需要CasperJS刮刮辅助

时间:2014-04-24 09:03:32

标签: casperjs

我正试图转到this page,并从每个链接中删除“标题”'和作者'对于每篇论文。到目前为止,我有这个(我需要帮助的问题在代码中的注释中):

var utils = require('utils');
var casper = require('casper').create({
  verbose: true,
  logLevel: 'error',
  pageSettings: {
    loadImages: false,
    loadPlugins: false,
    userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
  },
  clientScripts: ['lib/jquery.min.js']
});

var i = 0;
var links = [];
var thesis_data = [];

function getThesisLinks () {
  var links = document.querySelectorAll('');//Not sure what should go in ('')
  return [].map.call(links, function(link) {
    return link.getAttribute('href');
  });
}

function loopThroughThesisLinks() {
  // Recurses until all links are processed
  if (i < links.length) {
    this.echo('[LINK #' + i + '] ' + links[i]);
    getThesisData.call(this, links[i]);
    i++;
    this.run(loopThroughThesisLinks);
  } else {
    utils.dump(thesis_data);
    this.exit();
  }
}

function getThesisData(link) {
  this.start(link, function() {

    // Get title of thesis - not sure what element to insert for this.fetchText
    var title = this.fetchText('');

    // Get name of authors - not sure what element to insert for this.fetchText
    var author = this.fetchText('');

    // Add the title & author data to the thesis_data array
    var data = {
      title: title,
      author: author
    };
    thesis_data.push(data);

  });
}

casper.start('http://ses.library.usyd.edu.au/handle/2123/345/browse?type=dateissued&sort_by=2&order=DESC&rpp=1495&etal=0&submit_browse=Update', function() {
  links = this.evaluate(getThesisLinks);

  // Convert relative links to absolute URLs
  for (var i = 0; i < links.length; i++) {
    links[i] = "http://ses.library.usyd.edu.au/handle/" + links[i];
  }

  utils.dump(links);
});

casper.run(loopThroughThesisLinks);

任何帮助都将不胜感激。

1 个答案:

答案 0 :(得分:1)

这是所有链接的简单CSS选择器:

var links = document.querySelectorAll(
           'table.misctable > tbody > tr > td:nth-of-type(3) > a');

你也可以像这样使用XPath:

var x = require('casper').selectXPath; // goes to the beginning of the file
var title = this.fetchText(x('//table//tr/td[1][contains(text(),"Title:")]/../td[2]'));

我认为你可以找出 authors -query。我可能会在循环中使用casper.thenOpen以不同的方式完成抓取,因为在其他startrun调用处于不同的函数中时,这很难阅读。

使用casper.thenOpen,它看起来像这样:

var x = require('casper').selectXPath; // goes to the beginning of the file

function loopThroughThesisLinks() {
  // Recurses until all links are processed
  if (i < links.length) {
    this.echo('[LINK #' + i + '] ' + links[i]);
    getThesisData.call(this, links[i]);
    i++;
    this.then(loopThroughThesisLinks);
  } else {
    utils.dump(thesis_data);
    this.exit();
  }
}

function getThesisData(link) {
  this.thenOpen(link, function() {
    var title = this.fetchText(x('//table//tr/td[1][contains(text(),"Title:")]/../td[2]'));
    var author = this.fetchText(x('//table//tr/td[1][contains(text(),"Authors:")]/../td[2]'));

    // Add the title & author data to the thesis_data array
    var data = {
      title: title,
      author: author
    };
    thesis_data.push(data);
  });
}