CasperJS Scraper在启动后立即退出

时间:2016-03-06 20:29:11

标签: javascript phantomjs casperjs

我在尝试编写一个相当简单的CasperJS刮刀时遇到了一些严重的问题。基本上,我想在网站上遍历一些搜索结果,跟踪每个结果,收集一些数据,然后返回当前搜索页面。完成此过程后,我想将结果写入文件。我有以下代码根本无法正常工作。请原谅任何明显的错误,我是javascript的新手,是Java,Ruby,C ++原生。

// This site can also be queried via URL, I initially wrote this serializer
// to use this approach, but I ended up going with CasperJS navigation instead.
// My problems seem agnostic to whether or not I navigate using page links or URL.
function serialize(json) {
    var str = [];
    for(var prop in json) {
        if(json.hasOwnProperty(prop)) {
            str.push(encodeURIComponent(prop) + "=" + encodeURIComponent(obj[prop]));
        }
    }
    return str.join("&");
}


// Scrape Links and Names from the current page in the searh results
function getPageLinks() {
    var dancers = document.querySelectorAll('h4 > a');
    return Array.prototype.map.call(links, function(e) {
        var result = {};
        result[e.textContent] = e.getAttribute('href');
        return result;
    });
}

// For a given dancer, scrape the block of html containing the name of each donor,
// their donation amount, and any comments.
function scrapeDonorInfo() {
    var donors = document.querySelectorAll('div.msgBottomInnCont > div.meta');
    return Array.prototype.map.call(links, function(e) {
        return e.innerHtml;
    });
}

// Use Tail recursion to scrape the donors for every dancer in each page of the search results.
function scrapeAllDonors(dancers, startIndex) {
    // Inject Underscore.js for utility methods (namely _.union())
    this.page.injectJs('https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.8.3/underscore-min.js');

    // Populate the links object only after there are links to scrape
    casper.waitForSelector('h4 > a', function() {
        var links = this.evaluate(getPageLinks);
        dancers = this.evaluate(_.union(dancers, links));
    });
    this.echo('Links object populated', 'INFO'); // Log the message, 
                                                 // using this.echo() for colored tags

    // For every dancer page link on this page of search results,
    // fetch their fundraising page, scrape their donors, 
    // 
    dancers.forEach(function(element, index, array) {
        if(index >= startIndex) {
            var name = Object.keys(element)[0];
            var link = baseURL + element[name];
            casper.thenOpen(link);
            casper.waitForSelector('div.meta', function() {
                var viewMore = 'a.viewMore';
                if(casper.visible(viewMoreActivity)) {
                    casper.thenClick(viewMore);
                }

                element[name] = {"donor_info": this.evaluate(getDonorInfo)};
            });
            casper.back();
        }
    });

    var nextLink = "a#next";
    casper.waitForSelector(nextLink, function() {

        // If the next button in the results is clickable, click it.
        if (casper.visible(nextLink)) {
            casper.thenClick(nextLink);
            casper.thenEvaluate(scrapeAllDonors(dancers, dancers.length()));
        } else {
            // Otherwise, write the final results to file.
            fs.write(save, dancers, 'w');
            casper.echo("END")
        }
    });
}


// Note: This is the Phantom.js package 'fs', not the Node.js package.
var fs = require('fs');

// Create a dated file for scrape results
var fname = new Date().getTime() + '.txt';
var save = fs.pathJoin(fs.workingDirectory, 'data', fname);

// Initialize Casper.js with desired settings
var casper = require('casper').create({
    verbose: true,
    logLevel: 'debug',
    pageSettings: {
        loadImages:  false,     
        loadPlugins: false         
    }
});

// Handler for Resource Errors
casper.on("resource.error", function(resourceError) {
    console.log('Unable to load resource (#' + resourceError.id + 'URL:' + resourceError.url + ')');
    console.log('Error code: ' + resourceError.errorCode + '. Description: ' + resourceError.errorString);
});

// Handler for Page Errors
casper.on("page.error", function (msg, trace) {
    console.log( 'Error: ' + msg, 'ERROR' );
    console.log( 'Trace: ' + trace, 'TRACE' );
});

// Handler for Blocking requests made by social components (facebook in particular)
casper.on("resource.requested", function(requestData, networkRequest){
    console.log('Request (#' + requestData.id + '): ' + JSON.stringify(requestData) + "\n");
    if (requestData.url.indexOf("facebook") !== -1) {
        networkRequest.abort();
    }
});

// BaseURL for the site, convenient for scrapeAllDonors
var baseURL = 'https://fundraise.nudm.org/';

casper.start('https://fundraise.nudm.org/search/fundraisers?page=1');

casper.then(scrapeAllDonors([], 0));

// Run everything in the stack, then notify and exit
casper.run(function() {
    this.echo("DONE", 'INFO');
    this.exit();
});

为了解决问题,Casper / Phantom拒绝打印任何日志消息,我无法弄清楚原因。当我没有调试运行时,我得到:

casperjs --ssl-protocol=tlsv1 Crawler.js
[info] [phantom] Starting...
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file:///usr/local/Cellar/casperjs/1.1-beta4/libexec/bin/bootstrap.js. Domains, protocols and ports must match.

启用调试后,我得到:

 casperjs --ssl-protocol=tlsv1 -debug=true Crawler.js
Unable to open file: -debug=true
Unable to load script -debug=true; check file syntax
dhcp-199-74-85-154:NUDM Expose williambyrne$ casperjs --ssl-protocol=tlsv1 --debug=true Crawler.js
2016-03-06T14:22:31 [DEBUG] CookieJar - Created but will not store cookies (use option '--cookies-file=<filename>' to enable persisten cookie storage) 
2016-03-06T14:22:31 [DEBUG] Phantom - execute: Configuration 
2016-03-06T14:22:31 [DEBUG]      0 objectName : "" 
2016-03-06T14:22:31 [DEBUG]      1 cookiesFile : "" 
2016-03-06T14:22:31 [DEBUG]      2 diskCacheEnabled : "false" 
2016-03-06T14:22:31 [DEBUG]      3 maxDiskCacheSize : "-1" 
2016-03-06T14:22:31 [DEBUG]      4 ignoreSslErrors : "false" 
2016-03-06T14:22:31 [DEBUG]      5 localToRemoteUrlAccessEnabled : "false" 
2016-03-06T14:22:31 [DEBUG]      6 outputEncoding : "UTF-8" 
2016-03-06T14:22:31 [DEBUG]      7 proxyType : "http" 
2016-03-06T14:22:31 [DEBUG]      8 proxy : ":1080" 
2016-03-06T14:22:31 [DEBUG]      9 proxyAuth : ":" 
2016-03-06T14:22:31 [DEBUG]      10 scriptEncoding : "UTF-8" 
2016-03-06T14:22:31 [DEBUG]      11 webSecurityEnabled : "true" 
2016-03-06T14:22:31 [DEBUG]      12 offlineStoragePath : "" 
2016-03-06T14:22:31 [DEBUG]      13 offlineStorageDefaultQuota : "-1" 
2016-03-06T14:22:31 [DEBUG]      14 printDebugMessages : "true" 
2016-03-06T14:22:31 [DEBUG]      15 javascriptCanOpenWindows : "true" 
2016-03-06T14:22:31 [DEBUG]      16 javascriptCanCloseWindows : "true" 
2016-03-06T14:22:31 [DEBUG]      17 sslProtocol : "tlsv1" 
2016-03-06T14:22:31 [DEBUG]      18 sslCertificatesPath : "" 
2016-03-06T14:22:31 [DEBUG]      19 webdriver : ":" 
2016-03-06T14:22:31 [DEBUG]      20 webdriverLogFile : "" 
2016-03-06T14:22:31 [DEBUG]      21 webdriverLogLevel : "INFO" 
2016-03-06T14:22:31 [DEBUG]      22 webdriverSeleniumGridHub : "" 
2016-03-06T14:22:31 [DEBUG] Phantom - execute: Script & Arguments 
2016-03-06T14:22:31 [DEBUG]      script: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/bin/bootstrap.js" 
2016-03-06T14:22:31 [DEBUG]      0 arg: "--casper-path=/usr/local/Cellar/casperjs/1.1-beta4/libexec" 
2016-03-06T14:22:31 [DEBUG]      1 arg: "--cli" 
2016-03-06T14:22:31 [DEBUG]      2 arg: "Crawler.js" 
2016-03-06T14:22:31 [DEBUG] Phantom - execute: Starting normal mode 
2016-03-06T14:22:31 [DEBUG] WebPage - setupFrame "" 
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/fs.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/system.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/_coffee-script.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/package.json" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/coffee-script.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/./lexer.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/././rewriter.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/././helpers.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/./parser.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/./helpers.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/./nodes.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/././scope.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/./././helpers.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/././lexer.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/./././rewriter.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/webpage.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/package.json" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/cli.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/utils.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] Phantom - injectJs: "Crawler.js" 
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/casper.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/colorizer.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/events.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/http.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/mouse.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/pagestack.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/querystring.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/tester.js" QMap(("mode", QVariant(QString, "r") ) )  
[info] [phantom] Starting...
2016-03-06T14:22:31 [DEBUG] WebpageCallbacks - getJsConfirmCallback 
2016-03-06T14:22:31 [DEBUG] WebpageCallbacks - getGenericCallback 
2016-03-06T14:22:31 [DEBUG] WebpageCallbacks - getJsConfirmCallback 
2016-03-06T14:22:31 [DEBUG] WebPage - setupFrame "" 
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/fs.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/system.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/_coffee-script.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/webpage.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] WebPage - updateLoadingProgress: 10 
2016-03-06T14:22:31 [DEBUG] WebPage - setupFrame "" 
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/fs.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/system.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/_coffee-script.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/webpage.js" QMap(("mode", QVariant(QString, "r") ) )  
2016-03-06T14:22:31 [DEBUG] WebPage - updateLoadingProgress: 100 
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file:///usr/local/Cellar/casperjs/1.1-beta4/libexec/bin/bootstrap.js. Domains, protocols and ports must match.

2016-03-06T14:22:31 [DEBUG] WebPage - updateLoadingProgress: 10 
2016-03-06T14:22:31 [DEBUG] WebPage - updateLoadingProgress: 100 

有什么想法吗?

更新(在做出第一个建议的更改后)

williambyrne$ casperjs --ssl-protocol=tlsv1  Crawler.js
[info] [phantom] Starting...
[info] [phantom] Running suite: 3 steps
[debug] [phantom] opening url: https://fundraise.nudm.org/search/fundraisers?page=1, HTTP GET
[debug] [phantom] Navigation requested: url=https://fundraise.nudm.org/search/fundraisers?page=1, type=Other, willNavigate=true, isMainFrame=true
Request (#1): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"}],"id":1,"method":"GET","time":"2016-03-06T21:03:49.874Z","url":"https://fundraise.nudm.org/search/fundraisers?page=1"}

[debug] [phantom] url changed to "https://fundraise.nudm.org/search/fundraisers?page=1"
Request (#2): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"text/css,*/*;q=0.1"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":2,"method":"GET","time":"2016-03-06T21:03:51.112Z","url":"https://fundraise.nudm.org/css/sc_global.css?cuiv=1456860159443"}

Request (#3): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"text/css,*/*;q=0.1"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":3,"method":"GET","time":"2016-03-06T21:03:51.113Z","url":"https://fundraise.nudm.org/stylesheets/css/charity/search.css?cuiv=1456860159443"}

Request (#4): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"text/css,*/*;q=0.1"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":4,"method":"GET","time":"2016-03-06T21:03:51.113Z","url":"https://fundraise.nudm.org/css/white_label_header_v3.4.3.1.css?cuiv=1456860159443"}

Request (#5): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"text/css,*/*;q=0.1"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":5,"method":"GET","time":"2016-03-06T21:03:51.114Z","url":"https://fundraise.nudm.org/css/white_label_header_responsive.css?cuiv=1456860159443"}

Request (#6): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":6,"method":"GET","time":"2016-03-06T21:03:51.114Z","url":"https://ajax.googleapis.com/ajax/libs/jquery/1.8.1/jquery.min.js"}

Request (#7): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":7,"method":"GET","time":"2016-03-06T21:03:51.114Z","url":"https://fundraise.nudm.org/js/front_scripts.js?cuiv=1456860159443"}

Request (#8): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":8,"method":"GET","time":"2016-03-06T21:03:51.115Z","url":"https://fundraise.nudm.org/js/mobile_share.js?cuiv=1456860159443"}

Request (#9): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":9,"method":"GET","time":"2016-03-06T21:03:51.115Z","url":"https://fundraise.nudm.org/js/search.js?cuiv=1456860159443"}

Request (#10): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":10,"method":"GET","time":"2016-03-06T21:03:51.116Z","url":"https://fundraise.nudm.org/js/mobile.js?cuiv=1456860159443"}

Request (#11): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":11,"method":"GET","time":"2016-03-06T21:03:51.304Z","url":"https://ssl.google-analytics.com/ga.js"}

Request (#12): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":12,"method":"GET","time":"2016-03-06T21:03:51.304Z","url":"https://www.google-analytics.com/analytics.js"}

Request (#13): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"},{"name":"Accept","value":"*/*"}],"id":13,"method":"GET","time":"2016-03-06T21:03:51.309Z","url":"https://fundraise.nudm.org/css/fonts/proximanova/ProximaNova-Reg-webfont.woff"}

Request (#14): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":14,"method":"GET","time":"2016-03-06T21:03:51.313Z","url":"https://connect.facebook.com/en_US/sdk.js"}

Request (#15): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"},{"name":"Accept","value":"*/*"}],"id":15,"method":"GET","time":"2016-03-06T21:03:51.314Z","url":"https://fundraise.nudm.org/css/fonts/proximanova/ProximaNova-Sbold-webfont.woff"}

Request (#16): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"},{"name":"Accept","value":"*/*"}],"id":16,"method":"GET","time":"2016-03-06T21:03:51.315Z","url":"https://fundraise.nudm.org/css/fonts/pictos/pictos-webfont.woff"}

Request (#17): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"},{"name":"Accept","value":"*/*"}],"id":17,"method":"GET","time":"2016-03-06T21:03:51.315Z","url":"https://fundraise.nudm.org/css/fonts/proximanova/ProximaNova-Bold-webfont.woff"}

Request (#18): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"},{"name":"Accept","value":"*/*"}],"id":18,"method":"GET","time":"2016-03-06T21:03:51.316Z","url":"https://fundraise.nudm.org/css/fonts/proximanova/ProximaNova-Thin-webfont.woff"}

Request (#19): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"},{"name":"Accept","value":"*/*"}],"id":19,"method":"GET","time":"2016-03-06T21:03:51.317Z","url":"https://fundraise.nudm.org/css/fonts/entypo/entypo.woff"}

Unable to load resource (#14URL:)
Error code: 301. Description: Protocol "" is unknown
Request (#20): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":20,"method":"GET","time":"2016-03-06T21:03:51.796Z","url":"https://js-agent.newrelic.com/nr-885.min.js"}

Request (#21): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":21,"method":"GET","time":"2016-03-06T21:03:53.756Z","url":"https://bam.nr-data.net/1/67fe2a1b26?a=10291124&v=885.a559836&to=ZV0HYUJUCEYEU0QLC1wXJFZEXAlbSlRVBAVHVBEaQ1AHRwZYHwQRXFwXVFlGA0cW&rst=2645&ap=775&fe=686&dc=204&f=%5B%5D&at=SRoEFwpOG0g%3D&jsonp=NREUM.setToken"}

[debug] [phantom] Successfully injected Casper client-side utilities
[debug] [phantom] start page is loaded
[info] [phantom] Step anonymous 3/3 https://fundraise.nudm.org/search/fundraisers?page=1 (HTTP 200)
Links object populated
[info] [phantom] Step anonymous 3/3: done in 3944ms.
[info] [phantom] Step _step 4/5 https://fundraise.nudm.org/search/fundraisers?page=1 (HTTP 200)
[info] [phantom] Step _step 4/5: done in 3965ms.
[info] [phantom] waitFor() finished in 40ms.
[info] [phantom] Step anonymous 5/6 https://fundraise.nudm.org/search/fundraisers?page=1 (HTTP 200)
Error: ReferenceError: Can't find variable: links
Trace: [object Object],[object Object],[object Object]
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file:///usr/local/Cellar/casperjs/1.1-beta4/libexec/bin/bootstrap.js. Domains, protocols and ports must match.

Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file:///usr/local/Cellar/casperjs/1.1-beta4/libexec/bin/bootstrap.js. Domains, protocols and ports must match.

Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file:///usr/local/Cellar/casperjs/1.1-beta4/libexec/bin/bootstrap.js. Domains, protocols and ports must match.

似乎“链接”的范围存在一些问题。阵列。

更新2 :(对scrapeAllDonors的更改)

// Use Tail recursion to scrape the donors for every dancer in each page of the search results.
function scrapeAllDonors(dancers, startIndex) {
    // Inject Underscore.js for utility methods (namely _.union())
    this.page.injectJs('https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.8.3/underscore-min.js');

    // Populate the links object only after there are links to scrape
    casper.waitForSelector('h4 > a', function() {
        var links = this.evaluate(getPageLinks);
        dancers = this.evaluate(_.union(dancers, links));

        // For every dancer page link on this page of search results,
        // fetch their fundraising page, scrape their donors, 
        // 
        dancers.forEach(function(element, index, array) {
            if(index >= startIndex) {
                var name = Object.keys(element)[0];
                var link = baseURL + element[name];
                casper.thenOpen(link);
                casper.waitForSelector('div.meta', function(name) {
                    var viewMore = 'a.viewMore';
                    if(casper.visible(viewMoreActivity)) {
                        casper.thenClick(viewMore);
                    }

                    element[name] = {"donor_info": this.evaluate(getDonorInfo)};
                }, name);
                casper.back();
            }
        });

        // If the next button in the results is clickable, click it.
        var nextLink = "a#next";
        if (casper.visible(nextLink)) {
            casper.thenClick(nextLink);
            casper.then(function() {
                scrapeAllDonors.call(this, dancers, dancers.length());
            });
        } else {
            // Otherwise, write the final results to file.
            fs.write(save, dancers, 'w');
            casper.echo("END")
        }
    });
    this.echo('Donor Information Scraped', 'INFO'); // Log the message, 
                                                 // using this.echo() for colored tags
}

1 个答案:

答案 0 :(得分:1)

您已经犯了立即调用scrapeAllDonors的错误,而不是稍后将其传递执行,此处:

casper.thenEvaluate(scrapeAllDonors(dancers, dancers.length()));

在这里:

casper.then(scrapeAllDonors([], 0));

这意味着它甚至在加载第一页之前执行,因此尝试对about:blank进行操作。如果你想这样调用它,你需要重构scrapeAllDonors,以便它返回一个步进函数:

function scrapeAllDonors(dancers, startIndex) {
    return function(){
        // Inject Underscore.js for utility methods (namely _.union())
        this.page.injectJs('https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.8.3/underscore-min.js');

        // ...

        var nextLink = "a#next";
        casper.waitForSelector(nextLink, function() {
            // ...
        });
    };
}

如果您不想更改scrapeAllDonors,则可以通过替换

来重构对它的调用
casper.then(scrapeAllDonors(...));

casper.then(function(){
    scrapeAllDonors.call(this, ...)
});

我对What must be wrapped in then() statements in CasperJS? How to determine execution order of sync/async functions?的回答可能有助于理解CasperJS中异步执行的复杂性。