无法构建自定义Web scraper

时间:2015-02-25 18:57:00

标签: javascript node.js asynchronous

我尝试使用node.js上的request,cheerio和async构建自定义Web scraper。 我无法弄清楚为什么我在公司参数中得到一些未定义的值导致瀑布崩溃。 我知道代码有点乱,但它大部分都是可读的。我认为我的问题可能来自回调电话。

async.waterfall([
  function(callback){
    var base_url = 'http://www.architonic.com/fr/pmfairexh/imm-cologne/8550409/';
    _.times(3, function(n){
      var url = base_url+(n+1);
      request(url, function (error, response, html) {
        if (error)
          callback(url);
        if (!error && response.statusCode == 200) {
          var $ = cheerio.load(html);
          var links_companies = $('#sheet_content_inside > ul > li > div h2 > a'),
              //links_companies = $('a', li_companies),
              companies = [];

          //console.log(colors.blue(url), links_companies.length);
          links_companies.each(function(i, a) {
            companies.push({name: $(this).attr('title'), url_from: url, next_url: $(this).attr('href')});
          });
          callback(null, companies);
        }
      });
    });
  },
  function(companies, callback){
    async.map(companies, function(c, cb){
      var url = c.next_url;
      if(!_.isUndefined(url))
      request(url, function (error, response, html) {
        console.log(url, c.name);
        if (error)
          cb(url);
        if (!error && response.statusCode == 200) {
          var $ = cheerio.load(html),
              profile_link = $('#head_main_content > div > div:nth-child(4) > h6 > a'),
              cons = (typeof profile_link == 'undefined')?  "WHAT?" : profile_link.attr('href');
              //console.log("url founded: "+cons);
              // if(typeof profile_link == 'undefined')
              //   cb(c);
              c.origin_url = c.url_from;
              c.next_url = profile_link.attr('href');
              c.url_from = url;
              c.profile_url = c.next_url;
              //console.log(c);
              // if (!_.isUndefined(c.next_url))
                cb(null, c);
              // else
              //   cb(c);
              //return _.extend({}, c, {profile_url: profile_link.attr('href'), origin_url: c.url_from});
        }
      });
    }, function(err, _companies){
      callback(null, _companies);
    });
  },
  function(companies, callback){
    async.map(companies, function(c, cb){
      var url = c.next_url;
      console.log(colors.green(url), colors.red(c.name));
      // if (_.isUndefined(url))
      //   return cb(c);
      if(!_.isUndefined(url))
        request(url, function (error, response, html) {
          //console.log(url, c.name);
          if (error)
            cb(url);
          if (!error && response.statusCode == 200) {
            var $ = cheerio.load(html),
                left_zone = $('#sheet_content_inside > div.margin_top_20 > div.left'),
                right_zone = $('#sheet_content_inside > div.margin_top_20 > div.right.width_195');
            //console.log(left_zone.html(), right_zone.html());
            var name = $('span[itemprop="name"]', left_zone).text(),
                s_address = $('span[itemprop="streetAddress"]', left_zone).text(),
                p_code = $('span[itemprop="postalCode"]', left_zone).text(),
                city = $('span[itemprop="addressLocality"]', left_zone).text(),
                country = $('span[itemprop="addressCountry"]', left_zone).text();

            console.log(name, s_address, p_code, city, country);
            cb(null, c);
          }
        });
    }, function(err, _companies){
      callback(null, _companies);
    });
  }
  ], function(err, companies){
    console.log(colors.warn("end"), companies.length);
    console.log(err);
    console.log(companies);
});

1 个答案:

答案 0 :(得分:0)

在找到真正的罪魁祸首之前,您可以清理公司数组,然后再将其传递给下一个函数或每个async.map之前。像这样;

companies = _.compact(companies)

这将确保列表清除空值和未定义