Why does scrapy repeatedly scrape one result only?

时间:2015-08-07 02:21:29

标签: python web-scraping scrapy scrapy-spider

Please help me on this problem: the following spider code is expected to return all the listed jobs of the start_url. However, it only return many copies of the first job. Xpath codes are correctly tested in "Xpath Checker". What is wrong? Thanks for your input!

from scrapy.spiders import Spider
from scrapy.selector import Selector
from Testjobs.items import TestjobsItem, TestjobsItemLoader

class TestjobSpider(Spider):
    name = "test"
    allowed_domains = ['http://careers.pathologyjobstoday.org/']
    start_urls = [
        'http://careers.pathologyjobstoday.org/jobseeker/search/results'
    ]

    def parse(self, response):
        hxs = Selector(response)  
        sites = hxs.xpath('//tr[contains(@id, "jt_jobrow_")]') 

        for site in sites:
            il = TestjobsItemLoader(response=response, selector=site)
            il.add_xpath('title', 'normalize-space(//div[@class="jt_jobs_title"]/text())') 
            yield il.load_item()

1 个答案:

答案 0 :(得分:1)

您需要通过添加一个点来制作“内部”XPath 特定于上下文的

public static String abbreviateMiddle(String input, String middle, int length) {
    if (input != null && input.length() > length) {
        int half = (length - middle.length()) / 2;

        Pattern pattern = Pattern.compile(
                "^(.{" + half + ",}?)" + "\\b.*\\b" + "(.{" + half + ",}?)$");
        Matcher matcher = pattern.matcher(input);

        if (matcher.matches()) {
            return matcher.group(1) + middle + matcher.group(2);
        }
    }

    return input;
}