此脚本仅向我提供第一个结果,或者如果我将0更改为1然后将下一个项目更改为.extract()[0]
。为什么不再迭代整个xpath?
规则部分也无效。我知道问题出在response.xpath
。怎么处理呢?
我的其他脚本正在运行,但这不是
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
name = scrapy.Field()
date = scrapy.Field()
class criticspider(CrawlSpider):
name = "hand"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/bysubcategory/mobile-handsets/page/1"]
rules = (
Rule(
SgmlLinkExtractor(allow=('"/bysubcategory/mobile-handsets/page/1/+"',)),
callback="parse_start_url",
follow=True),
)
def parse(self, response):
sites = response.xpath('//table[@width="100%"]')
items = []
for site in sites:
item = CompItem()
item['date'] = site.xpath('.//td[@class="small"]/text()').extract()[1]
item['name'] = site.xpath('.//td[@class="small"]//a/text()').extract()[0]
item['title'] = site.xpath('.//td[@class="complaint"]/h4/a/text()').extract()[0]
item['link'] = site.xpath('.//td[@class="complaint"]/h4/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[@class="compl-text"]/div/text()').extract()
yield old_item
答案 0 :(得分:1)
问题在于如何定义sites
。
目前,只有//table[@width="100%"]
会导致整个表格匹配。相反,直接在div
代码中找到id
属性的所有td
元素:
sites = response.xpath("//td/div[@id]")
至于rules
部分 - 这是我要采用的方法 - 使用与parse
回调不同的方法来收集搜索结果。完整的代码有一些更多的改进:
from urlparse import urljoin
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
name = scrapy.Field()
date = scrapy.Field()
class criticspider(CrawlSpider):
name = "hand"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/bysubcategory/mobile-handsets"]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[@class='pagelinks']"), follow=True, callback="parse_results"),
)
def parse_results(self, response):
sites = response.xpath("//td/div[@id]")
for site in sites:
item = CompItem()
item['date'] = site.xpath('.//td[@class="small"]/text()').extract()[1]
item['name'] = site.xpath('.//td[@class="small"]//a/text()').extract()[0]
item['title'] = site.xpath('.//td[@class="complaint"]/h4/a/text()').extract()[0]
item['link'] = site.xpath('.//td[@class="complaint"]/h4/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[@class="compl-text"]/div/text()').extract()
yield old_item