我想从网站上获取一些数据,所以我用scrapy写了一个蜘蛛,但是当我回调另一个" parse_zai"时,它似乎失败了,所以我怎么能完成它?请帮助!!
# encoding utf-8
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from tencentnews.items import TencentnewsItem
class Tencentnews_spider(CrawlSpider):
name = "Tnews"
allowed_domains = ["news.qq.com"]#It's the web i scrapyed
start_urls = [
"http://news.qq.com/china_index.shtml",
"http://news.qq.com/world_index.shtml",
"http://news.qq.com/society_index.shtml",
]
rules = [
Rule(SgmlLinkExtractor(allow=('/a/\d{8}/\d{6}\.htm',)),follow=True,callback='parse_item'),
Rule(SgmlLinkExtractor(allow=('/(.+)\.shtml', )), follow=True),
]
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
sel = Selector(response)
item = TencentnewsItem()
item['articlename'] = sel.xpath("//div[@id='C-Main-Article-QQ']/div[1]/h1/text()").extract() #get the news'article
item['reportsource'] = sel.xpath("//span[@class='color-a-1']/a/text()").extract()
item['articletime'] = sel.xpath("//span[@class='article-time']/text()").extract()
item['commentnumber'] = sel.xpath("//a[@id='cmtNum']/text()").extract()
item['commenturl'] = sel.xpath("//a[@id='cmtNum']/@href").extract()
print repr(item).decode("unicode-escape") + '\n'
for url in item['commenturl']:
request = Request(url,callback = self.parse_zai)
request.meta['item'] = item
return request
def parse_zai(self,response):
print 'helloworld'
sel = Selector(response)
item = response.meta['item']
item['title'] = sel.xpath("//div[@class='bigTitle']/h1/a/text()").extract()
print repr(item).decode("unicode-escape") + '\n'
return item
答案 0 :(得分:0)
在我查看网站后,我发现评论域名coral.qq.com/xxx
不在
allowed_domains。
将域添加到allowed_domains,或者只删除allow_domains属性。
您可以通过过滤scrapy loglevel DEBUG日志来检查您的请求是否因allowed_domains而失败。
grep "DEBUG: Filtered offsite request" yourlogfile.log
顺便说一下,日志只会打印不在allowed_domains中的第一个域,以保持日志清洁。
DEBUG: Filtered offsite request to 'www.example.com': <GET http://www.example.com>