在我的start_urls中,如果我定义主页,则scrapy不会抓取页面,并且“if”检查parse_item函数永远不会被命中(例如:'someurl.com/medical/patient-info')。但是当我在start url中提供相同的页面url(即start_urls ='someurl.com/medical/patient-info)时,它会抓取它并点击下面的parse_item检查
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.selector import HtmlXPathSelector
from tutorial.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import urlparse
from scrapy import log
class MySpider(CrawlSpider):
items = []
failed_urls = []
duplicate_responses = []
name = 'myspiders'
allowed_domains = ['someurl.com']
login_page = 'someurl.com/login_form'
start_urls = 'someurl.com/' # Facing problem for the url here
rules = [Rule(SgmlLinkExtractor(deny=('logged_out', 'logout',)), follow=True, callback='parse_item')]
def start_requests(self):
yield Request(
url=self.login_page,
callback=self.login,
dont_filter=False
)
def login(self, response):
"""Generate a login request."""
return FormRequest.from_response(response,
formnumber=1,
formdata={'username': 'username', 'password': 'password' },
callback=self.check_login_response)
def check_login_response(self, response):
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Logout" in response.body:
self.log("Successfully logged in. Let's start crawling! :%s" % response, level=log.INFO)
self.log("Response Url : %s" % response.url, level=log.INFO)
return Request(url=self.start_urls)
else:
self.log("Bad times :(", loglevel=log.INFO)
def parse_item(self, response):
# Scrape data from page
hxs = HtmlXPathSelector(response)
self.log('response came in from : %s' % (response), level=log.INFO)
# check for some important page to crawl
if response.url == 'someurl.com/medical/patient-info' :
self.log('yes I am here', level=log.INFO)
urls = hxs.select('//a/@href').extract()
urls = list(set(urls))
for url in urls :
self.log('URL extracted : %s' % url, level=log.INFO)
item = DmozItem()
if response.status == 404 or response.status == 500:
self.failed_urls.append(response.url)
self.log('failed_url : %s' % self.failed_urls, level=log.INFO)
item['failed_urls'] = self.failed_urls
else :
if url.startswith('http') :
if url.startswith('someurl.com'):
item['internal_link'] = url
self.log('internal_link :%s' % url, level=log.INFO)
else :
item['external_link'] = url
self.log('external_link :%s' % url, level=log.INFO)
self.items.append(item)
self.items = list(set(self.items))
return self.items
else :
self.log('did not recieved expected response', level=log.INFO)
答案 0 :(得分:1)
我猜start_urls
必须是一个列表。
请尝试以下操作:start_urls = ['http://www.someurl.com/', ]