Question

我目前正在开发一个网络抓取工具，它应该访问目录中的网站列表，访问网站＆＃39; CSS样式表，检查@media标签（检查响应式设计的基本方法，我知道还有其他需要考虑的案例），并打印所有不对文件使用响应式设计的网站。

我相当确定我实际检查CSS的@media标签的方法工作正常，但蜘蛛在决定是否找到一个带有@media标签的CSS文件之前没有访问所有CSS文件。我有一个测试文件，在程序进行时记录调试输出，它显示奇怪的模式，例如完成检查所有CSS文件，然后打印出文件中找到的内容，这不应该发生。

我希望有人可以查看我的代码并帮助我确定为什么这不按照我想要的顺序发生。作为参考，目标是：

从列表中访问网站
访问该网站HTML主题元素中的每个CSS文件
如果找到@media标记，我们就完成了，网站使用了响应式设计
如果没有，请继续查看更多CSS文件
如果没有CSS文件包含@media标记，则该网站不使用响应式设计，应该添加到列表中

这里是我的代码（并非所有内容都能完美运行 - 例如，程序超时因为我还没有使用TimeOutError，但在大多数情况下，我觉得应该这样做＆＃ 39;正确评估网站的工作，并没有这样做）：

import scrapy
import re
import os.path
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from twisted.internet.error import TimeoutError
import time

class LCCISpider(CrawlSpider):
    name = "lcci"
    start_urls = ["http://www.lancasterchamber.com/busdirectory.aspx?mode=category"]
    #Calls parse_item for every category link on main page
    rules = (Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@id="catListingResults"]/table/tr')), 
            callback = 'parse_item', follow = True),)
    website_list = []
    found_media = False

    #Called for each category
    def parse_item(self, response):
        #For each site on the page, calls parse_website


        sites = response.xpath('//div[@id="busListingResults"]/table/tr')
        for site in sites:
            urls = site.xpath('.//td/a[4]/@href').extract()
            for url in urls:
                if len(url) == 0:
                    continue
                else:
                    new_site = response.urljoin(url)
                    yield scrapy.Request(new_site, callback=self.parse_website,
                                                    errback=self.errback_website)




    def parse_website(self, response):

        f = open('output2.txt', 'a')
        f.write("NOW VISITING")
        f.flush()
        f.write(response.url)
        f.flush()
        f.write("\n")
        f.flush()
        f.close()
        #reset found_media to false for each website
        self.found_media = False
        #for every link in the header, check potential css for @media tag
        for href in response.css("head > link::attr('href')"):
            url = response.urljoin(href.extract())
            #if @media tag has not been found, continue checking css
            if self.found_media == False:
                #Call check_css for the url of the css file
                yield scrapy.Request(url, callback=self.check_css,
                                          errback=self.errback_website)

                f = open('output2.txt', 'a')
                f.write("step\n")
                f.flush()
                f.close()
            else:
                break

        #if no @media tag is found in any link in the header, add the url to the website_list

        if self.found_media == False:
            #self.website_list.append(response.url)
            f = open('output2.txt', 'a')
            f.write("No @media tag in")
            f.flush()
            f.write(response.url)
            f.flush()
            f.write("\n")
            f.flush()
            f.close()

            f = open('outputfalse2.txt', 'a')
            f.write(response.url)
            f.write("\n")
            f.close()

        else:
            f = open('outputtrue.txt', 'a')
            f.write(reponse.url)
            f.write("\n")
            f.close()

    def check_css(self, response):

        #Just a way of converting url into a string, the ".txt" is otherwise meaningless
        string = str(response.url)
        f = open('output2.txt', 'a')
        f.write("Checking CSS in ")
        f.write(response.url)
        f.write("\n")
        f.flush()
        f.close()
        #only perform regex search if it's a .css file
        if (string[-4:] == ".css"): 
            media_match = re.search(r'@media', response.body, flags=0)
            if media_match != None:
                f = open('output2.txt', 'a')
                f.write("found @media tag in " + response.url + "\n")
                f.flush()
                #If an @media tag is found, set found_media to True
                self.found_media = True
                f.close()
        else:
            f = open('output2.txt', 'a')
            f.write("not css")
            f.flush()
            f.close()

    def errback_website(self, failure):
        if failure.check(TimeoutError):
            request = failure.request
            self.logger.error = ('TimeoutError on %s', request.url)

Answer 1

我浏览过，无法帮助完成这项工作。这是完全清理的代码。在逻辑方面几乎没有变化。它现在做的是：

连接到网站
获取所有类别
从类别中获取所有网站
连接到每个网站的第一页
查找.css个链接
连接到.css个链接 6.1如果media正则表达式与yield产品匹配css url和item

这里唯一的问题是由于scrapy的异步性质，你最终会有很多重复项，因为你当时可能会抓取多个.css文件。为此，我们可以使用简单的管道来检测和删除重复项。为了将来参考，您不应该使用文件写入进行调试。看一下scrapy shell，你甚至可以在parse里面使用它在爬行过程中打开shell，如：

def parse(self, response):
    inspect_response(response, self)

这是工作蜘蛛：

import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.exceptions import DropItem
from scrapy.linkextractors import LinkExtractor
from twisted.internet.error import TimeoutError
from scrapy import Request


class DupePipeline(object):
    def __init__(self):
        self.known_websites = set()

    def process_item(self, item, spider):
        if item['website'] in self.known_websites:
            raise DropItem('duplicate')
        self.known_websites.add(item['website'])
        return item


class LCCISpider(CrawlSpider):
    name = "lcci"
    start_urls = ["http://www.lancasterchamber.com/busdirectory.aspx?mode=category"]
    custom_settings = {
        'ROBOTSTXT_OBEY': False,
        'ITEM_PIPELINES': {
            'myproject.spiders.spider.DupePipeline': 666,
        }
    }
    # Calls parse_item for every category link on main page
    rules = (Rule(LinkExtractor(restrict_xpaths=['//div[@id="catListingResults"]/table/tr']),
                  callback='parse_item', follow=True),)  # why follow?

    # Called for each category
    def parse_item(self, response):
        # For each site on the page, calls parse_website
        sites = response.xpath('//div[@id="busListingResults"]/table/tr')
        for site in sites:
            urls = site.xpath('.//td/a[4]/@href').extract()
            for url in urls:
                if not url:
                    continue
                new_site = response.urljoin(url)
                yield Request(new_site,
                              callback=self.parse_website,
                              errback=self.errback_website)

    def parse_website(self, response):
        # for every link in the header, check potential css for @media tag
        for href in response.css("head > link::attr('href')").extract():
            if not href.endswith('.css'):  # only css files
                continue
            yield Request(response.urljoin(href),
                          meta={'website': response.url},
                          callback=self.check_css,
                          errback=self.errback_website)

    def check_css(self, response):
        media_match = re.search(r'@media', response.body, flags=0)
        if media_match:
            # return item!
            yield {'url': response.url,
                   'website': response.meta['website']}

    def errback_website(self, failure):
        if failure.check(TimeoutError):
            request = failure.request
            self.logger.error = ('TimeoutError on %s', request.url)

使用scrapy crawl lcci -o test.json运行几分钟后的结果我得到了这个结果：http://pastebin.com/raw/kfsTKqUY

Scrapy没有以正确的顺序执行

1 个答案: