Question

我有一个正在工作一段时间的网络爬虫。我正在尝试向网络抓取工具添加语义组件，并且我认为我已经成功完成了，因为我的抓取工具现在在搜索中包含了关键字。我可以在CMD提示符中看到它正在这样做。但是，我试图弄清楚如何将两个单独的输出打印到两个单独的文件。对于1个文件，我需要网页和关联的关键字（在CMD提示符下可以看到）。其次，我需要最初从非语义搜寻器中检索的输出，该输出包括每个网页所链接的页面。我在实现这个目标方面遇到困难。

from io import StringIO
from functools import partial
from scrapy.http import Request
from scrapy.spiders import BaseSpider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.item import Item

def find_all_substrings(string, sub):

    import re
    starts = [match.start() for match in re.finditer(re.escape(sub), string)]
    return starts

class GraphSpider(CrawlSpider):

    name = "examplespider"
    custom_settings = {
    'DEPTH_LIMIT': '2',
    }
    allowed_domains = []
    start_urls = (
        'https://www.exampleurl.com',
    )

    rules = (
        Rule(LinkExtractor(allow=r'/'), callback='check_buzzwords', follow=True),
    )

    crawl_count = 0
    words_found = 0                                 

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        i = dict()
        i['url'] = response.url
        # i['http_status'] = response.status
        llinks=[]
        for anchor in hxs.select('//a[@href]'):
            href=anchor.select('@href').extract()[0]
            if not href.lower().startswith("javascript"):
                i['linkedurl'] = urljoin_rfc(response.url,href)
                yield i

    def check_buzzwords(self, response):

        self.__class__.crawl_count += 1

        crawl_count = self.__class__.crawl_count

        wordlist = [
            "keyword1",
            "keword2"
            ]

        url = response.url
        contenttype = response.headers.get("content-type", "").decode('utf-8').lower()
        data = response.body.decode('utf-8')

        for word in wordlist:
                substrings = find_all_substrings(data, word)
                for pos in substrings:
                        ok = False
                        if not ok:
                                self.__class__.words_found += 1
                                print(word + ";" + url + ";")
        return Item()

    def _requests_to_follow(self, response):
        if getattr(response, "encoding", None) != None:
                return CrawlSpider._requests_to_follow(self, response)
        else:
                return []

    def _response_downloaded(self, response):
        filename = response.url.split("/")[-1] + '.html'
        with open(filename, 'wb') as f:
            f.write(response.body)    
        rule = self._rules[response.meta['rule']]
        return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)

基本上，我需要一个具有以下内容的CSV文件：

关键字1，网站1，出现次数

关键字1，网站2，出现次数

关键字2，网站3，出现次数

第二个csv文件具有：

网站1，网站2

网站1，网站3

网站2，网站4

我仍在学习python，因此，我感谢能使它正常工作的帮助。

如何格式化Scrapy Crawler的输出格式

0 个答案: