成功抓取数据但无法将其写入csv或将其写入csv

时间:2017-08-01 07:48:07

标签: python web-scraping scrapy web-crawler bots

我添加了DOWNLOAD_DELAY = 2COOKIES_ENABLED = False,我的蜘蛛抓取网站,但不在我的CSV文件中写入项目。我不认为这是正常的,因为当我不添加这两个设置时,一切都很好......有人可以帮助我吗?

我在命令提示符中使用此行调用我的蜘蛛:scrapy crawl CDiscount -o items.csv

这是我的蜘蛛:

# -*- coding: utf-8 -*-
# Every import is done for a specific use
import scrapy                                       # Once you downloaded scrapy, you have to import it in your code to use it.
import re                                           # To use the .re() function, which extracts just a part of the text you crawl. It's using regex (regular expressions)
import numbers                                      # To use mathematics things, in this case : numbers.
from cdiscount_test.items import CdiscountTestItem  # To return the items you want. Each item has a space allocated in the momery, created in the items.py file, which is in the cdiscount_test directory.
from urllib.parse import urljoin                    # To use the urljoin function, which make that an url is readable by the spider.
from scrapy.spiders import CrawlSpider, Rule        # To use rules and LinkExtractor, which allowed the spider to follow every url on the page you crawl.
from scrapy.linkextractors import LinkExtractor     # Look above.

# Your spider
class CdiscountsellersspiderSpider(CrawlSpider):
    name = 'CDiscount'             # The name of your spider. You call it in the anaconda prompt.
    handle_httpstatus_list = [502, 503]
    allowed_domains = ['cdiscount.com']         # Web domains allowed by you, your spider cannot enter on a page which is not in that domain.
    start_urls = ['http://www.cdiscount.com']   # The first link you crawl.

    # To allow your spider to follow the urls that are on the actual page.
    rules = (
        Rule(LinkExtractor(allow=()), callback='parse_1'),
    )

    # Your "crawl" function
    def parse_1(self, response):
        item = CdiscountTestItem() # The spider now knowws that the items you want have to be stored in the item variable.

        # Extraction of the data you are looking for by following their paths in the HTML code.
        # response.xpath() is to find the location of the data you want (the HTML code) and .extract() or .re() is to extract the data from the HTML code.
        name = response.xpath('//div[@class="shtName"]/div[@class="shtOver"]/h1[@itemprop="name"]/text()').extract()
        country = response.xpath('//div[@class="shtName"]/span[@class="shTopCExp"]/text()').extract()

        # If your data are on the page, you pull them out
        # To know if they are, you check their lenght, which means : if the variable in which you put your data are empty, then the data are obviously not on that page.
        if ((len(name) != 0) & (len(country) != 0)):
            sales = response.xpath('//div[@class="shcSell"]//span//text()').re(r'([\d]*).*')
            nbproducts_tot = response.xpath('//div[@class="jsFacetListing mgFacetListing mgFOpen"][1]/div[@class="mgFAllList"][1]/ul/li/label/span/text()').re(r'\(([\d]*)\)')
            avgcost_tot = response.xpath('//div[@class="jsFacetListing mgFacetListing mgFOpen"][1]/div[@class="mgFAllList"][1]/ul/li/label/span/text()').re(r'\<?(.*[\d]*)\€+')
            id_tmp = response.xpath('//ul[@class="sCgvLegal"]/li/span[contains(.,"Immatriculation")]/following-sibling::text()').extract()
            address = response.xpath('//ul[@class="sCgvLegal"]/li/span[contains(.,"Adresse")]/following-sibling::text()').extract()
            business = response.xpath('//ul[@class="sCgvLegal"]/li/span[contains(.,"Sociale")]/following-sibling::text()').extract()

            # You calculate the avergae cost, the number of all the products the seller has and the turnover of the lasts 12 months.
            # First, the average cost :
                # avgcost is a list of list. Each "under list" contains all the slices of price which exist for the products of this seller.
                # You order the slice of price and the number of products they contain.
                # Then, you sum the max and min of each slice of price and divide this number by two (or let it like that, depending if the slice is between two numbers or just "under 10" or "above 1000").
                    # Like that, you have the average cost of each slice of price.
            avgcost = [avgcost_tot[i].split(' ') for i in range(len(avgcost_tot))]
            if (len(avgcost) != 0):
                for i in range(len(avgcost)):
                    avgcost[i].remove(avgcost[i][len(avgcost[i])-1])
                if len(avgcost[0]) != 3:
                    if len(avgcost[len(avgcost)-1]) != 3:
                        for i in range(1,len(avgcost)-1):
                            avgcost[i].remove(avgcost[i][1])
                            avgcost[i] = sum(map(int, avgcost[i]))/len(avgcost[i])
                        avgcost[len(avgcost)-1] = sum(map(int, avgcost[len(avgcost)-1]))/len(avgcost[len(avgcost)-1])
                    else:
                        for i in range(1, len(avgcost)):
                            avgcost[i].remove(avgcost[i][1])
                            avgcost[i] = sum(map(int, avgcost[i]))/len(avgcost[i])
                    avgcost[0] = sum(map(int, avgcost[0]))/len(avgcost[0])
                else:
                    if len(avgcost[len(avgcost)-1]) != 3:
                        for i in range(0,len(avgcost)-1):
                            avgcost[i].remove(avgcost[i][1])
                            avgcost[i] = sum(map(int, avgcost[i]))/len(avgcost[i])
                        avgcost[len(avgcost)-1] = sum(map(int, avgcost[len(avgcost)-1]))/len(avgcost[len(avgcost)-1])
                    else:
                        for i in range(0, len(avgcost)):
                            avgcost[i].remove(avgcost[i][1])
                            avgcost[i] = sum(map(int, avgcost[i]))/len(avgcost[i])

                # You pull out the number of products of each slice of price and put these info in a list.
                nbproducts_list = list(map(int, nbproducts_tot))
                # You divide each average cost of each slice of price by the number of products contains in that slice.
                    # Now, you have a list of every average cost by product, by slice of price.
                cost_list = [avgcost[i]*nbproducts_list[i] for i  in range(len(avgcost))]

                # The total number of products sold by the seller.
                nbproducts = sum(nbproducts_list)
                # The average cost of one product sold by this seller.
                cost = int(sum(cost_list)/nbproducts)

                item['Average_cost'] = ''.join(str(cost)).strip()
                item['Products_number'] = ''.join(str(nbproducts)).strip()

                if (len(sales) != 0):
                    # Turnover of the lasts 12 months (average cost of one product by the number of products sold on the lasts 12 months).
                    turnover = cost*int(sales[0])

                    item['TurnOver_12months'] = ''.join(str(turnover)).strip()

            # Store the data at the right place in your output file
            item['Storefront_name'] = ''.join(name).strip()
            item['Business_name'] = ''.join(business).strip()
            item['Address'] = ''.join(address).strip()
            item['Country'] = ''.join(country).strip()
            item['ID'] = ''.join(id_tmp).strip()
            item['Sales_12months'] = ''.join(sales).strip()
            item['url_seller'] = ''.join(str(response.url))
            # Return the data
            yield item

        # If not, there was no data on the page and you have to find all the links on your page and launch the first function on them.
        else:
            for sel in response.xpath('//html/body'):
                list_urls = sel.xpath('//a/@href').extract()
                list_iframe = response.xpath('//div[@class="ProductPriceBox-item detail"]/div/a/@href').extract()
                if (len(list_iframe) != 0):
                    for list_iframe in list_urls:
                        yield scrapy.Request(list_iframe, callback=self.parse_start_url, meta={'dont_redirect': True})
                for url in list_urls:
                    yield scrapy.Request(response.urljoin(url), callback=self.parse_start_url, meta={'dont_redirect': True})

以下是我的设置:

BOT_NAME = 'cdiscount_test'

SPIDER_MODULES = ['cdiscount_test.spiders']
NEWSPIDER_MODULE = 'cdiscount_test.spiders'

DOWNLOAD_DELAY = 2
COOKIES_ENABLED = False

编辑:

现在,我的蜘蛛甚至没有废弃网站。它只是抓取所有内容,找到要遵循的所有链接,但是没有得到数据,我不明白为什么。自上次工作以来,我没有改变任何东西。

谢谢!

0 个答案:

没有答案