为什么scrapy没有将数据存储到mongodb?

时间:2015-06-08 05:20:52

标签: python mongodb web-scraping scrapy scrapy-spider

我的主要文件:

import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request


class Product(scrapy.Item):
    brand = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    name = scrapy.Field()
    title = scrapy.Field()
    date = scrapy.Field()
    heading = scrapy.Field()
    data = scrapy.Field()
    Model_name = scrapy.Field()


class aqaqspider(CrawlSpider):
    name = "mouth_shut_new"
    allowed_domains = ["mouthshut.com"]
    start_urls = ["http://www.mouthshut.com/mobile-phones/Yu-Yureka-reviews-925723476"

    ]
    rules = (
        Rule(
            SgmlLinkExtractor(allow=('.*\-page-.*',)),
            callback="parse_start_url",
            follow=True),
    )

    def parse_start_url(self, response):
        products = response.xpath('//div[@id="allreviews"]/ul/li')
        items = []
        if not products:
            raise CloseSpider("No more products!")

        for product in products:
            item = Product()
            #item['Model_name'] = product.xpath('/html/body/form/div[12]/div/div[5]/div/div[1]/div[3]/ul/li[1]/h1/a/span/text()').extract()
            item['name'] = product.xpath('.//li[@class="profile"]/div/a/span/text()').extract()[0]
            item['title'] = product.xpath('.//div[@class="reviewtitle fl"]/strong/a/text()').extract()[0]
            item['date'] = product.xpath('.//div[@class="reviewrate"]//span[@class="datetime"]/span/span/span/text()').extract()[0]
            item['link'] = product.xpath('.//div[@class="reviewtitle fl"]/strong/a/@href').extract()[0]
            if item['link']:
                if 'http://' not in item['link']:
                    item['link'] = urljoin(response.url, item['link'])
                yield scrapy.Request(item['link'],
                                    meta={'item': item},
                                    callback=self.anchor_page)

            items.append(item)

    def anchor_page(self, response):
        old_item = response.request.meta['item']

        old_item['data'] = response.xpath('.//div[@itemprop="description"]/p/text()').extract()
        yield old_item

    # yield Request(url="http://www.mouthshut.com/Product/mobileListing.aspx?cid=925602729&f1=1&view=list&nsort1=0&nsort2=2015-06-01%2016:12:23.000&ntype=3&mpad=1&ran=0.3691624044781373&dcal=Intex%20Aqua%20Xtreme" ,
                      # headers={"Referer": "http://www.mouthshut.com/mobile-phones.php", "X-Requested-With": "XMLHttpRequest"},
                      # callback=self.parse, 
                      # dont_filter=True)

我的settings.py:

# -*- coding: utf-8 -*-

# Scrapy settings for mouth project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'mouth'

SPIDER_MODULES = ['mouth.spiders']
NEWSPIDER_MODULE = 'mouth.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mouth (+http://www.yourdomain.com)'


ITEM_PIPELINES = {'mouth.pipelines.MongoDBPipeline':300}

MONGODB_HOST = 'localhost' # Change in prod
MONGODB_PORT = 27017 # Change in prod
MONGODB_DATABASE = "mobiles_complaints" # Change in prod
MONGODB_COLLECTION = "Yu_Yureka"
MONGODB_USERNAME = "" # Change in prod
MONGODB_PASSWORD = "" # Change in prod

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'consumer (+http://www.yourdomain.com)'

我的pipelines.py:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


import pymongo
from scrapy.conf import settings
from scrapy import log


class MongoDBPipeline(object):
    def __init__(self):
        connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
        db = connection[settings['MONGODB_DATABASE']]
        self.collection = db[settings['MONGODB_COLLECTION']]

def process_item(self, item, spider):
    self.collection.insert(dict(item))
    log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
        settings['MONGODB_DATABASE'],
        settings['MONGODB_COLLECTION'],
        settings['MONGODB_HOST'],
        settings['MONGODB_PORT']))
    return item

我跑了sc scrapy crawl mouth_shut_new。但是我的数据没有存储在数据库中。在输出中,它应该显示数据存储在mongo和集合名称中。我缺少什么?

2 个答案:

答案 0 :(得分:2)

process_item()方法没有正确缩进,应该是:

class MongoDBPipeline(object):
    def __init__(self):
        connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
        db = connection[settings['MONGODB_DATABASE']]
        self.collection = db[settings['MONGODB_COLLECTION']]

    def process_item(self, item, spider):
        self.collection.insert(dict(item))
        log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
            settings['MONGODB_DATABASE'],
            settings['MONGODB_COLLECTION'],
            settings['MONGODB_HOST'],
            settings['MONGODB_PORT']))
        return item

答案 1 :(得分:0)

你没有在回调函数中产生该项:callback =" parse_start_url",你应该这样做:

def parse_start_ul(self, response):
    ...

    for product in products:
        item = Product()
        ....
        yield item