scrapy结果只能在循环中保存一个项目

时间:2017-09-14 11:00:56

标签: python scrapy scrapy-spider

import scrapy
from universities.items import UniversitiesItem


def clean_full_name(full_name):
    sp = full_name.split(',')
    last_name = sp[0].strip()
    first_name = sp[1].replace('\r\n', '').strip()
    first_name = ' '.join(first_name.split()[:-1]).strip()
    return ' '.join([last_name, first_name])


class DerexlUniversity(scrapy.Spider):
    name = 'drexel_university'
    allowed_domains = ['drexel.edu']
    start_urls = ['http://drexel.edu/search?t=people&q=']

    def __init__(self):
        self.last_name = ''

    def parse(self, response):
        with open('kw.txt') as file_object:
            last_names = file_object.readlines()

        for ln in ['Chong', 'Zhao']:
            self.last_name = ln.strip()
            print('-----------------------------------------------------')
            print("scraping last name: ", self.last_name)
            query = response.url + self.last_name
            yield scrapy.Request(query, callback=self.parse_item)

    def parse_item(self, response):
        self.logger.info('This is item page %s', self.last_name)
        result_rows = response.xpath('//table//tr[@class="result-row"]')
        result_details = response.xpath('//table//tr[@class="result-details"]')

        for row, detail in zip(result_rows, result_details):
            full_name = row.xpath('.//span[@class="fullname"]/text()').extract_first()
            if full_name:
                full_name = clean_full_name(full_name)
                if self.last_name in full_name.split():
                    item = UniversitiesItem()
                    item['fullname'] = full_name
                    item['university'] = 'Drexel University'
                    try:
                        item['email'] = row.xpath('.//span[@class="email-address"]/a/@href').extract_first()[7:]
                        item['phone'] = row.xpath('.//span[@class="phone-numbers"]/a/@href').extract_first()[4:]

                        person_detail = detail.xpath('.//span[@class="person-detail"]/text()').extract()
                    except ValueError:
                        pass
                    else:
                        person_detail_clean = ', '.join([pd.strip() for pd in person_detail[0].split(',')][1:])
                        item['person_detail'] = person_detail_clean

                    yield item

for循环中有两个关键词,即' Chong'和'赵'我试图将结果保存在CSV文件中。我每次都在parse_item函数的for循环中生成一个新项。但是,只有'赵'正在得救。我无法弄清楚原因。

1 个答案:

答案 0 :(得分:0)

您的问题与let incomeCalc = Decimal((arc4random_uniform(50)+75)) / 100 print("incomeCalc") print(incomeCalc) 有关。您不应该在响应中使用类级别变量。你应该self.last_name。以下对我有用

response.meta