我在堆栈溢出时发现了一段代码,关于如何在单独的csv文件中拆分url的输出,我想出了下面的代码。但是,我不能再在代码中使用field_to_export。我想知道如何设置要导出的字段,以便导出它们:field_to_export = ['itemA','itemB','itemC']。
from scrapy import signals
from scrapy.exporters import CsvItemExporter
import re
class appPipeline(object):
urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]
names = [name.group(1) for l in urls for name in [re.search(r'https://www.google.co.uk/', l, re.M|re.I)] if name]
def __init__(self):
self.files = {}
self.exporters = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.files = dict([ (name, open('results/'+name+'.csv','w+b')) for name in self.names])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.names])
### this line is not working self.exportes.fields_to_export = ['itemA','itemB','itemC']
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
myItem = item['myItem']
if myItem in set(self.names):
self.exporters[myItem].export_item(item)
return item
到目前为止,我一直试图覆盖项目中的键。我试图序列化项目,我一直在寻找如何通过键列表对字典中的值进行排序。他们都没有工作。
谢谢你的帮助。
答案 0 :(得分:0)
这一行:
self.exporters.fields_to_export = ['itemA','itemB','itemC']
将(无用的)属性添加到导出器的字典中。我很确定你的意思是:
for exporter in self.exporters.values():
exporter.fields_to_export = ['itemA','itemB','itemC']
我实际上将它与上下线相结合以提高速度:
for name in self.names:
self.exporters[name] = CsvItemExporter(self.files[name]))
self.exporters[name].fields_to_export = ['itemA','itemB','itemC']
self.exporters[name].start_exporting()
无需强迫理解。