Question

我创建了一个网络抓取工具，用于提取在数字图书馆（sample document）中发布的研究论文的信息。

基本上我正在提取每篇论文的标题，摘要和参考文献列表，并将它们存储在文本文件中。对所有引用的论文也重复这个过程。

我使用了一个队列来存储文档ID。
我需要从至少5000篇论文中提取这些信息，但程序太慢，需要3个小时才能完成250-300篇论文。

提高刮刀速度的可行方法有哪些？

以下是代码：

# _*_ coding:utf-8 _*_
import urllib2
import json
import Queue

crawled = []

fo = open("paper.txt", "w")

class Paper(object):
    def __init__(self, paper_id):
        self.paper_id = paper_id
        self.title, self.abstract = self.fetch_data()

    def fetch_data(self):
        base_url = "http://ieeexplore.ieee.org/rest/document/{0}/{1}"
        data_url = base_url.format(self.paper_id, "abstract")
        response = urllib2.urlopen(data_url)
        html = response.readlines()
        data = json.loads("\n".join(html))
        title = data["title"]
        abstract = data["abstract"]
        return title, abstract

    def fetch_ieee_references(self):
        base_url = "http://ieeexplore.ieee.org/rest/document/{0}/{1}"
        data_url = base_url.format(self.paper_id, "references")
        response = urllib2.urlopen(data_url)
        html = response.readlines()
        data = json.loads("\n".join(html))
        references = []
        try:
            for ref in data["references"]:
                try:
                    ref_link = ref["links"]["documentLink"]
                    ref_paper_id = ref_link.split("/")[-1]
                    references.append(Paper(ref_paper_id))
                except:
                    pass
       except:
           pass
       return references
    def extract_paper(self):
        try:
            print "Paper ID"
            print self.paper_id
            fname = str(self.paper_id)
            fname = fname + ".txt"
            fcon = open(fname,"w")
            print
            print "Title"
            print self.title
            print >>fcon, self.title
            print "Abstract"
            print self.abstract
            print >>fcon, self.abstract
            print "References"
            for ref in self.fetch_ieee_references():
                print ref.paper_id, ref.title
                print >>fo, self.paper_id, ref.paper_id
        except:
            pass


def new_func():
    n_id = 6639344
    q = Queue.Queue()
    q.put_nowait(n_id)
    crawled.append(n_id)
    while not q.empty():
        p_id = q.get_nowait()
        paper = Paper(p_id)
        paper.extract_paper()
        for ref in paper.fetch_ieee_references():
            if ref.paper_id not in crawled:
                crawled.append(ref.paper_id)
                q.put_nowait(ref.paper_id)

new_func()

Answer 1

正如其他用户已经提到的，它主要取决于HTTP请求的速度，因此您依赖于站点的服务器。因此，为了加快速度，您可以在多个流程之间划分文件。另外我不明白为什么你读了html然后使用json.loads你可以在响应上使用json.load，这会加快一点点。

提高网络刮刀的速度

1 个答案: