Question

在尝试确定是否应该使用并行编程时，我一直坚持使用这段代码。

代码采用包含两列的文本文件：第一列包含一个单词，第二列包含一个URL。

在String_stripper_function（）中，文本文件的每一行都以特定方式格式化（因此对replace（）函数的所有调用。）。

然后我们在第一列和第二列之间进行比较，如果第一列中的单词包含在第二列的url中，那么该行将被写入一个新文件（称之为Result.txt）

此外，如果第一列中的单词包含4个大写字母，而第二列中的URL包含数字，则将该行添加到同一个新文件（Result.txt）。

现在这种情况有效，我已经多次检查，但是在具有16GB内存的 i7 计算机上需要花费很长时间，<100>行 几小时。

该文件包含1923014行（如果您愿意，则为行）;它是97.9 MB。

所以我的问题是：表现明智我的代码有什么问题？

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 16:44:35 2015

@author: Steve
"""
import re
import multiprocessing as mp
import numpy as np
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)
    #This code strips the urls into their main domain
def url_stripper(url):
    url=url.replace('http://','')
    url=url.replace('https://','')
    url=url.replace('http','')
    url=url.replace('/',' ')
    url=url.split()
    return url


def String_stripper_function():
   with open("homepages.txt") as infile:
    i=0
    l1=np.array([])
    l2=np.array([])
    l3=np.array([])
    l4=np.array([])
    for line in infile:        
        word_original=line.split()[0]
        url_original=line.split()[1]
        url=url_stripper(url_original)
        if len(url)==0:
            print 'lol no url fam'
        else:
            url=url[0]
        word=word_original.replace('_',' ')
        word=word.replace('-',' ')
        word=word.replace('(','')
        word=word.replace(')','')
        regex = re.compile(".*?\((.*?)\)")
        word_in_parenthesis = re.findall(regex, word)
        for i in xrange(len(word_in_parenthesis)):
            word=word.replace(word_in_parenthesis[i],'')
        word=word.replace('The ','')
        word=word.replace(' The ','')
        word=word.replace(', The ','')
        word=word.replace(' ,The ','')
        word=word.replace(',The ','')
        word=word.replace('...','')
        word=word.replace('A ','')
        word=word.replace(' A ','')
        word=word.replace(', A ','')
        word=word.replace(' ,A ','')
        word=word.replace(',A ','')
        word=word.replace('An ','')
        word=word.replace(' An ','')
        word=word.replace(', An ','')
        word=word.replace(' ,An ','')
        word=word.replace(',An ','')
        word=word.replace(',','')
        #condition 2&3
        words=word.split()
#        print word.lower().split()
#        print url_original.lower()
        Capital_Letters=sum(1 for c in word if c.isupper())
        decision=hasNumbers(url)
        for w in words:
            #comment the following for 
            if w.lower() in url_original.lower():
                if word_original not in l1:
                    l1=np.append(l1,word_original)
                    l2=np.append(l2,url_original)
                else:
                    print ""
                #Uncomment the following for Domain only
#            if w.lower() in url.lower():
#                    l1=np.append(l1,word_original)
#                    l2=np.append(l2,url_original)
            elif Capital_Letters==4 and decision==True:
                if word_original not in l1:
                    l1=np.append(l1,word_original)
                    l2=np.append(l2,url_original) 
                else:
                    print ""
#        if word_original not in l1:
#            if word_original not in l3:
#                    l3=np.append(l3,word_original)
#                    l4=np.append(l4,url_original)
            else:
                print ""


    file = open("results.txt", "w")
    for index in xrange(len(l1)):
        file.write( '%s \t %s\n' % (str(l1[index]),str(l2[index])))
    file.close()
#    file1 = open("results_failedConditions.txt", "w")
#    for index in xrange(len(l3)):
#        file1.write( '%s \t %s\n' % (str(l3[index]),str(l4[index])))
#    file1.close()

if __name__=="__main__":
         String_stripper_function()

Answer 1

首先，这个问题应该在Code Review上提出。

我对您的代码进行了一些更改，以考虑一些假设。

str.replace()通过在一个大字符串上运行而不是逐行迭代来运行得更快。
在urlStripper我刚刚找到://，因为我相信开始时间才会发生。
而不是使用.replace("/"," ")然后使用.split()，而.split("/")应该更快。您可以使用自己的分隔符进行拆分。请参阅文档中的split。

我做了一些其他的小改动。在我的测试中，我的版本运行500次1000次测试，这个版本需要0.054秒，而你的版本需要0.133秒。这是代码：

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 16:44:35 2015

@author: Steve
@edit: IronManMark20
"""
import timer
import re
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)
    #This code strips the urls into their main domain
def url_stripper(url):
    try:
        index=url.index("://") #only happens at beginning
    except:
        return url.split("/") #you can set the splitter
    url=url[:index]
    return url.split("/")



def String_stripper_function():
    with open("./homepages.txt") as infile:
        i=0
        l1=[]
        l2=[]
        #l3=[]
        #l4=[]
        lines_string="" #We'll use this later
        uris=[]#needed for uris
        for line in infile:
            word_original=line.split()[0]
            url_original=line.split()[1]
            url=url_stripper(url_original)
            if len(url)==0:
                print 'lol no url fam'
            else:
                url=url[0]
            lines_string+=word_original
            lines_string+="/" #add a delimiter that we don't search for later
            uris+=[url_original]
        words=Mass_List(lines_string)
        words=words[:len(words)-1]
        for w in words:
            lines=lines_string.split("/")#split for later use
            Capital_Letters=sum(1 for c in w if c.isupper())
            url_original=uris[words.index(w)] #get url for each line
            decision=hasNumbers(url_original)

            #comment the following for 
            if w.lower() in url_original.lower():
                if word_original not in l1:
                    l1+=[lines[words.index(w)]]
                    l2+=[uris[words.index(w)]]
            #   else:
            #       print ""
                #Uncomment the following for Domain only
#           if w.lower() in url.lower():
#                    l1=np.append(l1,word_original)
#                    l2=np.append(l2,url_original)
            elif Capital_Letters==4 and decision==True:
                if word_original not in l1:
                    l1+=[lines[words.index(w)]]
                    l2+=[uris[words.index(w)]]
            #   else:
            #       print ""
#           if word_original not in l1:
#               if word_original not in l3:
#                    l3=np.append(l3,word_original)
#                    l4=np.append(l4,url_original)
            #else:
            #   print ""


    file = open("results.txt", "w") 
    for i in range(len(l1)):
        file.write(l1[i]+" "+l2[i]+"\n")
    file.close()
#    file1 = open("results_failedConditions.txt", "w")
#    for index in xrange(len(l3)):
#        file1.write( '%s \t %s\n' % (str(l3[index]),str(l4[index])))
#    file1.close()
def Mass_List(lines):
        word=lines.replace('_',' ')
        word=word.replace('-',' ')
        word=word.replace('(','')
        word=word.replace(')','')
        regex = re.compile(".*?\((.*?)\)")
        word_in_parenthesis = re.findall(regex, word)
        for i in xrange(len(word_in_parenthesis)):
            word=word.replace(word_in_parenthesis[i],'')
        word=word.replace('The ','')
        word=word.replace(' The ','')
        word=word.replace(', The ','')
        word=word.replace(' ,The ','')
        word=word.replace(',The ','')
        word=word.replace('...','')
        word=word.replace('A ','')
        word=word.replace(' A ','')
        word=word.replace(', A ','')
        word=word.replace(' ,A ','')
        word=word.replace(',A ','')
        word=word.replace('An ','')
        word=word.replace(' An ','')
        word=word.replace(', An ','')
        word=word.replace(' ,An ','')
        word=word.replace(',An ','')
        word=word.replace(',','')
        words=word.split('/') #changed to split in arbitrary delimiter
        return words

if __name__=="__main__":
         String_stripper_function()

在python中格式化大型文本文件

1 个答案: