在python中格式化大型文本文件

时间:2015-04-24 22:56:06

标签: python parallel-processing text-files

在尝试确定是否应该使用并行编程时,我一直坚持使用这段代码。

代码采用包含两列的文本文件:第一列包含一个单词,第二列包含一个URL。

在String_stripper_function()中,文本文件的每一行都以特定方式格式化(因此对replace()函数的所有调用。)。

然后我们在第一列和第二列之间进行比较,如果第一列中的单词包含在第二列的url中,那么该行将被写入一个新文件(称之为Result.txt

此外,如果第一列中的单词包含4个大写字母,而第二列中的URL包含数字,则将该行添加到同一个新文件(Result.txt)。

现在这种情况有效,我已经多次检查,但是在具有16GB内存的 i7 计算机上需要花费很长时间,<100>行 几小时

该文件包含1923014行(如果您愿意,则为行);它是97.9 MB。

所以我的问题是:表现明智我的代码有什么问题

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 16:44:35 2015

@author: Steve
"""
import re
import multiprocessing as mp
import numpy as np
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)
    #This code strips the urls into their main domain
def url_stripper(url):
    url=url.replace('http://','')
    url=url.replace('https://','')
    url=url.replace('http','')
    url=url.replace('/',' ')
    url=url.split()
    return url


def String_stripper_function():
   with open("homepages.txt") as infile:
    i=0
    l1=np.array([])
    l2=np.array([])
    l3=np.array([])
    l4=np.array([])
    for line in infile:        
        word_original=line.split()[0]
        url_original=line.split()[1]
        url=url_stripper(url_original)
        if len(url)==0:
            print 'lol no url fam'
        else:
            url=url[0]
        word=word_original.replace('_',' ')
        word=word.replace('-',' ')
        word=word.replace('(','')
        word=word.replace(')','')
        regex = re.compile(".*?\((.*?)\)")
        word_in_parenthesis = re.findall(regex, word)
        for i in xrange(len(word_in_parenthesis)):
            word=word.replace(word_in_parenthesis[i],'')
        word=word.replace('The ','')
        word=word.replace(' The ','')
        word=word.replace(', The ','')
        word=word.replace(' ,The ','')
        word=word.replace(',The ','')
        word=word.replace('...','')
        word=word.replace('A ','')
        word=word.replace(' A ','')
        word=word.replace(', A ','')
        word=word.replace(' ,A ','')
        word=word.replace(',A ','')
        word=word.replace('An ','')
        word=word.replace(' An ','')
        word=word.replace(', An ','')
        word=word.replace(' ,An ','')
        word=word.replace(',An ','')
        word=word.replace(',','')
        #condition 2&3
        words=word.split()
#        print word.lower().split()
#        print url_original.lower()
        Capital_Letters=sum(1 for c in word if c.isupper())
        decision=hasNumbers(url)
        for w in words:
            #comment the following for 
            if w.lower() in url_original.lower():
                if word_original not in l1:
                    l1=np.append(l1,word_original)
                    l2=np.append(l2,url_original)
                else:
                    print ""
                #Uncomment the following for Domain only
#            if w.lower() in url.lower():
#                    l1=np.append(l1,word_original)
#                    l2=np.append(l2,url_original)
            elif Capital_Letters==4 and decision==True:
                if word_original not in l1:
                    l1=np.append(l1,word_original)
                    l2=np.append(l2,url_original) 
                else:
                    print ""
#        if word_original not in l1:
#            if word_original not in l3:
#                    l3=np.append(l3,word_original)
#                    l4=np.append(l4,url_original)
            else:
                print ""


    file = open("results.txt", "w")
    for index in xrange(len(l1)):
        file.write( '%s \t %s\n' % (str(l1[index]),str(l2[index])))
    file.close()
#    file1 = open("results_failedConditions.txt", "w")
#    for index in xrange(len(l3)):
#        file1.write( '%s \t %s\n' % (str(l3[index]),str(l4[index])))
#    file1.close()

if __name__=="__main__":
         String_stripper_function()

1 个答案:

答案 0 :(得分:0)

首先,这个问题应该在Code Review上提出。

我对您的代码进行了一些更改,以考虑一些假设。

  1. str.replace()通过在一个大字符串上运行而不是逐行迭代来运行得更快。
  2. urlStripper我刚刚找到://,因为我相信开始时间才会发生。
  3. 而不是使用.replace("/"," ")然后使用.split(),而.split("/")应该更快。您可以使用自己的分隔符进行拆分。请参阅文档中的split
  4. 我做了一些其他的小改动。在我的测试中,我的版本运行500次1000次测试,这个版本需要0.054秒,而你的版本需要0.133秒。 这是代码:

    # -*- coding: utf-8 -*-
    """
    Created on Sun Apr 12 16:44:35 2015
    
    @author: Steve
    @edit: IronManMark20
    """
    import timer
    import re
    def hasNumbers(inputString):
        return any(char.isdigit() for char in inputString)
        #This code strips the urls into their main domain
    def url_stripper(url):
        try:
            index=url.index("://") #only happens at beginning
        except:
            return url.split("/") #you can set the splitter
        url=url[:index]
        return url.split("/")
    
    
    
    def String_stripper_function():
        with open("./homepages.txt") as infile:
            i=0
            l1=[]
            l2=[]
            #l3=[]
            #l4=[]
            lines_string="" #We'll use this later
            uris=[]#needed for uris
            for line in infile:
                word_original=line.split()[0]
                url_original=line.split()[1]
                url=url_stripper(url_original)
                if len(url)==0:
                    print 'lol no url fam'
                else:
                    url=url[0]
                lines_string+=word_original
                lines_string+="/" #add a delimiter that we don't search for later
                uris+=[url_original]
            words=Mass_List(lines_string)
            words=words[:len(words)-1]
            for w in words:
                lines=lines_string.split("/")#split for later use
                Capital_Letters=sum(1 for c in w if c.isupper())
                url_original=uris[words.index(w)] #get url for each line
                decision=hasNumbers(url_original)
    
                #comment the following for 
                if w.lower() in url_original.lower():
                    if word_original not in l1:
                        l1+=[lines[words.index(w)]]
                        l2+=[uris[words.index(w)]]
                #   else:
                #       print ""
                    #Uncomment the following for Domain only
    #           if w.lower() in url.lower():
    #                    l1=np.append(l1,word_original)
    #                    l2=np.append(l2,url_original)
                elif Capital_Letters==4 and decision==True:
                    if word_original not in l1:
                        l1+=[lines[words.index(w)]]
                        l2+=[uris[words.index(w)]]
                #   else:
                #       print ""
    #           if word_original not in l1:
    #               if word_original not in l3:
    #                    l3=np.append(l3,word_original)
    #                    l4=np.append(l4,url_original)
                #else:
                #   print ""
    
    
        file = open("results.txt", "w") 
        for i in range(len(l1)):
            file.write(l1[i]+" "+l2[i]+"\n")
        file.close()
    #    file1 = open("results_failedConditions.txt", "w")
    #    for index in xrange(len(l3)):
    #        file1.write( '%s \t %s\n' % (str(l3[index]),str(l4[index])))
    #    file1.close()
    def Mass_List(lines):
            word=lines.replace('_',' ')
            word=word.replace('-',' ')
            word=word.replace('(','')
            word=word.replace(')','')
            regex = re.compile(".*?\((.*?)\)")
            word_in_parenthesis = re.findall(regex, word)
            for i in xrange(len(word_in_parenthesis)):
                word=word.replace(word_in_parenthesis[i],'')
            word=word.replace('The ','')
            word=word.replace(' The ','')
            word=word.replace(', The ','')
            word=word.replace(' ,The ','')
            word=word.replace(',The ','')
            word=word.replace('...','')
            word=word.replace('A ','')
            word=word.replace(' A ','')
            word=word.replace(', A ','')
            word=word.replace(' ,A ','')
            word=word.replace(',A ','')
            word=word.replace('An ','')
            word=word.replace(' An ','')
            word=word.replace(', An ','')
            word=word.replace(' ,An ','')
            word=word.replace(',An ','')
            word=word.replace(',','')
            words=word.split('/') #changed to split in arbitrary delimiter
            return words
    
    if __name__=="__main__":
             String_stripper_function()