Question

我想帮助重构此代码以减少冗余行/概念。这个def的代码基本上重复了3次。

限制： - 我是新手，所以一个非常奇特的列表理解或将事物变成带有dunders和方法覆盖的对象是我的进步方式。 - 仅内置模块。这是Pyhton 2.7代码，只导入os和re。

整个脚本的作用：查找具有固定前缀的文件。这些文件是以管道分隔的文本文件。第一行是标题。它有一个可以是1行或更多行的页脚。根据前缀，脚本会从文本文件中抛弃另一步中不需要的“列”。它以逗号分隔的数据保存在扩展名为.csv的新文件中。

大部分工作都是在processRawFiles（）中完成的。这就是我要重构的内容，因为它非常重复。

def separateTranslationTypes(translationFileList):
    '''Takes in list of all files to process and find which are roomtypes
    , ratecodes or sourcecodes. The type of file determines how it will be processed.'''
    rates = []
    rooms = []
    sources = []
    for afile in translationFileList:
        rates.append( [m.group() for m in re.finditer('cf_ratecodeheader+(.*)', afile)] )
        rooms.append( [m.group() for m in re.finditer('cf_roomtypes+(.*)', afile)] )
        sources.append( [m.group() for m in re.finditer('cf_sourcecodes+(.*)', afile)] )
    # empty list equates to False. So if x is True if the list is not empty - thus kept.
    rates = [x[0] for x in rates if x]
    rooms = [x[0] for x in rooms if x]
    sources = [x[0] for x in sources if x]
    print '... rateCode files :: ',rates,'\n'
    print '... roomType files :: ',rooms,'\n'
    print '... sourceCode files :: ',sources, '\n'

    return {'rateCodeFiles':rates,
            'roomTypeFiles':rooms,
            'sourceCodeFiles':sources}

groupedFilestoProcess = separateTranslationTypes(allFilestoProcess)


def processRawFiles(groupedFileDict):
    for key in groupedFileDict:
        # Process the rateCodes file
        if key == 'rateCodeFiles':
            for fname_Value in groupedFileDict[key]: # fname_Value is the filename
                if os.path.exists(fname_Value):
                    workingfile = open(fname_Value,'rb')
                    filedatastring = workingfile.read() # turns entire file contents to a single string
                    workingfile.close()
                    outname = 'forUpload_' + fname_Value[:-4:] + '.csv' # removes .txt of any other 3 char extension
                    outputfile = open(outname,'wb')
                    filedatalines = filedatastring.split('\n') # a list containing each line of the file
                    rawheaders = filedatalines[0] # 1st element of the list is the first row of the file, with the headers
                    parsedheaders = rawheaders.split('|') # turn the header string into a list where | was delimiter
                    print '\n'
                    print 'outname: ', outname, '\n'
                    # print 'rawheaders: ', rawheaders, '\n'
                    # print 'parsedheaders: ',parsedheaders, '\n'
                    # print filedatalines[0:2]
                    print '\n'
                    ratecodeindex = parsedheaders.index('RATE_CODE')
                    ratecodemeaning = parsedheaders.index('DESCRIPTION')
                    for dataline in filedatalines:
                        if dataline[:4] == 'LOGO':
                            firstuselessline = filedatalines.index(dataline)
                            # print firstuselessline
                    # ignore the first line which was the headers
                    # stop before the line that starts with LOGO - the first useless line
                    for dataline in filedatalines[1:firstuselessline-1:]:
                        # print dataline.split('|')
                        theratecode = dataline.split('|')[ratecodeindex]
                        theratemeaning = dataline.split('|')[ratecodemeaning]
                        # print theratecode, '\t', theratemeaning, '\n'
                        linetowrite = theratecode + ',' + theratemeaning + '\n'
                        outputfile.write(linetowrite)
                    outputfile.close()

        # Process the roomTypes file
        if key == 'roomTypeFiles':
            for fname_Value in groupedFileDict[key]: # fname_Value is the filename
                if os.path.exists(fname_Value):
                    workingfile = open(fname_Value,'rb')
                    filedatastring = workingfile.read() # turns entire file contents to a single string
                    workingfile.close()
                    outname = 'forUpload_' + fname_Value[:-4:] + '.csv' # removes .txt of any other 3 char extension
                    outputfile = open(outname,'wb')
                    filedatalines = filedatastring.split('\n') # a list containing each line of the file
                    rawheaders = filedatalines[0] # 1st element of the list is the first row of the file, with the headers
                    parsedheaders = rawheaders.split('|') # turn the header string into a list where | was delimiter
                    print '\n'
                    print 'outname: ', outname, '\n'
                    # print 'rawheaders: ', rawheaders, '\n'
                    # print 'parsedheaders: ',parsedheaders, '\n'
                    # print filedatalines[0:2]
                    print '\n'
                    ratecodeindex = parsedheaders.index('LABEL')
                    ratecodemeaning = parsedheaders.index('SHORT_DESCRIPTION')
                    for dataline in filedatalines:
                        if dataline[:4] == 'LOGO':
                            firstuselessline = filedatalines.index(dataline)
                            # print firstuselessline
                    # ignore the first line which was the headers
                    # stop before the line that starts with LOGO - the first useless line
                    for dataline in filedatalines[1:firstuselessline-1:]:
                        # print dataline.split('|')
                        theratecode = dataline.split('|')[ratecodeindex]
                        theratemeaning = dataline.split('|')[ratecodemeaning]
                        # print theratecode, '\t', theratemeaning, '\n'
                        linetowrite = theratecode + ',' + theratemeaning + '\n'
                        outputfile.write(linetowrite)
                    outputfile.close()
        # Process sourceCodes file
        if key == 'sourceCodeFiles':
            for fname_Value in groupedFileDict[key]: # fname_Value is the filename
                if os.path.exists(fname_Value):
                    workingfile = open(fname_Value,'rb')
                    filedatastring = workingfile.read() # turns entire file contents to a single string
                    workingfile.close()
                    outname = 'forUpload_' + fname_Value[:-4:] + '.csv' # removes .txt of any other 3 char extension
                    outputfile = open(outname,'wb')
                    filedatalines = filedatastring.split('\n') # a list containing each line of the file
                    rawheaders = filedatalines[0] # 1st element of the list is the first row of the file, with the headers
                    parsedheaders = rawheaders.split('|') # turn the header string into a list where | was delimiter
                    print '\n'
                    print 'outname: ', outname, '\n'
                    # print 'rawheaders: ', rawheaders, '\n'
                    # print 'parsedheaders: ',parsedheaders, '\n'
                    # print filedatalines[0:2]
                    print '\n'
                    ratecodeindex = parsedheaders.index('SOURCE_CODE')
                    ratecodemeaning = parsedheaders.index('DESCRIPTION')
                    for dataline in filedatalines:
                        if dataline[:4] == 'LOGO':
                            firstuselessline = filedatalines.index(dataline)
                            # print firstuselessline
                    # ignore the first line which was the headers
                    # stop before the line that starts with LOGO - the first useless line
                    for dataline in filedatalines[1:firstuselessline-1:]:
                        # print dataline.split('|')
                        theratecode = dataline.split('|')[ratecodeindex]
                        theratemeaning = dataline.split('|')[ratecodemeaning]
                        # print theratecode, '\t', theratemeaning, '\n'
                        linetowrite = theratecode + ',' + theratemeaning + '\n'
                        outputfile.write(linetowrite)
                    outputfile.close()

processRawFiles(groupedFilestoProcess)

Answer 1

不得不重做我的代码，因为有一个新的事件，有问题的文件既没有标题行，也没有页脚行。但是，由于我想要的列仍然以相同的顺序出现，我只能保留它们。此外，如果任何下一行的列数少于所使用的两个索引中较大的一列，我们将停止读取。

至于减少重复，processRawFiles包含两个def，无需重复之前的大量解析代码。

def separateTranslationTypes(translationFileList):
    '''Takes in list of all files to process and find which are roomtypes
    , ratecodes or sourcecodes. The type of file determines how it will be processed.'''
    rates = []
    rooms = []
    sources = []
    for afile in translationFileList:
        rates.append( [m.group() for m in re.finditer('cf_ratecode+(.*)', afile)] )
        rooms.append( [m.group() for m in re.finditer('cf_roomtypes+(.*)', afile)] )
        sources.append( [m.group() for m in re.finditer('cf_sourcecodes+(.*)', afile)] )
    # empty list equates to False. So if x is True if the list is not empty - thus kept.
    rates = [x[0] for x in rates if x]
    rooms = [x[0] for x in rooms if x]
    sources = [x[0] for x in sources if x]
    print '... rateCode files :: ',rates,'\n'
    print '... roomType files :: ',rooms,'\n'
    print '... sourceCode files :: ',sources, '\n'

    return {'rateCodeFiles':rates,
            'roomTypeFiles':rooms,
            'sourceCodeFiles':sources}

groupedFilestoProcess = separateTranslationTypes(allFilestoProcess)

def processRawFiles(groupedFileDict):
    def someFixedProcess(bFileList, codeIndex, codeDescriptionIndex):
        for fname_Value in bFileList: # fname_Value is the filename
            if os.path.exists(fname_Value):
                workingfile = open(fname_Value,'rb')
                filedatastring = workingfile.read() # turns entire file contents to a single string
                workingfile.close()
                outname = 'forUpload_' + fname_Value[:-4:] + '.csv' # removes .txt of any other 3 char extension
                outputfile = open(outname,'wb')
                filedatalines = filedatastring.split('\n') # a list containing each line of the file
                # print '\n','outname: ',outname,'\n\n'
                # HEADERS ARE NOT IGNORED! Since the file might not have headers.
                print outname
                for dataline in filedatalines:
                    # print filedatalines.index(dataline), dataline.split('|')
                    # e.g. index 13, reuires len 14, so len > index is needed
                    if len(dataline.split('|')) > codeDescriptionIndex:
                        thecode_text = dataline.split('|')[codeIndex]
                        thedescription_text = dataline.split('|')[codeDescriptionIndex]
                        linetowrite = thecode_text + ',' + thedescription_text + '\n'
                        outputfile.write(linetowrite)
                    outputfile.close()

    def processByType(aFileList, itsType):
        typeDict = {'rateCodeFiles' : {'CODE_INDEX': 4,'DESC_INDEX':7},
                    'roomTypeFiles' : {'CODE_INDEX': 1,'DESC_INDEX':13},
                    'sourceCodeFiles': {'CODE_INDEX': 2,'DESC_INDEX':3}}
        # print 'someFixedProcess(',aFileList,typeDict[itsType]['CODE_INDEX'],typeDict[itsType]['DESC_INDEX'],')'
        someFixedProcess(aFileList,
                         typeDict[itsType]['CODE_INDEX'],
                         typeDict[itsType]['DESC_INDEX'])

    for key in groupedFileDict:
        processByType(groupedFileDict[key],key)

processRawFiles(groupedFilestoProcess)

重构此Python代码以简化重复的简单方法

1 个答案: