递归文本拆分的麻烦

时间:2013-05-10 22:00:30

标签: python regex string parsing recursion

尝试使用递归通过文本定义的边界标记分割文本,并创建包含原始文本文件的所有有组织部分的列表和字符串列表。

分裂没有发生。

以下是简短版本:真正的问题脚本:

def separate(text,boundary = None):
    if boundary == None:
        m = re.findall(r'(?<=boundary=).*',text)
        i = 0
        while i < len(m): #have all levels of Boundary/headers named
            boundary = m[i]
            textList = recursiveSplit(text,boundary)
            i += 1
        pdb.set_trace()
        return textList

def recursiveSplit(chunk,boundary):
    if type(chunk) is types.StringType:
        ar = re.split(r'(?P<boundary>)(?!--)',chunk)
        return ar
    if type(chunk) is types.ListType:
        i = 0
        while i < len(chunk):
            chunk[i] = recursiveSplit(chunk[i],boundary)
            i += 1
        return obj

我之前发过这个剧本,人们希望我把它全部发布,所以我会这样做

  #Textbasics email parser
#based on a "show original" file converted into text

from sys import argv
import re, os, pdb, types

script, filename = argv
text = open(filename).read()
type = "text only" #Set the default type of email

#cut the email up by sections
#--A section is defined as any time there are two line breaks in a row
textList = re.split(r"\n\n", text)
header = textList[0]
if re.search(r'MIME-Version',header):
    type = "MIME"

# If mail has no attachments, parse as a text-only email
class Parser(object):

    def __init__(self,textList):
        a = 1
        self.body = ""
        self.textList = textList
        self.header = textList[0]
        while a < len(textList):
            self.body = self.body + textList[a] + '\n\n'
            a += 1

        m = re.search(r'(?<=Subject: ).*', self.header)
        self.subject = m.group(0)

        m = re.search(r'(?<=From: ).*', self.header)
        self.fromVar = m.group(0)

        m = re.search(r'(?<=To: ).*', self.header)
        self.toVar = m.group(0)

        m = re.search(r'(?<=Date: )\w+\s\w+\s\w+', self.header)
        self.date = m.group(0)

    def returnParsed(self,descriptor = "all"):
        if descriptor == "all":
            retv = "Subject: " + self.subject + "\n" + "From: " + self.fromVar + "\n" + "To: " + self.toVar + "\n" + "Date: " + self.date + "\n" + "\n" + self.body
            return retv

        if descriptor == "subject":
            return self.subject
        if descriptor == "fromVar":
            return self.fromVar
        if descriptor == "toVar":
            return self.toVar
        if descriptor == "date":
            return self.date
        if descriptor == "body":
            return self.body

class MIMEParser(Parser):

    class MIMEDataDecoder(object):
        def __init__(self,decodeString,type):
            pass    


    def __init__(self,textList):
        self.textList = textList
        self.nestedItems = []
        newItem = NestedItem(self)
        newItem.setContentType("Header")
        newItem.setValue(self.textList[0])
        self.nestedItems.append(newItem)
        if re.search(r'(boundary=)',newItem.value):
            helperItem = NestedItem(self)
            helperItem.value = (self.textList[0])
            m = re.search(r'(?<=Content-Type: ).+(?=;)',newItem.value)
            helperItem.setContentType(m.group(0))
            self.nestedItems.append(helperItem)

        self.organizeData()   
        """i = 0
        while i < len(self.textList):
            newItem = NestedItem(self)
            ct = self.nextContentType
            newItem.setContentType(ct)
            newItem.setValue(self.textList[i])
            self.nestedItems.append(newItem)
            m = re.search(r'(?<=Content-Type: ).+(?=;)',self.textList[i])
            if m:
                self.nextContentType = m.group(0)
            i += 1
            """

    def nestItem (self,item):
        self.nestedItems.append(item)

    def organizeData(self):
        self.nestLevel = 1
        self.currentSuper = self
        m = re.search(r'(?<=boundary=).*',self.textList[0])
        self.currentBoundary = m.group(0)
        self.currentList = self.textList
        self.currentList.remove(self.textList[0])
        self.formerObjectDatabase = {}
        pdb.set_trace()
        while self.nestLevel > 0:
            i = 0
            while i < len(self.currentList):

                boundary = self.currentBoundary
                #If block is a "normal block", containing a current boundary identifier
                p = re.search(r'--(?P<boundary>)(?!--)', text)
                if p:
                    newItem = NestedItem(self.currentSuper)
                    newItem.setValue(self.currentList[i])
                    r = re.search(r'(?<=Content-Type: ).+(?=;)',newItem.value)
                    if r:
                        newItem.setContentType(r.group(0))
                    self.currentObject = newItem
                    self.currentSuper.nestItem(self.currentObject)
                #If the block contains a new block boundary
                m = re.search(r'(?<=boundary=).*',self.currentList[i])
                if m:
                    #begin new layer of recursive commands
                    newFormerObject = self.FormerCurrentObject(self.currentList,self.currentSuper,self.currentBoundary)
                    self.formerObjectDatabase[self.nestLevel] = newFormerObject
                    self.currentSuper = self.currentObject
                    self.nestLevel += 1
                    self.currentBoundary = m.group(0)
                    boundary = self.currentBoundary
                    #self.currentList = re.split(r'--(?P<boundary>)(?!--)', self.currentList[i])
                boundary = self.currentBoundary
                #If block contains an "end of boundary" marker
                q = re.search(r'(?P<boundary>)--', text)
                if q:
                    self.nestLevel -= 1
                    currentObject = self.formerObjectDatabase[self.nestLevel]
                    self.currentList = currentObject.formerList
                    self.currentSuper = currentObject.formerSuper
                    self.currentBoundary = currentObject.formerBoundary
                i += 1                    


    class FormerCurrentObject:
        def __init__(self,formerList,formerSuper,formerBoundary):
            self.formerList = formerList
            self.formerSuper = formerSuper
            self.formerBoundary = formerBoundary




    def printAll(self):
        print "printing all: %d" % len(self.nestedItems)
        i = 0
        while i < len(self.nestedItems):
            print "printing out item %d" % i
            self.nestedItems[i].printOut()
            i += 1

class NestedItem(object):
    def __init__(self,superObject,contentType=" ",value = " "):
        self.superObject = superObject
        self.contentType = contentType
        self.value = value
        self.nestedItems = []

    def nestItem(self,item):
        self.nestedItems.append(item)

    def printOut(self,printBuffer = ""):
        print printBuffer + '++%s' % self.contentType
        print printBuffer + self.value
        a = 0
        printBuffer = printBuffer + "  "
        while a < len(self.nestedItems):
            self.nestedItems[a].printOut(printBuffer)

    def setContentType(self,contentType):
        self.contentType = contentType

    def setValue(self,value):
        self.value = value



if type == "text only":
    p = Parser(textList)
    print p.returnParsed()
# ---PROBLEM CODE STARTS HERE---
def separate(text,boundary = None):
    pdb.set_trace()
    if boundary == None:
        m = re.findall(r'(?<=boundary=).*',text)
        i = 0
        textList = [text]
        while i < len(m): #have all levels of Boundary/headers named
            boundary = m[i]
            textList = recursiveSplit(textList,boundary)
            i += 1

    return textList

def recursiveSplit(chunk,boundary):
    if type(chunk) is types.ListType: #<<--error occurs here
        for obj in chunk:
            recursiveSplit(obj,boundary)
    if type(chunk) is types.StringType:
        list = re.split(r'(?P<boundary>)(?!--)',chunk)
        return list
    return None
#---PROBLEM CODE ENDS(?) HERE---

if type == "MIME":
    #separate the text file instead by its boundary identifier
    p = MIMEParser(separate(text))
    p.printAll()

您可以使用任何MIME类型的电子邮件来运行此操作。这是我为方便起见而使用的那个

MIME-Version: 1.0
Received: by 10.112.170.40 with HTTP; Fri, 3 May 2013 05:08:21 -0700 (PDT)
Date: Fri, 3 May 2013 08:08:21 -0400
Delivered-To: MYEMAIL@gmail.com
Message-ID: <@mail.gmail.com>
Subject: MiB 5/3/13 7:43AM (EST)
From: ME<MYEMAIL@gmail.com>
To: SOMEONE <SOMEONE@aol.com>
Content-Type: multipart/mixed; boundary=BNDRY1

--BNDRY1
Content-Type: multipart/alternative; boundary=BNDRY2

--BNDRY2
Content-Type: text/plain; charset=ISO-8859-1

-changed signature methods to conform more to working clinic header
methods(please test/not testable in simulator)
-confirmed that signature image is showing up in simulator. Awaiting
further tests
-Modified findings spacing/buffer. See if you like it

--BNDRY2
Content-Type: text/html; charset=ISO-8859-1

<div dir="ltr">-changed signature methods to conform more to working clinic header methods(please test/not testable in simulator)<div style>-confirmed that signature image is showing up in simulator. Awaiting further tests</div>
<div style>-Modified findings spacing/buffer. See if you like it</div></div>

--BNDRY2--
--BNDRY1
Content-Type: application/zip; name="Make it Brief.ipa.zip"
Content-Disposition: attachment; filename="Make it Brief.ipa.zip"
Content-Transfer-Encoding: base64
X-Attachment-Id: f_hg9biuno0

<<FILE DATA>>
--BNDRY1--

1 个答案:

答案 0 :(得分:2)

问题在于正则表达式。可能有一种更酷的方法,但我只是根据变量创建了一个搜索字符串文字。

def recursiveSplit(chunk,boundary):
    if type(chunk) is types.StringType:
        #ar = re.split(r'(?P<boundary>)(?!--)',chunk)
        searchString = "--%s" % boundary
        print searchString
        ar = re.split(searchString,chunk)
        return ar
    if type(chunk) is types.ListType:
        i = 0
        while i < len(chunk):
            chunk[i] = recursiveSplit(chunk[i],boundary)
            i += 1
        return obj