从EML文件中提取附件

时间:2017-06-02 08:07:45

标签: python-3.x

我目前使用此代码从EML文件中提取附件。我想知道我是否可以将附件链接到邮件(EML文件)。也就是说,添加eml文件名作为附件名称前缀。 所以我可以知道附件属于什么邮件。 谢谢

import os, re
import email
import argparse
import olefile

def extractAttachment(msg, eml_files, output_path):
    #print len(msg.get_payload())
    #print msg.get_payload()
    if len(msg.get_payload()) > 2:
        if isinstance(msg.get_payload(), str):
            try:
                extractOLEFormat(eml_files, output_path)
            except IOError:
                #print 'Could not process %s. Try manual extraction.' % (eml_files)
                #print '\tHeader of file: %s\n' % (msg.get_payload()[:8])
                pass

        elif isinstance(msg.get_payload(), list):
            count = 0
            while count < len(msg.get_payload()):
                payload = msg.get_payload()[count]
                #récupérer les pièces jointes 
                filename = payload.get_filename()
                #os.rename(filename,'rrrrr'+filename)
                #filename=os.path.join(str(filename), str(eml_files))
                if filename is not None:
                    try:
                        magic = payload.get_payload(decode=True)[:4]
                    except TypeError:
                        magic = "None"                    
                    # Print the magic deader and the filename for reference.
                    printIT(eml_files, magic, filename)
                    # Write the payload out.
                    writeFile(filename, payload, output_path)
                count += 1

    elif len(msg.get_payload()) == 2:
        payload = msg.get_payload()[1]
        filename = payload.get_filename()
        try:
            magic = payload.get_payload(decode=True)[:4]
        except TypeError:
            magic = "None"
        # Print the magic deader and the filename for reference.
        printIT(eml_files, magic, filename)
        # Write the payload out.
        writeFile(filename, payload, output_path)        

    elif len(msg.get_payload()) == 1:
        attachment = msg.get_payload()[0]
        payload = attachment.get_payload()[1]
        filename = attachment.get_payload()[1].get_filename()
        try:
            magic = payload.get_payload(decode=True)[:4]
        except TypeError:
            magic = "None"        
        # Print the magic deader and the filename for reference.
        printIT(eml_files, magic, filename)
        # Write the payload out.
        writeFile(filename, payload, output_path)
    #else:
    #    print 'Could not process %s\t%s' % (eml_files, len(msg.get_payload()))

def extractOLEFormat(eml_files, output_path):
    data = '__substg1.0_37010102'
    filename = olefile.OleFileIO(eml_files)
    msg = olefile.OleFileIO(eml_files)
    attachmentDirs = []
    for directories in msg.listdir():
        if directories[0].startswith('__attach') and directories[0] not in attachmentDirs:
            attachmentDirs.append(directories[0])

    for dir in attachmentDirs:
        filename = [dir, data]
        if isinstance(filename, list):
            filenames = "/".join(filename)
            filename = msg.openstream(dir + '/' + '__substg1.0_3707001F').read().replace('\000', '')


            payload = msg.openstream(filenames).read()
            magic = payload[:4]
            # Print the magic deader and the filename for reference.
            printIT(eml_files, magic, filename)
            # Write the payload out.
            writeOLE(filename, payload, output_path)
#filename = str(eml_files)+"--"+str(filename)
def printIT(eml_files, magic, filename):
    filename = str(eml_files)+"--"+str(filename)
    print ('Email Name: %s\n\tMagic: %s\n\tSaved File as: %s\n' % (eml_files, magic, filename))

def writeFile(filename, payload, output_path):

    filename = str(eml_files)+"--"+str(filename)
    try:
        file_location = output_path + filename
        open(os.path.join(file_location), 'wb').write(payload.get_payload(decode=True))
    except (TypeError, IOError):
        pass

def writeOLE(filename, payload, output_path):
    open(os.path.join(output_path + filename), 'wb')
def main():
    parser = argparse.ArgumentParser(description='Attempt to parse the attachment from EML messages.')
    parser.add_argument('-p', '--path',default='C:\\Users\\hamd\\Desktop\\TEX\\emails' ,help='eml')#Path to EML files
    parser.add_argument('-o', '--out', default='C:\\Users\\hamd\\Desktop\\TEX\\PJ\\eml_files\\',help='pj')#Path to write attachments to.
    args = parser.parse_args()    

    if args.path:
        input_path = args.path
    else:
        print ("You need to specify a path to your EML files.")
        exit(0)

    if args.out:
        output_path = args.out
    else:
        print ("You need to specify a path to write your attachments to.")
        exit(0)

    for root, subdirs, files in os.walk(input_path):
        for file_names in files:
            eml_files = os.path.join(root, file_names)
            msg = email.message_from_file(open(eml_files))
            extractAttachment(msg, eml_files, output_path)

if __name__ == "__main__":
    main()

1 个答案:

答案 0 :(得分:0)

我试着把它写成评论,但太长了。我不会给出一个完整的解决方案,但我会解释这个想法。

一个可能的解决方案是创建一个指向提取附件的硬链接,为硬链接提供与 EML 文件相同的名称。如果同一个 EML 文件中有更多附件,可以附加一个增量后缀:

whatever.eml    (original email file)
whatever_001.attch    (hard link to first extracted attachment)
whatever_002.attch    (hard link to second extracted attachment)
...

这样:

  • 您可以自由地将提取的附件移动到其他任何位置(但在同一个磁盘中,因为硬链接根据定义只能在同一个磁盘上工作)
  • 您可以将附件(硬链接)的副本与 EML 文件一起保存,而不会占用磁盘空间
  • 如果提取的文件被删除,您将拥有附件(硬链接)的备份副本,而不会占用磁盘空间

在 Python 中,您可以简单地使用以下命令创建硬链接:

import os
os.link(existing_target_file, new_link_name)