将结构大型XML文件转换为CSV而不使用标记名称

时间:2016-02-08 15:44:12

标签: python csv xml-parsing

Iam尝试将大型XML文件转换为CSV格式,下面是我的代码和示例文件,示例文件(xml文件的一部分)

<PostalAddress>
    <Id>5464443597076195439</Id>
    <AddUserId>SYSTEM_USER</AddUserId>
    <AddDate>2013-01-05T18:08:42-06:00</AddDate>
    <LastPersistenceTransactionUserId>SYSTEM_USER</LastPersistenceTransactionUserId>
    <LastPersistenceTransactionDate>2013-07-11T08:21:34-05:00</LastPersistenceTransactionDate>
    <LastPersistenceTransactionType tc="2"/>
    <ExternalReferenceId>3200723</ExternalReferenceId>
    <SchemaVersion>2</SchemaVersion>
    <Type tc="1"/>
    <Usage tc="2"/>
    <Valid>true</Valid>
    <Overridable>true</Overridable>
    <Preferred>false</Preferred>
    <Line1>4849 RONSON CT</Line1>
    <Line2>STE 208</Line2>
    <City>SAN DIEGO</City>
    <State tc="6"/>
    <PostalCode>92111</PostalCode>
    <Country tc="1"/>
</PostalAddress>

以下是我的代码

import codecs
import xml.etree.ElementTree as et
import sys
class gokul:

    def __init__(self, input_file, output_file, encoding='utf-8'):

        self.output_buffer = []
        self.output = None
       self.context = et.iterparse(input_file, from bs4 import BeautifulSoup=("start", "end"))
        try:
            self.output = codecs.open(output_file, "w", encoding=encoding)
        except:
            print("Failed to open the output file")
            raise


    def convert(self, tag="item", delimiter=",", ignore=[], noheader=False,
                limit=-1, buffer_size=1000):
        event, root = next(self.context)

        items = []
        header_line = []
        field_name = ''

        tagged = False
        started = False
        n = 0

        for event, elem in self.context:

            should_write = elem.tag != tag and started and elem.tag not in ignore
            should_tag = not tagged and should_write and not noheader
            i=0 

            if event == 'start':
                if elem.tag == tag and not started:
                    started = True
                elif should_tag:
                    field_name = '_'.join((field_name, elem.tag)) if field_name else elem.tag

            else:
                if should_write:
                    if should_tag:
                        header_line.append(field_name)  
                        field_name = field_name.rpartition('_' + elem.tag)[0]
                    items.append('' if elem.text is None else elem.text.strip().replace('"', r'""'))

                elif elem.tag == tag and len(items) > 0:

                    if header_line and not tagged:
                        self.output.write(delimiter.join(header_line) + '\n')
                    tagged = True
                    self.output_buffer.append(r'"' + (r'"' + delimiter + r'"').join(items) + r'"')
                    items = []
                    n += 1
                    if n == limit:
                        break
                    if len(self.output_buffer) > buffer_size:
                        self._write_buffer()

                elem.clear()  

        self._write_buffer()  
        self.output.close()

        return n
    def _write_buffer(self):
        """Write records from buffer to the output file"""

        self.output.write('\n'.join(self.output_buffer) + '\n')
        self.output_buffer = []

         converter = gokul(sys.argv[1], sys.argv[2], encoding="utf-8")
         converter.convert(tag=sys.argv[3]) 

0 个答案:

没有答案