Pythonic快速解析XML流到dicts列表的方法

时间:2015-10-12 19:58:46

标签: python xml dictionary

我坚持将XML解析为类似Pandas的数据帧的列表。

# -*- coding: utf-8 -*-
import pandas as pd
"""
It's very important for parsing!
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import xml.etree.cElementTree as ET
from table import Table

def xml_to_pd(xml):
    tree = ET.fromstring(xml)
    xmltag = tree.tag
    doc_dict = {}
    res = []

    for doc in tree.iter():
        if doc.attrib:
            if doc_dict not in res and len(doc_dict)>0:
                res.append(doc_dict)
                doc_dict = {}
                doc_dict = (doc.attrib)

            if doc.text:
                key = doc.tag
                value = doc.text
                doc_dict[key] = (value)

        else:
            if doc.text:
                key = doc.tag
                value = doc.text
                doc_dict[key] = (value)

            else:
                if doc_dict != {}:
                    if doc_dict not in res:
                        res.append(doc_dict)
                doc_dict = {}

    if doc_dict != {}:
        if doc_dict not in res:
            res.append(doc_dict)
            doc_dict = {}
    df = pd.DataFrame(res)
    return df

table = pd.DataFrame()

allxml = ['<markets><market id="1">MMVB</market><market id="4">FORTS</market><market id="15">ETS</market></markets>',
'<sec_info_upd><secid>1538</secid><seccode>SV16BL5</seccode><market>4</market><bgo_c>11908.97</bgo_c><bgo_nc>10307.27</bgo_nc><bgo_buy>4789.49</bgo_buy></sec_info_upd>',
'<quotes><quote secid="3630"><board>FUT</board><seccode>SiZ5</seccode><price>68079</price><buy>-1</buy></quote><quote secid="3630"><board>FUT</board><seccode>SiZ5</seccode><price>68132</price><buy>2</buy></quote></quotes>']

for xml in allxml:
    res = xml_to_pd(xml)
    for r in res:
        table = pd.concat([table, res])
    print '\n\r'
    print table

我的想法是从每个XML表达式构建一个表,但我得到了奇怪的混合结果,并且不太确定我是否正确地执行

请不要关心Pandas,实际上我将使用另一个轻量级存储表对象接受像pandas DataFrame那样的dicts列表。

这也是非常关键的时间,因为xml Feed是每10毫升从股票市场提供的。所以,问题是:我该如何正确而快速地做到这一点?

真的需要你的帮助,因为我完全陷入了这个xml地狱。 提前谢谢。

1 个答案:

答案 0 :(得分:0)

这是我(丑陋)的解决方案:

def xml_to_pd(self, xml):
    tree = ET.fromstring(xml)
    self.xmltag = tree.tag
    doc_dict = {}
    res = []
    for doc in tree.iter():
        if doc.tag in doc_dict.keys():
            res.append (doc_dict)
            doc_dict = {}
        if doc.attrib:
            if doc_dict!= {} and doc_dict not in res:
                for k in doc.attrib.keys():
                    if k in doc_dict:
                        res.append (doc_dict)
                        doc_dict = {}

                doc_dict.update (doc.attrib)
            else:
                doc_dict.update(doc.attrib)
        if doc.text:
                key = doc.tag
                value = doc.text
                doc_dict[key] = value
    if doc_dict not in res and doc_dict != {}:
        res.append (doc_dict)
        doc_dict = {}
    return res

Table() is my custom 'dataframe' class. 

更新一年。

def xml_to_dict(self, xmltext):
    doc = xmltodict.parse(xmltext)
    _res = {}

    for key, value in doc.iteritems():
        if isinstance(value, dict):
            stripped_dict = {}
            for subkey in value.keys():
                stripped_key = subkey.strip('@')
                stripped_dict[stripped_key] = value[subkey]
            _res.update(stripped_dict)
        else:
            _res[key] = value
    return _res