带有错误xml.etree.ElementTree.ParseError的python编程:格式不正确

时间:2018-07-13 16:36:50

标签: python xml elasticsearch xml.etree

我写了一个python来索引包含在XML文件中的文件夹中的数据,但是仅在运行一个文件夹索引时出现此错误:xml.etree.ElementTree.ParseError:格式不正确(无效令牌):行5933第23栏 我的python代码:

import os
from xml.etree import ElementTree
from elasticsearch import Elasticsearch

es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
def start(path):
    tree = ElementTree.parse(path)
    root = tree.getroot()
    print(root)
    docs = tree.findall('.//DOC')
    for doc in docs:
        title = doc.find('TITLE').text
        text = doc.find('TEXT').text
        all_dates = doc.findall('DATE')
        date = ''
        for d in all_dates:
            if d.attrib["calender"] == "Western":
                date = d.text
                break

    doc = {
        'title': title,
        'text': text,
        'date': date
    }
    insert_to_elastic(doc)

def insert_to_elastic(doc):
    es.index(index='hamshahri', doc_type='document', body=doc)

def xml_files():
    folders = ['2002','2003','2004','2005']
    xml_list = []
    for item in folders:
       path = '/home/course/web/Hamshahri/'+ item
       for filename in os.listdir(path):
          fullname = os.path.join(path, filename)
          if fullname.endswith('.xml'):
              xml_list.append(fullname)
    return xml_list

xmls = xml_files()
for path in xmls:
   print(path)
   start(path)

0 个答案:

没有答案