将字典的文本文件转换为csv Python

时间:2017-07-19 16:15:46

标签: python json csv dictionary

我有一个由不同词典组成的文本文件,它看起来像这样:

{"destination.fqdn": "194-65-57-128.ctt.pt", "feed.provider": "MyFeed", "source.abuse_contact": "coisas@foo.com", "raw": "bWFsd2FyZSwyMTAuMjguNTYuMSxodHRwOi8vd3d3LmN0dC5wdCAsMTk0LTY1LTU3LTEyOC5jdHQucHQsY29pc2FzQGZvby5jb20sMTk0LjIzOS4xNjcuNSx3d3cudmVyeWJhZC5jb20gLHZlcnkudmVyeWJhZC5jb20sLCwsMjAxMC0wMi0xOFQwMDowMDowMCswMDowMA0K", "feed.name": "FileCollector", "destination.geolocation.latitude": 32.2109, "destination.geolocation.cc": "CN", "source.geolocation.longitude": 12.069, "event_description.text": "ctt", "source.ip": "194.239.167.5", "source.geolocation.city": "Frederikssund", "destination.geolocation.city": "Zhenjiang", "destination.url": "http://www.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.verybad.com", "source.fqdn": "very.verybad.com", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "feed.accuracy": 100.0, "time.observation": "2017-07-18T13:15:48+00:00", "destination.geolocation.longitude": 119.4551, "source.geolocation.latitude": 55.8396, "classification.type": "malware", "destination.ip": "210.28.56.1", "time.source": "2010-02-18T00:00:00+00:00", "source.geolocation.cc": "DK"}
{"destination.url": "http://www2.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.telecom.pt", "feed.provider": "MyFeed", "time.observation": "2017-07-18T13:15:48+00:00", "destination.fqdn": "ctt-pt.mail.protection.outlook.com", "source.abuse_contact": "coisas7@foo.com", "source.geolocation.cc": "TN", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "raw": "YyZjLDI1MS4xNTQuNjUuOSxodHRwOi8vd3d3Mi5jdHQucHQsY3R0LXB0Lm1haWwucHJvdGVjdGlvbi5vdXRsb29rLmNvbSxjb2lzYXM3QGZvby5jb20sMTk3LjEzLjEwNS44LHd3dy50ZWxlY29tLnB0LCwsLCwyMDEwLTAyLTE4VDAwOjAwOjAwKzAwOjAwDQo=", "feed.name": "FileCollector", "classification.type": "c&c", "source.geolocation.latitude": 34.0, "source.geolocation.longitude": 9.0, "destination.ip": "251.154.65.9", "event_description.text": "ctt", "source.ip": "197.13.105.8", "time.source": "2010-02-18T00:00:00+00:00", "feed.accuracy": 100.0}

每一行都是字典,有些字典比其他字典有更多的键,我想将文本文件转换为csv文件。

我有以下代码:

import json
import csv
import ast

def json_to_csv(txt_file, csv_file):

    lista = []
    with open(txt_file, 'rb') as fin:
        lines = fin.readlines()
        for line in lines:
            dict_line = ast.literal_eval(line)
            lista.append(line)
    list_json = json.dumps(lista)

    read_json = json.loads(list_json)

    header =["feed.accuracy","feed.url","source.geolocation.longitude","event_description.text","raw","destination.geolocation.city","source.ip","classification.taxonomy",
                "time.observation","destination.geolocation.latitude","destination.ip","source.asn","feed.name","source.geolocation.latitude","time.source","feed.provider",
                "destination.geolocation.longitude","destination.geolocation.cc","destination.asn","source.abuse_contact","source.geolocation.cc","classification.type"]
    with open(csv_file, 'wb+') as f:
        dict_writer = csv.DictWriter(f, header)
        dict_writer.writeheader()
        dict_writer.writerows(read_json)

首先我读取文本文件,然后将其内容转换为JSON,然后尝试将转换后的数据写入csv文件,但是它返回以下错误:

Traceback (most recent call last):
  File "<pyshell#38>", line 1, in <module>
    json_to_csv('ctt.txt','ctt.csv')
  File "C:/Users/Marisa/Documents/json_to_csv.py", line 26, in json_to_csv
    dict_writer.writerows(read_json)
  File "C:\Python27\lib\csv.py", line 157, in writerows
    rows.append(self._dict_to_list(rowdict))
  File "C:\Python27\lib\csv.py", line 148, in _dict_to_list
    + ", ".join([repr(x) for x in wrong_fields]))
ValueError: dict contains fields not in fieldnames: u'{', u'"', u'f', u'e', u'e', u'd', u'.', u'a', u'c', u'c', u'u', u'r', u'a', u'c', u'y', u'"', u':', u' ', u'1', u'0', u'0', u'.', u'0', u',', u' ', u'"', u'c', u'l', u'a', u's', u's', u'i', u'f', u'i', u'c', u'a', u't', u'i', u'o', u'n', u'.', u't', u'a', u'x',...

1 个答案:

答案 0 :(得分:1)

你使它变得比它需要的复杂一点,并且你错过了上面你自己的示例数据中的一些字段。我们可以摆脱ast依赖关系和背面&amp;第四次JSON处理,添加缺少的字段,以下内容将使用您提供的示例数据:

import json
import csv

def json_to_csv(txt_file, csv_file):

    lista = []
    with open(txt_file, 'r') as in_file:
        lines = in_file.readlines()
        for line in lines:
            try:
                dict_line = json.loads(line)
                lista.append(dict_line)
            except Exception as err:
                print(err)

    header = [
        "feed.accuracy", "feed.url", "source.geolocation.longitude",
        "event_description.text", "raw", "destination.geolocation.city",
        "source.ip", "classification.taxonomy", "time.observation",
        "destination.geolocation.latitude", "destination.ip", "source.asn",
        "feed.name", "source.geolocation.latitude", "time.source",
        "feed.provider", "destination.geolocation.longitude",
        "destination.geolocation.cc", "destination.asn",
        "source.abuse_contact", "source.geolocation.cc", "classification.type",
        'destination.fqdn', 'source.fqdn', 'source.geolocation.city',
        'source.url', 'destination.url'
    ]
    with open(csv_file, 'w+') as out_file:
        dict_writer = csv.DictWriter(out_file, header)
        dict_writer.writeheader()
        dict_writer.writerows(lista)

请注意,如果您的真实数据中包含的示例中包含的字段更多,则您还需要添加这些字段。

另请注意,如果输入数据是正确的JSON数组,如:

[{"destination.fqdn": "194-65-57-128.ctt.pt", "feed.provider": "MyFeed", "source.abuse_contact": "coisas@foo.com", "raw": "bWFsd2FyZSwyMTAuMjguNTYuMSxodHRwOi8vd3d3LmN0dC5wdCAsMTk0LTY1LTU3LTEyOC5jdHQucHQsY29pc2FzQGZvby5jb20sMTk0LjIzOS4xNjcuNSx3d3cudmVyeWJhZC5jb20gLHZlcnkudmVyeWJhZC5jb20sLCwsMjAxMC0wMi0xOFQwMDowMDowMCswMDowMA0K", "feed.name": "FileCollector", "destination.geolocation.latitude": 32.2109, "destination.geolocation.cc": "CN", "source.geolocation.longitude": 12.069, "event_description.text": "ctt", "source.ip": "194.239.167.5", "source.geolocation.city": "Frederikssund", "destination.geolocation.city": "Zhenjiang", "destination.url": "http://www.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.verybad.com", "source.fqdn": "very.verybad.com", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "feed.accuracy": 100.0, "time.observation": "2017-07-18T13:15:48+00:00", "destination.geolocation.longitude": 119.4551, "source.geolocation.latitude": 55.8396, "classification.type": "malware", "destination.ip": "210.28.56.1", "time.source": "2010-02-18T00:00:00+00:00", "source.geolocation.cc": "DK"},
{"destination.url": "http://www2.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.telecom.pt", "feed.provider": "MyFeed", "time.observation": "2017-07-18T13:15:48+00:00", "destination.fqdn": "ctt-pt.mail.protection.outlook.com", "source.abuse_contact": "coisas7@foo.com", "source.geolocation.cc": "TN", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "raw": "YyZjLDI1MS4xNTQuNjUuOSxodHRwOi8vd3d3Mi5jdHQucHQsY3R0LXB0Lm1haWwucHJvdGVjdGlvbi5vdXRsb29rLmNvbSxjb2lzYXM3QGZvby5jb20sMTk3LjEzLjEwNS44LHd3dy50ZWxlY29tLnB0LCwsLCwyMDEwLTAyLTE4VDAwOjAwOjAwKzAwOjAwDQo=", "feed.name": "FileCollector", "classification.type": "c&c", "source.geolocation.latitude": 34.0, "source.geolocation.longitude": 9.0, "destination.ip": "251.154.65.9", "event_description.text": "ctt", "source.ip": "197.13.105.8", "time.source": "2010-02-18T00:00:00+00:00", "feed.accuracy": 100.0}]

该解决方案简化了整个初始with open块变得更加简单:

with open(txt_file, 'r') as in_file:
    lista = json.load(in_file)