使用XDocument解析嵌套XML

时间:2017-07-07 21:25:19

标签: c# xml linq xml-parsing

我有一个以下格式的xml。 xml有2个部分(job_type =" REQUESTED" job_type =" RECOMMENDED");我只想解析Requested块中的值。 同样在Applicants标签中有两种类型的应用程序(类型=" PB"和类型=" CB")

我想将结果解析为以下格式的csv

id , social_security_number (where type = "PB"), first_name(where type = "PB"), city(where type = "PB" and item_code="CURRENT"), state_code_id(where type = "PB" and item_code="CURRENT"), com(where item_code="PEMAIL" and type ="PB"), social_security_number (where type = "CB"), first_name(where type = "CB"), city(where type = "CB" and item_code="CURRENT"), state_code_id(where type = "CB" and item_code="CURRENT"), com(where item_code="PEMAIL" and type ="CB")

例如结果:

2407132 ,999999999, Thomas, Portland, MI, edison@gmail.com, 123456789, Mary, BarHarBor, MI, mary@gmail.com

<?xml version="1.0" encoding="utf-8"?>
<JobApplications xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="2407132" bundle_id="2407132" version="1.0">
<JobApplication job_type="REQUESTED" request_date="2014-08-02T12:26:00.0000000">
<JobApplicationStates>
<JobApplicationState type="USEDCL" payment_call_flag="False">
<Applicants>
<Applicant social_security_number="999999999" type="PB" date_of_birth="1972-10-01T00:00:00.0000000" first_name="Thomas" last_name="Edison">
<Addresses>
<Address city="Portland" state_code_id="MI" country_code="USA" postal_code="12345" item_code="CURRENT" street_number="6297" street="LAKE ARBOR" />
<Address item_code="PREVIOUS" />
</Addresses>
<Communications>
<Communication item_code="PEMAIL" com="edison@gmail.com" contact_type="CU"/>
<Communication item_code="HOME" com="(123)-456-7890" contact_type="CU"/>
<Communication item_code="OTHER" contact_type="CU"/>
<Communication item_code="WORK" com="(100)-200-3000" contact_type="CU"/>
</Communications>
</Applicant>
<Applicant social_security_number="123456789" type="CB" date_of_birth="1976-10-01T00:00:00.0000000" first_name="Mary" last_name="Edison">
<Addresses>
<Address city="BarHarBor" state_code_id="MI" country_code="USA" postal_code="12345" item_code="CURRENT" street_number="6297" street="LAKE ARBOR" />
<Address item_code="PREVIOUS" />
</Addresses>
<Communications>
<Communication item_code="PEMAIL" com="mary@gmail.com" contact_type="CU"/>
<Communication item_code="HOME" com="(999)-456-7890" contact_type="CU"/>
<Communication item_code="OTHER" contact_type="CU"/>
<Communication item_code="WORK" com="(300)-200-3000" contact_type="CU"/>
</Communications>
</Applicant>
</Applicants> 
</JobApplicationState>
</JobApplicationStates>
</JobApplication>
<JobApplication job_type="RECOMMENDED" request_date="2014-08-02T12:26:00.0000000">
<JobApplicationStates>
<JobApplicationState type="USEDCL" payment_call_flag="False">
<Applicants>
<Applicant social_security_number="999999999" type="PB" date_of_birth="1972-10-01T00:00:00.0000000" first_name="Thomas" last_name="Edison">
<Addresses>
<Address city="Portland" state_code_id="MI" country_code="USA" postal_code="12345" item_code="CURRENT" street_number="6297" street="LAKE ARBOR" />
<Address item_code="PREVIOUS" />
</Addresses>
<Communications>
<Communication item_code="PEMAIL" com="edison@gmail.com" contact_type="CU"/>
<Communication item_code="HOME" com="(123)-456-7890" contact_type="CU"/>
<Communication item_code="OTHER" contact_type="CU"/>
<Communication item_code="WORK" com="(100)-200-3000" contact_type="CU"/>
</Communications>
</Applicant>
<Applicant social_security_number="123456789" type="CB" date_of_birth="1976-10-01T00:00:00.0000000" first_name="Mary" last_name="Edison">
<Addresses>
<Address city="BarHarBor" state_code_id="MI" country_code="USA" postal_code="12345" item_code="CURRENT" street_number="6297" street="LAKE ARBOR" />
<Address item_code="PREVIOUS" />
</Addresses>
<Communications>
<Communication item_code="PEMAIL" com="mary@gmail.com" contact_type="CU"/>
<Communication item_code="HOME" com="(999)-456-7890" contact_type="CU"/>
<Communication item_code="OTHER" contact_type="CU"/>
<Communication item_code="WORK" com="(300)-200-3000" contact_type="CU"/>
</Communications>
</Applicant>
</Applicants> 
</JobApplicationState>
</JobApplicationStates>
</JobApplication>

如何使用XDocument以所需格式解析任何指针?我有数百万的xmls要解析。

2 个答案:

答案 0 :(得分:0)

Yo通常可以将xml文件压平,以便更容易放入数据库并进行读取。在你的情况下,我认为这是最好的方法。见下面的代码。我认为在解析完成后可以过滤数据表结果时,仅通过解析某些项来使解析方法复杂化是不太好的。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Data;
using System.Xml;
using System.Xml.Linq;

namespace ConsoleApplication1
{
    class Program
    {
        const string FILENAME = @"c:\temp\test.xml";
        static void Main(string[] args)
        {

            DataTable dt = new DataTable();

            dt.Columns.Add("ID", typeof(int));
            dt.Columns.Add("Job Type", typeof(string));
            dt.Columns.Add("Request Date", typeof(DateTime));
            dt.Columns.Add("Job State Type",typeof(string));
            dt.Columns.Add("Payment Call Flag", typeof(Boolean));
            dt.Columns.Add("SSN", typeof(string));
            dt.Columns.Add("Appliacant Type", typeof(string));
            dt.Columns.Add("DOB", typeof(DateTime));
            dt.Columns.Add("First Name", typeof(string));
            dt.Columns.Add("Last Name", typeof(string));

            dt.Columns.Add("City", typeof(string));
            dt.Columns.Add("State", typeof(string));
            dt.Columns.Add("Country", typeof(string));
            dt.Columns.Add("Postal Code", typeof(string));
            dt.Columns.Add("Street Number", typeof(string));
            dt.Columns.Add("Street", typeof(string));
            dt.Columns.Add("Email", typeof(string));
            dt.Columns.Add("Home Phone", typeof(string));
            dt.Columns.Add("Other", typeof(string));
            dt.Columns.Add("Work Phone", typeof(string));


            XDocument doc = XDocument.Load(FILENAME);

            XElement jobApplications = doc.Root;
            int id = (int)jobApplications.Attribute("id");

            foreach (XElement jobApplication in jobApplications.Elements("JobApplication"))
            {
                string job_type = (string)jobApplication.Attribute("job_type");
                DateTime request_date = (DateTime)jobApplication.Attribute("request_date");
                string job_state_type = (string)jobApplication.Descendants("JobApplicationState").FirstOrDefault().Attribute("type");
                Boolean payment_call_flag = (Boolean)jobApplication.Descendants("JobApplicationState").FirstOrDefault().Attribute("payment_call_flag");

                foreach (XElement applicant in jobApplication.Descendants("Applicant"))
                {
                    string social_security_number = (string)applicant.Attribute("social_security_number");
                    string applicant_type = (string)applicant.Attribute("type");
                    DateTime date_of_birth = (DateTime)applicant.Attribute("date_of_birth");
                    string first_name = (string)applicant.Attribute("first_name");
                    string last_name = (string)applicant.Attribute("last_name");


                    XElement address = applicant.Descendants("Address").Where(x => (string)x.Attribute("item_code") == "CURRENT").FirstOrDefault();
                    string city = (string)address.Attribute("city");
                    string state = (string)address.Attribute("state_code_id");
                    string country = (string)address.Attribute("country_code");
                    string postal_code = (string)address.Attribute("postal_code");
                    string street_number = (string)address.Attribute("street_number");
                    string street = (string)address.Attribute("street");

                    XElement communications = applicant.Descendants("Communications").FirstOrDefault();
                    string email = communications.Elements().Where(x => (string)x.Attribute("item_code") == "PEMAIL").Select(x => (string)x.Attribute("com")).FirstOrDefault();
                    string home_phone = communications.Elements().Where(x => (string)x.Attribute("item_code") == "HOME").Select(x => (string)x.Attribute("com")).FirstOrDefault();
                    string other = communications.Elements().Where(x => (string)x.Attribute("item_code") == "OTHER").Select(x => (string)x.Attribute("com")).FirstOrDefault();
                    string work_phone = communications.Elements().Where(x => (string)x.Attribute("item_code") == "WORK").Select(x => (string)x.Attribute("com")).FirstOrDefault();

                    dt.Rows.Add(new object[] {
                        id,
                        job_type, request_date, job_state_type, payment_call_flag,
                        social_security_number, applicant_type, date_of_birth, first_name, last_name,
                        city, state, country, postal_code, street_number, street,
                        email, home_phone, other, work_phone
                    });
                }
            }

        }
    }
}

答案 1 :(得分:0)

这应该可以帮助您使用开源库Cinchoo ETL

将xml文件转换为几行代码的csv

由于xml是结构化格式的文件,因此您需要展平以生成CSV。在XPath和Cinchoo ETL库的帮助下,您可以快速生成CSV文件。

下面的示例代码显示了如何解析xml并将其转换为csv

using (var parser = new ChoXmlReader("sample.xml").WithXPath("JobApplications")
    .WithField("ID", xPath: "@id")
    .WithField("PB_SSN", xPath: "/JobApplication[@job_type='REQUESTED']/JobApplicationStates/JobApplicationState/Applicants/Applicant[@type='PB']/@social_security_number")
    .WithField("PB_FIRST_NAME", xPath: "/JobApplication[@job_type='REQUESTED']/JobApplicationStates/JobApplicationState/Applicants/Applicant[@type='PB']/@first_name")
    .WithField("PB_CITY", xPath: "/JobApplication[@job_type='REQUESTED']/JobApplicationStates/JobApplicationState/Applicants/Applicant[@type='PB']/Addresses/Address[@item_code='CURRENT']/@city")
    .WithField("PB_STATE", xPath: "/JobApplication[@job_type='REQUESTED']/JobApplicationStates/JobApplicationState/Applicants/Applicant[@type='PB']/Addresses/Address[@item_code='CURRENT']/@state_code_id")
    .WithField("PB_PEMAIL", xPath: "/JobApplication[@job_type='REQUESTED']/JobApplicationStates/JobApplicationState/Applicants/Applicant[@type='PB']/Communications/Communication[@item_code='PEMAIL']/@com")
    .WithField("CB_SSN", xPath: "/JobApplication[@job_type='REQUESTED']/JobApplicationStates/JobApplicationState/Applicants/Applicant[@type='CB']/@social_security_number")
    .WithField("CB_FIRST_NAME", xPath: "/JobApplication[@job_type='REQUESTED']/JobApplicationStates/JobApplicationState/Applicants/Applicant[@type='CB']/@first_name")
    .WithField("CB_CITY", xPath: "/JobApplication[@job_type='REQUESTED']/JobApplicationStates/JobApplicationState/Applicants/Applicant[@type='CB']/Addresses/Address[@item_code='CURRENT']/@city")
    .WithField("CB_STATE", xPath: "/JobApplication[@job_type='REQUESTED']/JobApplicationStates/JobApplicationState/Applicants/Applicant[@type='CB']/Addresses/Address[@item_code='CURRENT']/@state_code_id")
    .WithField("CB_PEMAIL", xPath: "/JobApplication[@job_type='REQUESTED']/JobApplicationStates/JobApplicationState/Applicants/Applicant[@type='CB']/Communications/Communication[@item_code='PEMAIL']/@com")
)
{
    using (var writer = new ChoCSVWriter("sample.csv"))
        writer.Write(parser);
}

输出:

2407132,999999999,Thomas,Portland,MI,edison@gmail.com,123456789,Mary,BarHarBor,MI,mary@gmail.com

披露:我是这个图书馆的作者。