快速替换XML节点值

时间:2013-09-04 11:34:00

标签: java xml regex xslt

我有一堆XML文档,其中包含我需要用假数据替换的个人信息。 Person节点包含以下元素:

  • uuid - 必填,不应该被触及。
  • firstName - 可选
  • lastName - 可选
  • 地址 - 可选
  • personID - 必需

一个人可能会出现很多次,在这种情况下应该使用相同的假数据,即如果两个Person节点具有相同的personID,他们都应该收到相同的假身份证。

我已经实现了一些Java代码,它们从XML字符串构建DOM树,并在将节点写回字符串之前替换它们。这很好,但由于我有这么多文件,我想知道是否有更快的方法。也许通过正则表达式或XSLT或其他东西?

以下是一个示例文档:

<ADocument>
  <Stuff>
    ...
  </Stuff>
  <OtherStuff>
    ...
  </OtherStuff>
  <Person>
    <uuid>11111111-1111-1111-1111-111111111111</uuid>
    <firstName>Some</firstName>
    <lastName>Person</lastName>
    <personID>111111111111</personID>
  </Person>
  <Person>
    <uuid>22222222-2222-2222-2222-222222222222</uuid>
    <firstName>Another Person</firstName>
    <address>Main St. 2</address>
    <personID>222222222222</personID>
  </Person>
  <Person>
    <uuid>33333333-3333-3333-3333-333333333333</uuid>
    <firstName>Some</firstName>
    <lastName>Person</lastName>
    <personID>111111111111</personID>
  </Person>
  <MoreStuff>
    ...
  </MoreStuff>
</ADocument>

这是我目前的实施:

public String replaceWithFalseData(String xmlInstance) {
    Document dom = toDOM(xmlInstance);

    XPathExpression xPathExpression = XPathExpressionFactory.createXPathExpression("//Person");
    List<Node> nodeList = xPathExpression.evaluateAsNodeList(dom);

    for(Node personNode : nodeList) {
        Map<String, Node> childNodes = getChildNodes(personNode);
        String personID = childNodes.get("personID").getTextContent();
        // Retrieve a cached fake person using the ID, or create a new one if none exists.
        Person fakePerson = getFakePerson(personID);

        setIfExists(childNodes.get("firstName"), fakePerson.getFirstName());
        setIfExists(childNodes.get("lastName"), fakePerson.getLastName());
        setIfExists(childNodes.get("address"), fakePerson.getAddress());
        setIfExists(childNodes.get("personID"), fakePerson.getPersonID());
    }

    return toString(dom);
}

public Map<String, Node> getChildNodes(Node parent) {
    Map<String, Node> childNodes = new HashMap<String, Node>();
    for(Node child = parent.getFirstChild(); child != null; child = child.getNextSibling()) {
        if(child.getLocalName() != null) {
            childNodes.put(child.getLocalName(), child);
        }
    }
    return childNodes;
}

public void setIfExists(Node node, String value) {
    if(node != null) {
        node.setTextContent(value);
    }
}

4 个答案:

答案 0 :(得分:2)

您正在使用基于DOM的API。使用 XML的流式API(StAX)可以实现更快的替换,在许多情况下,它可以胜过基于DOM的API:   StAX versus DOM

DOM API比StAX占用更多内存,这会降低性能,但比StAX API更容易使用。

您的示例的工作解决方案 - 在150 MB xml文件上测试,在10秒内更换:

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;


public class ReplaceXmlWithFakeUser
{
  public static void main(String[] args) throws XMLStreamException, IOException
  {
    XMLInputFactory inFactory = XMLInputFactory.newInstance();
    XMLEventReader eventReader = inFactory.createXMLEventReader(new BufferedInputStream(new FileInputStream("c:\\temp\\persons.xml")));
    XMLOutputFactory factory = XMLOutputFactory.newInstance();
    XMLEventWriter writer = factory.createXMLEventWriter(new BufferedOutputStream(new FileOutputStream("c:\\temp\\fakePersons.xml")));
    XMLEventFactory eventFactory = XMLEventFactory.newInstance();
    while (eventReader.hasNext())
    {
      XMLEvent event = eventReader.nextEvent();

      if (event.getEventType() == XMLEvent.START_ELEMENT &&
        event.asStartElement().getName().toString().equals("Person"))
      {
        //write Person startElement:
        writer.add(event);


        /*
        STEP 1:
        personId is at the end of Person element. Cannot overwrite firstName and address element with fake data yet. Must call getFakePerson() first.
        Iterate till you read Person END element and just remember all events within person element which we will overwrite with fake data in step 2.
         */
        Person fakePerson=null;

        List<XMLEvent> eventsWithinPersonElement = new ArrayList<XMLEvent>();

        event = eventReader.nextEvent();
        while(!(event.getEventType() == XMLEvent.END_ELEMENT && event.asEndElement().getName().toString().equals("Person")))
        {

          eventsWithinPersonElement.add(event);

          if(event.getEventType() == XMLEvent.START_ELEMENT &&
              event.asStartElement().getName().toString().equals("personID"))
          {
            XMLEvent personIDContentEvent = eventReader.nextEvent();

            String personId = personIDContentEvent.asCharacters().toString();
            fakePerson = getFakePerson(personId);

            eventsWithinPersonElement.add(personIDContentEvent);
          }

          event = eventReader.nextEvent();
        }
        XMLEvent personEndElement=event;


        //STEP 2:
        for (Iterator<XMLEvent> eventWithinPersonElementIterator = eventsWithinPersonElement.iterator(); eventWithinPersonElementIterator.hasNext(); )
        {
          XMLEvent eventWithinPersonElement = eventWithinPersonElementIterator.next();

          writer.add(eventWithinPersonElement);

          if(eventWithinPersonElement.getEventType() == XMLEvent.START_ELEMENT &&
              eventWithinPersonElement.asStartElement().getName().toString().equals("personID"))
          {
            writer.add(eventFactory.createCharacters(fakePerson.personId));

            //skip personId event
            eventWithinPersonElementIterator.next();
          }
          if(eventWithinPersonElement.getEventType() == XMLEvent.START_ELEMENT &&
              eventWithinPersonElement.asStartElement().getName().toString().equals("firstName"))
          {
            writer.add(eventFactory.createCharacters(fakePerson.firstName));

            //skip real firstName
            eventWithinPersonElementIterator.next();
          }
          if(eventWithinPersonElement.getEventType() == XMLEvent.START_ELEMENT &&
              eventWithinPersonElement.asStartElement().getName().toString().equals("lastName"))
          {
            writer.add(eventFactory.createCharacters(fakePerson.lastName));

            //skip real firstName
            eventWithinPersonElementIterator.next();
          }
          else if(eventWithinPersonElement.getEventType() == XMLEvent.START_ELEMENT &&
              eventWithinPersonElement.asStartElement().getName().toString().equals("address"))
          {
            writer.add(eventFactory.createCharacters(fakePerson.address));

            //skip real address
            eventWithinPersonElementIterator.next();

          }
        }

        writer.add(personEndElement);
      }
      else
      {
        writer.add(event);
      }
    }
    writer.close();
  }

  private static Person getFakePerson(String personId)
  {
    //create simple fake user...

    Person fakePerson = new Person();
    fakePerson.personId = personId;
    fakePerson.firstName = "fake first name: " + Math.random();
    fakePerson.lastName = "fake last name: " + Math.random();
    fakePerson.address = "fake address: " + Math.random();

    return fakePerson;
  }

  static class Person
  {
    String personId;
    String firstName;
    String lastName;
    String address;

  }
}

使用persons.xml作为输入:

<ADocument>
    <Stuff>
        <StuffA></StuffA>
    </Stuff>
    <OtherStuff>
        <OtherStuff>
            <ABC>yada yada</ABC>
        </OtherStuff>
    </OtherStuff>

    <Person>
        <uuid>11111111-1111-1111-1111-111111111111</uuid>
        <firstName>Some</firstName>
        <lastName>Person</lastName>
        <personID>111111111111</personID>
    </Person>
    <Person>
        <uuid>22222222-2222-2222-2222-222222222222</uuid>
        <firstName>Another Person</firstName>
        <address>Main St. 2</address>
        <personID>222222222222</personID>
    </Person>
    <Person>
        <uuid>33333333-3333-3333-3333-333333333333</uuid>
        <firstName>Some</firstName>
        <lastName>Person</lastName>
        <personID>111111111111</personID>
    </Person>

    <MoreStuff>
        <foo></foo>
        <foo>fooo</foo>
        <foo><bar></bar></foo>
        <foo>
            <bar></bar>
            <bar/>
            <bar>bb</bar>
        </foo>
        <bar/>
    </MoreStuff>

</ADocument>

生成此fakePersons.xml结果:

<?xml version="1.0" encoding="UTF-8"?><ADocument>
    <Stuff>
        <StuffA></StuffA>
    </Stuff>
    <OtherStuff>
        <OtherStuff>
            <ABC>yada yada</ABC>
        </OtherStuff>
    </OtherStuff>

    <Person>
        <uuid>11111111-1111-1111-1111-111111111111</uuid>
        <firstName>fake first name: 0.9518514637129984</firstName>
        <lastName>fake last name: 0.3495378044884426</lastName>
        <personID>111111111111</personID>
    </Person>
    <Person>
        <uuid>22222222-2222-2222-2222-222222222222</uuid>
        <firstName>fake first name: 0.8945739434355868</firstName>
        <address>fake address: 0.40784763231471777</address>
        <personID>222222222222</personID>
    </Person>
    <Person>
        <uuid>33333333-3333-3333-3333-333333333333</uuid>
        <firstName>fake first name: 0.7863207851479257</firstName>
        <lastName>fake last name: 0.09918620445731652</lastName>
        <personID>111111111111</personID>
    </Person>

    <MoreStuff>
        <foo></foo>
        <foo>fooo</foo>
        <foo><bar></bar></foo>
        <foo>
            <bar></bar>
            <bar></bar>
            <bar>bb</bar>
        </foo>
        <bar></bar>
    </MoreStuff>

</ADocument>

答案 1 :(得分:0)

我不确定XSLT是否可以在这里提供帮助。也许我对XSLT的了解不够深入,但是使用XSLT基于现有XML的数据创建新的XML结构。看起来你想在这里做相反的事情:保持相同的结构,但是根据动态值更新数据。您可能很难创建这样的XSLT。 优化可以依赖于相当多的参数:每个XML的Person元素的数量,XML中相等PersonIds的数量,要处理的XML的数量,......如果您处理大文件,您可能想要切换到SAX实现可优化您的内存消耗。如果你在同一个XML中处理大量相同的PersonID,你可以在假数据后面建立一些缓存结构,你可以用它来替换你的DOM上的命中量(你可以用你的命令直接替换节点)缓存节点并用原始节点覆盖uuid)。 如果你有很多包含类似PersonID的小文件,你可能想要使用跨XML缓存,如果可以接受相同的假数据可以用于多个XML文件。

此外,我相信您可以删除PersonID上的'setIfExists',因为它被声明为必填字段。

答案 2 :(得分:0)

我无法对相对性能发表评论,但这是解决您问题的XSLT解决方案。

以下XSLT样式表:

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  <xsl:output method="xml" indent="yes" omit-xml-declaration="yes"/>

  <!-- Find the position of the first Person with the same personID as this Person. 
       This will be used to provide a unique identifier for that person. -->
  <xsl:template name="get-position-id">
    <xsl:value-of select="count(../../Person[personID=current()/../personID][1]/preceding-sibling::Person)"/>
  </xsl:template>

  <!-- For personID elements, we will replace the number with a number based on the position of 
       the first Person with the same personId. -->
  <xsl:template match="personID">
    <xsl:copy>
      <xsl:variable name="position-id">
        <xsl:call-template name="get-position-id"/>
      </xsl:variable>
      <xsl:call-template name="create-person-id">
        <xsl:with-param name="input" select="$position-id"/>
      </xsl:call-template>
    </xsl:copy>
  </xsl:template>

  <!-- For elements that match this template, we will replace the text with an arbitrary string
       appended with a number linking them to a particular personID. -->
  <xsl:template match="firstName|lastName|address">
    <xsl:copy>
      <xsl:variable name="position-id">
        <xsl:call-template name="get-position-id"/>
      </xsl:variable>
      <xsl:call-template name="create-fake-string">
        <xsl:with-param name="input" select="$position-id"/>
      </xsl:call-template>
    </xsl:copy>    
  </xsl:template>

  <!-- The identity transform. -->
  <xsl:template match="@*|node()">
    <xsl:copy>
      <xsl:apply-templates select="@*|node()"/>
    </xsl:copy>
  </xsl:template>

  <!-- This template generates a number that can be used to replace personID. -->
  <xsl:template name="create-person-id">
    <xsl:param name="input"/>
    <!-- Turn the input into a 12-digit number padded by zeroes. -->
    <xsl:value-of select="format-number($input, '000000000000') "/>
  </xsl:template>

  <!-- This template generates a string that can be used to replace data. -->
  <xsl:template name="create-fake-string">
    <xsl:param name="input"/>
    <!-- Create a string to replace data with, appending the input parameter. -->
    <xsl:text>FAKEDATA</xsl:text>
    <xsl:value-of select="$input"/>
  </xsl:template>

</xsl:stylesheet>

在应用于示例文档时生成以下XML:

<ADocument>
  <Stuff>
    ...
  </Stuff>
  <OtherStuff>
    ...
  </OtherStuff>
  <Person>
    <uuid>11111111-1111-1111-1111-111111111111</uuid>
    <firstName>FAKEDATA0</firstName>
    <lastName>FAKEDATA0</lastName>
    <personID>000000000000</personID>
  </Person>
  <Person>
    <uuid>22222222-2222-2222-2222-222222222222</uuid>
    <firstName>FAKEDATA1</firstName>
    <address>FAKEDATA1</address>
    <personID>000000000001</personID>
  </Person>
  <Person>
    <uuid>33333333-3333-3333-3333-333333333333</uuid>
    <firstName>FAKEDATA0</firstName>
    <lastName>FAKEDATA0</lastName>
    <personID>000000000000</personID>
  </Person>
  <MoreStuff>
    ...
  </MoreStuff>
</ADocument>

答案 3 :(得分:0)

感谢所有贡献的人!我使用我的DOM实现,Sergej的StAX实现和Ben的XSLT实现以及我自己的另一个实现,使用正则表达式对一组2000 XML文档进行了性能测试。结果如下:

  • DOM:23,93s
  • StAX:20,37s
  • XSLT:83,52s
  • 正则表达:7,83s

以下是胜利者:

public String replaceWithFalseData(String xmlInstance) {
    Pattern personPattern = Pattern.compile("<Person>.*?</Person>", Pattern.DOTALL);
    Matcher personMatcher = personPattern.matcher(xmlInstance);
    StringBuffer xmlBuffer = new StringBuffer();

    while(personMatcher.find()) {
        String personXml = personMatcher.group();

        Pattern idPattern = Pattern.compile("<personID>(.*)</personID>");
        Matcher idMatcher = idPattern.matcher(personXml);
        idMatcher.find();
        String id = idMatcher.group(1);
        Person fakePerson = getFakePerson(id);

        personXml = personXml.replaceFirst("<firstName>.*</firstName>",
                "<firstName>" + fakePerson.getFirstName() + "</firstName>");

        personXml = personXml.replaceFirst("<lastName>.*</lastName>",
                "<lastName>" + fakePerson.getLastName() + "</lastName>");

        personXml = personXml.replaceFirst("<address>.*</address>",
                "<address>" + fakePerson.getAddress() + "</address>");

        personXml = personXml.replaceFirst("<personID>.*</personID>",
                "<personID>" + fakePerson.getPersonID() + "</personID>");

        personMatcher.appendReplacement(xmlBuffer, personXml);
    }

    personMatcher.appendTail(xmlBuffer);
    return xmlBuffer.toString();
}