如何使用XSLT删除这个棘手的XML重复节点?

时间:2012-05-21 15:54:37

标签: xml xslt

我想从同一个父中删除连续重复节点,并且还有完全相同的孩子

输入方案1:

<myroot>
    <nodeA id="a">
        <section id="i">  
            <item1 id="0" method="create"> 
                <somechild>a</somechild>
            </item1>

            <item1 id="1" method="create">
                <otherchild>a</otherchild>
            </item1>
        </section>        

        <section id="i">
            <item1 id="0" method="create"> <!-- second consecutive create, we remove this -->
                <somechild>a</somechild>
            </item1>
            <item1 id="0" method="create"> <!-- third consecutive create, but children have different value , so we don't remove this -->
                <somechild>bbb</somechild>
            </item1>
            <item1 id="3" method="create">
                <other>xx</other>    
            </item1>

            <item1 id="0" method="change"> 
                <otherchild>a</otherchild>
            </item1>
        </section>
    </nodeA>

    <nodeA id="b">
        <section id="i">  
            <item1 id="0" method="create"> 
                <somechild>a</somechild>
            </item1>

            <item1 id="1" method="create">
                <otherchild>a</otherchild>
            </item1>
        </section>        

        <section id="i">
            <item1 id="0" method="create"> <!-- second consecutive create, we remove this -->
                <somechild>a</somechild>
            </item1>
            <item1 id="0" method="create"> <!-- third consecutive create, but children have different value , so we don't remove this -->
                <somechild>bbb</somechild>
            </item1>
            <item1 id="3" method="create">
                <other>xx</other>    
            </item1>

            <item1 id="0" method="change"> 
                <otherchild>a</otherchild>
            </item1>
        </section>
    </nodeA>

    <nodeB id="b">
        <section id="i">  
            <item1 id="0" method="create"> 
                <somechild>a</somechild>
            </item1>

            <item1 id="1" method="create">
                <otherchild>a</otherchild>
            </item1>
        </section>        

        <section id="i">
            <item1 id="0" method="create"> <!-- second consecutive create, we remove this -->
                <somechild>a</somechild>
            </item1>
            <item1 id="0" method="create"> <!-- third consecutive create, but children have different value , so we don't remove this -->
                <somechild>bbb</somechild>
            </item1>
            <item1 id="3" method="create">
                <other>xx</other>    
            </item1>

            <item1 id="0" method="change"> 
                <otherchild>a</otherchild>
            </item1>
        </section>
    </nodeB>
</myroot>

我的结果:

<myroot>
    <nodeA id="a">
        <section id="i">
            <item1 id="0" method="create">
                <somechild>a</somechild>
            </item1>
            <item1 id="1" method="create">
                <otherchild>a</otherchild>
            </item1>
        </section>
        <section id="i">
            <item1 id="0" method="create">
                <somechild>bbb</somechild>
            </item1>
            <item1 id="3" method="create">
                <other>xx</other>
            </item1>
            <item1 id="0" method="change">
                <otherchild>a</otherchild>
            </item1>
        </section>
    </nodeA>
    <nodeA id="b">
        <section id="i"/>
        <section id="i"/>
    </nodeA>
    <nodeB id="b">
        <section id="i"/>
        <section id="i"/>
    </nodeB>
</myroot>

预期输出:

<myroot>
    <nodeA id="a">
        <section id="i">  
            <item1 id="0" method="create"> 
                <somechild>a</somechild>
            </item1>

            <item1 id="1" method="create">
                <otherchild>a</otherchild>
            </item1>
        </section>        

        <section id="i">
            <item1 id="0" method="create"> 
                <somechild>bbb</somechild>
            </item1>
            <item1 id="3" method="create">
                <other>xx</other>    
            </item1>

            <item1 id="0" method="change"> 
                <otherchild>a</otherchild>
            </item1>
        </section>
    </nodeA>

    <nodeA id="b">
        <section id="i">  
            <item1 id="0" method="create"> 
                <somechild>a</somechild>
            </item1>

            <item1 id="1" method="create">
                <otherchild>a</otherchild>
            </item1>
        </section>        

        <section id="i">
            <item1 id="0" method="create"> 
                <somechild>bbb</somechild>
            </item1>
            <item1 id="3" method="create">
                <other>xx</other>    
            </item1>

            <item1 id="0" method="change"> 
                <otherchild>a</otherchild>
            </item1>
        </section>
    </nodeA>

    <nodeB id="b">
        <section id="i">  
            <item1 id="0" method="create"> 
                <somechild>a</somechild>
            </item1>

            <item1 id="1" method="create">
                <otherchild>a</otherchild>
            </item1>
        </section>        

        <section id="i">
            <item1 id="0" method="create"> 
                <somechild>bbb</somechild>
            </item1>
            <item1 id="3" method="create">
                <other>xx</other>    
            </item1>

            <item1 id="0" method="change"> 
                <otherchild>a</otherchild>
            </item1>
        </section>
    </nodeB>
</myroot>

在上面的第一个场景中:只有第二个连续的create具有相同的子节点,而第三个连续的create方法具有不同的子节点,这就是为什么我们只删除第二个子节点。

第二种输入方案(更多变化):

<myroot>
    <nodeB id="a">
        <cell id="i">              
            <item2 id="1" method="create">
                <otherchild>a</otherchild>
            </item2>

            <item2 id="0" method="create"> 
                <otherchild>a</otherchild>
            </item2>

            <item2 id="1" method="modify">
                <otherchild>a</otherchild>
            </item2>        
        </cell>        

        <cell id="i">
            <item2 id="1" method="modify"> <!-- second consecutive modify, we remove this -->
                <otherchild>a</otherchild>
            </item2>

            <item2 id="1" method="modify"> <!-- third consecutive modify, BUT different chldren, we do NOT remove this -->
                <otherchild>a</otherchild>
                <somechild>aa</somechild>
            </item2>

            <item2 id="1" method="delete" /> 

            <item2 id="0" method="create"> 
                <somechild>bbb</somechild>
            </item2>

            <item2 id="1" method="delete" /> <!-- second consecutive delete, we remove this -->

            <item2 id="3" method="create">
                <other>xx</other>    
            </item2>

            <item2 id="1" method="delete" />  <!-- third consecutive delete, we remove this -->           
        </cell>
    </nodeB>
</myroot>

输出:

<myroot>
    <nodeB id="a">
        <cell id="i">              
            <item2 id="1" method="create">
                <otherchild>a</otherchild>
            </item2>

             <item2 id="0" method="create"> 
                <otherchild>a</otherchild>
            </item2>

            <item2 id="1" method="modify">
                <otherchild>a</otherchild>
            </item2>        
        </cell>        

        <cell id="i">
            <item2 id="1" method="modify">
                <otherchild>a</otherchild>
                <somechild>aa</somechild>
            </item2>

            <item2 id="1" method="delete" /> 

            <item2 id="0" method="create"> 
                <somechild>bbb</somechild>
            </item2>           
        </cell>
    </nodeB>
</myroot>

不执行此操作:

    <myroot>     
        <node1 id="a">
            <section id="i">
                <item1 id="0" method="start">
                    <somechild>a</somechild>
                </item1>
                <item1 id="0" method="start"> <!-- this one is successive from the previous so we eliminate -->
                    <somechild>a</somechild>
                </item1>
                <item1 id="0" method="stop"/>                
                <item1 id="0" method="start"> <!-- this will be treated as new starting point -->
                    <somechild>a</somechild>
                </item1>
            </section>  

            <section id="i">
                <item1 id="0" method="start"> <!-- this one is successive from the previous so we eliminate -->
                    <somechild>a</somechild>
                </item1>
            </section>                
        </node1>
    </myroot>

output:

    <myroot>     
        <node1 id="a">
            <section id="i">
                <item1 id="0" method="start">
                    <somechild>a</somechild>
                </item1>
                <item1 id="0" method="start"> <!-- this one is successive from the previous so we eliminate -->
                    <somechild>a</somechild>
                </item1>
                <item1 id="0" method="stop"/>                          
            </section>                  
            <section id="i"/>                              
        </node1>
    </myroot>

The correct output should be:

    <myroot>     
        <node1 id="a">
            <section id="i">
                <item1 id="0" method="start">
                    <somechild>a</somechild>
                </item1>                   
                <item1 id="0" method="stop"/>                
                <item1 id="0" method="start"> <!-- this will be treated as new starting point -->
                    <somechild>a</somechild>
                </item1>
            </section>                  
            <section id="i" />                   
        </node1>
    </myroot>

任何人都可以帮助我使用XSLT进行这种棘手的删除吗? 非常感谢。

约翰

1 个答案:

答案 0 :(得分:5)

这是一个XSLT 2.0示例样式表,应该可以完成这项工作,或者至少可以让您了解如何使用deep-equal提供帮助:

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet 
  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  version="2.0">

  <xsl:strip-space elements="*"/>
  <xsl:output indent="yes"/>

  <xsl:template match="@* | node()">
    <xsl:copy>
      <xsl:apply-templates select="@* , node()"/>
    </xsl:copy>
  </xsl:template>

  <xsl:template match="/*/*/*/*[some $el in (preceding-sibling::* , preceding::*) satisfies deep-equal(., $el)]"/>

</xsl:stylesheet>

[编辑] 根据新要求,我没有看到使用deep-equal解决该问题的方法,因此我编写了一个使用for-each-groupdeep-equal的新样式表:

<xsl:stylesheet 
  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  version="2.0">

  <xsl:strip-space elements="*"/>
  <xsl:output indent="yes"/>

  <xsl:template match="@* | node()" name="identity">
    <xsl:copy>
      <xsl:apply-templates select="@* , node()"/>
    </xsl:copy>
  </xsl:template>

  <xsl:template match="/*/*">
    <xsl:copy>
      <xsl:variable name="first-in-group" as="element()*">
        <xsl:for-each-group select="*" group-by="concat(node-name(.), '|', @id)">
          <xsl:for-each-group select="current-group()/*" group-by="concat(@id, '|', @method)">
            <xsl:sequence 
              select="for $pos in 1 to count(current-group())
                      return current-group()[$pos]
                              [every $item 
                              in subsequence(current-group(), 1, $pos - 1) 
                              satisfies not(deep-equal($item, current-group()[$pos]))] "/>
          </xsl:for-each-group>
        </xsl:for-each-group>
      </xsl:variable>
      <xsl:apply-templates select="@*"/>
      <xsl:apply-templates>
        <xsl:with-param name="first-in-group" select="$first-in-group" tunnel="yes"/>
      </xsl:apply-templates>
    </xsl:copy>
  </xsl:template>

  <xsl:template match="/*/*/*/*">
    <xsl:param name="first-in-group" tunnel="yes"/>
    <xsl:if test="$first-in-group intersect .">
      <xsl:call-template name="identity"/>
    </xsl:if>
  </xsl:template>

</xsl:stylesheet>

使用样式表Saxon 9.4,对样本应用时

<myroot>
    <nodeA id="a">
        <section id="i">  
            <item1 id="0" method="create"> 
                <somechild>a</somechild>
            </item1>

            <item1 id="1" method="create">
                <otherchild>a</otherchild>
            </item1>
        </section>        

        <section id="i">
            <item1 id="0" method="create"> <!-- second consecutive create, we remove this -->
                <somechild>a</somechild>
            </item1>
            <item1 id="0" method="create"> <!-- third consecutive create, but children have different value , so we don't remove this -->
                <somechild>bbb</somechild>
            </item1>
            <item1 id="3" method="create">
                <other>xx</other>    
            </item1>

            <item1 id="0" method="change"> 
                <otherchild>a</otherchild>
            </item1>
        </section>
    </nodeA>

    <nodeA id="b">
        <section id="i">  
            <item1 id="0" method="create"> 
                <somechild>a</somechild>
            </item1>

            <item1 id="1" method="create">
                <otherchild>a</otherchild>
            </item1>
        </section>        

        <section id="i">
            <item1 id="0" method="create"> <!-- second consecutive create, we remove this -->
                <somechild>a</somechild>
            </item1>
            <item1 id="0" method="create"> <!-- third consecutive create, but children have different value , so we don't remove this -->
                <somechild>bbb</somechild>
            </item1>
            <item1 id="3" method="create">
                <other>xx</other>    
            </item1>

            <item1 id="0" method="change"> 
                <otherchild>a</otherchild>
            </item1>
        </section>
    </nodeA>

    <nodeB id="b">
        <section id="i">  
            <item1 id="0" method="create"> 
                <somechild>a</somechild>
            </item1>

            <item1 id="1" method="create">
                <otherchild>a</otherchild>
            </item1>
        </section>        

        <section id="i">
            <item1 id="0" method="create"> <!-- second consecutive create, we remove this -->
                <somechild>a</somechild>
            </item1>
            <item1 id="0" method="create"> <!-- third consecutive create, but children have different value , so we don't remove this -->
                <somechild>bbb</somechild>
            </item1>
            <item1 id="3" method="create">
                <other>xx</other>    
            </item1>

            <item1 id="0" method="change"> 
                <otherchild>a</otherchild>
            </item1>
        </section>
    </nodeB>
</myroot>

输出

<myroot>
   <nodeA id="a">
      <section id="i">
         <item1 id="0" method="create">
            <somechild>a</somechild>
         </item1>
         <item1 id="1" method="create">
            <otherchild>a</otherchild>
         </item1>
      </section>
      <section id="i">
         <item1 id="0" method="create"><!-- third consecutive create, but children have different value , so we don't re
move this --><somechild>bbb</somechild>
         </item1>
         <item1 id="3" method="create">
            <other>xx</other>
         </item1>
         <item1 id="0" method="change">
            <otherchild>a</otherchild>
         </item1>
      </section>
   </nodeA>
   <nodeA id="b">
      <section id="i">
         <item1 id="0" method="create">
            <somechild>a</somechild>
         </item1>
         <item1 id="1" method="create">
            <otherchild>a</otherchild>
         </item1>
      </section>
      <section id="i">
         <item1 id="0" method="create"><!-- third consecutive create, but children have different value , so we don't re
move this --><somechild>bbb</somechild>
         </item1>
         <item1 id="3" method="create">
            <other>xx</other>
         </item1>
         <item1 id="0" method="change">
            <otherchild>a</otherchild>
         </item1>
      </section>
   </nodeA>
   <nodeB id="b">
      <section id="i">
         <item1 id="0" method="create">
            <somechild>a</somechild>
         </item1>
         <item1 id="1" method="create">
            <otherchild>a</otherchild>
         </item1>
      </section>
      <section id="i">
         <item1 id="0" method="create"><!-- third consecutive create, but children have different value , so we don't re
move this --><somechild>bbb</somechild>
         </item1>
         <item1 id="3" method="create">
            <other>xx</other>
         </item1>
         <item1 id="0" method="change">
            <otherchild>a</otherchild>
         </item1>
      </section>
   </nodeB>
</myroot>
希望所有内容都被移除,您希望将其删除,并保留所有内容。