Question

输入XML

KeyError                                  Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.DatetimeEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 1420243200000000000

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2524             try:
-> 2525                 return self._engine.get_loc(key)
   2526             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.DatetimeEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.DatetimeEngine.get_loc()

KeyError: Timestamp('2015-01-03 00:00:00')

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.DatetimeEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 1420243200000000000

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
~/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py in _get_predict_end(self, end)
    172             try:
--> 173                 end = self._get_dates_loc(dates, dtend)
    174             except KeyError as err: # end is greater than dates[-1]...probably

~/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py in _get_dates_loc(self, dates, date)
     94     def _get_dates_loc(self, dates, date):
---> 95         date = dates.get_loc(date)
     96         return date

~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/datetimes.py in get_loc(self, key, method, tolerance)
   1425             key = Timestamp(key, tz=self.tz)
-> 1426             return Index.get_loc(self, key, method, tolerance)
   1427 

~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2526             except KeyError:
-> 2527                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2528 

pandas/_libs/index.pyx in pandas._libs.index.DatetimeEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.DatetimeEngine.get_loc()

KeyError: Timestamp('2015-01-03 00:00:00')

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-206-505c74789333> in <module>()
      3 ax = price.loc['2012-01-03':].plot(ax=ax, label='observed')
      4 
----> 5 fig = model_fit.plot_predict('2014-01-03','2015-01-03', dynamic=False, ax=ax, plot_insample=False)
      6 
      7 plt.show()

~/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/arima_model.py in plot_predict(self, start, end, exog, dynamic, alpha, plot_insample, ax)
   1885 
   1886         # use predict so you set dates
-> 1887         forecast = self.predict(start, end, exog, 'levels', dynamic)
   1888         # doing this twice. just add a plot keyword to predict?
   1889         start = self.model._get_predict_start(start, dynamic=dynamic)

~/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/arima_model.py in predict(self, start, end, exog, typ, dynamic)
   1808     def predict(self, start=None, end=None, exog=None, typ='linear',
   1809                 dynamic=False):
-> 1810         return self.model.predict(self.params, start, end, exog, typ, dynamic)
   1811     predict.__doc__ = _arima_results_predict
   1812 

~/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/arima_model.py in predict(self, params, start, end, exog, typ, dynamic)
   1184             if not dynamic:
   1185                 predict = super(ARIMA, self).predict(params, start, end, exog,
-> 1186                                                      dynamic)
   1187 
   1188                 start = self._get_predict_start(start, dynamic)

~/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/arima_model.py in predict(self, params, start, end, exog, dynamic)
    732         # will return an index of a date
    733         start = self._get_predict_start(start, dynamic)
--> 734         end, out_of_sample = self._get_predict_end(end, dynamic)
    735         if out_of_sample and (exog is None and self.k_exog > 0):
    736             raise ValueError("You must provide exog for ARMAX")

~/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/arima_model.py in _get_predict_end(self, end, dynamic)
   1062         Handling of inclusiveness should be done in the predict function.
   1063         """
-> 1064         end, out_of_sample = super(ARIMA, self)._get_predict_end(end, dynamic)
   1065         if 'mle' not in self.method and not dynamic:
   1066             end -= self.k_ar

~/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/arima_model.py in _get_predict_end(self, end, dynamic)
    673     def _get_predict_end(self, end, dynamic=False):
    674         # pass through so predict works for ARIMA and ARMA
--> 675         return super(ARMA, self)._get_predict_end(end)
    676 
    677     def geterrors(self, params):

~/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py in _get_predict_end(self, end)
    177                     freq = self.data.freq
    178                     out_of_sample = datetools._idx_from_dates(dates[-1], dtend,
--> 179                                             freq)
    180                 else:
    181                     if freq is None:

~/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/datetools.py in _idx_from_dates(d1, d2, freq)
    100     return len(DatetimeIndex(start=_maybe_convert_period(d1),
    101                              end=_maybe_convert_period(d2),
--> 102                              freq=_freq_to_pandas[freq])) - 1
    103 
    104 

~/anaconda3/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    116                 else:
    117                     kwargs[new_arg_name] = new_arg_value
--> 118             return func(*args, **kwargs)
    119         return wrapper
    120     return _deprecate_kwarg

~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/datetimes.py in __new__(cls, data, freq, start, end, periods, copy, name, tz, verify_integrity, normalize, closed, ambiguous, dtype, **kwargs)
    303 
    304         if data is None and freq is None:
--> 305             raise ValueError("Must provide freq argument if no data is "
    306                              "supplied")
    307 

ValueError: Must provide freq argument if no data is supplied

输出XML

NaN

问题陈述： 需要一个XSLT才能从上面的Input XML转换为Output XML 输入XML的实际样本大约为10 MB，我编写了一个转换，对PayrollGroup进行了两次循环，其循环次数为n平方的O（n2）。这对我的服务器来说非常糟糕，并且抛出了内存不足异常。

有人可以提供性能更好的XSLT吗？

编辑：

以下是我正在使用的两个XSLT，一个接一个

<PayrollGroup xmlns="http://www.example.org">
    <Payroll>
      <EmpID>1</EmpID>
      <Name>Jacob</Name>
      <WeekNumber>12</WeekNumber>
      <HoursType>Regular</HoursType>
      <Hours>80</Hours>
      <EarningsType></EarningsType>
      <Earnings></Earnings>
    </Payroll>
    <Payroll>
      <EmpID>1</EmpID>
      <Name>Jacob</Name>
      <WeekNumber>12</WeekNumber>
      <HoursType></HoursType>
      <Hours></Hours>
      <EarningsType>Regular</EarningsType>
      <Earnings>800.00</Earnings>
     </Payroll>
    <Payroll>
      <EmpID>2</EmpID>
      <Name>John</Name>
      <WeekNumber>12</WeekNumber>
      <HoursType></HoursType>
      <Hours></Hours>
      <EarningsType>Regular</EarningsType>
      <Earnings>1000.00</Earnings>
     </Payroll>
     <Payroll>
      <EmpID>3</EmpID>
      <Name>Augira</Name>
      <WeekNumber>12</WeekNumber>
      <HoursType>Other</HoursType>
      <Hours>12</Hours>
      <EarningsType></EarningsType>
      <Earnings></Earnings>
     </Payroll>
     <Payroll>
      <EmpID>4</EmpID>
      <Name>Satya</Name>
      <WeekNumber>12</WeekNumber>
      <HoursType>SMT</HoursType>
      <Hours>40</Hours>
      <EarningsType></EarningsType>
      <Earnings></Earnings>
     </Payroll>
    <Payroll>
      <EmpID>4</EmpID>
      <Name>Satya</Name>
      <WeekNumber>12</WeekNumber>
      <HoursType></HoursType>
      <Hours></Hours>
      <EarningsType>SMT</EarningsType>
      <Earnings>600.00</Earnings>
     </Payroll>
 </PayrollGroup>

之后，下面的XSLT删除重复项

<PayrollGroup xmlns="http://www.example.org">
    <Payroll>
      <EmpID>1</EmpID>
      <Name>Jacob</Name>
      <WeekNumber>12</WeekNumber>
      <HoursType>Regular</HoursType>
      <Hours>80</Hours>
      <EarningsType>Regular</EarningsType>
      <Earnings>800.00</Earnings>
    </Payroll>
    <Payroll>
      <EmpID>2</EmpID>
      <Name>John</Name>
      <WeekNumber>12</WeekNumber>
      <HoursType></HoursType>
      <Hours></Hours>
      <EarningsType>Regular</EarningsType>
      <Earnings>1000.00</Earnings>
     </Payroll>
    <Payroll>
      <EmpID>3</EmpID>
      <Name>Augira</Name>
      <WeekNumber>12</WeekNumber>
      <HoursType>Other</HoursType>
      <Hours>12</Hours>
      <EarningsType></EarningsType>
      <Earnings></Earnings>
    </Payroll>
    <Payroll>
      <EmpID>4</EmpID>
      <Name>Satya</Name>
      <WeekNumber>12</WeekNumber>
      <HoursType>SMT</HoursType>
      <Hours>40</Hours>
      <EarningsType>SMT</EarningsType>
      <Earnings>600.00</Earnings>
     </Payroll>
 </PayrollGroup>

Answer 1

目前尚不清楚您是在使用一个还是两个单独的XSLT样式表。但是，您只需要一个，在XSLT 2.0中删除重复项的方法是使用xsl:for-each-group。所以，不要这样做...

<xsl:for-each select="/ns0:PayrollGroup/ns0:Payroll">

执行此操作...

<xsl:for-each-group select="/ns0:PayrollGroup/ns0:Payroll" group-by="ns0:EmpID">

除此之外，以下行（及类似内容）可能会影响性能：

<xsl:value-of select="/ns0:PayrollGroup/ns0:Payroll[ns0:EmpID = $empId and ns0:HoursType = $earningsType]/ns0:HoursType"/>

这是因为他们必须在整个XML文档中搜索匹配项。在使用xsl:for-each-group时，您可以将搜索限制为仅当前组（即Payroll具有相同EmpID的元素）

<xsl:value-of select="current-group()[ns0:HoursType = $earningsType]/ns0:HoursType"/>

尝试使用此XSLT

<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:ns0="http://www.example.org"
    version="2.0">

<xsl:key name="Emps" match="ns0:Payroll" use="ns0:EmpID" />

<xsl:template match="@*|node()">
<xsl:copy>
  <xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>

<xsl:output method="xml" indent="yes" />
<xsl:template match="/">
<ns0:PayrollGroup>
  <xsl:for-each-group select="/ns0:PayrollGroup/ns0:Payroll" group-by="ns0:EmpID">
    <xsl:variable name="empId" select="ns0:EmpID"/>
    <xsl:variable name="earningsType" select="ns0:EarningsType"/>
    <xsl:variable name="hoursType" select="ns0:HoursType"/>
    <ns0:Payroll>
      <ns0:EmpID>
        <xsl:value-of select="ns0:EmpID"/>
      </ns0:EmpID>
      <ns0:Name>
        <xsl:value-of select="ns0:Name"/>
      </ns0:Name>
      <ns0:WeekNumber>
        <xsl:value-of select="ns0:WeekNumber"/>
      </ns0:WeekNumber>
      <xsl:choose>
        <xsl:when test="ns0:HoursType = '' and ns0:Hours ='' and ns0:EarningsType !='' and ns0:Earnings !='' ">
          <ns0:HoursType>
            <xsl:value-of select="current-group()[ns0:HoursType = $earningsType]/ns0:HoursType"/>
          </ns0:HoursType>
          <ns0:Hours>
            <xsl:value-of select="current-group()[ns0:HoursType = $earningsType]/ns0:Hours"/>
          </ns0:Hours>
          <ns0:EarningsType>
            <xsl:value-of select="ns0:EarningsType"/>
          </ns0:EarningsType>
          <ns0:Earnings>
            <xsl:value-of select="ns0:Earnings"/>
          </ns0:Earnings>
        </xsl:when>
        <xsl:when test="ns0:HoursType != '' and ns0:Hours !='' and ns0:EarningsType ='' and ns0:Earnings ='' ">
          <ns0:HoursType>
            <xsl:value-of select="ns0:HoursType"/>
          </ns0:HoursType>
          <ns0:Hours>
            <xsl:value-of select="ns0:Hours"/>
          </ns0:Hours>
          <ns0:EarningsType>
            <xsl:value-of select="current-group()[ns0:EarningsType = $hoursType]/ns0:EarningsType"/>
          </ns0:EarningsType>
          <ns0:Earnings>
            <xsl:value-of select="current-group()[ns0:EarningsType = $hoursType]/ns0:Earnings"/>
          </ns0:Earnings>
        </xsl:when>
        <xsl:otherwise>
          <ns0:HoursType>
            <xsl:value-of select="ns0:HoursType"/>
          </ns0:HoursType>
          <ns0:Hours>
            <xsl:value-of select="ns0:Hours"/>
          </ns0:Hours>
          <ns0:EarningsType>
            <xsl:value-of select="ns0:EarningsType"/>
          </ns0:EarningsType>
          <ns0:Earnings>
            <xsl:value-of select="ns0:Earnings"/>
          </ns0:Earnings>
        </xsl:otherwise>
      </xsl:choose>
    </ns0:Payroll>
  </xsl:for-each-group>
</ns0:PayrollGroup>
</xsl:template>

</xsl:stylesheet>

需要性能更好的XSLT

输入XML

输出XML

1 个答案: