data.table子集,具有不规则的日期范围边界

时间:2014-03-11 21:44:42

标签: r data.table

我在此data.table ....

中组织的每种工具都有不同的期间开始
  

取值

   Instrument       Date
1:        DBC 2006-02-08
2:        IEF 2002-08-02
3:        VEU 2007-03-10
4:        VNQ 2004-10-03
5:        VTI 2001-06-17

和另一个data.table .....

  

DT

   Instrument       Date   Open   High    Low  Close  Volume Adjusted
 1:        DBC 2006-02-06  24.55  24.66  24.10  24.20  771500    22.69
 2:        DBC 2006-02-07  24.05  24.20  23.45  23.50  674400    22.04
 3:        DBC 2006-02-08  23.49  23.67  23.33  23.40  335400    21.94
 4:        DBC 2006-02-09  23.56  23.82  23.51  23.62  705900    22.15
 5:        DBC 2006-02-10  23.65  23.69  23.10  23.19  276700    21.75
 6:        IEF 2002-07-31  82.05  82.58  82.05  82.52   32600    55.22
 7:        IEF 2002-08-01  82.54  82.90  82.52  82.86   71400    55.44
 8:        IEF 2002-08-02  83.02  83.70  82.90  83.50  120300    55.87
 9:        IEF 2002-08-05  83.68  83.92  83.53  83.92  159300    56.15
10:        IEF 2002-08-06  83.51  83.51  83.03  83.24   90100    55.70
11:        VEU 2007-03-08  50.71  51.10  50.71  50.96  113600    43.05
12:        VEU 2007-03-09  51.15  51.36  51.02  51.30  139100    43.34
13:        VEU 2007-03-12  51.25  51.59  51.20  51.54  108000    43.54
14:        VEU 2007-03-13  51.20  51.23  50.17  50.19  216900    42.40
15:        VEU 2007-03-14  50.00  50.26  49.39  50.25  154900    42.45
16:        VNQ 2004-10-01  50.35  51.25  50.25  51.18  129800    32.84
17:        VNQ 2004-10-04  51.39  51.55  51.39  51.40    8100    32.98
18:        VNQ 2004-10-05  51.40  51.50  51.35  51.39   11900    32.97
19:        VNQ 2004-10-06  51.50  51.63  51.49  51.63    9300    33.12
20:        VNQ 2004-10-07  51.65  51.65  51.10  51.10    7900    32.78
21:        VTI 2001-06-15 110.85 112.01 110.35 111.33 1067400    44.58
22:        VTI 2001-06-18 111.63 111.83 110.62 110.62  282600    44.29
23:        VTI 2001-06-19 112.30 112.33 110.47 110.93 1777600    44.42
24:        VTI 2001-06-20 110.75 112.15 110.75 112.10  476000    44.88
25:        VTI 2001-06-21 112.00 113.45 111.75 113.00  240400    45.24
    Instrument       Date   Open   High    Low  Close  Volume Adjusted

...,我希望将 Date S作为每个乐器的起点进行分组:


可重复数据:

S <- structure(list(Instrument = c("DBC", "IEF", "VEU", "VNQ", "VTI"), Date = structure(c(13187, 11901, 13582, 12694, 11490), class = "Date")), row.names     =c(NA, -5L), class = c("data.table", "data.frame"), .Names = c("Instrument", "Date"), sorted = c("Instrument", "Date"))

DT <- structure(list(Instrument = c("DBC", "DBC", "DBC", "DBC", "DBC", "IEF", "IEF", "IEF", "IEF", "IEF", "VEU", "VEU", "VEU", "VEU", "VEU", "VNQ", "VNQ", "VNQ", "VNQ", "VNQ", "VTI", "VTI", "VTI", "VTI", "VTI"), Date = structure(c(13185, 13186, 13187, 13188, 13189, 11899,11900, 11901, 11904, 11905, 13580, 13581, 13584, 13585, 13586, 12692, 12695, 12696, 12697, 12698, 11488, 11491, 11492, 11493, 11494), class = c("IDate", "Date")), Open = c(24.55, 24.05, 23.49, 23.56, 23.65, 82.05, 82.54, 83.02, 83.68, 83.51, 50.71, 51.15, 51.25,51.2, 50, 50.35, 51.39, 51.4, 51.5, 51.65, 110.85, 111.63, 112.3, 110.75, 112), High = c(24.66, 24.2, 23.67, 23.82, 23.69, 82.58, 82.9, 83.7, 83.92, 83.51, 51.1, 51.36, 51.59,51.23, 50.26, 51.25, 51.55, 51.5, 51.63, 51.65, 112.01, 111.83, 112.33, 112.15, 113.45), Low = c(24.1, 23.45, 23.33, 23.51, 23.1, 82.05, 82.52, 82.9, 83.53, 83.03, 50.71, 51.02, 51.2, 50.17, 49.39, 50.25, 51.39, 51.35, 51.49, 51.1, 110.35, 110.62, 110.47, 110.75, 111.75), Close = c(24.2, 23.5, 23.4, 23.62, 23.19, 82.52, 82.86, 83.5, 83.92, 83.24, 50.96, 51.3, 51.54, 50.19, 50.25, 51.18, 51.4, 51.39, 51.63, 51.1, 111.33, 110.62, 110.93, 112.1, 113), Volume = c(771500, 674400, 335400, 705900, 276700, 32600, 71400, 120300, 159300, 90100, 113600, 139100, 108000, 216900, 154900, 129800, 8100, 11900, 9300, 7900,1067400, 282600, 1777600, 476000, 240400), Adjusted = c(22.69, 22.04, 21.94, 22.15, 21.75, 55.22, 55.44, 55.87, 56.15, 55.7, 43.05, 43.34, 43.54, 42.4, 42.45, 32.84, 32.98, 32.97, 33.12, 32.78, 44.58, 44.29, 44.42, 44.88, 45.24)), .Names = c("Instrument", "Date", "Open", "High", "Low", "Close", "Volume", "Adjusted"), row.names = c(NA, -25L), class = c("data.table", "data.frame"), sorted = c("Instrument", "Date"))

2 个答案:

答案 0 :(得分:2)

您可以合并2套,然后按仪器过滤:

setkey(S,Instrument)
setkey(DT,Instrument)
merge(DT,S)[,.SD[Date.x>=Date.y],Instrument]

  Instrument     Date.x   Open   High    Low  Close  Volume Adjusted     Date.y
 1:        DBC 2006-02-08  23.49  23.67  23.33  23.40  335400    21.94 2006-02-08
 2:        DBC 2006-02-09  23.56  23.82  23.51  23.62  705900    22.15 2006-02-08
 3:        DBC 2006-02-10  23.65  23.69  23.10  23.19  276700    21.75 2006-02-08
 4:        IEF 2002-08-02  83.02  83.70  82.90  83.50  120300    55.87 2002-08-02
 5:        IEF 2002-08-05  83.68  83.92  83.53  83.92  159300    56.15 2002-08-02
 6:        IEF 2002-08-06  83.51  83.51  83.03  83.24   90100    55.70 2002-08-02
 7:        VEU 2007-03-12  51.25  51.59  51.20  51.54  108000    43.54 2007-03-10
 8:        VEU 2007-03-13  51.20  51.23  50.17  50.19  216900    42.40 2007-03-10
 9:        VEU 2007-03-14  50.00  50.26  49.39  50.25  154900    42.45 2007-03-10
10:        VNQ 2004-10-04  51.39  51.55  51.39  51.40    8100    32.98 2004-10-03
11:        VNQ 2004-10-05  51.40  51.50  51.35  51.39   11900    32.97 2004-10-03
12:        VNQ 2004-10-06  51.50  51.63  51.49  51.63    9300    33.12 2004-10-03
13:        VNQ 2004-10-07  51.65  51.65  51.10  51.10    7900    32.78 2004-10-03
14:        VTI 2001-06-18 111.63 111.83 110.62 110.62  282600    44.29 2001-06-17
15:        VTI 2001-06-19 112.30 112.33 110.47 110.93 1777600    44.42 2001-06-17
16:        VTI 2001-06-20 110.75 112.15 110.75 112.10  476000    44.88 2001-06-17
17:        VTI 2001-06-21 112.00 113.45 111.75 113.00  240400    45.24 2001-06-17

答案 1 :(得分:2)

以下是@agstudy和@Arun

提供的三种解决方案的总结和基准测试
  1. 合并(DT,S)[,. SD [Date.x&GT = Date.y],文书]
  2. DT [S,.SD [日期&gt; = i.Date]]
  3. DT [S] [日期和GT = Date.1]
  4. <强>基准

    require(rbenchmark)
    setkey(DT, Instrument, Date) # initial condition
    setkey(S, Instrument, Date) # initial condition
    
    agstudy <- function() {
      setkey(DT, Instrument)
      setkey(S, Instrument)
    
      out <- merge(DT,S)[,.SD[Date.x>=Date.y],Instrument]
    
      out[,Date.y:=NULL]
      setnames(out,"Date.x", "Date")
      setkey(out, Instrument, Date)
      return(out)
    }
    
    Arun <- function(){
      setkey(S, Instrument)
    
      out <- DT[S, .SD[Date >= i.Date]]
    
      setkey(out, Instrument, Date)
      return(out)
    }
    
    DanielKrizian <- function(){
        setkey(S, Instrument)
    
        out <- DT[S][Date>=Date.1]
    
        setkey(out, Instrument, Date)
        out[,Date.1:=NULL]
        return(out)
    }
    
    setkey(DT, Instrument, Date) # initial condition
    setkey(S, Instrument, Date) # initial condition
    benchmark(ag <- agstudy(),replications=1000)
    
    setkey(DT, Instrument, Date) 
    setkey(S, Instrument, Date)
    benchmark(ar <- Arun(),replications=1000)
    
    setkey(DT, Instrument, Date)
    setkey(S, Instrument, Date)
    benchmark(dk <- DanielKrizian(),replications=1000)
    

    <强>输出

               test replications elapsed relative user.self sys.self user.child sys.child
    ag <- agstudy()         1000    9.39        1      8.55     0.07         NA        NA
    ar <- Arun()            1000    7.16        1      6.53     0.07         NA        NA
    dk <- DanielKrizian()   1000    2.95        1       2.7        0         NA        NA
    

    检查

    all(identical(ag, ar),identical(ag,dk),identical(ar,dk))
    [1]TRUE