data.table:每组最近24小时观察的子集

时间:2016-07-29 19:53:25

标签: r data.table data-manipulation

假设我有一个data.table,如下所示:

dt = data.table(group = c(1,1,1,2,2,2,3,3,3),time = c("2016-03-09T08:31:00-05:00","2016-03-08T11:31:00-05:00","2016-03-06T08:31:00-05:00",
                                                "2016-04-04T23:28:00-04:00","2016-04-10T23:28:00-04:00","2016-04-09T23:28:00-04:00",
                                                "2016-05-11T19:52:00-04:00","2016-05-10T20:52:00-04:00","2016-04-11T19:52:00-04:00"))

dt
   group                      time
1:     1 2016-03-09T08:31:00-05:00
2:     1 2016-03-08T11:31:00-05:00
3:     1 2016-03-06T08:31:00-05:00
4:     2 2016-04-04T23:28:00-04:00
5:     2 2016-04-10T23:28:00-04:00
6:     2 2016-04-09T23:28:00-04:00
7:     3 2016-05-11T19:52:00-04:00
8:     3 2016-05-10T20:52:00-04:00
9:     3 2016-04-11T19:52:00-04:00

对于此data.table中的每个组,我只想保留最近一天24小时内的观察结果。我为此制定了一个讨厌的解决方案,但它并不像我需要它在大型数据集上那么快。

library(lubridate)
set(dt,j = "time",value = ymd_hms(dt[["time"]]))
dt[,.(mostRecent = max(time),time),by = group][
  time > (mostRecent - days(1)),.(group,time)]

   group                time
1:     1 2016-03-09 13:31:00
2:     1 2016-03-08 16:31:00
3:     2 2016-04-11 03:28:00
4:     3 2016-05-11 23:52:00
5:     3 2016-05-11 00:52:00

有没有人有关于如何更优雅/更快地完成任务的提示?

3 个答案:

答案 0 :(得分:4)

首先,将阈值放在表格中:

thresh_dt = dt[, .(time = max(time)), by=group][, time := time - 24*60*60][]

max与减去日期的秒数分开,以利用the "GForce" optimized max。另请参阅?datatable.optimize

接下来,进行滚动或非等距连接:

thresh_dt[dt, on=c("group", "time"), roll=TRUE, nomatch=0]

# or, on data.table 1.9.7+
thresh_dt[dt, on=.(group, time <= time), nomatch=0]

   group                time
1:     1 2016-03-09 13:31:00
2:     1 2016-03-08 16:31:00
3:     2 2016-04-11 03:28:00
4:     2 2016-04-10 03:28:00
5:     3 2016-05-11 23:52:00
6:     3 2016-05-11 00:52:00

基准。 GForce max和滚动的优势仅在您有足够的群组时显示。我的示例数据扩展了@sbstn,因此组的数量是一个参数:

N = 5e6
ng = 1e5

all_times = seq(from = as.POSIXct('2016-01-01 10:00:00'),
                to = as.POSIXct('2016-06-30 10:00:00'),
                by = 60)
all_times_int = as.integer(all_times)    
idx = sample(seq.int(length(all_times)), N, replace = TRUE)
dt = data.table(group = sample(ng, N, replace = TRUE),
                time = all_times[idx],
                time_int = all_times_int[idx])

# sbstn, no gmax
system.time({
  dt[, cutoff_time := max(time) - 24*60*60, by = group]
  dt[time >= cutoff_time]
})
#    user  system elapsed 
#    8.50    0.01    8.47 

# sbstn, with gmax
system.time({
  dt[, maxtime := max(time), by = group][, cutoff_time := maxtime - 24*60*60]
  dt[time >= maxtime]
})
#    user  system elapsed 
#    4.98    0.01    4.99

# gmax and roll
system.time({
  thresh_dt = dt[, .(time = max(time)), by=group][, time := time - 24*60*60]
  thresh_dt[dt, on=c("group", "time"), roll=TRUE, nomatch=0][, list(group, time)]
})
#    user  system elapsed 
#    1.29    0.06    1.36 
# (Caveat: I didn't verify that these results match.)

我的回答是将行分组两次(一次计算最大值,再次与原始数据连接)。通过将其归结为一个分组操作,克莱顿斯坦利的回答也变得很快(至少我认为这是什么&#39;)

system.time(dt[order(group, -time)
    ][, groupP := shift(group, type='lag')
    ][, head := is.na(groupP) | group != groupP
    ][, copy(.SD)[.SD[head == T], rTime := i.time, on=c(group='group')]
    ][time > (rTime - 24*60*60)
    ][, .(group, time)
    ][order(group, -time)
    ])
#    user  system elapsed 
#    1.32    0.25    1.14 

答案 1 :(得分:4)

为每个组创建截止时间的简单解决方案(假设时间已经转换):

dt[, cutoff_time := max(time) - 24*60*60, by = group]
dt[time > cutoff_time]

修改

评论&#34; GForce优化最大值&#34;让我好奇,所以我创建了一些更大的假数据,以便比较速度。请注意,integer可以很好地与max>=

配合使用
require(data.table)

require(microbenchmark)

N = 100000
N_g = 100

all_times = seq(from = as.POSIXct('2016-01-01 10:00:00'),
                to = as.POSIXct('2016-06-30 10:00:00'),
                by = 60)

all_times_int = as.integer(all_times)

idx = sample(seq.int(length(all_times)), N, replace = TRUE)

dt = data.table(group = sample(seq.int(N_g), N, replace = TRUE),
                time = all_times[idx],
                time_int = all_times_int[idx])

f1a = function (x) {
  x[, cutoff_time := max(time) - 24*60*60, by = group]
  x[time >= cutoff_time, list(group, time)]
}

f1b = function (x) {
  x[, cutoff_time := max(time_int) - 24*60*60, by = group]
  x[time_int >= cutoff_time, list(group, time)]
}

f2 = function (x) {
  thresh_dt = x[, .(time = max(time)), by=group][, time := time - 24*60*60]
  thresh_dt[x, on=c("group", "time"), roll=TRUE, nomatch=0][, list(group, time)]
}

microbenchmark(f1a(dt),
               f1b(dt),
               f2(dt))

Unit: milliseconds
   expr       min        lq      mean   median        uq       max neval
 f1a(dt)  9.842106 10.593243 11.593148 11.62311 12.478853 14.335338   100
 f1b(dt)  3.391178  3.763598  4.403264  4.00142  5.018182  8.335717   100
  f2(dt) 14.422669 15.701397 17.090674 16.56990 17.695653 52.926897   100

identical(f1a(dt), f1b(dt)) # TRUE
identical(f1a(dt), f2(dt)) # TRUE

编辑2: 还有一个N = 1,000,000N_g = 10,000组:

> microbenchmark(f1a(dt),
+                f1b(dt),
+                f2(dt),
+                times = 10)
Unit: milliseconds
    expr       min       lq      mean    median        uq      max neval
 f1a(dt) 634.91473 647.5662 670.74597 663.28238 694.29595 728.2481    10
 f1b(dt)  64.61488  67.3692  76.68925  68.42335  72.36862 113.1407    10
  f2(dt) 205.67688 208.6491 229.65610 213.59476 249.16703 278.7713    10

> microbenchmark(f1a(dt),
+                f1b(dt),
+                f2(dt),
+                times = 10)
Unit: milliseconds
    expr       min        lq     mean    median        uq       max neval
 f1a(dt) 620.11090 624.33587 645.0220 642.13648 657.74347 697.27674    10
 f1b(dt)  64.80214  67.43851  67.9140  67.99647  68.63552  69.74466    10
  f2(dt) 198.39200 199.56088 209.6908 204.60183 216.23255 241.76792    10

> microbenchmark(f1a(dt),
+                f1b(dt),
+                f2(dt),
+                times = 10)
Unit: milliseconds
    expr      min        lq      mean    median        uq      max neval
 f1a(dt) 619.2903 645.22617 656.58883 660.99508 664.82678 682.7618    10
 f1b(dt)  63.2454  67.31781  72.10255  68.19679  71.91441 106.7493    10
  f2(dt) 195.9335 210.06171 222.19868 215.75979 241.74100 245.9022    10

答案 2 :(得分:3)

可能瓶颈在于max(*),通过计算。如果是这样的话:

<div id="container">
  <div class="header">THIS IS THE HEADER</div>
  <div class="section"></div>
  <div class="section"></div>
  <div class="section"></div>
</div>