Question

我有两个问题：您建议阅读哪些资源来改善数据操作功能？我一直在处理更大的数据集并且一直在努力适应 - 我觉得我碰到了一堵砖墙并且不知道在哪里看（许多在线资源变得太复杂而没有建立基础）。

例如，我正在尝试解决此问题。我有一个拥有数百万行的df，我正在尝试简化它并分析趋势。我有一个dput示例。我试图隔离每个ID并获取给定年份的最小值。（某些ID有多年没有其他ID）。在简化了这些数据之后，我正在尝试添加百分比更改列。鉴于这是一个20多年的时间序列，我现在可以忽略几个月，因为一年的最小值与另一年的最小值相比应该会产生合理的百分比变化。

谢谢！

输入：

structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L), .Label = c("a", "b"), class = "factor"), Date = structure(c(1L, 
2L, 3L, 4L, 5L, 6L, 10L, 12L, 14L, 7L, 8L, 9L, 11L, 13L, 5L, 
6L, 10L, 12L, 14L, 7L, 8L, 9L, 11L, 13L, 15L, 16L), .Label = c("2/21/2009", 
"2/22/2009", "2/23/2009", "2/24/2009", "2/25/2009", "2/26/2009", 
"3/2/2011", "3/3/2011", "3/4/2011", "3/5/2010", "3/5/2011", "3/6/2010", 
"3/6/2011", "3/7/2010", "3/7/2011", "3/8/2011"), class = "factor"), 
    Year = c(2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2010L, 
    2010L, 2010L, 2011L, 2011L, 2011L, 2011L, 2011L, 2009L, 2009L, 
    2010L, 2010L, 2010L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 
    2011L), Value = c(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
    20, 21, 22, 5, 6, 7, 8, 8, 9, 10, 11, 12, 15, 23, 25, 27)), .Names = c("ID", 
"Date", "Year", "Value"), class = "data.frame", row.names = c(NA, 
-26L))

预期产出：

structure(list(ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("a", 
"b"), class = "factor"), Date = structure(c(1L, 4L, 5L, 2L, 4L, 
3L), .Label = c("2/21/2009", "2/25/2009", "3/2/2011", "3/5/2010", 
"3/6/2011"), class = "factor"), Year = c(2009L, 2010L, 2011L, 
2009L, 2010L, 2011L), Value = c(10, 16, 5, 6, 8, 10), Percent.Increase = c(NA, 
0.6, -0.6875, NA, 0.333333333, 0.25)), .Names = c("ID", "Date", 
"Year", "Value", "Percent.Increase"), class = "data.frame", row.names = c(NA, 
-6L))

Answer 1

按“ID”，“年度”分组后，我们slice每个组中的min“值”行，然后按“ID”分组，我们创建'Percent.Increase'从“值”的lag中减去“值”并除以“值”的lag。

res <-  df1 %>%
         group_by(ID, Year) %>%
         slice(which.min(Value)) %>% 
         group_by(ID) %>%
         mutate(Percent.Increase = (Value-lag(Value))/lag(Value))

Answer 2

在data.table中实现HAVING clause之前，这似乎是非常有效的方式：

library(dplyr)
library(data.table)
N = 5e7
set.seed(1)
df = data.frame(ID = sample(2L, N, TRUE), 
                Date = sample(16L, N, TRUE), 
                Year = sample(2009:2011, N, TRUE), 
                Value = sample(N/10, N, TRUE))
dt = as.data.table(df)
system.time(
    res <- df %>%
        group_by(ID, Year) %>%
        slice(which.min(Value)) %>% 
        group_by(ID) %>%
        mutate(Percent_Increase = (Value-lag(Value))/lag(Value))    
)
#   user  system elapsed 
#  1.676   2.176   3.847
system.time(
    r <- dt[dt[, .I[which.min(Value)],, .(ID, Year)]$V1,
            ][, Percent_Increase := {
                tmp <- shift(Value)
                (Value-tmp)/tmp
            }, .(ID)]
)
#   user  system elapsed 
#  0.940   0.460   1.334
all.equal(r, as.data.table(res), ignore.col.order = TRUE, check.attributes = FALSE, ignore.row.order = TRUE)
#[1] TRUE

检查5e7上的时间。

{{1}}

按年简化数据框并计算百分比变化

2 个答案: