通过变量列表执行滚动求和的优雅,快速方法

时间:2014-03-25 12:20:09

标签: r variables data.table plyr

有没有人开发出一种优雅,快捷的方式来按日期执行滚动总和?例如,如果我想通过Cust_ID为以下数据集创建一个180天的滚动总计,有没有办法更快地完成它(比如data.table中的某些内容)。我一直在使用以下示例来计算滚动总和,但我担心它效率低下。

library("zoo")
library("plyr")
library("lubridate")

##Make some sample variables
set.seed(1)
Trans_Dates <- as.Date(c(31,33,65,96,150,187,210,212,240,273,293,320,
                         32,34,66,97,151,188,211,213,241,274,294,321,
                         33,35,67,98,152,189,212,214,242,275,295,322),origin="2010-01-01")
Cust_ID <- c(rep(1,12),rep(2,12),rep(3,12))
Target <- rpois(36,3)

##Combine into one dataset
Example.Data <- data.frame(Trans_Dates,Cust_ID,Target)

##Create extra variable with 180 day rolling sum
Example.Data2 <- ddply(Example.Data, .(Cust_ID), 
  function(datc) adply(datc, 1, 
   function(x) data.frame(Target_Running_Total =
    sum(subset(datc, Trans_Dates>(as.Date(x$Trans_Dates)-180) & Trans_Dates<=x$Trans_Dates)$Target))))

#Print new data
Example.Data2 

3 个答案:

答案 0 :(得分:2)

假设您的小组具有或多或少的平衡,那么我怀疑expand.gridave会非常快(您必须对数据进行基准测试才能确定) 。我使用expand.grid来填补缺失的日期,以便我可以天真地使用cumsum获取滚动金额,然后使用head减去除最近180之外的所有内容。

- 作为一个问题(以及更熟练的R用户),为什么我的identical电话总是失败? -

我建立了相同的数据。

full <- expand.grid(seq(from=min(Example.Data$Trans_Dates), to=max(Example.Data$Trans_Dates), by=1), unique(Example.Data$Cust_ID))
Example.Data3 <- merge(Example.Data, full, by.x=c("Trans_Dates", "Cust_ID"), by.y=c("Var1", "Var2"), all=TRUE)
Example.Data3 <- Example.Data3[with(Example.Data3, order(Cust_ID, Trans_Dates)), ]
Example.Data3$Target.New <- ifelse(is.na(Example.Data3$Target), 0, Example.Data3$Target)
Example.Data3$Target_Running_Total <- ave(Example.Data3$Target.New, Example.Data3$Cust_ID, FUN=function(x) cumsum(x) - c(rep(0, 180), head(cumsum(x), -180)))
Example.Data3$Target.New <- NULL
Example.Data3 <- Example.Data3[complete.cases(Example.Data3), ]
row.names(Example.Data3) <- seq(nrow(Example.Data3))
Example.Data3

identical(Example.Data2$Target_Running_Total, Example.Data3$Target_Running_Total)
sum(Example.Data2$Target_Running_Total - Example.Data3$Target_Running_Total)
(Example.Data2$Target_Running_Total - Example.Data3$Target_Running_Total) 

产生以下结果。

> (Example.Data2$Target_Running_Total - Example.Data3$Target_Running_Total) 
 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

答案 1 :(得分:1)

我想我偶然发现了一个相当有效的答案。

set.seed(1)
Trans_Dates <- as.Date(c(31,33,65,96,150,187,210,212,240,273,293,320,
                         32,34,66,97,151,188,211,213,241,274,294,321,
                         33,35,67,98,152,189,212,214,242,275,295,322),origin="2010-01-01")
Cust_ID <- c(rep(1,12),rep(2,12),rep(3,12))
Target <- rpois(36,3)

##Make simulated data into a data.table
library(data.table)
data <- data.table(Cust_ID,Trans_Dates,Target)

##Assign each customer an number that ranks them
data[,Cust_No:=.GRP,by=c("Cust_ID")]

##Create "list" of comparison dates
Ref <- data[,list(Compare_Value=list(I(Target)),Compare_Date=list(I(Trans_Dates))), by=c("Cust_No")]

##Compare two lists and see of the compare date is within N days
data$Roll.Val <- mapply(FUN = function(RD, NUM) {
  d <- as.numeric(Ref$Compare_Date[[NUM]] - RD)
  sum((d <= 0 & d >= -180)*Ref$Compare_Value[[NUM]])
}, RD = data$Trans_Dates,NUM=data$Cust_No)

##Print out data
data <- data[,list(Cust_ID,Trans_Dates,Target,Roll.Val)][order(Cust_ID,Trans_Dates)]
data

答案 2 :(得分:1)

library(data.table)

set.seed(1)

data <- data.table(Cust_ID = c(rep(1, 12), rep(2, 12), rep(3, 12)),
                   Trans_Dates = as.Date(c(31, 33, 65, 96, 150, 187, 210,
                                           212, 240, 273, 293, 320, 32, 34,
                                           66, 97, 151, 188, 211, 213, 241,
                                           274, 294, 321, 33, 35, 67, 98,
                                           152, 189, 212, 214, 242, 275,
                                           295, 322),
                                         origin = "2010-01-01"),
                   Target = rpois(36, 3))

data[, RollingSum := {
         d <- data$Trans_Dates - Trans_Dates
         sum(data$Target[Cust_ID == data$Cust_ID & d <= 0 & d >= -180])
       },
     by = list(Trans_Dates, Cust_ID)]