Question

我想计算多个日期变量之间的成对平均天数和中位数天数。

我的原始数据df可能如下所示：

id     invitation    account_date   first_order    second_order    third_order
1    1/1/2016      1/7/2016       1/20/2016      1/22/2016        NA
2    1/1/2016      1/8/2016       1/22/2016      1/23/2016        1/25/2016
3    1/1/2016      1/5/2016       1/20/2016      2/1/2016         NA
4    1/1/2016      1/2/2016       1/18/2016      2/4/2016         2/6/2016

鉴于我的数据已经正确格式化为日期，通过首先计算成对差异来手动计算日期组合的平均值和中位数差异非常容易，例如：

id     inv_to_act act_to_first    act_to_sec    act_to_third
1      6          13              2             NA
2      7          14              1             2
3      4          15              12            NA
4      1          16              17            2

然后使用基数R：mean(df$act_to_first,na.rm=T)。

但是，我想在同一数据集的几个数据集或子集上计算这些计算，因此一遍又一遍地执行每个步骤都不可扩展。另外，我非常确定必须有melt或plyr解决方案，我还没有想到。

Answer 1

您可以通过循环对并使用difftime来计算每对日期之间的日期差异：

combos <- combn(tail(names(df), -1), 2)
diffs <- apply(combos, 2, function(x) {
  difftime(df[,x[2]], df[,x[1]], units="days")
})
colnames(diffs) <- paste0(combos[1,], "_TO_", combos[2,])
diffs
#      invitation_TO_account_date invitation_TO_first_order invitation_TO_second_order invitation_TO_third_order account_date_TO_first_order
# [1,]                          6                        19                         21                        NA                          13
# [2,]                          7                        21                         22                        24                          14
# [3,]                          4                        19                         31                        NA                          15
# [4,]                          1                        17                         34                        36                          16
#      account_date_TO_second_order account_date_TO_third_order first_order_TO_second_order first_order_TO_third_order second_order_TO_third_order
# [1,]                           15                          NA                           2                         NA                          NA
# [2,]                           15                          17                           1                          3                           2
# [3,]                           27                          NA                          12                         NA                          NA
# [4,]                           33                          35                          17                         19                           2

执行该步骤后，您应该能够轻松计算每列的平均值：

colMeans(diffs, na.rm=TRUE)
#   invitation_TO_account_date    invitation_TO_first_order   invitation_TO_second_order    invitation_TO_third_order  account_date_TO_first_order 
#                          4.5                         19.0                         27.0                         30.0                         14.5 
# account_date_TO_second_order  account_date_TO_third_order  first_order_TO_second_order   first_order_TO_third_order  second_order_TO_third_order 
#                         22.5                         26.0                          8.0                         11.0                          2.0

拥有这些功能后，您可以将它们放在一个函数中，并将该函数应用于任何输入df：

meanDateRanges <- function(df) {
  combos <- combn(tail(names(df), -1), 2)
  diffs <- apply(combos, 2, function(x) {
    difftime(df[,x[2]], df[,x[1]], units="days")
  })
  colnames(diffs) <- paste0(combos[1,], "_TO_", combos[2,])
  colMeans(diffs, na.rm=TRUE)
}

您可以在包含meanDateRanges(df)的输入数据框上或在lapply(df.list, meanDateRanges)的列表中运行此功能。

数据：

df <- structure(list(id = 1:4, invitation = structure(list(sec = c(0, 
0, 0, 0), min = c(0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L), 
    mday = c(1L, 1L, 1L, 1L), mon = c(0L, 0L, 0L, 0L), year = c(116L, 
    116L, 116L, 116L), wday = c(5L, 5L, 5L, 5L), yday = c(0L, 
    0L, 0L, 0L), isdst = c(0L, 0L, 0L, 0L), zone = c("EST", "EST", 
    "EST", "EST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon", 
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", 
"POSIXt")), account_date = structure(list(sec = c(0, 0, 0, 0), 
    min = c(0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L), mday = c(7L, 
    8L, 5L, 2L), mon = c(0L, 0L, 0L, 0L), year = c(116L, 116L, 
    116L, 116L), wday = c(4L, 5L, 2L, 6L), yday = c(6L, 7L, 4L, 
    1L), isdst = c(0L, 0L, 0L, 0L), zone = c("EST", "EST", "EST", 
    "EST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon", 
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", 
"POSIXt")), first_order = structure(list(sec = c(0, 0, 0, 0), 
    min = c(0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L), mday = c(20L, 
    22L, 20L, 18L), mon = c(0L, 0L, 0L, 0L), year = c(116L, 116L, 
    116L, 116L), wday = c(3L, 5L, 3L, 1L), yday = c(19L, 21L, 
    19L, 17L), isdst = c(0L, 0L, 0L, 0L), zone = c("EST", "EST", 
    "EST", "EST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon", 
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", 
"POSIXt")), second_order = structure(list(sec = c(0, 0, 0, 0), 
    min = c(0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L), mday = c(22L, 
    23L, 1L, 4L), mon = c(0L, 0L, 1L, 1L), year = c(116L, 116L, 
    116L, 116L), wday = c(5L, 6L, 1L, 4L), yday = c(21L, 22L, 
    31L, 34L), isdst = c(0L, 0L, 0L, 0L), zone = c("EST", "EST", 
    "EST", "EST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon", 
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", 
"POSIXt")), third_order = structure(list(sec = c(NA, 0, NA, 0
), min = c(NA, 0L, NA, 0L), hour = c(NA, 0L, NA, 0L), mday = c(NA, 
25L, NA, 6L), mon = c(NA, 0L, NA, 1L), year = c(NA, 116L, NA, 
116L), wday = c(NA, 1L, NA, 6L), yday = c(NA, 24L, NA, 36L), 
    isdst = c(-1L, 0L, -1L, 0L), zone = c("", "EST", "", "EST"
    ), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
    )), .Names = c("sec", "min", "hour", "mday", "mon", "year", 
"wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", 
"POSIXt"))), .Names = c("id", "invitation", "account_date", "first_order", 
"second_order", "third_order"), row.names = c(NA, -4L), class = "data.frame")

计算日期组合之间的平均差异

1 个答案: