Question

我有一个数据集，其中包含4种客户类型和110个分支机构的到达时间（年-月-日-小时-分钟）和每一行的出席时间。使用这两个变量，我试图为每行创建一个队列长度新的列（例如，到达时间早于给定事务的到达时间但尚未参加的客户）。

下面的代码可以正常运行，但是已经运行了很多时间。关于如何加快此代码的任何想法？

library(tidyverse)    
library(data.table)
library(parallel)

transaccion_data<- tibble(transaction_Id = seq(1:10),
arrival_time = c("2018-11-01 09:05:00 CST", "2018-11-01 09:03:00 CST"),
attended_time =c("2018-11-01 09:10:00 CST", "2018-11-01 09:06:00 CST"), 
queue = c(NA, NA))

hours<-seq(ymd_hms("2018-11-01 09:00:00 CST"),ymd_hms("2018-11-01 16:00:00 CST"), 60)

queue_matrix是一个数据表，其中11月每天有nrow = 9到16 hrs（相差1分钟），列等于client_type＆branch（452列）

queue_matrix[1:13441, ] <-parSapply(cl = cluster,
  function (x) transaction_data %>%
  group_by(branch_type_client) %>%
  summarise(queue = sum(arrival_time <= x & attended_time>x)) %>%
  column_to_rownames(var = "branch_type_client") %>%
  transpose()
)

Answer 1

您的数据有点小（并且不完整），因此我生成了自己的数据：

library(tidyverse)
library(lubridate)
library(rlang)
n_items <- 1e6
sample_data <- tibble(
  arrival_time = make_date(2018, 11, floor(runif(n_items, 1, 31))) +
    dhours(9) + dseconds(floor(runif(n_items, 0, 6 * 60 * 60 + 1))),
  attended_time = arrival_time +
    dseconds(floor(runif(n_items, 0, 60 * 60 + 1))),
  branch_type_client = sample(LETTERS, n_items, replace = TRUE)
)

现在，我们需要计算每分钟到达和参加的人数。我假设直到13:06:00才有人到达13:05:01。

arrived <- sample_data %>% 
  count(branch_type_client, time = ceiling_date(arrival_time, "minutes"))

attended <- sample_data %>% 
  count(branch_type_client, time = ceiling_date(attended_time, "minutes"))

现在，我们将两者结合在一起，填写所有所需的日期序列，然后计算到达与出席之间的累计差额。

all_times <- rep(seq(ymd("2018-11-01"), ymd("2018-11-30"), by = "1 day"), each = 7 * 60 + 1) +
  dhours(9) + rep(dminutes(0:(60 * 7)), 30)

queue <- full_join(arrived, attended, by = c("branch_type_client", "time"),
                   suffix = c("_arrived", "_attended")) %>% 
  complete(branch_type_client, time = all_times) %>% 
  replace_na(list(n_arrived = 0, n_attended = 0)) %>% 
  arrange(branch_type_client, time) %>% 
  group_by(branch_type_client) %>% 
  mutate(queue_length = cumsum(n_arrived - n_attended))

如果您希望每个branch_type_client包含一列，则可以使用tidyr::spread：

queue_wide <- queue %>% 
  select(time, branch_type_client, queue_length) %>% 
  spread(branch_type_client, queue_length)

这一切（包括生成一百万行的示例数据）在我没有使用并行功能的7岁笔记本电脑上大约需要6秒钟。

加快双循环R

1 个答案: