我有一个由三列组成的数据框:x,ID和date_time。 “ x”列是每五分钟出现的变量x的记录,ID指示要记录的内容,而date_time指示何时记录。请参见下面的数据框。
我要从此数据帧中计算出一个新的数据帧,该数据帧具有七列:“测量”,“ ID”和“日期”,“ x_4_5_night_15min_yes / no”,“ x_4_5_night_time_15min”,“ x_4_5_night_events_15min”,“ x_ <4_night_15min”
每个唯一的测量都应该有一行。到目前为止,我有一个代码可以正确返回以下列:“测量”,“ ID”和“日期”:
df1$mydate = as.Date(df1$date_time, format = "%Y.%m.%d %H:%M:%S")
df1$tm <- as.numeric(df1$date_time)
df1$dts <- 86400*as.numeric(df1$mydate)
df2 <- df1 %>%
group_by(ID,mydate) %>%
transform(date = case_when(((dts-3600)<tm & tm<(dts+82800)) ~paste0(mydate), ((dts+82800)<=tm) ~paste0(mydate+1) )) %>%
select(ID,date) %>%
unique() %>%
group_by(ID) %>%
mutate(measurement = row_number())
但是我不知道怎么做最后一个。
这是预期的输出:
expected_output <- read.table(header=TRUE, text ="
ID Date x_4_5_night_events_15min x_4_5_night_15min_yes/no x_4_5_night_time_15min x_<4_night_events_15min
12 2020.03.02 0 0 0 0
12 2020.03.03 1 1 20 1
13 2020.05.09 0 0 0 0
14 2020.03.03 2 1 40 0
")
这是数据:
structure(list(date_time = c("2020.03.02 22:00:17", "2020.03.02 22:05:17",
"2020.03.02 22:10:17", "2020.03.02 22:35:17", "2020.03.02 22:40:17",
"2020.03.02 22:45:17", "2020.03.02 22:50:17", "2020.03.02 22:55:17",
"2020.03.02 23:00:17", "2020.03.02 23:05:17", "2020.03.02 23:10:17",
"2020.03.02 23:15:17", "2020.03.02 23:20:17", "2020.03.02 23:25:17",
"2020.03.02 23:30:17", "2020.03.02 23:35:17", "2020.03.02 23:40:17",
"2020.03.02 23:45:17", "2020.03.02 23:50:17", "2020.03.02 23:55:17",
"2020.03.03 00:00:17", "2020.03.03 00:55:17", "2020.03.03 01:00:17",
"2020.03.03 01:05:17", "2020.03.03 01:10:17", "2020.03.03 01:15:17",
"2020.03.03 01:20:17", "2020.03.03 01:25:17", "2020.05.09 08:39:32",
"2020.05.09 08:44:32", "2020.05.09 08:49:33", "2020.05.09 08:54:33",
"2020.05.09 08:59:34", "2020.05.09 09:04:34", "2020.05.09 09:09:35",
"2020.05.09 09:14:35", "2020.05.09 09:19:36", "2020.05.09 09:24:36",
"2020.05.09 09:29:37", "2020.05.09 09:34:37", "2020.05.09 09:39:38",
"2020.05.09 09:44:38", "2020.05.09 09:49:39", "2020.05.09 09:54:39",
"2020.05.09 09:59:40", "2020.03.02 22:40:17", "2020.03.02 22:45:17",
"2020.03.02 22:50:17", "2020.03.02 22:55:17", "2020.03.02 23:00:17",
"2020.03.02 23:05:17", "2020.03.02 23:10:17", "2020.03.02 23:15:17",
"2020.03.02 23:20:17", "2020.03.02 23:25:17", "2020.03.02 23:30:17",
"2020.03.02 23:35:17", "2020.03.02 23:40:17", "2020.03.02 23:45:17",
"2020.03.02 23:50:17", "2020.03.02 23:55:17", "2020.03.03 00:00:17",
"2020.03.03 00:55:17", "2020.03.03 01:00:17", "2020.03.03 01:05:17",
"2020.03.03 01:10:17"), id = c(12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L), x = c("7.55", "4.55",
"4.55", "12", "12", "10", "10", "4.3", "", "", "4.3", "4.3",
"4.3", "", "4.3", "12", "12", "12", "2", "12", "12", "", "8",
"3", "3", "2", "2", "", "12", "10", "10", "4.3", "4.3", "4.3",
"4.3", "4.3", "4.3", "4.3", "4.3", "12", "12", "12", "12", "12",
"12", "2", "12", "12", "", "8", "3", "3", "2", "2", "", "12",
"10", "10", "4.3", "4.3", "4.3", "4.3", "4.3", "4.3", "4.3",
"4.3")), row.names = c(NA, 66L), class = "data.frame")
任何建议都非常感谢,谢谢!
答案 0 :(得分:1)
将初始数据帧命名为df11,下面的代码将提供所需的输出。请注意,我已经假设15分钟(在标准5、6、7中)意味着在4和5之间连续/连续3个记录(对于#7,少于4)与标准4相同。我相信其他人可以写得短得多和更优雅的代码。现在,我已经逐步编写了该文档,以便您可以检查每个步骤。
df11$xn <- as.numeric(df11$x)
df1 <- df11 %>% transform(xmin = ifelse((xn<4 | xn>5 | is.na(xn)),0,5 ),
xlt4 = ifelse((xn>=4 | is.na(xn)),0,1),
x45 = ifelse((xn<4 | xn>5 | is.na(xn)),0,1))
df1$dateTime = as_datetime(df1$date_time, format = "%Y.%m.%d %H:%M:%S")
df1$mydate = as.Date(df1$date_time, format = "%Y.%m.%d %H:%M:%S")
df1$tm <- as.numeric(df1$dateTime)
df1$dts <- 86400*as.numeric(df1$mydate)
df2 <- df1 %>%
group_by(ID,mydate) %>%
transform(date = case_when(((dts-3600)<tm & tm<(dts+82800)) ~paste0(mydate), ((dts+82800)<=tm) ~paste0(mydate+1) )) %>%
transform(dayrnight = ifelse((tm>=(dts+25200) & tm<(dts+82800) ),'day','night' ) ) # %>%
df3 <- df2 %>% group_by(ID,date,dayrnight) %>% mutate(x45l1 = lag(x45), x45l2=lag(x45,2), xlt4l1 = lag(xlt4)) %>%
mutate_if(is.numeric , replace_na, replace = 0) %>%
mutate(x45cum = ifelse(row_number()>2, cumsum(x45)*x45*x45l1,cumsum(x45)),
xlt4cum= ifelse(row_number()>2, cumsum(xlt4)*xlt4*xlt4l1,cumsum(xlt4)) ) %>%
mutate(x45event = ifelse((x45cum %% 3) == 0, 1*(x45cum>0),0),
xlt4event= ifelse((xlt4cum %% 3) == 0, 1*(xlt4cum>0),0)) %>%
mutate(x45event_tot = cumsum(x45event), xlt4event_tot = cumsum(xlt4event)) %>%
mutate(x45min = cumsum(xmin)*(x45event_tot>0)) %>%
dplyr::summarise(x45mint = max(x45min), x45eventt = max(x45event_tot), xlt4eventt = max(xlt4event_tot)) %>%
mutate(x45_15min_yn = ifelse(x45eventt>0,1,0), xlt4_15min_yn = ifelse(xlt4eventt>0,1,0)) %>%
pivot_wider(id_cols = c(ID,date), names_from = dayrnight, values_from = c("x45mint", "x45eventt", "x45_15min_yn", "xlt4_15min_yn")) %>%
mutate_if(is.numeric , replace_na, replace = 0) %>%
select(ID, date, x_4_5_night_events_15min = x45eventt_night, x_4_5_night_15min_yesorno = x45_15min_yn_night,
x_4_5_night_time_15min = x45mint_night, x_lt4_night_events_15min = xlt4_15min_yn_night)
> df3
# A tibble: 5 x 6
# Groups: ID, date [5]
ID date x_4_5_night_events_15min x_4_5_night_15min_yesorno x_4_5_night_time_15min x_lt4_night_events_15min
<int> <chr> <dbl> <dbl> <dbl> <dbl>
1 12 2020-03-02 0 0 0 0
2 12 2020-03-03 1 1 20 1
3 13 2020-05-09 0 0 0 0
4 14 2020-03-02 0 0 0 0
5 14 2020-03-03 2 1 40 1