我想将数据框1与2合并,但是由于结构不同,我不知道如何合并。在数据框1中,日期与日期和时间在一列中,在数据框2中,三列具有时间跨度。
Dataframe 1
datetime PM
<dttm> <dbl>
1 2017-05-17 07:00:26 2.5
2 2017-05-17 08:00:26 4.17
3 2017-05-17 09:00:26 0.333
4 2017-05-17 10:00:26 0
5 2017-05-17 11:00:26 0
6 2017-05-17 12:00:26 0
7 2017-05-17 13:00:26 0
8 2017-05-17 14:00:26 0
9 2017-05-17 15:00:26 0
10 2017-05-17 16:00:26 0
11 2017-05-17 17:00:27 0
12 2017-05-17 18:00:27 0
13 2017-05-17 19:00:27 0.5
14 2017-05-17 20:00:27 1.67
15 2017-05-17 21:00:27 2
16 2017-05-17 22:00:27 2.67
Dataframe 2
DATE SHIP In Out PAX
<dttm> <chr> <chr> <chr> <dbl>
1 2017-05-17 00:00:00 Rotterdam 07:00 17:00 1404
2 2017-05-17 00:00:00 Deutschland 08:00 14:00 600
3 2017-05-18 00:00:00 Serenade 07:00 17:00 2200
4 2017-05-18 00:00:00 AIDAsol 11:00 20:00 2194
5 2017-05-19 00:00:00 Marco Polo 07:30 15:00 800
6 2017-05-21 00:00:00 Balmoral 07:30 16:00 2000
Expected result
datetime PM1 Shipname1 ShipPAX1 Shipname2 ShipPAX2
17.5.17 7:00 5,0 Rotterdam 1404,00 Deutschland
17.5.17 8:00 4,0 Rotterdam 1404,00 Deutschland 600,00
17.5.17 9:00 1,0 Rotterdam 1404,00 Deutschland 600,00
17.5.17 10:00 1,0 Rotterdam 1404,00 Deutschland 600,00
17.5.17 11:00 2,0 Rotterdam 1404,00 Deutschland 600,00
17.5.17 12:00 5,0 Rotterdam 1404,00 Deutschland 600,00
17.5.17 13:00 3,0 Rotterdam 1404,00 Deutschland 600,00
17.5.17 14:00 6,0 Rotterdam 1404,00 Deutschland 600,00
17.5.17 15:00 2,0 Rotterdam 1404,00 Deutschland NA
17.5.17 16:00 3,0 Rotterdam 1404,00 Deutschland NA
17.5.17 17:00 4,0 Rotterdam 1404,00 NA NA
17.5.17 18:00 8,0 NA NA NA NA
答案 0 :(得分:0)
我认为棘手的部分是您的第一个数据帧具有每小时的时间,而第二个数据帧具有开始和结束时间。因此,您首先需要使用seq
创建具有正确的小时顺序的数据框。然后,您可以将每个数据框与left_join
中的dplyr
连接起来。
library(dplyr)
datetime <- c("17.5.17 07:00", "17.5.17 08:00", "17.5.17 09:00", "17.5.17 10:00", "17.5.17 11:00", "17.5.17 12:00", "17.5.17 13:00", "17.5.17 14:00", "17.5.17 15:00", "17.5.17 16:00", "17.5.17 17:00", "17.5.17 18:00")
PM1 <- c("5,0", "4,0", "1,0", "1,0", "2,0", "5,0", "4,0", "6,0", "2,0", "3,0", "4,0", "8,0")
df1 <- data.frame(datetime, PM1)
df1$datetime <- as.POSIXct(df1$datetime, format = "%d.%m.%y %H:%M")
df1
datetime PM1
1 2017-05-17 07:00:00 5,0
2 2017-05-17 08:00:00 4,0
3 2017-05-17 09:00:00 1,0
4 2017-05-17 10:00:00 1,0
5 2017-05-17 11:00:00 2,0
6 2017-05-17 12:00:00 5,0
7 2017-05-17 13:00:00 4,0
8 2017-05-17 14:00:00 6,0
9 2017-05-17 15:00:00 2,0
10 2017-05-17 16:00:00 3,0
11 2017-05-17 17:00:00 4,0
12 2017-05-17 18:00:00 8,0
DATE <- c("17.5.17 00:00")
SHIP <- c("Rotterdam", "Deutschland")
In <- c("07:00", "08:00")
Out <- c("17:00", "14:00")
PAX <- c(1404, 600)
df <- data.frame(DATE, SHIP, In, Out, PAX)
df
DATE SHIP In Out PAX
1 17.5.17 00:00 Rotterdam 07:00 17:00 1404
2 17.5.17 00:00 Deutschland 08:00 14:00 600
#Change formatting of dates
df$DATE <- gsub(" 00:00", "", df$DATE)
df$In <- as.POSIXct(paste(df$DATE, df$In, sep = " "), format = "%d.%m.%y %H:%M")
df$Out <- as.POSIXct(paste(df$DATE, df$Out, sep = " "), format = "%d.%m.%y %H:%M")
for (i in 1:nrow(df)) {
#Create time sequence per hour
datetime <- seq(df$In[i], df$Out[i], by = "hour")
SHIP <- df$SHIP[i]
PAX <- df$PAX[i]
#Create temp df2
df2 <- data.frame(datetime, SHIP, PAX)
#Left join every time
df1 <- left_join(df1, df2, by = c("datetime" = "datetime"))
}
df1
datetime PM1 SHIP.x PAX.x SHIP.y PAX.y
1 2017-05-17 07:00:00 5,0 Rotterdam 1404 <NA> NA
2 2017-05-17 08:00:00 4,0 Rotterdam 1404 Deutschland 600
3 2017-05-17 09:00:00 1,0 Rotterdam 1404 Deutschland 600
4 2017-05-17 10:00:00 1,0 Rotterdam 1404 Deutschland 600
5 2017-05-17 11:00:00 2,0 Rotterdam 1404 Deutschland 600
6 2017-05-17 12:00:00 5,0 Rotterdam 1404 Deutschland 600
7 2017-05-17 13:00:00 4,0 Rotterdam 1404 Deutschland 600
8 2017-05-17 14:00:00 6,0 Rotterdam 1404 Deutschland 600
9 2017-05-17 15:00:00 2,0 Rotterdam 1404 <NA> NA
10 2017-05-17 16:00:00 3,0 Rotterdam 1404 <NA> NA
11 2017-05-17 17:00:00 4,0 Rotterdam 1404 <NA> NA
12 2017-05-17 18:00:00 8,0 <NA> NA <NA> NA
答案 1 :(得分:0)
一种data.table
解决方案。
样本数据
library( data.table)
#first create some good sample data
# I added T between date and time, to read it in as one string/column automatically
DT1 <- fread("datetime PM1
17.5.17T7:00 5,0
17.5.17T8:00 4,0
17.5.17T9:00 1,0
17.5.17T10:00 1,0
17.5.17T11:00 2,0
17.5.17T12:00 5,0
17.5.17T13:00 3,0
17.5.17T14:00 6,0
17.5.17T15:00 2,0
17.5.17T16:00 3,0
17.5.17T17:00 4,0
17.5.17T18:00 8,0")
DT2 <- fread("DATE SHIP In Out PAX
17.5.17T0:00 Rotterdam 07:00 17:00 1404,00
17.5.17T0:00 Deutschland 08:00 14:00 600,00
")
#now create real POSIXct dates
DT1[, datetime := as.POSIXct( datetime, format = "%d.%m.%yT%H:%M") ]
DT2[, DATE := as.POSIXct( DATE, format = "%d.%m.%yT%H:%M") ]
#set start and end date as POSIXct
DT2[, In := as.POSIXct( paste0( as.IDate(DATE), "T", In ), format = "%Y-%m-%dT%H:%M") ]
DT2[, Out := as.POSIXct( paste0( as.IDate(DATE), "T", Out ), format = "%Y-%m-%dT%H:%M") ]
代码
#use data.table::foverlaps to join on date ranges
ans <- DT2[ DT1, on = .( In <= datetime, Out >= datetime ) ]
#and cast to wide format, using SHIP as columnname, and PAX as value
dcast( ans, In + PM1 ~ SHIP, value.var = "PAX" )
# In PM1 NA Deutschland Rotterdam
# 1: 2017-05-17 07:00:00 5,0 <NA> <NA> 1404,00
# 2: 2017-05-17 08:00:00 4,0 <NA> 600,00 1404,00
# 3: 2017-05-17 09:00:00 1,0 <NA> 600,00 1404,00
# 4: 2017-05-17 10:00:00 1,0 <NA> 600,00 1404,00
# 5: 2017-05-17 11:00:00 2,0 <NA> 600,00 1404,00
# 6: 2017-05-17 12:00:00 5,0 <NA> 600,00 1404,00
# 7: 2017-05-17 13:00:00 3,0 <NA> 600,00 1404,00
# 8: 2017-05-17 14:00:00 6,0 <NA> 600,00 1404,00
# 9: 2017-05-17 15:00:00 2,0 <NA> <NA> 1404,00
# 10: 2017-05-17 16:00:00 3,0 <NA> <NA> 1404,00
# 11: 2017-05-17 17:00:00 4,0 <NA> <NA> 1404,00
# 12: 2017-05-17 18:00:00 8,0 <NA> <NA> <NA>