不包括独特日期

时间:2015-11-11 09:21:32

标签: r

我的数据在excel中看起来像这样,并扩展到更多(Date5,Date6 ....)

 Date1   Value1 Date2    Value2 Date3    Value3 Date4   Value4
 1/2/2004   17  1/3/2004    27  1/1/2004    17  1/3/2004    31
 1/3/2004   26  1/4/2004    30  1/3/2004    29  1/4/2004    36
 1/4/2004   22  1/5/2004    22  1/4/2004    28  1/5/2004    33
 1/5/2004   17  1/6/2004    28  1/5/2004    36  1/6/2004    50
 1/13/2004  15  1/7/2004    17  1/12/2004   15  1/8/2004    9
 1/14/2004  10  1/14/2004   21  1/14/2004   12  1/14/2004   11

我想排除所有系列中不存在关联日期的任何值。

我发布的样本数据结果如下:

Date    Value1  Value2  Value3  Value4
1/3/2004    26    27    29      31
1/4/2004    22    30    28      36
1/5/2004    17    22    36      33
1/14/2004   10    21    12      11

5 个答案:

答案 0 :(得分:3)

Date <- Reduce(intersect, list(df$Date1, df$Date2, df$Date3, df$Date4))
Value1 <- df[df$Date1 %in% Date, ]$Value1
Value2 <- df[df$Date2 %in% Date, ]$Value2
Value3 <- df[df$Date3 %in% Date, ]$Value3
Value4 <- df[df$Date4 %in% Date, ]$Value4
data.frame(Date, Value1, Value2, Value3, Value4)


# Date Value1 Value2 Value3 Value4
# 1  1/3/2004     26     27     29     31
# 2  1/4/2004     22     30     28     36
# 3  1/5/2004     17     22     36     33
# 4 1/14/2004     10     21     12     11

正如@docendo提到的那样,在多列的情况下,这可能会很长,更新的方式将是

Date <- Reduce(intersect, list(df$Date1, df$Date2, df$Date3, df$Date4))
Values <- df[, seq(0, ncol(df), by=2)]
Dates <- df[, seq(1, ncol(df), by=2)]
mat <- apply(Dates, 2, function(x) {x %in% Date})
data.frame(Date, matrix(Values[mat], nrow = 4))

# Date X1 X2 X3 X4
# 1  1/3/2004 26 27 29 31
# 2  1/4/2004 22 30 28 36
# 3  1/5/2004 17 22 36 33
# 4 1/14/2004 10 21 12 11

根据@David的评论,使用

可以进一步改善这一点
Values <- df[c(FALSE, TRUE)]
Dates <- df[c(TRUE, FALSE)]
Date <- Reduce(intersect, as.list(Dates))
mat <- apply(Dates, 2, function(x) {x %in% Date}) 
data.frame(Date, matrix(Values[mat], nrow = ncol(df)/2))

#        Date X1 X2 X3 X4
# 1  1/3/2004 26 27 29 31
# 2  1/4/2004 22 30 28 36
# 3  1/5/2004 17 22 36 33
# 4 1/14/2004 10 21 12 11

答案 1 :(得分:3)

这是dplyr / tidyr方法:

library(dplyr); library(tidyr)
gather(DF, key1, Date, -starts_with("Value")) %>%
  gather(key2, Val, starts_with("Value")) %>% 
  filter(Date %in% Reduce(intersect, select(DF, starts_with("Date"))) & 
           gsub("[^0-9]", "", key1) == gsub("[^0-9]", "", key2)) %>% 
  select(-key1) %>% spread(key2, Val)

#       Date Value1 Value2 Value3 Value4
#1 1/14/2004     10     21     12     11
#2  1/3/2004     26     27     29     31
#3  1/4/2004     22     30     28     36
#4  1/5/2004     17     22     36     33
#Warning:
#attributes are not identical across measure variables; they will be dropped 

警告与转换为factor的{​​{1}}列有关。

-

在@ AntoniosK的评论之后编辑

答案 2 :(得分:1)

我的尝试没有额外的包:

d <- read.table(header=TRUE, text=
'Date1   Value1 Date2    Value2 Date3    Value3 Date4   Value4
1/2/2004   17  1/3/2004    27  1/1/2004    17  1/3/2004    31
1/3/2004   26  1/4/2004    30  1/3/2004    29  1/4/2004    36
1/4/2004   22  1/5/2004    22  1/4/2004    28  1/5/2004    33
1/5/2004   17  1/6/2004    28  1/5/2004    36  1/6/2004    50
1/13/2004  15  1/7/2004    17  1/12/2004   15  1/8/2004    9
1/14/2004  10  1/14/2004   21  1/14/2004   12  1/14/2004   11')

l <- length(d) %/% 2
D <- "Date"
dneu.i <- function(i) {
  di <- d[, (2*i-1):(2*i)]
  names(di) <- c("Date", "Value")
  di$I <- paste0(D, i)
  di
}
dneu <- dneu.i(1)
for (i in 2:l) dneu <- rbind(dneu, dneu.i(i))
dneu.w <- reshape(dneu, dir="wide", idvar="Date", timevar="I")
subset(dneu.w, apply(dneu.w[,-1], 1, function(x) !any(is.na(x))))

答案 3 :(得分:1)

dt = read.table(text="Date1   Value1 Date2    Value2 Date3    Value3 Date4   Value4
                1/2/2004   17  1/3/2004    27  1/1/2004    17  1/3/2004    31
                1/3/2004   26  1/4/2004    30  1/3/2004    29  1/4/2004    36
                1/4/2004   22  1/5/2004    22  1/4/2004    28  1/5/2004    33
                1/5/2004   17  1/6/2004    28  1/5/2004    36  1/6/2004    50
                1/13/2004  15  1/7/2004    17  1/12/2004   15  1/8/2004    9
                1/14/2004  10  1/14/2004   21  1/14/2004   12  1/14/2004   11", header=T)

library(dplyr)
library(tidyr)


dt %>% 
  select(starts_with("Date")) %>%                               ## get the dates columns
  gather(DateGroup,Date,starts_with("Date")) %>%                ## reshape them to create a single column of dates and in which initial column they belong
  cbind(dt %>% 
          select(starts_with("Value")) %>%                      ## get the values columns
          gather(ValueGroup,Value,starts_with("Value"))) %>%    ## reshape them to create a single column of values and in which initial column they belong
  group_by(Date) %>%                                            ## for each date
  mutate(Group_count = n_distinct(DateGroup)) %>%               ## count in how many inital columns they exist
  ungroup() %>%                                                 ## forget about the grouping
  filter(Group_count == length(unique(DateGroup))) %>%          ## keep columns that exists in all initial columns
  select(Date, ValueGroup, Value) %>%                           ## select appropriate columns
  spread(ValueGroup, Value)                                     ## reshape dataset

#        Date Value1 Value2 Value3 Value4
#       (chr)  (int)  (int)  (int)  (int)
# 1 1/14/2004     10     21     12     11
# 2  1/3/2004     26     27     29     31
# 3  1/4/2004     22     30     28     36
# 4  1/5/2004     17     22     36     33

答案 4 :(得分:0)

我添加答案有点迟了。无论如何... 显然,你正在处理一个多变量的时间序列。因此,您应该使用时间序列对象(如zoo)来存储数据。最近我在SO上看到了很多问题,其中人们使用数据帧和矩阵来存储多变量时间序列对象。我强烈建议你不要这样做。

以下是我使用zoo的解决方案:

library(zoo)
do.call("merge",c(lapply(split(1:ncol(d),sort(rep(1:(ncol(d)/2),times=2))),
                         function(x) zoo(d[,x[2],drop=FALSE],
                                         as.Date(d[,x[1]],format="%m/%d/%Y"))),
                  all=FALSE))

结果是

#           Value1 Value2 Value3 Value4
#2004-01-03     26     27     29     31
#2004-01-04     22     30     28     36
#2004-01-05     17     22     36     33
#2004-01-14     10     21     12     11