根据列值合并数据框中的行值

时间:2017-06-16 17:58:35

标签: r csv

我试图加入给定列中具有相等值的行的非空值。这是一个例子:

Date        x        y
2017-06-01  5        NA   <- to merge
2017-06-01  NA       8    <- to merge
2017-05-02  55       33

我需要的是将以上内容转换为:

Date        x        y
2017-06-01  5        8    <- merged lines
2017-05-02  55       33

如何使用列表推导或lambda表达式来完成此操作?

4 个答案:

答案 0 :(得分:1)

使用与使用in this answer非常相似的方法,我们可以使用 dplyr 包执行您想要的操作。请注意,我创建了一个与您的非常相似的示例数据集。

library(dplyr)
# generate sample data
dat <- data.frame(grp = c('a','a','b'),
                  x = c(5, NA, 55),
                  y = c(NA, 8, 33), stringsAsFactors = FALSE)

dat

#   grp  x  y
# 1   a  5 NA
# 2   a NA  8
# 3   b 55 33

my_fun <- function(x) x[!is.na(x)]

dat %>%
    group_by(grp) %>%
    summarise_all(funs(my_fun))

# A tibble: 2 × 3
#     grp     x     y
#   <chr> <dbl> <dbl>
# 1     a     5     8
# 2     b    55    33

编辑 - 组中一列中的多个非缺失值。

我生成了一些在组的列中具有多个非缺失值的数据。我们可以使用expand.griduniquecomplete.cases来查找非缺失值的所有组合。请注意,这就是我认为您希望问题得到解决的方式,但如果没有更多细节,很难确定。

dat <- structure(list(grp = c("a", "a", "a", "b", "b"), x = c(5, NA, 
3, 8, NA), y = c(8, 9, NA, NA, 3)), .Names = c("grp", "x", "y"
), row.names = c(NA, -5L), class = "data.frame")

#   grp  x  y
# 1   a  5  8
# 2   a NA  9
# 3   a  3 NA
# 4   b  8 NA
# 5   b NA  3

do.call('rbind', by(dat, dat$grp, function(d){
  new_d <- unique(expand.grid(d, stringsAsFactors = FALSE))
  new_d[complete.cases(new_d), ]
}))

#      grp x y
# a.1    a 5 8
# a.7    a 3 8
# a.10   a 5 9
# a.16   a 3 9
# b      b 8 3

答案 1 :(得分:1)

只要你在每一行中都有相同的数据,并且只缺少其中几个,这应该可行。

# sample data
myData <- data.frame(date = c(Sys.Date(), Sys.Date(), Sys.Date(), Sys.Date()-2),
                  x = c(5, NA, 5, 55),
                  y = c(NA, 8, 8, 33),  stringsAsFactors = FALSE)
#myData
#        date  x  y
# 2017-12-13  5 NA
# 2017-12-13 NA  8
# 2017-12-13  5  8
# 2017-12-11 55 33

# merge the lines
myData <- aggregate(myData[-1], list(myData$date), FUN = mean, na.rm = TRUE)

#> myData
#     Group.1  x  y
# 2017-12-11 55 33
# 2017-12-13  5  8

答案 2 :(得分:0)

如果它不适合您的使用案例,请告诉我。我做了一些有趣的事情,并希望它是通过排除令牌来按行分组合并问题的一般解决方案,或者是由用户提供的函数指定的值绑定。如果在独占合并上没有剩余值,则可以给出替换值。应该成功处理NA值作为替换值和data.frame的一部分。

一切都是基础R

#' Token in data.frame?
#' 
#' Looks through all elements of data.frame to see if token is there.
#' Will only look in columns type compatible with token
#' 
#' @param token The token to look for
#' @param df, data.frame to look in
#' @return Scalar boolean
contains <- function(token, df) {
  mask <- class(token) == lapply(df, class)
  all(
    unlist(
      lapply(df[, mask], function(col) {
        found <- token %in% col
        if(any(found)) return(TRUE)
        found
      })
    )
  )
}

#' Matrix Mask
#' 
#' Calculate a boolean matrix mask of where
#' token is in data.frame. Token match is TRUE
#' in matrix, everything else FALSE
#' 
#' @param token, token to look for
#' @param df, data.frame to look in
#' @return matrix of boolean type
matrix.mask <- function(token, df){
  as.matrix(
    lapply(df, function(c){
      if(is.na(token)) {
        sapply(c, is.na) #NA need special handling
      } else {
        token == c
      }
    })
  )
}

#' Is Equal
#' 
#' Test for equality by value and accounts for special handling of NA
#' All value sin lhs and rhs need to be of same type. If one contains any NA, value is FALSE
#' 
#' @param lhs symbol to test for value equality against rhs
#' @param rhs symbol to test for value equality agains lhs
#' @return boolean vector or scalar depending on lhs and rhs
is.equal <- function(lhs, rhs) {
  if(all(is.na(lhs)) && all(is.na(rhs))) {
    return(rep(TRUE, max(length(lhs), length(rhs))))
  } else if (any(is.na(lhs)) || any(is.na(rhs))) 
    {
      return(rep(FALSE, max(length(lhs), length(rhs))))
    }
    else 
    {
      lhs == rhs
    }

  }

#' Merge rows grouped by index column with exclusion of token
#' 
#' Merge rows in a data.frame by excluding token. 
#' Rows are considered by grouping as given by factor 'ind'
#' If exclusion of token does not render the group of rows
#' at that particular column value to only have one value
#' left, the vector of values left in that column will be
#' transformed to scalar by means of supplied tie function. 
#' 
#' @param df data.frame to do row-wise merge in
#' @param ind name or index of column to apply grouping of rows against
#' @param token a value to exclude during merge of rows
#' @param tie a function that takes a vector and returns a scalar
#' @param replace value to substitute with if column becomes efter merge, ie the group
#' rows contained only token
#' @param return merged data.frame
merge.rows = function(df, ind, token, tie, replace) {
  #check invariants
  col.names <- names(df)
  stopifnot(ind %in% col.names | ind <= length(col.names))


  if(is.character(ind)) {ind <- which(ind %in% col.names)} #make sure ind is numeric
  if(!(is.factor(df[, ind]))) {
    ind.type <- lapply(df[ind], class)
    df[ind] <- lapply(df[ind], as.factor)
  }
  ind.name <- col.names[ind]

  #fast return
  if(
    ! contains(token, df[, -ind])
  ) {
     return(df)   
      }
  #list for each group with values for each column
  merged.outer <-
  by(df, as.list(df[ind]), simplify = F, FUN = function(rows) {
    #short circuit with same return type

    if(nrow(rows) == 1) {return(as.list(rows))} 

    index.inner <- rows[1, ind]
    rows.data <- rows[-ind] #calculate without index factor

    rows.mask <- matrix.mask(token, rows.data)

    #list with value for each column in index group
    merged.inner <- 
      mapply(function(rs, mask){
      keep <- rs[! mask]

      if(length(keep) > 1){ #put back index
          tie(keep) #tie if many values
        } else {
          keep
        }
    }, rows.data, rows.mask, SIMPLIFY = F, USE.NAMES = T)

    should.replace <- lapply(merged.inner, is.equal, token)

    merged.inner.replaced <- 
    mapply(function(val, should) {
      if(should) {val <- replace} else {val}
    }, merged.inner, should.replace)

    c(as.list(rows[ind]), merged.inner.replaced)

  })

  #make df again

  df.merged <- Reduce(rbind, lapply(merged.outer, as.data.frame))
  df.merged

}

d1 = data.frame(Date = as.Date(c('2017-06-01', '2017-06-01', '2017-05-02'), format='%Y-%m-%d'),
                X = c(5, NA, 55),
                Y = c(NA, 8, 33))


a1 <- merge.rows(d1, "Date", NA)
#         Date  X  Y
# 1 2017-05-02 55 33
# 2 2017-06-01  5  8

d2 = data.frame(Date = as.Date(c('2017-06-01', '2017-06-01', '2017-05-02'), format='%Y-%m-%d'),
                X = c(5, 6, 55),
                Y = c(6, 8, 33))
a2 <- merge.rows(d2, "Date", 6)
#         Date  X  Y
# 1 2017-05-02 55 33
# 2 2017-06-01  5  8

#data from bouncyball for better comparison
dat <- structure(list(grp = c("a", "a", "a", "b", "b"), 
                  x = c(5, NA, 3, 8, NA), y = c(8, 9, NA, NA, 3)), 
                 .Names = c("grp", "x", "y"), row.names = c(NA, -5L), class = "data.frame")

#apply insert the mean if more than one value in grouped column
merge.rows(dat, "grp", NA, mean)
#   grp x   y
# 1   a 4 8.5
# 2   b 8 3.0

#apply insert the max if more than one value in grouped column
merge.rows(dat, "grp", NA, max)
#   grp x y
# 1   a 5 9
# 2   b 8 3

dat2 <- structure(list(grp1 = rep(c("a", "b"), each=4),
                       grp2 = rep(c("c", "d"), times = 2, each=2),
                      x = c(5, NA, 3, 8, NA, 8, 3, 9), y = c(8, 9, NA, NA, 3, 6, 2, NA)), 
                 .Names = c("grp1", "grp2", "x", "y"), row.names = c(NA, 8L), class = "data.frame")

#merge on group by two columns
merge.rows(dat2, c("grp1", "grp2"), NA, mean, 999)
#   grp1 grp2   x     y
# 1    a    c 5.0   8.5
# 2    a    c 5.0   8.5
# 3    b    c 8.0   4.5
# 4    b    c 8.0   4.5
# 5    a    d 5.5 999.0
# 6    a    d 5.5 999.0
# 7    b    d 6.0   2.0
# 8    b    d 6.0   2.0

答案 3 :(得分:0)

对于每个变量和组只有一个非NA值的给定样本数据,这可以通过简洁的&#34;单行&#34;来解决:

library(data.table)
# coerce to class data.table, group by column Date,
# for each of the other columns remove any NA values
setDT(DF)[, lapply(.SD, na.omit), Date]

         Date  x  y
1: 2017-06-01  5  8
2: 2017-05-02 55 33

数据

DF <- structure(list(Date = c("2017-06-01", "2017-06-01", "2017-05-02"
), x = c(5L, NA, 55L), y = c(NA, 8L, 33L)), .Names = c("Date", 
"x", "y"), row.names = c(NA, -3L), class = "data.frame")