比较两个数据集并找出常用名称

时间:2015-04-13 15:28:18

标签: r dataframe

我如何比较两个数据集并找到共同的基因名称,前提是两个数据集的CNA和chr相同

DT1

    CNA     chr   Genes
    gain    5     Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1
    loss    5     RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5
    gain    2     PDIA5,SEMA5B

dt2

    CNA     chr   Genes
    gain    5     Sall3,Nfatc1,SNORA5,SNORA5
    gain    5     RNU6-866P,OR8J1,OR8K3,OR8K3
    gain    2     PDIA5,DCC

预期产出

df3

    CNA     chr   Genes
    gain    5     Sall3,Nfatc1
    gain    2     PDIA5

我确定这是一个微不足道的问题,但我很乐意提出一些建议来帮助我。

2 个答案:

答案 0 :(得分:3)

这是一种方法:

library(data.table)

df2 = setDT(df2)[,list(Genes=paste0(Genes, collapse=',')),by=list(CNA, chr)]
res = setkey(setDT(df1), CNA, chr)[df2]

#    CNA chr                          Genes                                                Genes.1
#1: gain   5 Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1 Sall3,Nfatc1,SNORA5,SNORA5,RNU6-866P,OR8J1,OR8K3,OR8K3
#2: gain   2                   PDIA5,SEMA5B                                              PDIA5,DCC

res[, paste0(intersect(strsplit(Genes,',')[[1]], strsplit(Genes.1,',')[[1]]), collapse=',')
    , by=list(CNA, chr)]

#    CNA chr           V1
#1: gain   5 Sall3,Nfatc1
#2: gain   2        PDIA5

数据:

df1 = structure(list(CNA = c("gain", "gain", "loss"), chr = c(2L, 5L, 
5L), Genes = c("PDIA5,SEMA5B", "Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1", 
"RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5")), .Names = c("CNA", "chr", 
"Genes"), class = "data.frame", row.names = c(NA, -3L))

df2 = structure(list(CNA = c("gain", "gain", "gain"), chr = c(5L, 5L, 
2L), Genes = c("Sall3,Nfatc1,SNORA5,SNORA5", "RNU6-866P,OR8J1,OR8K3,OR8K3", 
"PDIA5,DCC")), .Names = c("CNA", "chr", "Genes"), class = "data.frame", row.names = c(NA, 
-3L))

答案 1 :(得分:2)

不是很优雅但是

dt1 <- read.table(header = TRUE, text = "CNA     chr   Genes
gain    5     Sall3,Kcng2,Atp9b,Nfatc1,Ctdp1
loss    5     RNU6-866P,TRIM5,TRIM34,TRIM22,TRIM5
gain    2     PDIA5,SEMA5B", stringsAsFactors = FALSE)


dt2 <- read.table(header = TRUE, text= "CNA     chr   Genes
gain    5     Sall3,Nfatc1,SNORA5,SNORA5
gain    5     RNU6-866P,OR8J1,OR8K3,OR8K3
gain    2     PDIA5,DCC", stringsAsFactors = FALSE)



f <- function(x, y, z = 'Genes') {
  ## split the genes out and find common ones
  xx <- strsplit(x[, z], ',')
  yy <- strsplit(y[, z], ',')
  res <- lapply(seq_along(xx), function(ii)
    intersect(xx[[ii]], yy[[ii]]))

  ## combine back into one of the data frames
  res <- lapply(res, paste, collapse = ',')
  res <- cbind(x[, 1:2], Genes = do.call('rbind', res))

  ## make sure the chr and alterations are the same and only return those
  idx <- sapply(1:nrow(x), function(ii) all(x[ii, 1:2] == y[ii, 1:2]))
  res[idx, ]
}


f(dt1, dt2)

#    CNA chr        Genes
# 1 gain   5 Sall3,Nfatc1
# 3 gain   2        PDIA5