如何匹配2个数据框列并提取列值和列名?

时间:2015-12-22 08:27:23

标签: r

我有一个名为mymat的矩阵。我有一个名为geno <- c("01","N1","11","1N","10")的向量。我有另一个名为key.table的表。我想要做的是,我想将key中的key.table列与key中的mymat列匹配,并且如果任何匹配行中的列值具有任何geno元素,我想从mymat中提取该列名称以及匹配的geno元素,并将其粘贴到matched.extract key.table中的新列中在每个key的相应行中,并获得结果。

  mymat <- structure(c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114", 
"chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00", 
"11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L), .Dimnames = list(
    c("34", "35", "36", "37", "38"), c("key", "AMLM12001KP", 
    "AMAS-11.3-Diagnostic", "AMLM12014N-R")))

key.table<- structure(c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114", 
"chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125", 
"chr5:12127", "chr5:12129", "9920068", "9920069", "9920070", 
"9920071", "9920072", "9920073", "9920074", "9920075", "9920076", 
"9920077", "9920078"), .Dim = c(11L, 2L), .Dimnames = list(c("34", 
"35", "36", "37", "38", "39", "40", "41", "42", "43", "44"), 
    c("key", "variantId")))

结果

  key          variantId    matched.extract
34 "chr5:12111" "9920068"     NA
35 "chr5:12111" "9920069"     NA
36 "chr5:12113" "9920070"     AMLM12001KP (1N),AMLM12014N-R (1N)
37 "chr5:12114" "9920071"     AMAS-11.3-Diagnostic (11)
38 "chr5:12118" "9920072"     AMAS-11.3-Diagnostic (10)
39 "chr5:12122" "9920073"     NA
40 "chr5:12123" "9920074"     NA
41 "chr5:12123" "9920075"     NA
42 "chr5:12125" "9920076"     NA
43 "chr5:12127" "9920077"     NA
44 "chr5:12129" "9920078"     NA

3 个答案:

答案 0 :(得分:7)

使用,我会这样做:

library(data.table)
# convert the 'key.table' matrix to a data.table
kt <- as.data.table(key.table, keep.rownames=TRUE)
# convert the 'mymat' matrix to a data.table and melt into long format
# filter on the needed geno-types
# paste the needed values together into the requested format
mm <- melt(as.data.table(mymat, keep.rownames=TRUE),
           id=c("rn","key"))[value %in% c("1N","11","10"), val := paste0(variable," (",value,")")
                             ][, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)
                               ][val=="", val:=NA]
# join the 'mm' and 'kt' data.tables
kt[mm, matched := val, on=c("rn","key")]

给出:

> kt
    rn        key variantId                            matched
 1: 34 chr5:12111   9920068                                 NA
 2: 35 chr5:12111   9920069                                 NA
 3: 36 chr5:12113   9920070 AMLM12001KP (1N),AMLM12014N-R (1N)
 4: 37 chr5:12114   9920071          AMAS-11.3-Diagnostic (11)
 5: 38 chr5:12118   9920072          AMAS-11.3-Diagnostic (10)
 6: 39 chr5:12122   9920073                                 NA
 7: 40 chr5:12123   9920074                                 NA
 8: 41 chr5:12123   9920075                                 NA
 9: 42 chr5:12125   9920076                                 NA
10: 43 chr5:12127   9920077                                 NA
11: 44 chr5:12129   9920078                                 NA

解释

  • kt <- as.data.table(key.table, keep.rownames=TRUE)会将矩阵key.table转换为data.table(这是一个增强的data.frame),并将这些rownames存储在rn列中。
  • mm <- melt(as.data.table(mymat, keep.rownames=TRUE), id=c("rn","key"))会将矩阵mymat转换为data.table,将rownames存储在rn列中,并将data.table融合为长格式。
  • 部分[value %in% c("1N","11","10"), val := paste0(variable," (",value,")")]会将variable - 值(mymat中的列号)与value - 值粘贴,仅适用于{{1}的情况是value1N11
  • 部分10会将[, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)]NA的非val行粘贴在一起rn&amp; key变量。
  • 部分[val=="", val:=NA]会将val的空行转换为NA - 值
  • 最后kt[mm, matched := val, on=c("rn","key")]通过引用kt - data.table更新val - 匹配mm的{​​{1}} - data.table值&安培; rn变量。

警告 :使用 data.table 时,最好不要将key用作变量名称因为key也是key中的参数。有关详细信息,请参阅data.table

答案 1 :(得分:4)

我不熟悉dplyr函数。您可以尝试基本R合并功能:

typedef __time_t time_t;

用于粘贴组织类型的列名称的功能:

mm <- merge(key.table,mymat,by="key",all.x=T)
mm

最终数据框:

get.geno <- function(x,y) ifelse(!x %in% c("00","0N") & !is.na(x), paste0(y," (",x,")"), NA)
a <- t(apply(mm[,3:5], 1, get.geno, colnames(mm)[3:5]))

答案 2 :(得分:1)

不完全确定你想要什么,但它可能接近这个:

mm$result <- apply(a, 1, function(x) paste(x[!is.na(x)] ,collapse=","))
mm[, -3:-5]
          key   variantId                           result
1  chr5:12111   9920068                                   
2  chr5:12111   9920068                                   
3  chr5:12111   9920069                                   
4  chr5:12111   9920069                                   
5  chr5:12113   9920070 AMLM12001KP (1N),AMLM12014N-R (1N)
6  chr5:12114   9920071          AMAS-11.3-Diagnostic (11)
7  chr5:12118   9920072          AMAS-11.3-Diagnostic (10)
8  chr5:12122   9920073                                   
9  chr5:12123   9920074                                   
10 chr5:12123   9920075                                   
11 chr5:12125   9920076                                   
12 chr5:12127   9920077                                   
13 chr5:12129   9920078    

产量:

library(reshape2)
mymat <- structure(
  c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
    "chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00", 
    "11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L), 
  .Dimnames = list(
    c("34", "35", "36", "37", "38"), 
    c("key", "AMLM12001KP", "AMAS-11.3-Diagnostic", "AMLM12014N-R")))

key.table<- structure(
  c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114", 
    "chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125", 
    "chr5:12127", "chr5:12129", "9920068", "9920069", "9920070", 
    "9920071", "9920072", "9920073", "9920074", "9920075", "9920076", 
    "9920077", "9920078"), .Dim = c(11L, 2L), 
  .Dimnames = list(
    c("34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44"), 
                   c("key", "variantId")))

# work with dataframes
mmdf <- data.frame(mymat)
ktdf <- data.frame(key.table)

tdf <- merge(mmdf,ktdf,by="key")
mltdf <- melt(tdf,id.vars=c("key","variantId"))
mltdf1 <- mltdf[mltdf$value != "0N" & mltdf$value != "00" ,]

mltdf1