我有一个名为mymat
的矩阵。我有一个名为geno <- c("01","N1","11","1N","10")
的向量。我有另一个名为key.table
的表。我想要做的是,我想将key
中的key.table
列与key
中的mymat
列匹配,并且如果任何匹配行中的列值具有任何geno
元素,我想从mymat
中提取该列名称以及匹配的geno
元素,并将其粘贴到matched.extract
key.table
中的新列中在每个key
的相应行中,并获得结果。
mymat <- structure(c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00",
"11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L), .Dimnames = list(
c("34", "35", "36", "37", "38"), c("key", "AMLM12001KP",
"AMAS-11.3-Diagnostic", "AMLM12014N-R")))
key.table<- structure(c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125",
"chr5:12127", "chr5:12129", "9920068", "9920069", "9920070",
"9920071", "9920072", "9920073", "9920074", "9920075", "9920076",
"9920077", "9920078"), .Dim = c(11L, 2L), .Dimnames = list(c("34",
"35", "36", "37", "38", "39", "40", "41", "42", "43", "44"),
c("key", "variantId")))
结果
key variantId matched.extract
34 "chr5:12111" "9920068" NA
35 "chr5:12111" "9920069" NA
36 "chr5:12113" "9920070" AMLM12001KP (1N),AMLM12014N-R (1N)
37 "chr5:12114" "9920071" AMAS-11.3-Diagnostic (11)
38 "chr5:12118" "9920072" AMAS-11.3-Diagnostic (10)
39 "chr5:12122" "9920073" NA
40 "chr5:12123" "9920074" NA
41 "chr5:12123" "9920075" NA
42 "chr5:12125" "9920076" NA
43 "chr5:12127" "9920077" NA
44 "chr5:12129" "9920078" NA
答案 0 :(得分:7)
使用data.table,我会这样做:
library(data.table)
# convert the 'key.table' matrix to a data.table
kt <- as.data.table(key.table, keep.rownames=TRUE)
# convert the 'mymat' matrix to a data.table and melt into long format
# filter on the needed geno-types
# paste the needed values together into the requested format
mm <- melt(as.data.table(mymat, keep.rownames=TRUE),
id=c("rn","key"))[value %in% c("1N","11","10"), val := paste0(variable," (",value,")")
][, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)
][val=="", val:=NA]
# join the 'mm' and 'kt' data.tables
kt[mm, matched := val, on=c("rn","key")]
给出:
> kt rn key variantId matched 1: 34 chr5:12111 9920068 NA 2: 35 chr5:12111 9920069 NA 3: 36 chr5:12113 9920070 AMLM12001KP (1N),AMLM12014N-R (1N) 4: 37 chr5:12114 9920071 AMAS-11.3-Diagnostic (11) 5: 38 chr5:12118 9920072 AMAS-11.3-Diagnostic (10) 6: 39 chr5:12122 9920073 NA 7: 40 chr5:12123 9920074 NA 8: 41 chr5:12123 9920075 NA 9: 42 chr5:12125 9920076 NA 10: 43 chr5:12127 9920077 NA 11: 44 chr5:12129 9920078 NA
解释:
kt <- as.data.table(key.table, keep.rownames=TRUE)
会将矩阵key.table
转换为data.table
(这是一个增强的data.frame
),并将这些rownames存储在rn
列中。mm <- melt(as.data.table(mymat, keep.rownames=TRUE), id=c("rn","key"))
会将矩阵mymat
转换为data.table
,将rownames存储在rn
列中,并将data.table融合为长格式。[value %in% c("1N","11","10"), val := paste0(variable," (",value,")")]
会将variable
- 值(mymat
中的列号)与value
- 值粘贴,仅适用于{{1}的情况是value
,1N
或11
。10
会将[, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)]
和NA
的非val
行粘贴在一起rn
&amp; key
变量。[val=="", val:=NA]
会将val
的空行转换为NA
- 值kt[mm, matched := val, on=c("rn","key")]
通过引用kt
- data.table更新val
- 匹配mm
的{{1}} - data.table值&安培; rn
变量。 警告 :使用 data.table 时,最好不要将key
用作变量名称因为key
也是key
中的参数。有关详细信息,请参阅data.table
。
答案 1 :(得分:4)
我不熟悉dplyr函数。您可以尝试基本R合并功能:
typedef __time_t time_t;
用于粘贴组织类型的列名称的功能:
mm <- merge(key.table,mymat,by="key",all.x=T)
mm
最终数据框:
get.geno <- function(x,y) ifelse(!x %in% c("00","0N") & !is.na(x), paste0(y," (",x,")"), NA)
a <- t(apply(mm[,3:5], 1, get.geno, colnames(mm)[3:5]))
答案 2 :(得分:1)
不完全确定你想要什么,但它可能接近这个:
mm$result <- apply(a, 1, function(x) paste(x[!is.na(x)] ,collapse=","))
mm[, -3:-5]
key variantId result
1 chr5:12111 9920068
2 chr5:12111 9920068
3 chr5:12111 9920069
4 chr5:12111 9920069
5 chr5:12113 9920070 AMLM12001KP (1N),AMLM12014N-R (1N)
6 chr5:12114 9920071 AMAS-11.3-Diagnostic (11)
7 chr5:12118 9920072 AMAS-11.3-Diagnostic (10)
8 chr5:12122 9920073
9 chr5:12123 9920074
10 chr5:12123 9920075
11 chr5:12125 9920076
12 chr5:12127 9920077
13 chr5:12129 9920078
产量:
library(reshape2)
mymat <- structure(
c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00",
"11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L),
.Dimnames = list(
c("34", "35", "36", "37", "38"),
c("key", "AMLM12001KP", "AMAS-11.3-Diagnostic", "AMLM12014N-R")))
key.table<- structure(
c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125",
"chr5:12127", "chr5:12129", "9920068", "9920069", "9920070",
"9920071", "9920072", "9920073", "9920074", "9920075", "9920076",
"9920077", "9920078"), .Dim = c(11L, 2L),
.Dimnames = list(
c("34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44"),
c("key", "variantId")))
# work with dataframes
mmdf <- data.frame(mymat)
ktdf <- data.frame(key.table)
tdf <- merge(mmdf,ktdf,by="key")
mltdf <- melt(tdf,id.vars=c("key","variantId"))
mltdf1 <- mltdf[mltdf$value != "0N" & mltdf$value != "00" ,]
mltdf1