我需要帮助合并数据(mydf)中具有相同名称(即起始列)的行并连接“ALT”列中的内容,从而根据起始列中的类似值删除所有重复行。我想合并行并连接用逗号分隔的“ALT”列中的内容,并得到如下所示的结果。谢谢你的帮助。
> mydf
chr start end REF ALT TYPE refGene
chr10 chr10:176131 176131 C A snp nonsynonymous SNV
chr10 chr10:159149 159149 C G snp:17659149 nonsynonymous SNV
chr10 chr10:159149 159149 C T snp:17659149 nonsynonymous SNV
chr10 chr10:241469 241469 T C snp splicing
> result
chr start end REF ALT TYPE refGene
chr10 chr10:176131 176131 C A snp nonsynonymous SNV
chr10 chr10:159149 159149 C G,T snp:17659149 nonsynonymous SNV
chr10 chr10:241469 241469 T C snp splicing
DPUT在这里:
structure(list(chr = c("chr3", "chr3", "chr3", "chr3"), start = c("chr3:75786036",
"chr3:75786337", "chr3:75786337", "chr3:75788226"), end = c(75786036,
75786337, 75786337, 75788226), REF = c("A", "G", "G", "C"), ALT = c("G",
"A", "T", "A"), TYPE = c("snp:75786036", "snp:75786337", "snp:75786337",
"snp:75788226"), `refGene::location` = c("nonsynonymous SNV",
"nonsynonymous SNV", "nonsynonymous SNV", "nonsynonymous SNV"
), `refGene::type` = c("ZNF717:NM_001290208:exon5:c.T2738C:p.F913S,ZNF717:NM_001128223:exon5:c.T2738C:p.F913S,ZNF717:NM_001290209:exon5:c.T2588C:p.F863S,",
"ZNF717:NM_001290208:exon5:c.C2437T:p.P813S,ZNF717:NM_001128223:exon5:c.C2437T:p.P813S,ZNF717:NM_001290209:exon5:c.C2287T:p.P763S,",
"ZNF717:NM_001290208:exon5:c.C2437A:p.P813T,ZNF717:NM_001128223:exon5:c.C2437A:p.P813T,ZNF717:NM_001290209:exon5:c.C2287A:p.P763T,",
"ZNF717:NM_001290208:exon5:c.G548T:p.C183F,ZNF717:NM_001128223:exon5:c.G548T:p.C183F,ZNF717:NM_001290209:exon5:c.G398T:p.C133F,"
)), .Names = c("chr", "start", "end", "REF", "ALT", "TYPE", "refGene::location",
"refGene::type"), row.names = c("4041", "4051", "4052", "4128"
), class = "data.frame")
答案 0 :(得分:3)
这是一个dplyr
解决方案:
library(dplyr)
df %>% group_by(start) %>%
mutate(ALT = paste(ALT, collapse=",")) %>%
distinct(start)
答案 1 :(得分:1)
在基地R中尝试aggregate
:
newdf <- mydf[!duplicated(mydf$start),]
newdf[, 'ALT'] <- aggregate(ALT~start, data=mydf, toString)[,2]
答案 2 :(得分:0)
这样的东西会起作用(未测试:请dput
数据)
library(data.table)
setDT(mydf)[,.(REF=paste(REF,collapse=",")),by="TYPE"]
unique(mydf, by="TYPE")
chr start end REF ALT TYPE refGene::location
1: chr3 chr3:75786036 75786036 A G snp:75786036 nonsynonymous SNV
2: chr3 chr3:75786337 75786337 G,G A snp:75786337 nonsynonymous SNV
3: chr3 chr3:75788226 75788226 C A snp:75788226 nonsynonymous SNV
refGene::type
1: ZNF717:NM_001290208:exon5:c.T2738C:p.F913S,ZNF717:NM_001128223:exon5:c.T2738C:p.F913S,ZNF717:NM_001290209:exon5:c.T2588C:p.F863S,
2: ZNF717:NM_001290208:exon5:c.C2437T:p.P813S,ZNF717:NM_001128223:exon5:c.C2437T:p.P813S,ZNF717:NM_001290209:exon5:c.C2287T:p.P763S,
3: ZNF717:NM_001290208:exon5:c.G548T:p.C183F,ZNF717:NM_001128223:exon5:c.G548T:p.C183F,ZNF717:NM_001290209:exon5:c.G398T:p.C133F,
>