我正在使用示例数据以及我自己的数据来遵循此处https://f1000research.com/articles/5-1492/v2所述的工作流程。这个工作正常,但是现在我无法生成OTU表,该表包含诸如“ OTU00004”甚至更好的“ kingdom_phylum _..._ Pseudomonas_OTU00004”之类的标题。我想使用这样的表格来查找并绘制某个OTU在多个样本上的丰度。
我创建了一个名为ps的对象,看来没问题:
ps <- phyloseq(tax_table(taxtab), sample_data(samdf),
otu_table(seqtab, taxa_are_rows = FALSE),phy_tree(fitGTR$tree))
> ps
phyloseq-class experiment-level object
otu_table() OTU Table: [ 454 taxa and 360 samples ]
sample_data() Sample Data: [ 360 samples by 14 sample variables ]
tax_table() Taxonomy Table: [ 454 taxa by 6 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 454 tips and 452 internal nodes ]
但是OTU表中的标题和分类表中的相应行是实际的(此处已缩短)序列
> head(otu_table(ps)[1])
GCAAGCGTTACTCGGAATCACTGGGCGTAAAGAGCGCGTAGGCGG#shortened
F3D0 0
> head(tax_table(ps)[1])
Taxonomy Table: [1 taxa by 6 taxonomic ranks]:
Kingdom
GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGA#shortened "Bacteria"
是否有一种方法可以合并otu表和分类表中的信息,并用编号的OTU id替换序列?我检查了几个phyloseq资源和常见问题解答,但是找不到答案。
我想要一张像这样的桌子:
taxonomy_OTU00001 taxonomy_OTU00002 taxonomy_OTU00003
F3D0 #counts #counts #counts
F3D1 #counts #counts #counts
F3D11 #counts #counts #counts
F3D125 #counts #counts #counts
由于此步骤之前的工作流程非常耗时,所以我不确定如何为该问题提供可重复的示例。
short_otu2 = short_otu = head(otu_table(ps)[,c(1:6)]) # seq as colnames
short_tax2 = short_tax = tax_table(ps)[colnames(short_otu), ] # seq as rownames
# shorten seqs, must still be unique
colnames(short_otu2) <- substr(colnames(short_otu), 0, 50)
rownames(short_tax2) <- substr(rownames(short_tax), 0, 50)
library(phyloseq)
> dput(short_otu2)
new("otu_table", .Data = structure(c(526L, 375L, 2931L, 994L,
2061L, 419L, 319L, 330L, 1737L, 623L, 1868L, 350L, 402L, 207L,
1880L, 577L, 887L, 303L, 413L, 64L, 838L, 698L, 939L, 484L, 146L,
126L, 496L, 440L, 1183L, 184L, 462L, 37L, 26L, 782L, 271L, 310L
), .Dim = c(6L, 6L), .Dimnames = list(c("F3D0", "F3D1", "F3D11",
"F3D125", "F3D13", "F3D141"), c("GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGAAGAT",
"GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGACTCT", "GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTGT",
"GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTTT", "CCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGT",
"GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTGC"))), taxa_are_rows = FALSE)
> dput(short_tax2)
new("taxonomyTable", .Data = structure(c("Bacteria", "Bacteria",
"Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteroidetes",
"Bacteroidetes", "Bacteroidetes", "Bacteroidetes", "Bacteroidetes",
"Bacteroidetes", "Bacteroidia", "Bacteroidia", "Bacteroidia",
"Bacteroidia", "Bacteroidia", "Bacteroidia", "Bacteroidales",
"Bacteroidales", "Bacteroidales", "Bacteroidales", "Bacteroidales",
"Bacteroidales", "Bacteroidales_S24-7_group", "Bacteroidales_S24-7_group",
"Bacteroidales_S24-7_group", "Bacteroidales_S24-7_group", "Bacteroidaceae",
"Bacteroidales_S24-7_group", NA, NA, NA, NA, "Bacteroides", NA
), .Dim = c(6L, 6L), .Dimnames = list(c("GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGAAGAT",
"GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGACTCT", "GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTGT",
"GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTTT", "CCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGT",
"GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTGC"), c("Kingdom",
"Phylum", "Class", "Order", "Family", "Genus"))))
答案 0 :(得分:1)
这是问题的一部分“用OTU id(Phyloseq / dada2)替换实际序列吗?”
我联系了phyloseq / dada2开发人员,并根据苏珊·福尔摩斯(Susan Holmes)的回复(https://github.com/joey711/phyloseq/issues/1030),提出了这段代码,用编号为OTU的标头替换了扩增子序列。
进一步的讨论可以在这里找到:https://github.com/joey711/phyloseq/issues/213
# this changes the header from the actual sequence to Seq_001, Seq_002 etc
taxa_names(ps)
n_seqs <- seq(ntaxa(ps))
len_n_seqs <- nchar(max(n_seqs))
taxa_names(ps) <- paste("Seq", formatC(n_seqs,
width = len_n_seqs,
flag = "0"), sep = "_")
taxa_names(ps)
将分类法包含在标题中的一种可能方法如下(从上面继续):
# generate a vector containing the full taxonomy path for all OTUs
wholetax <- do.call(paste, c(as.data.frame(tax_table(ps))
[c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")],
sep = "__")) # to distinguish from "_" within tax ranks
# turn the otu_table into a data.frame
otu_export <- as.data.frame(otu_table(ps))
tmp <- names(otu_export)
# paste wholetax and OTU_ids together
for(i in 1:length(tmp)){
names(tmp)[i] = paste(wholetax[i], tmp[i], sep = "__")
}
# overwrite old names
names(otu_export) <- names(tmp)
> head(otu_export)[5]
# output:
Bacteria__Bacteroidetes__Bacteroidia__Bacteroidales__Bacteroidaceae__Bacteroides__Seq_005
F3D0 146
F3D1 126
F3D11 496
F3D125 440
F3D13 1183
F3D141 184
这还不包括表之间正确排序的测试!因此,请确保粘贴和覆盖正确。
这样,您就可以在一个文件中包含一个data.frame,其中包含每个分类类别的分类“可拆分”,OTU ID,样品名称和计数。但是,除了导出文件之外,您仍然维护phyloseq结构,其中OTU_ids链接了不同的表,例如otu_table()和tax_table()。另一种方法是向wholetax
命令提供taxa_names()
向量,但我尚未对此进行测试。
我们非常欢迎您提出改进建议!