我们知道三组密码子编码氨基酸,例如ATG编码仅用于M(蛋氨酸)和ATC,ATA,ATT编码用于I(异亮氨酸) 对于编码M,DNA序列中ATG的百分比总是为1,对于编码I,DNA序列中ATC的百分比总是为0.33,如ATA和ATT。 我想制作一个函数,可以计算序列中密码子的数量,然后计算形成特定氨基酸的频率百分比。
codon <- list(ATA = "I", ATC = "I", ATT = "I", ATG = "M", ACA = "T",
ACC = "T", ACG = "T", ACT = "T", AAC = "N", AAT = "N", AAA = "K",
AAG = "K", AGC = "S", AGT = "S", AGA = "R", AGG = "R", CTA = "L",
CTC = "L", CTG = "L", CTT = "L", CCA = "P", CCC = "P", CCG = "P",
CCT = "P", CAC = "H", CAT = "H", CAA = "Q", CAG = "Q", CGA = "R",
CGC = "R", CGG = "R", CGT = "R", GTA = "V", GTC = "V", GTG = "V",
GTT = "V", GCA = "A", GCC = "A", GCG = "A", GCT = "A", GAC = "D",
GAT = "D", GAA = "E", GAG = "E", GGA = "G", GGC = "G", GGG = "G",
GGT = "G", TCA = "S", TCC = "S", TCG = "S", TCT = "S", TTC = "F",
TTT = "F", TTA = "L", TTG = "L", TAC = "Y", TAT = "Y", TAA = "stop",
TAG = "stop", TGC = "C", TGT = "C", TGA = "stop", TGG = "W")
( fracs <- 1/table(unlist(codon)) )
A C D E F G H I K L M
0.2500000 0.5000000 0.5000000 0.5000000 0.5000000 0.2500000 0.5000000 0.3333333 0.5000000 0.1666667 1.0000000
N P Q R S stop T V W Y
0.5000000 0.2500000 0.5000000 0.1666667 0.1666667 0.3333333 0.2500000 0.2500000 1.0000000 0.5000000
codonfracs <- setNames(lapply(codon, function(x) unname(fracs[x])), names(codon))
str(head(codonfracs))
List of 6
$ ATA: num 0.333
$ ATC: num 0.333
$ ATT: num 0.333
$ ATG: num 1
$ ACA: num 0.25
$ ACC: num 0.25
s <- 'AAGGCCTGCGCAAATATTTCCACTCCTTCCCGGGTGCTCCTGAGTTGAACCCGC
TTAGAGACTCCGAAATCAACGACGACTTCCACCAGTGGGCCCAGTGACCGCCACACTGGA
CCCCATACCACTTCTTTTTGTTATTCTTAAATATGTT
'
strsplit3 <- function(s, k=3) {
starts <- seq.int(1, nchar(s), by=k)
stops <- c(starts[-1] - 1, nchar(s))
mapply(substr, s, starts, stops, USE.NAMES=FALSE)
}
strsplit3(s)
[1] "AAG" "GCC" "TGC" "GCA" "AAT" "ATT" "TCC" "ACT" "CCT" "TCC" "CGG" "GTG" "CTC" "CTG" "AGT" "TGA"
[17] "ACC" "CGC" "\nTT" "AGA" "GAC" "TCC" "GAA" "ATC" "AAC" "GAC" "GAC" "TTC" "CAC" "CAG" "TGG" "GCC"
[33] "CAG" "TGA" "CCG" "CCA" "CAC" "TGG" "A\nC" "CCC" "ATA" "CCA" "CTT" "CTT" "TTT" "GTT" "ATT" "CTT"
[49] "AAA" "TAT" "GTT" "\n"
我把我的论点分成3个框架。请指导我在参数中查找每个密码子的计数以及它的发生百分比。我正在寻找的输出是表格形式,包括四个密码子列,它编码的氨基酸,密码子数和形成氨基酸的百分比。
答案 0 :(得分:0)
更新后的答案:评论中的示例非常有用。下面的代码似乎复制了那些计算。
由于您的OP在基础R中,我在基础R中保留了我的答案。但是,如果您要继续使用R进行这些类型的分析,您应该考虑使用R包{{1}用于数据操作和制表。它的语法和清晰度明显优于基础R。
tidyverse
这给出了结果
#
# This should be created as a data.frame but we'll work with the data as given and
# first define Amino_Acid as a vector and then convert to data.frame
#
Amino_Acid <- c(ATA = "I", ATC = "I", ATT = "I", ATG = "M", ACA = "T",
ACC = "T", ACG = "T", ACT = "T", AAC = "N", AAT = "N", AAA = "K",
AAG = "K", AGC = "S", AGT = "S", AGA = "R", AGG = "R", CTA = "L",
CTC = "L", CTG = "L", CTT = "L", CCA = "P", CCC = "P", CCG = "P",
CCT = "P", CAC = "H", CAT = "H", CAA = "Q", CAG = "Q", CGA = "R",
CGC = "R", CGG = "R", CGT = "R", GTA = "V", GTC = "V", GTG = "V",
GTT = "V", GCA = "A", GCC = "A", GCG = "A", GCT = "A", GAC = "D",
GAT = "D", GAA = "E", GAG = "E", GGA = "G", GGC = "G", GGG = "G",
GGT = "G", TCA = "S", TCC = "S", TCG = "S", TCT = "S", TTC = "F",
TTT = "F", TTA = "L", TTG = "L", TAC = "Y", TAT = "Y", TAA = "stop",
TAG = "stop", TGC = "C", TGT = "C", TGA = "stop", TGG = "W")
amino_acid <- data.frame(Amino_Acid=Amino_Acid, Triplet=names(Amino_Acid), stringsAsFactors = FALSE)
#
# to avoid end-of-line chararacters in short lines or having a very long line,
# use paste to combine several one line strings into one long string
#
s <- paste0("AAGGCCTGCGCAAATATTTCCACTCCTTCCCGGGTGCTCCTGAGTTGAACCCGCTTAGAGACTCCG",
"AAATCAACGACGACTTCCACCAGTGGGCCCAGTGACCGCCACACTGGACCCCATACCACTTCTTTT",
"TGTTATTCTTAAATATGTT")
#
# function to split sequence into codons
#
strsplit3 <- function(string) {sapply(X=seq(1,nchar(string), 3),
FUN=function(starts=X, y=string) substr(y, starts, starts+2) ) }
#
# function to return counts in data.frame with given names
#
table_df <- function(data, column, count_name) {
tab <- table(data[, column], dnn = column)
as.data.frame.table(tab, responseName = count_name)
}
#
# identify triplets and amino acid in sequence
#
codon_seq <- data.frame(Triplet=strsplit3(s))
num_triplets <- nrow(codon_seq)
codon_seq <- merge(codon_seq, amino_acid, by = "Triplet")
#
# count triplets and amino acids
#
codon_seq <- merge(codon_seq, table_df(codon_seq, column = "Amino_Acid", count_name = "AA_cnt"))
codon_seq <- merge(codon_seq, table_df(codon_seq, column = "Triplet", count_name = "Number") )
#
# remove duplicate rows and calc frequencies and fractions
#
codon_seq <- unique(codon_seq)
codon_seq$Freq_1k <- round(1000*codon_seq$Number/num_triplets, 1)
codon_seq$Fraction <- round(codon_seq$Number/codon_seq$AA_cnt, 2)
#
# Arrange columns in same order as example
#
codon_seq <- with(codon_seq, data.frame(Triplet,Amino_Acid, Fraction, Freq_1k, Number))