如何计算字符的出现次数,然后查找其百分比

时间:2018-06-09 08:43:32

标签: r bioinformatics dna-sequence

我们知道三组密码子编码氨基酸,例如ATG编码仅用于M(蛋氨酸)和ATC,ATA,ATT编码用于I(异亮氨酸) 对于编码M,DNA序列中ATG的百分比总是为1,对于编码I,DNA序列中ATC的百分比总是为0.33,如ATA和ATT。 我想制作一个函数,可以计算序列中密码子的数量,然后计算形成特定氨基酸的频率百分比。

codon <- list(ATA = "I", ATC = "I", ATT = "I", ATG = "M", ACA = "T", 
              ACC = "T", ACG = "T", ACT = "T", AAC = "N", AAT = "N", AAA = "K", 
              AAG = "K", AGC = "S", AGT = "S", AGA = "R", AGG = "R", CTA = "L", 
              CTC = "L", CTG = "L", CTT = "L", CCA = "P", CCC = "P", CCG = "P", 
              CCT = "P", CAC = "H", CAT = "H", CAA = "Q", CAG = "Q", CGA = "R", 
              CGC = "R", CGG = "R", CGT = "R", GTA = "V", GTC = "V", GTG = "V", 
              GTT = "V", GCA = "A", GCC = "A", GCG = "A", GCT = "A", GAC = "D", 
              GAT = "D", GAA = "E", GAG = "E", GGA = "G", GGC = "G", GGG = "G", 
              GGT = "G", TCA = "S", TCC = "S", TCG = "S", TCT = "S", TTC = "F", 
              TTT = "F", TTA = "L", TTG = "L", TAC = "Y", TAT = "Y", TAA = "stop", 
              TAG = "stop", TGC = "C", TGT = "C", TGA = "stop", TGG = "W")


( fracs <- 1/table(unlist(codon)) )





      A         C         D         E         F         G         H         I         K         L         M 
0.2500000 0.5000000 0.5000000 0.5000000 0.5000000 0.2500000 0.5000000 0.3333333 0.5000000 0.1666667 1.0000000 
        N         P         Q         R         S      stop         T         V         W         Y 
0.5000000 0.2500000 0.5000000 0.1666667 0.1666667 0.3333333 0.2500000 0.2500000 1.0000000 0.5000000

codonfracs <- setNames(lapply(codon, function(x) unname(fracs[x])), names(codon))
str(head(codonfracs))

List of 6
 $ ATA: num 0.333
 $ ATC: num 0.333
 $ ATT: num 0.333
 $ ATG: num 1
 $ ACA: num 0.25
 $ ACC: num 0.25

s <- 'AAGGCCTGCGCAAATATTTCCACTCCTTCCCGGGTGCTCCTGAGTTGAACCCGC
TTAGAGACTCCGAAATCAACGACGACTTCCACCAGTGGGCCCAGTGACCGCCACACTGGA
CCCCATACCACTTCTTTTTGTTATTCTTAAATATGTT
'

strsplit3 <- function(s, k=3) {
  starts <- seq.int(1, nchar(s), by=k)
  stops <- c(starts[-1] - 1, nchar(s))
  mapply(substr, s, starts, stops, USE.NAMES=FALSE)
}
strsplit3(s)

[1] "AAG"  "GCC"  "TGC"  "GCA"  "AAT"  "ATT"  "TCC"  "ACT"  "CCT"  "TCC"  "CGG"  "GTG"  "CTC"  "CTG"  "AGT"  "TGA" 
[17] "ACC"  "CGC"  "\nTT" "AGA"  "GAC"  "TCC"  "GAA"  "ATC"  "AAC"  "GAC"  "GAC"  "TTC"  "CAC"  "CAG"  "TGG"  "GCC" 
[33] "CAG"  "TGA"  "CCG"  "CCA"  "CAC"  "TGG"  "A\nC" "CCC"  "ATA"  "CCA"  "CTT"  "CTT"  "TTT"  "GTT"  "ATT"  "CTT" 
[49] "AAA"  "TAT"  "GTT"  "\n" 

我把我的论点分成3个框架。请指导我在参数中查找每个密码子的计数以及它的发生百分比。我正在寻找的输出是表格形式,包括四个密码子列,它编码的氨基酸,密码子数和形成氨基酸的百分比。

1 个答案:

答案 0 :(得分:0)

更新后的答案:评论中的示例非常有用。下面的代码似乎复制了那些计算。

由于您的OP在基础R中,我在基础R中保留了我的答案。但是,如果您要继续使用R进行这些类型的分析,您应该考虑使用R包{{1}用于数据操作和制表。它的语法和清晰度明显优于基础R。

tidyverse

这给出了结果

#
# This should be created as a data.frame but we'll work with the data as given and
# first define Amino_Acid as a vector and then convert to data.frame
# 
  Amino_Acid <- c(ATA = "I", ATC = "I", ATT = "I", ATG = "M", ACA = "T", 
                  ACC = "T", ACG = "T", ACT = "T", AAC = "N", AAT = "N", AAA = "K", 
                  AAG = "K", AGC = "S", AGT = "S", AGA = "R", AGG = "R", CTA = "L", 
                  CTC = "L", CTG = "L", CTT = "L", CCA = "P", CCC = "P", CCG = "P", 
                  CCT = "P", CAC = "H", CAT = "H", CAA = "Q", CAG = "Q", CGA = "R", 
                  CGC = "R", CGG = "R", CGT = "R", GTA = "V", GTC = "V", GTG = "V", 
                  GTT = "V", GCA = "A", GCC = "A", GCG = "A", GCT = "A", GAC = "D", 
                  GAT = "D", GAA = "E", GAG = "E", GGA = "G", GGC = "G", GGG = "G", 
                  GGT = "G", TCA = "S", TCC = "S", TCG = "S", TCT = "S", TTC = "F", 
                  TTT = "F", TTA = "L", TTG = "L", TAC = "Y", TAT = "Y", TAA = "stop", 
                  TAG = "stop", TGC = "C", TGT = "C", TGA = "stop", TGG = "W")

  amino_acid <- data.frame(Amino_Acid=Amino_Acid, Triplet=names(Amino_Acid), stringsAsFactors = FALSE)
#
#  to avoid end-of-line chararacters in short lines or having a very long line, 
#  use paste to combine several one line strings into one long string
#
  s <- paste0("AAGGCCTGCGCAAATATTTCCACTCCTTCCCGGGTGCTCCTGAGTTGAACCCGCTTAGAGACTCCG",
               "AAATCAACGACGACTTCCACCAGTGGGCCCAGTGACCGCCACACTGGACCCCATACCACTTCTTTT",
                "TGTTATTCTTAAATATGTT")
#
# function to split sequence into codons
#
  strsplit3 <- function(string) {sapply(X=seq(1,nchar(string), 3), 
                                        FUN=function(starts=X, y=string) substr(y, starts, starts+2) ) }
#
# function to return counts in data.frame with given names
#
  table_df <- function(data, column, count_name) {
                   tab <- table(data[, column], dnn = column)
                   as.data.frame.table(tab, responseName = count_name)
  }
#
# identify triplets and amino acid in sequence
#
  codon_seq <- data.frame(Triplet=strsplit3(s))
  num_triplets <- nrow(codon_seq)
  codon_seq <- merge(codon_seq, amino_acid, by = "Triplet")
#
# count triplets and amino acids
#
  codon_seq <- merge(codon_seq, table_df(codon_seq, column = "Amino_Acid", count_name = "AA_cnt"))
  codon_seq <- merge(codon_seq, table_df(codon_seq, column = "Triplet", count_name = "Number") )
#
# remove duplicate rows and calc frequencies and fractions
#
  codon_seq <- unique(codon_seq)
  codon_seq$Freq_1k <- round(1000*codon_seq$Number/num_triplets, 1)
  codon_seq$Fraction <- round(codon_seq$Number/codon_seq$AA_cnt, 2)
#
# Arrange columns in same order as example
#
 codon_seq <- with(codon_seq, data.frame(Triplet,Amino_Acid, Fraction, Freq_1k, Number)) 
相关问题