Обновленный ответ: Пример в вашем комментарии был очень полезным.Код ниже, похоже, повторяет эти вычисления.
Поскольку ваш ОП был в базе R, я сохранил свой ответ в базе R. Однако, если вы собираетесь продолжать использовать R для такого рода анализов, вам следует рассмотреть возможность использования пакета R tidyverse
для манипулирования данными и табулирования.Его синтаксис и ясность значительно лучше, чем у базового R.
#
# This should be created as a data.frame but we'll work with the data as given and
# first define Amino_Acid as a vector and then convert to data.frame
#
Amino_Acid <- c(ATA = "I", ATC = "I", ATT = "I", ATG = "M", ACA = "T",
ACC = "T", ACG = "T", ACT = "T", AAC = "N", AAT = "N", AAA = "K",
AAG = "K", AGC = "S", AGT = "S", AGA = "R", AGG = "R", CTA = "L",
CTC = "L", CTG = "L", CTT = "L", CCA = "P", CCC = "P", CCG = "P",
CCT = "P", CAC = "H", CAT = "H", CAA = "Q", CAG = "Q", CGA = "R",
CGC = "R", CGG = "R", CGT = "R", GTA = "V", GTC = "V", GTG = "V",
GTT = "V", GCA = "A", GCC = "A", GCG = "A", GCT = "A", GAC = "D",
GAT = "D", GAA = "E", GAG = "E", GGA = "G", GGC = "G", GGG = "G",
GGT = "G", TCA = "S", TCC = "S", TCG = "S", TCT = "S", TTC = "F",
TTT = "F", TTA = "L", TTG = "L", TAC = "Y", TAT = "Y", TAA = "stop",
TAG = "stop", TGC = "C", TGT = "C", TGA = "stop", TGG = "W")
amino_acid <- data.frame(Amino_Acid=Amino_Acid, Triplet=names(Amino_Acid), stringsAsFactors = FALSE)
#
# to avoid end-of-line chararacters in short lines or having a very long line,
# use paste to combine several one line strings into one long string
#
s <- paste0("AAGGCCTGCGCAAATATTTCCACTCCTTCCCGGGTGCTCCTGAGTTGAACCCGCTTAGAGACTCCG",
"AAATCAACGACGACTTCCACCAGTGGGCCCAGTGACCGCCACACTGGACCCCATACCACTTCTTTT",
"TGTTATTCTTAAATATGTT")
#
# function to split sequence into codons
#
strsplit3 <- function(string) {sapply(X=seq(1,nchar(string), 3),
FUN=function(starts=X, y=string) substr(y, starts, starts+2) ) }
#
# function to return counts in data.frame with given names
#
table_df <- function(data, column, count_name) {
tab <- table(data[, column], dnn = column)
as.data.frame.table(tab, responseName = count_name)
}
#
# identify triplets and amino acid in sequence
#
codon_seq <- data.frame(Triplet=strsplit3(s))
num_triplets <- nrow(codon_seq)
codon_seq <- merge(codon_seq, amino_acid, by = "Triplet")
#
# count triplets and amino acids
#
codon_seq <- merge(codon_seq, table_df(codon_seq, column = "Amino_Acid", count_name = "AA_cnt"))
codon_seq <- merge(codon_seq, table_df(codon_seq, column = "Triplet", count_name = "Number") )
#
# remove duplicate rows and calc frequencies and fractions
#
codon_seq <- unique(codon_seq)
codon_seq$Freq_1k <- round(1000*codon_seq$Number/num_triplets, 1)
codon_seq$Fraction <- round(codon_seq$Number/codon_seq$AA_cnt, 2)
#
# Arrange columns in same order as example
#
codon_seq <- with(codon_seq, data.frame(Triplet,Amino_Acid, Fraction, Freq_1k, Number))
Это дает результат
codon_seq
Triplet Amino_Acid Fraction Freq_1k Number
1 AAA K 0.50 19.6 1
2 AAG K 0.50 19.6 1
3 AAT N 1.00 19.6 1
4 ACC T 0.30 58.8 3
5 ACG T 0.20 39.2 2
6 ACT T 0.50 98.0 5
7 AGT S 0.33 58.8 3
8 ATA I 0.50 19.6 1
9 ATT I 0.50 19.6 1
10 CAC H 0.50 19.6 1
11 CAT H 0.50 19.6 1
…