таблица кодонов, содержащая ВСЕ 64 кодона, даже невырожденные (они составляют группы из одного элемента)
подсчет случаев в группе каждого кодонав то же время, когда встречаются кодоны во время итерации
таблица кодонов, содержащая названия кодированных аминокислот -> хороший дисплей
код:
from collections import defaultdict
# the first 600 nucleotides from GenBank: AAHX01097212.1
adn = ("tcccccgcagcttcgggaacgtgcgggctcgggagggaggggcctggcgccgggcgcgcg"
"cctgcgccccaccccgccccaccctggcgggtctcgcgcgcccggcccgcctcctgtcaa"
"ccccagcgcggcggtcaggtggtccccagcccttggccccagcctccagcttcctggtcc"
"ctcgggctctgagtcctgtctccggcagatcgcctttctgattgttctcctgcgcagctg"
"gaggtgtatagcccctagccgagctatggtgcctcagcagatgtgaggaggtagtgggtc"
"aggataaacccgcgcactccataataacgtgccagggctcagtgacttgggtctgcatta")
arn = adn.upper().replace('T','U')
#RNA codon table from http://en.wikipedia.org/wiki/Genetic_code
codon_table = ((('GCU', 'GCC', 'GCA', 'GCG'), 'Alanine'),
(('UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG'), 'Leucine'),
(('CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'), 'Arginine'),
(('AAA', 'AAG'), 'Lysine'),
(('AAU', 'AAC'), 'Asparagine'),
(('AUG',), 'Methionine'),
(('GAU', 'GAC'), 'Aspartic acid' ),
(('UUU', 'UUC'), 'Phenylalanine'),
(('UGU', 'UGC'), 'Cysteine'),
(('CCU', 'CCC', 'CCA', 'CCG'), 'Proline') ,
(('CAA', 'CAG'), 'Glutamine'),
(('UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC'), 'Serine'),
(('GAA', 'GAG'), 'Glutamic acid'),
(('ACU', 'ACC', 'ACA', 'ACG'), 'Threonine'),
(('GGU', 'GGC', 'GGA', 'GGG'), 'Glycine'),
(('UGG',), 'Tryptophane'),
(('CAU', 'CAC'), 'Histidine'),
(('UAU', 'UAC'), 'Tyrosine'),
(('AUU', 'AUC', 'AUA'), 'Isoleucine'),
(('GUU', 'GUC', 'GUA', 'GUG'), 'Valine'),
(('UAA', 'UGA', 'UAG'), 'STOP') )
siblings = dict( (cod, codgroup) for codgroup,aa in codon_table for cod in codgroup )
cod_count, grp_count, freq = defaultdict(int), defaultdict(int), {}
for cod in (arn[i:i+3] for i in xrange(0,len(arn),3)):
cod_count[cod] += 1
grp_count[siblings[cod]] += 1
for cod in siblings.iterkeys(): # the keys of siblings are the 64 codons
if siblings[cod] in grp_count:
freq[cod] = float(cod_count[cod])/grp_count[siblings[cod]]
else:
freq[cod] = '-* Missing *-'
display = '\n'.join(aa.rjust(13)+\
'\n'.join('%s %-16s' % (cod.rjust(18 if i else 5),freq[cod])
for i,cod in enumerate(codgrp))
for codgrp,aa in codon_table)
# editing addition:
def outputResults(filename,arn,codon_table,displ):
li = ['This file is named %s' % filename]
li.append('The sequence of ARN:\n%s' %\
'\n'.join(arn[i:i+42] for i in xrange(0,len(arn),42)))
li.append('Size of the sequence : '+str(len(arn)))
li.append('Codon_table:\n'+\
'\n'.join('%s : %s' % (u,v) for u,v in codon_table))
li.append('Frequency results :\n'+displ)
with open(filename,'w') as f:
f.writelines('\n\n'.join(li))
outputResults('ARN_mem.txt',arn,codon_table,display)
print display
.
РЕДАКТИРОВАТЬ
Я добавил функцию outputResults (), чтобы показать способ записи данных и результатов в файл
Полученное содержимое файла:
This file is named ARN_mem.txt
The sequence of ARN:
UCCCCCGCAGCUUCGGGAACGUGCGGGCUCGGGAGGGAGGGG
CCUGGCGCCGGGCGCGCGCCUGCGCCCCACCCCGCCCCACCC
UGGCGGGUCUCGCGCGCCCGGCCCGCCUCCUGUCAACCCCAG
CGCGGCGGUCAGGUGGUCCCCAGCCCUUGGCCCCAGCCUCCA
GCUUCCUGGUCCCUCGGGCUCUGAGUCCUGUCUCCGGCAGAU
CGCCUUUCUGAUUGUUCUCCUGCGCAGCUGGAGGUGUAUAGC
CCCUAGCCGAGCUAUGGUGCCUCAGCAGAUGUGAGGAGGUAG
UGGGUCAGGAUAAACCCGCGCACUCCAUAAUAACGUGCCAGG
GCUCAGUGACUUGGGUCUGCAUUA
Size of the sequence : 360
Codon_table:
('GCU', 'GCC', 'GCA', 'GCG') : Alanine
('UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG') : Leucine
('CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG') : Arginine
('AAA', 'AAG') : Lysine
('AAU', 'AAC') : Asparagine
('AUG',) : Methionine
('GAU', 'GAC') : Aspartic acid
('UUU', 'UUC') : Phenylalanine
('UGU', 'UGC') : Cysteine
('CCU', 'CCC', 'CCA', 'CCG') : Proline
('CAA', 'CAG') : Glutamine
('UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC') : Serine
('GAA', 'GAG') : Glutamic acid
('ACU', 'ACC', 'ACA', 'ACG') : Threonine
('GGU', 'GGC', 'GGA', 'GGG') : Glycine
('UGG',) : Tryptophane
('CAU', 'CAC') : Histidine
('UAU', 'UAC') : Tyrosine
('AUU', 'AUC', 'AUA') : Isoleucine
('GUU', 'GUC', 'GUA', 'GUG') : Valine
('UAA', 'UGA', 'UAG') : STOP
Frequency results :
Alanine GCU 0.1875
GCC 0.375
GCA 0.25
GCG 0.1875
Leucine UUA 0.125
UUG 0.0
CUU 0.25
CUC 0.375
etc.............