Это будет работать.Сначала мы хотим разобрать ваш файл fasta в заголовки и последовательности, затем сгладить список списков последовательностей в список строк, затем подсчитать количество каждого нуклеотида в строке, а затем вывести:
import sys
fasta = sys.argv[1]
def fastaParser(infile):
seqs = []
headers = []
with open(infile) as f:
sequence = ""
header = None
for line in f:
if line.startswith('>'):
headers.append(line[1:-1])
if header:
seqs.append([sequence])
sequence = ""
header = line[1:]
else:
sequence += line.rstrip()
seqs.append([sequence])
return headers, seqs
headers, seqs = fastaParser(fasta)
flat_seqs = [item for sublist in seqs for item in sublist]
def countNucs(instring):
# will count upper and lower case sequences, if do not want lower case remove .upper()
g = instring.upper().count('G')
c = instring.upper().count('C')
a = instring.upper().count('A')
t = instring.upper().count('T')
return 'G = {}, C = {}, A = {}, T = {}'.format(g, c, a, t)
for header, seq in zip(headers, flat_seqs):
print(header, countNucs(seq))
используя ваш пример fasta для запуска в командной строке:
[me]$ python3.5 fasta.py infasta.fa
output:
chr12_9180206_+:chr12_118582391_+:a1;2 total_counts: 115 Seed: 4 K: 20 length: 79 G = 24, C = 17, A = 14, T = 24
chr12_9180206_+:chr12_118582391_+:a2;2 total_counts: 135 Seed: 4 K: 20 length: 80 G = 16, C = 30, A = 17, T = 17
chr1_8969882_-:chr1_568670_-:a1;113 total_counts: 7600 Seed: 225 K: 20 length: 86 G = 12, C = 31, A = 27, T = 16
chr1_8969882_-:chr1_568670_-:a2;69 total_counts: 6987 Seed: 197 K: 20 length: 120 G = 20, C = 41, A = 31, T = 28