Вы можете использовать OneHotEncoder
из scikit-learn.
import numpy as np
from sklearn.preprocessing import OneHotEncoder
# create the encoder object
encoder = OneHotEncoder()
sequence = 'TCTGAGTCCCAATACACAAGAGGTTCCCTCACCTGTTCTGGTGTCAGACCCTCCCAGATGATCACCTCTCCTATGGCG'
sequence += 'GGGAAGGTGCCTGGATGTCTAAAGCCTGAAATGGGGATCTATCCCAGAAGCTGTGTAGCTTCTGCCTGTCCCAGAAGC'
sequence += 'TGTGTTGTTTCTGTATTCAGCTTGCTCACCCTCCGCAGTCCATTGATCTGCACAGACTGTTCTCAGATGGACTCGTGA'
sequence += 'GACAAGATGGCTCCTTCACCTGCTCTGGGGATCAGAACCCTCCCAGGTGGCCACCTCTCCTGTGGTGGGGAAGGTACC'
sequence += 'TGGAAGTCTTCAGCCCAAAACAGGGCCTGTCCCAGAAGCTGTGTCTCTTCTGCCTATCCCAGAAGCTGTATTGCTTCT'
sequence += 'GCTGTCCACTTGCTCACCCTCTGCAGTCTGCATGCTGATCTGCGCAGACTGTTCTCAGAGGGATCTGGCAGACAAGTT'
sequence += 'GGCTCCCTCACCTGCTCTGGGGCGGGGGGGGGGGGTTCAGAGCCCTCCTGGGCAGCCACCTCTCCTCTAGCAGAGAAG'
sequence += 'GTGCTGGGATGTCTTGAGCAGGAAACGGGGTATGTCCCAGAAGCTGTCTTGCTTCTGCAATCCACATGCTCAGCCTCT'
sequence += 'GCAGTCTGTGAGCTAATCTGGGCAGTCTGGTCTCAGGGGACTCTGGAGACAAGATGGCTCCCTCACCTGCTCTGGGGG'
sequence += 'TCAAAGCCCTCCTTGGCAGCCACCTTTTTCAGGCGGAGAAGGTGCCCGGATGTCTGGAGCCTGAAACAGGGGTATGTC'
sequence += 'CCAGACACTGTGTAGCTTCTGCCTGCCCCAGAAGATGTGTCACTTCCTCAGTCTGCTTGTTCACCCTCCACAGTCTGC'
sequence += 'AAGCTGATCTGCACAGACTGGTCTCAGAGGGACCTAGAAGACAAGATCAAGAAAAGTCTTATAGGTATAATGAATCAA'
sequence += 'GCAGAAAATGAAACATCAGAAGCTTAAGATAAAATACAGGATCTAGTCCAAATTAGCAAGAAGTA'
# transform sequence to a Nx1 array, pass through fit/transform operation
seq_arr = np.array(list(sequence)).reshape(-1, 1)
seq_1hot = encoder.fit_transform(seq_arr).toarray()
seq_1hot
# returns:
array([[0., 0., 0., 1.],
[0., 1., 0., 0.],
[0., 0., 0., 1.],
...,
[0., 0., 1., 0.],
[0., 0., 0., 1.],
[1., 0., 0., 0.]])
Вы можете увидеть, какие буквы соответствуют какому столбцу, посмотрев на:
encoder.categories_
# returns:
[array(['A', 'C', 'G', 'T'], dtype='<U1')]
Так что в этом случае они расположены в алфавитном порядке.