Это все еще не 100%, но я вернул выходные данные на 3365/3510 моего набора данных. Те немногие, которые я проверил, выстроились в ряд:)
В моем github (ссылка ниже) есть входы и выходы в csv, txt (соответственно).
Пожалуйста, игнорируйте глобальные переменные; Я собирался переключить код, чтобы увидеть, было ли заметное улучшение со скоростью, но не обойти его.
В настоящее время эта версия имеет проблемы с порядком операций относительно чередования и операторы начала / конца строки (^ $), если они являются вариантами чередования в начале или конце строки. Я уверен, что это связано с прецедентом; но я не смог организовать его достаточно хорошо.
Вызов функции для кода находится в последней ячейке. Вместо запуска всего DataFrame с
for x in range(len(df)):
try:
df_expression = df.iloc[x, 2]
df_subsequence = df.iloc[x, 1]
# call function
identify_submatches(df_expression, df_subsequence)
print(dataframe_counting)
dataframe_counting += 1
except:
pass
Вы можете легко протестировать по одному, передавая шаблон и соответствующую последовательность функции следующим образом:
p = ''
s = ''
identify_submatches(p, s)
Код: https://github.com/jameshollisandrew/just_for_fun/blob/master/motif_matching/motif_matching_02.ipynb
Входы: https://github.com/jameshollisandrew/just_for_fun/blob/master/motif_matching/elm_compiled_ss_re.csv
Выходы: https://github.com/jameshollisandrew/just_for_fun/blob/master/motif_matching/motif_matching_02_outputs.txt
"""exp_a as input expression
sub_a as input subject string"""
input_exp = exp_a
input_sub = sub_a
m_gro = '\^*\((?:[^()]+|(?R))*+\)({.+?})*\$*'
m_set = '\^*\[.+?\]({.+?})*\$*'
m_alt = '\|'
m_lit = '\^*[.\w]({.+?})*\$*|\$'
# PRINTOUT
if (print_type == 1):
print('\nExpression Input: {}\nSequence Input: {}'.format(exp_a, sub_a))
if (print_type == 3):
print('\n\nSTART ITERATION\nINPUTS\n exp: {}\n seq: {}'.format(exp_a, sub_a))
# return the pattern match (USE IF SUB IS NOT MATCHED PRIMARY)
if r.search(exp_a, sub_a) is not None:
m = r.search(exp_a, sub_a)
sub_a = m.group()
# >>>PRINTOUT<<<
if print_type == 3:
print('\nSEQUENCE TYPE M\n exp: {}\n seq: {}'.format(exp_a, sub_a))
elif m is None:
print('Search expression: {} in sequence: {} returned no matches.\n\n'.format(exp_a, sub_a))
return None
if (print_type == 1):
print('Subequence Match: {}'.format(sub_a))
# check if main expression has unnested alternation
if len(alt_states(exp_a)) > 0:
# returns matching alternative
exp_a = alt_evaluation(exp_a, sub_a)
# >>>PRINTOUT<<<
if print_type == 3:
print('\nALTERNATION RETURN\n exp: {}\n seq: {}'.format(exp_a, sub_a))
# get initial expression list
exp_list = get_states(exp_a)
# count possible expression constructions
status, matched_tuples = finite_state(exp_list, sub_a)
# >>>PRINTOUT<<<
if print_type == 3:
print('\nCONFIRM EXPRESSION\n exp: {}'.format(matched_tuples))
# index matches
indexer(input_exp, input_sub, matched_tuples)
def indexer(exp_a, sub_a, matched_tuples):
sub_length = len(sub_a)
sub_b = r.search(exp_a, sub_a)
adj = sub_b.start()
sub_b = sub_b.group()
print('')
for pair in matched_tuples:
pattern, match = pair
start = adj
adj = adj + len(match)
end = adj
index_pos = (start, end)
sub_b = slice_string(match, sub_b)
print('\t{}\t{}'.format(pattern, index_pos))
def strip_nest(s):
s = s[1:]
s = s[:-1]
return s
def slice_string(p, s):
pat = p
string = s
# handles escapes
p = r.escape(p)
# slice the input string on input pattern
s = r.split(pattern = p, string = s, maxsplit = 1)[1]
# >>>PRINTOUT<<<
if print_type == 4:
print('\nSLICE STRING\n pat: {}\n str: {}\n slice: {}'.format(pat, string, s))
return s
def alt_states(exp):
# check each character in string
idx = 0 # index tracker
op = 0 # open parenth
cp = 0 # close parenth
free_alt = [] # amend with index position of unnested alt
for c in exp:
if c == '(':
op += 1
elif c == ')':
cp += 1
elif c == '|':
if op == cp:
free_alt.append(idx)
if idx < len(exp)-1:
idx+=1
# split string if found
alts = []
if free_alt:
_ = 0
for i in free_alt:
alts.append(exp[_:i])
alts.append(exp[i+1:])
# the truth value of this check can be checked against the length of the return
# len(free_alt) > 0 means unnested "|" found
return alts
def alt_evaluation(exp, sub):
# >>>PRINTOUT<<<
if print_type == 3:
print('\nALTERNATION SELECTION\n EXP: {}\n SEQ: {}'.format(exp, sub))
# gets alt index position
alts = alt_states(exp)
# variables for eval
a_len = 0 # length of alternate match
keep_len = 0 # length of return match
keep = '' # return match string
# evaluate alternatives
for alt in alts:
m = r.search(alt, sub)
if m is not None:
a_len = len(m.group()) # length of match string
# >>>PRINTOUT<<<
if print_type == 3:
print(' pat: {}\n str: {}\n len: {}'.format(alt, m.group(0), len(m.group(0))))
if a_len >= keep_len:
keep_len = a_len # sets alternate length to keep length
exp = alt # sets alt as keep variable
# >>>PRINTOUT<<<
if print_type == 3:
print(' OUT: {}'.format(exp))
return exp
def get_states(exp):
"""counts number of subexpressions to be checked
creates FSM"""
# >>>PRINTOUT<<<
if print_type == 3:
print('\nGET STATES\n EXP: {}'.format(exp))
# List of possible subexpression regex matches
m_gro = '\^*\((?:[^()]+|(?R))*+\)({.+?})*\$*'
m_set = '\^*\[.+?\]({.+?})*\$*'
m_alt = '\|'
m_lit = '\^*[.\w]({.+?})*\$*|\$'
# initialize capture list
exp_list = []
# loop through first level of subexpressions:
while exp != '':
if r.match(m_gro, exp):
_ = r.match(m_gro, exp).group(0)
exp_list.append(_)
exp = slice_string(_, exp)
elif r.match(m_set, exp):
_ = r.match(m_set, exp).group(0)
exp_list.append(_)
exp = slice_string(_, exp)
elif r.match(m_alt, exp):
_ = ''
elif r.match(m_lit, exp):
_ = r.match(m_lit, exp).group(0)
exp_list.append(_)
exp = slice_string(_, exp)
else:
print('ERROR getting states')
break
n_states = len(exp_list)
# >>>PRINTOUT<<<
if print_type == 3:
print('GET STATES OUT\n states:\n {}\n # of states: {}'.format(exp_list, n_states))
return exp_list
def finite_state(exp_list, seq, level = 0, pattern_builder = '', iter_count = 0, pat_match = [], seq_match = []):
# >>>PRINTOUT<<<
if (print_type == 3):
print('\nSTARTING MACHINE\n EXP: {}\n SEQ: {}\n LEVEL: {}\n matched: {}\n pat_match: {}'.format(exp_list, seq, level, pattern_builder, pat_match))
# patterns
m_gro = '\^*\((?:[^()]+|(?R))*+\)({.+?})*\$*'
m_set = '\^*\[.+?\]({.+?})*\$*'
m_alt = '\|'
m_squ = '\{(.),(.)\}'
m_lit = '\^*[.\w]({.+?})*\$*|\$'
# set state, n_state
state = 0
n_states = len(exp_list)
#save_state = []
#save_expression = []
# temp exp
local_seq = seq
# >>>PRINTOUT<<<
if print_type == 3:
print('\n >>>MACHINE START')
# set failure cap so no endless loop
failure_cap = 1000
# since len(exp_list) returns + 1 over iteration (0 index) use the last 'state' as success state
while state != n_states:
for exp in exp_list:
# iterations
iter_count+=1
# >>>PRINTOUT<<<
if print_type == 3:
print(' iteration count: {}'.format(iter_count))
# >>>PRINTOUT<<<
if print_type == 3:
print('\n evaluating: {}\n for string: {}'.format(exp, local_seq))
# alternation reset
if len(alt_states(exp)) > 0:
# get operand options
operands = alt_states(exp)
# create temporary exp list
temp_list = exp_list[state+1:]
# add level
level = level + 1
# >>>PRINTOUT<<<
if print_type == 3:
print(' ALT MATCH: {}\n state: {}\n opers returned: {}\n level in: {}'.format(exp, state, operands, level))
# compile local altneration
for oper in operands:
# get substates
_ = get_states(oper)
# compile list
oper_list = _ + temp_list
# send to finite_state, sublevel
alt_status, pats = finite_state(oper_list, local_seq, level = level, pattern_builder=pattern_builder, iter_count=iter_count, pat_match=pat_match)
if alt_status == 'success':
return alt_status, pats
# group cycle
elif r.match(m_gro, exp) is not None:
# get operand options
operands = group_states(exp)
# create temporary exp list
temp_list = exp_list[state+1:]
# add level
level = level + 1
# >>>PRINTOUT<<<
if print_type == 3:
print(' GROUP MATCH: {}\n state: {}\n opers returned: {}\n level in: {}'.format(exp, state, operands, level))
# compile local
oper_list = operands + temp_list
# send to finite_state, sublevel
group_status, pats = finite_state(oper_list, local_seq, level=level, pattern_builder=pattern_builder, iter_count=iter_count, pat_match=pat_match)
if group_status == 'success':
return group_status, pats
# quantifier reset
elif r.search(m_squ, exp) is not None:
# get operand options
operands = quant_states(exp)
# create temporary exp list
temp_list = exp_list[state+1:]
# add level
level = level + 1
# >>>PRINTOUT<<<
if print_type == 3:
print(' QUANT MATCH: {}\n state: {}\n opers returned: {}\n level in: {}'.format(exp, state, operands, level))
# compile local
for oper in reversed(operands):
# compile list
oper_list = [oper] + temp_list
# send to finite_state, sublevel
quant_status, pats = finite_state(oper_list, local_seq, level=level, pattern_builder=pattern_builder, iter_count=iter_count, pat_match=pat_match)
if quant_status == 'success':
return quant_status, pats
# record literal
elif r.match(exp, local_seq) is not None:
# add to local pattern
m = r.match(exp, local_seq).group(0)
local_seq = slice_string(m, local_seq)
# >>>PRINTOUT<<<
if print_type == 3:
print(' state transition: {}\n state {} ==> {} of {}'.format(exp, state, state+1, n_states))
# iterate state for match
pattern_builder = pattern_builder + exp
pat_match = pat_match + [(exp, m)]
state += 1
elif r.match(exp, local_seq) is None:
# >>>PRINTOUT<<<
if print_type == 3:
print(' Return FAIL on {}, level: {}, state: {}'.format(exp, level, state))
status = 'fail'
return status, pattern_builder
# machine success
if state == n_states:
# >>>PRINTOUT<<<
if print_type == 3:
print(' MACHINE SUCCESS\n level: {}\n state: {}\n exp: {}'.format(level, state, pattern_builder))
status = 'success'
return status, pat_match
# timeout
if iter_count == failure_cap:
state = n_states
# >>>PRINTOUT<<<
if print_type == 3:
print('===============\nFAILURE CAP MET\n level: {}\n exp state: {}\n==============='.format(level, state))
break
def group_states(exp):
# patterns
m_gro = '\^*\((?:[^()]+|(?R))*+\)({.+?})*\$*'
m_set = '\^*\[.+?\]({.+?})*\$*'
m_alt = '\|'
m_squ = '\{(.),(.)\}'
m_lit = '\^*[.\w]({.+?})*\$*'
ret_list = []
# iterate over groups
groups = r.finditer(m_gro, exp)
for gr in groups:
_ = strip_nest(gr.group())
# alternation reset
if r.search(m_alt, _):
ret_list.append(_)
else:
_ = get_states(_)
for thing in _:
ret_list.append(thing)
return(ret_list)
def quant_states(exp):
# >>>PRINTOUT<<<
if print_type == 4:
print('\nGET QUANT STATES\n EXP: {}'.format(exp))
squ_opr = '(.+)\{.,.\}'
m_squ = '\{(.),(.)\}'
# create states
states_list = []
# get operand
operand_obj = r.finditer(squ_opr, exp)
for match in operand_obj:
operand = match.group(1)
# get repetitions
fa = r.findall(m_squ, exp)
for m, n in fa:
# loop through range
for x in range(int(m), (int(n)+1)):
# construct string
_ = operand + '{' + str(x) + '}'
# append to list
states_list.append(_)
# >>>PRINTOUT<<<
if print_type == 4:
print(' QUANT OUT: {}\n'.format(states_list))
return states_list
%%time
print_type = 1
"""0:
1: includes input
2:
3: all output prints on """
dataframe_counting = 0
for x in range(len(df)):
try:
df_expression = df.iloc[x, 2]
df_subsequence = df.iloc[x, 1]
# call function
identify_submatches(df_expression, df_subsequence)
print(dataframe_counting)
dataframe_counting += 1
except:
pass
Примеры выходного возврата
Выходные значения (т.е. подвыражение и набор индексов) с разделителями табуляции .
Expression Input: [KR]{1,4}[KR].[KR]W.
Sequence Input: TRQARRNRRRRWRERQRQIH
Subequence Match: RRRRWR
[KR]{1} (7, 8)
[KR] (8, 9)
. (9, 10)
[KR] (10, 11)
W (11, 12)
. (12, 13)
2270
Expression Input: [KR]{1,4}[KR].[KR]W.
Sequence Input: TASQRRNRRRRWKRRGLQIL
Subequence Match: RRRRWK
[KR]{1} (7, 8)
[KR] (8, 9)
. (9, 10)
[KR] (10, 11)
W (11, 12)
. (12, 13)
2271
Expression Input: [KR]{1,4}[KR].[KR]W.
Sequence Input: TRKARRNRRRRWRARQKQIS
Subequence Match: RRRRWR
[KR]{1} (7, 8)
[KR] (8, 9)
. (9, 10)
[KR] (10, 11)
W (11, 12)
. (12, 13)
2272
Expression Input: [KR]{1,4}[KR].[KR]W.
Sequence Input: LDFPSKKRKRSRWNQDTMEQ
Subequence Match: KKRKRSRWN
[KR]{4} (5, 9)
[KR] (9, 10)
. (10, 11)
[KR] (11, 12)
W (12, 13)
. (13, 14)
2273
Expression Input: [KR]{1,4}[KR].[KR]W.
Sequence Input: ASQPPSKRKRRWDQTADQTP
Subequence Match: KRKRRWD
[KR]{2} (6, 8)
[KR] (8, 9)
. (9, 10)
[KR] (10, 11)
W (11, 12)
. (12, 13)
2274
Expression Input: [KR]{1,4}[KR].[KR]W.
Sequence Input: GGATSSARKNRWDETPKTER
Subequence Match: RKNRWD
[KR]{1} (7, 8)
[KR] (8, 9)
. (9, 10)
[KR] (10, 11)
W (11, 12)
. (12, 13)
2275
Expression Input: [KR]{1,4}[KR].[KR]W.
Sequence Input: PTPGASKRKSRWDETPASQM
Subequence Match: KRKSRWD
[KR]{2} (6, 8)
[KR] (8, 9)
. (9, 10)
[KR] (10, 11)
W (11, 12)
. (12, 13)
2276
Expression Input: [VMILF][MILVFYHPA][^P][TASKHCV][AVSC][^P][^P][ILVMT][^P][^P][^P][LMTVI][^P][^P][LMVCT][ILVMCA][^P][^P][AIVLMTC]
Sequence Input: LLNAATALSGSMQYLLNYVN
Subequence Match: LLNAATALSGSMQYLLNYV
[VMILF] (0, 1)
[MILVFYHPA] (1, 2)
[^P] (2, 3)
[TASKHCV] (3, 4)
[AVSC] (4, 5)
[^P] (5, 6)
[^P] (6, 7)
[ILVMT] (7, 8)
[^P] (8, 9)
[^P] (9, 10)
[^P] (10, 11)
[LMTVI] (11, 12)
[^P] (12, 13)
[^P] (13, 14)
[LMVCT] (14, 15)
[ILVMCA] (15, 16)
[^P] (16, 17)
[^P] (17, 18)
[AIVLMTC] (18, 19)
2277
Expression Input: [VMILF][MILVFYHPA][^P][TASKHCV][AVSC][^P][^P][ILVMT][^P][^P][^P][LMTVI][^P][^P][LMVCT][ILVMCA][^P][^P][AIVLMTC]
Sequence Input: IFEASKKVTNSLSNLISLIG
Subequence Match: IFEASKKVTNSLSNLISLI
[VMILF] (0, 1)
[MILVFYHPA] (1, 2)
[^P] (2, 3)
[TASKHCV] (3, 4)
[AVSC] (4, 5)
[^P] (5, 6)
[^P] (6, 7)
[ILVMT] (7, 8)
[^P] (8, 9)
[^P] (9, 10)
[^P] (10, 11)
[LMTVI] (11, 12)
[^P] (12, 13)
[^P] (13, 14)
[LMVCT] (14, 15)
[ILVMCA] (15, 16)
[^P] (16, 17)
[^P] (17, 18)
[AIVLMTC] (18, 19)
2278
Expression Input: [VMILF][MILVFYHPA][^P][TASKHCV][AVSC][^P][^P][ILVMT][^P][^P][^P][LMTVI][^P][^P][LMVCT][ILVMCA][^P][^P][AIVLMTC]
Sequence Input: IYEKAKEVSSALSKVLSKID
Subequence Match: IYEKAKEVSSALSKVLSKI
[VMILF] (0, 1)
[MILVFYHPA] (1, 2)
[^P] (2, 3)
[TASKHCV] (3, 4)
[AVSC] (4, 5)
[^P] (5, 6)
[^P] (6, 7)
[ILVMT] (7, 8)
[^P] (8, 9)
[^P] (9, 10)
[^P] (10, 11)
[LMTVI] (11, 12)
[^P] (12, 13)
[^P] (13, 14)
[LMVCT] (14, 15)
[ILVMCA] (15, 16)
[^P] (16, 17)
[^P] (17, 18)
[AIVLMTC] (18, 19)
2279
Expression Input: [VMILF][MILVFYHPA][^P][TASKHCV][AVSC][^P][^P][ILVMT][^P][^P][^P][LMTVI][^P][^P][LMVCT][ILVMCA][^P][^P][AIVLMTC]
Sequence Input: IYKAAKDVTTSLSKVLKNIN
Subequence Match: IYKAAKDVTTSLSKVLKNI
[VMILF] (0, 1)
[MILVFYHPA] (1, 2)
[^P] (2, 3)
[TASKHCV] (3, 4)
[AVSC] (4, 5)
[^P] (5, 6)
[^P] (6, 7)
[ILVMT] (7, 8)
[^P] (8, 9)
[^P] (9, 10)
[^P] (10, 11)
[LMTVI] (11, 12)
[^P] (12, 13)
[^P] (13, 14)
[LMVCT] (14, 15)
[ILVMCA] (15, 16)
[^P] (16, 17)
[^P] (17, 18)
[AIVLMTC] (18, 19)
2280
Данные из: ELM (Ресурс линейного мотива Eukaryoti c для функциональных сайтов в белках) 2020. Получено из http://elm.eu.org/searchdb.html