Я нашел замечательный учебник по созданию хорошей программы для сопоставления строк. Тем не менее, Python выдает мне ошибку:
неопределенное имя 'word_count_dict' , как только я достигаю функции get_potential_matches. Даже после повторного изучения я не смог ее решить.
Пожалуйста, смотрите код ниже:
#counting terms
term_count_dict = {}
for index, entity in leitest2['LegalName'].iteritems():
entity = list(set(entity.split()))
for term in entity:
term_count_dict[term] = term_count_dict.setdefault(term, 0) + 1
#mapping rare terms to entities
rarity = 500
term_to_entities_dict = {}
for index, entity in leitest2['LegalName'].iteritems():
for term in list(set(entity.split())):
# Only include rare words (Less than rarity occurrences)
if term_count_dict[term] < rarity:
term_to_entities_dict.setdefault(term, [])
term_to_entities_dict[term].append(entity)
#determine potential matching - matching engine
def get_potential_matches(entity, term_count_dict, term_to_entities_dict):
term_counts = []
for term in list(set(entity.split())):
if term in term_to_entities_dict:
count = word_count_dict[term] #python marks the issue to be here
term_counts.append((term, count))
term_counts = sorted(term_counts, key=lambda x: x[1])
potential_matches = []
for i in range(len(term_counts)):
term = term_counts[i][0]
potential_matches.extend(term_to_entities_dict[term])
# Limit the # of potential matches per entity
if len(potential_matches) > 5:
break
# Limit to only use the i least common terms per entity
if i >= 3:
break
potential_matches = list(set(potential_matches)) # Remove duplicates
potential_matches.sort() # For easier output / validation
return term_counts, potential_matches
Здесь также ссылка на учебник (http://blog.keyrus.co.uk/fuzzy_matching_101_part_ii.html). Буду очень признателен за вашу помощь.