Я использую fuzzy-wuzzy для дедупликации. Я хочу сохранить дедупликации и не удалять их, а также отмечать их на основе их ближайших совпадений. Например
FirstName|LastName|Cluster
Jennifer |Lopez |0
Kanye |West |1
Jennifer |L |0
K | West |1
. Я пытаюсь изменить код на уровне библиотеки. Я уже два дня чешу затылок и не могу понять. Итак, наконец-то подумал о помощи. Вот код:
def dedupe(contains_dupes, threshold, scorer=fuzz.token_set_ratio):
extractor = []
# iterate over items in *contains_dupes*
for item in contains_dupes:
# return all duplicate matches found
matches = process.extract(item, contains_dupes, limit=None, scorer=scorer)
# filter matches based on the threshold
filtered = [x for x in matches if x[1] > threshold]
print(filtered)
# if there is only 1 item in *filtered*, no duplicates were found so append to *extracted*
if len(filtered) == 1:
extractor.append(filtered[0][0])
else:
# alpha sort
filtered = sorted(filtered, key=lambda x: x[0])
# length sort
filter_sort = sorted(filtered, key=lambda x: len(x[0]), reverse=True)
# take first item as our 'canonical example'
extractor.append(filter_sort[0][0])
# uniquify *extractor* list
keys = {}
for e in extractor:
keys[e] = 1
extractor = keys.keys()
# check that extractor differs from contain_dupes (e.g. duplicates were found)
# if not, then return the original list
if len(extractor) == len(contains_dupes):
print("Length of extractor and contain_dupes is same so returning contain_dupe \n")
return contains_dupes
else:
print("Returning extractor")
return extractor
#Using dataframe as I have a csv but the function takes in a list
df=pd.read_csv('Some_csv')
#Concatenating all the columns
df['combined']=df['FirstName']+'+'+df['LastName']
contain_dupes=df['combined'].toList()
dedupe(contain_dupes,70)