Вот один из способов сделать это, используя pandas
:
import pandas as pd
import numpy as np
import re
from collections import OrderedDict
# a function that splits a string into text and number
def my_splitter(s):
return filter(None, re.split(r'(\d+)', s))
#reading the data as a dataframe from the file
df=pd.read_csv('dataset.txt',sep='\t',header=None,skiprows=1,names=['Name'])
join=[]
for i in range(len(df)):
if len(my_splitter(df['Name'][i]))!=2:
join.append({'Name': my_splitter(df['Name'][i])[0], 'ID': 'na'})
else:
join.append({'Name': my_splitter(df['Name'][i])[0], 'ID': my_splitter(df['Name'][i])[1]})
df_new=pd.DataFrame(join)
diction=OrderedDict()
#creating a dictionary that stores the company name and ID
for i in range(len(df_new)):
if df_new['ID'][i]!='na':
diction[df_new['ID'][i]]=df_new['Name'][i].split()
for i in range(len(df_new)):
if df_new['ID'][i]=='na':
for j in diction:
if bool(set(df_new['Name'][i].split()) & set(diction[j])):
df_new['ID'][i]=j
print (df) # contents of the testing file read as a dataframe
print ("####################")
print (df_new)
#save result to a file - dataset.txt
df_new.to_csv('dataset.txt', sep='\t')
Вывод:
Name
0 T. Rowe Price Group
1 Group, T. Rowe Price 576
2 T. ROWE PRICE GROUP
3 Transatlantic, Inc 458
4 Transatlantic, Incorporated
5 Transatlantic, Inc 458
####################
ID Name
0 576 T. Rowe Price Group
1 576 Group, T. Rowe Price
2 576 T. ROWE PRICE GROUP
3 458 Transatlantic, Inc
4 458 Transatlantic, Incorporated
5 458 Transatlantic, Inc