Используйте модуль re *
import re
love = ['Intel(R) software',
'Intel IT',
'IntelliCAD Technology Consortium',
'Huaian Ningda intelligence Project co.,Ltd',
'Intellon Corporation',
'INTEL\Giovanni',
'Internal - Intel® Identity Protection Technology Software',
'*.google.com',
'GoogleHit',
'http://www.google.com',
'Google Play - Olmsted County',
'Microsoft Windows Component Publisher',
'Microsoft Windows 2000 Publisher',
'Microsoft Windows XP Publisher',
'Windows Embedded Signer',
'Windows Corporation',
'Windows7-PC\Windows7']
match = {}
counts = {}
regex_words = ['Intel', 'Windows', 'Google']
no = 0
# for each of the predefined words
for x in regex_words:
# new regex we will use for a closer match
regex = '\s?' + x + '\s'
# items we want to match
for each in love:
found = re.findall(x, each)
if found:
# counting them to get the maximum, (ran out of time)
counts[no] = len(found)
# here is a closer match, matching with space in front
if re.findall(regex, each):
per = 0.5
match[each] = str(per)
# this is an exact match
elif each == x:
per = 0.75
match[each] = str(per)
# this is the very first match the ordinary
else:
per = 0.25
match[each] = str(per)
no += 1
""" This is the calculation of the score the item made
for the it's repeatition against the set """
# this will be the mode of the counts
highest = 0
# start working on the counts
for y in counts:
# if this is higher than whats already in the highest
if counts[y] > highest:
# make it the highest
highest = counts[y]
# index for counts dict
small_no = 0
for z in match:
# percentage of what was in the counts for the item compared to the highest
per = counts[small_no] / highest * 100
# percentage the item gets for the remaining 25 score allocated to all
score = per / 100 * 25
total_score = round((score / 100), 2)
# increment the no. that we are using to iterate the counts
small_no += 1
# reset the new score for the matchs
match[z] = str(float(match[z]) + total_score)
Будет выводиться
{'Intel(R) software': '0.37', 'Intel IT': '0.62', 'IntelliCAD Technology Consortium': '0.37', 'Intellon Corporation': '0.37', 'Internal - Intel® Identity Protection Technology Software': '0.37', 'Microsoft Windows Component Publisher': '0.62', 'Microsoft Windows 2000 Publisher': '0.62', 'Microsoft Windows XP Publisher': '0.62', 'Windows Embedded Signer': '0.62', 'Windows Corporation': '0.62', 'Windows7-PC\\Windows7': '0.5', 'GoogleHit': '0.37', 'Google Play - Olmsted County': '0.62'