2018-09-18_CIP.ipynb
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import TreebankWordTokenizer as tok
english_stops = set(stopwords.words('english'))
# Gets synsets for a given term.
def get_synset(word):
for word in wn.synsets(word):
return word.name()
#Gets definitions for a synset.
def get_def(syn):
return wn.synset(syn).definition() # your definition is misspelled
# Creates a dataframe called sector_matrix based on another dataframe's column. Should be followed with an export.
def sector_tagger(frame):
tok_list = tok().tokenize(frame)
split_words = [w.lower() for w in tok_list]
clean_words = [w for w in split_words if w not in english_stops]
synset = [get_synset(w) for w in clean_words]
sector_matrix = pd.DataFrame({'Categories': clean_words,
'Synsets': synset})
sec_syn = list(sector_matrix['Synsets'])
sector_matrix['Definition'] = [get_def(w) if w != None else '' for w in sec_syn]
return sector_matrix
agri_matrix = df['Category'].apply(sector_tagger)
, если это отвечает на ваш вопрос, пожалуйста, отметьте его как ответ
вывод get_def
представляет собой список фраз
Альтернативный подход
def sector_tagger(frame):
mapping = [('/', ' '), ('(', ''), (')', ''), (',', '')]
for k, v in mapping:
frame = frame.replace(k, v)
tok_list = tok().tokenize(frame) # note () after tok
split_words = [w.lower() for w in tok_list]
clean_words = [w for w in split_words if w not in english_stops]
synset = [get_synset(w) for w in clean_words]
def_matrix = [get_def(w) if w != None else '' for w in synset]
return clean_words, synset, def_matrix
poo = df['Category'].apply(sector_tagger)
poo[0] =
(['agricultural', 'domestic', 'animal', 'services'],
['agricultural.a.01', 'domestic.n.01', 'animal.n.01', 'services.n.01'],
['relating to or used in or promoting agriculture or farming',
'a servant who is paid to perform menial tasks around the household',
'a living organism characterized by voluntary movement',
'performance of duties or provision of space and equipment helpful to others'])
list_clean_words = []
list_synset = []
list_def_matrix = []
for x in poo:
list_clean_words.append(x[0])
list_synset.append(x[1])
list_def_matrix.append(x[2])
agri_matrix = pd.DataFrame()
agri_matrix['Categories'] = list_clean_words
agri_matrix['Synsets'] = list_synset
agri_matrix['Definition'] = list_def_matrix
agri_matrix
Categories Synsets Definition
0 [agricultural, domestic, animal, services] [agricultural.a.01, domestic.n.01, animal.n.01... [relating to or used in or promoting agricultu...
1 [agricultural, food, products, processing] [agricultural.a.01, food.n.01, merchandise.n.0... [relating to or used in or promoting agricultu...
2 [agricultural, business, management] [agricultural.a.01, business.n.01, management.... [relating to or used in or promoting agricultu...
3 [agricultural, mechanization] [agricultural.a.01, mechanization.n.01] [relating to or used in or promoting agricultu...
4 [agricultural, production, operations] [agricultural.a.01, production.n.01, operation... [relating to or used in or promoting agricultu...
Разделить каждый список списков в длинный список (они упорядочены)
def create_long_list_from_list_of_lists(list_of_lists):
long_list = []
for one_list in list_of_lists:
for word in one_list:
long_list.append(word)
return long_list
long_list_clean_words = create_long_list_from_list_of_lists(list_clean_words)
long_list_synset = create_long_list_from_list_of_lists(list_synset)
long_list_def_matrix = create_long_list_from_list_of_lists(list_def_matrix)
Превратить его в DataFrame of Uniques Categories
agri_df = pd.DataFrame.from_dict(dict([('Categories', long_list_clean_words), ('Synsets', long_list_synset), ('Definitions', long_list_def_matrix)])).drop_duplicates().reset_index(drop=True)
agri_df.head(4)
Categories Synsets Definitions
0 ceramic ceramic.n.01 an artifact made of hard brittle material prod...
1 horticultural horticultural.a.01 of or relating to the cultivation of plants
2 construction construction.n.01 the act of constructing something
3 building building.n.01 a structure that has a roof and walls and stan...
Заключительное примечание
import from nltk.tokenize import TreebankWordTokenizer as tok
или:
import from nltk.tokenize import word_tokenize
для использования:
tok().tokenize(string_text_phrase) # text is a string phrase, not a list of words
или:
word_tokenize(string_text_phrase)
Похоже, что оба метода дают один и тот же результат, которыйсписок слов.
input = "Agricultural and domestic animal services"
output_of_both_methods = ['Agricultural', 'and', 'domestic', 'animal', 'services']