Существует множество примеров моделирования тем LDA Mallet, однако ни один из них не показывает, как добавить доминирующую тему, процентный вклад и ключевые слова темы в исходный фрейм данных. Давайте предположим, что это набор данных и мой код
Набор данных:
Document_Id Text
1 'Here goes one example sentence that is generic'
2 'My car drives really fast and I have no brakes'
3 'Your car is slow and needs no brakes'
4 'Your and my vehicle are both not as fast as the airplane'
Код
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pandas as pd
df = pd.read_csv('data_above.csv')
data = df.Text.values.tolist()
# Assuming I have done all the preprocessing, lemmatization and so on and ended up with data_lemmatized:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=50,random_state=100,
chunksize = 1000, update_every=1,
passes=10, alpha='auto', per_word_topics=True)
Я пробовал что-то подобное, но это не работает ...
def format_topics_sentences(ldamodel, corpus, df):
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
contents = df
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
return(sent_topics_df)