Я новичок в области топи c и пытаюсь учиться. Я следую инструкциям, приведенным на сайте ниже
Однако, когда я хочу визуализировать мои темы, используя LDA Я получаю сообщение об ошибке ?! Это код, скопированный с сайта:
# Import required packages
import numpy as np
import logging
import pyLDAvis.gensim
import json
import warnings
warnings.filterwarnings('ignore') # To ignore all warnings that arise here to enhance clarity
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from numpy import array
# Import dataset
p_df = pd.read_csv('C:/Users/kamal/Desktop/R project/Reviews.csv')
# Create sample of 10,000 reviews
p_df = p_df.sample(n = 10000)
# Convert to array
docs =array(p_df['Text'])
# Define function for tokenize and lemmatizing
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
def docs_preprocessor(docs):
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
docs[idx] = docs[idx].lower() # Convert to lowercase.
docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words.
# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isdigit()] for doc in docs]
# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 3] for doc in docs]
# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
return docs
# Perform function on our document
docs = docs_preprocessor(docs)
#Create Biagram & Trigram Models
from gensim.models import Phrases
# Add bigrams and trigrams to docs,minimum count 10 means only that appear 10 times or more.
bigram = Phrases(docs, min_count=10)
trigram = Phrases(bigram[docs])
for idx in range(len(docs)):
for token in bigram[docs[idx]]:
if '_' in token:
# Token is a bigram, add to document.
for token in trigram[docs[idx]]:
if '_' in token:
# Token is a bigram, add to document.
#Remove rare & common tokens
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=10, no_above=0.2)
#Create dictionary and corpus required for Topic Modeling
corpus = [dictionary.doc2bow(doc) for doc in docs]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
# Set parameters.
num_topics = 5
chunksize = 500
passes = 20
iterations = 400
eval_every = 1
# Make a index to word dictionary.
temp = dictionary[0] # only to "load" the dictionary.
id2word = dictionary.id2token
lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
alpha='auto', eta='auto', \
iterations=iterations, num_topics=num_topics, \
passes=passes, eval_every=eval_every)
# Print the Keyword in the 5 topics
Вот мой способ использования LDA для просмотра тем.
import pyLDAvis.gensim
import pickle
import pyLDAvis
# Visualize the topics
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
И ошибка, которую я получаю:
AttributeError Traceback (most recent call last)
<ipython-input-22-8a46a8151430> in <module>
4 # Visualize the topics
5 pyLDAvis.enable_notebook()
----> 6 LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
7 LDAvis_prepared
C:\ProgramData\Anaconda3\lib\site-packages\pyLDAvis\gensim.py in prepare(topic_model, corpus, dictionary, doc_topic_dist, **kwargs)
116 See `pyLDAvis.prepare` for **kwargs.
117 """
--> 118 opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs)
119 return vis_prepare(**opts)
C:\ProgramData\Anaconda3\lib\site-packages\pyLDAvis\gensim.py in _extract_data(topic_model, corpus, dictionary, doc_topic_dists)
24 corpus = gensim.matutils.Sparse2Corpus(corpus_csc)
---> 26 vocab = list(dictionary.token2id.keys())
27 # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm..
28 # for now, I'll just make sure we don't ever get zeros...
AttributeError: 'dict' object has no attribute 'token2id'