Topi c Modeling (Denisim) Визуализация с использованием LDA - PullRequest
0 голосов
/ 19 апреля 2020

Я новичок в области топи c и пытаюсь учиться. Я следую инструкциям, приведенным на сайте ниже

https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

Однако, когда я хочу визуализировать мои темы, используя LDA Я получаю сообщение об ошибке ?! Это код, скопированный с сайта:

# Import required packages
import numpy as np
import logging
import pyLDAvis.gensim
import json
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity

from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from numpy import array

# Import dataset
p_df = pd.read_csv('C:/Users/kamal/Desktop/R project/Reviews.csv')
# Create sample of 10,000 reviews
p_df = p_df.sample(n = 10000)
# Convert to array
docs =array(p_df['Text'])
# Define function for tokenize and lemmatizing
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]

    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 3] for doc in docs]

    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

    return docs
# Perform function on our document
docs = docs_preprocessor(docs)
#Create Biagram & Trigram Models 
from gensim.models import Phrases
# Add bigrams and trigrams to docs,minimum count 10 means only that appear 10 times or more.
bigram = Phrases(docs, min_count=10)
trigram = Phrases(bigram[docs])

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)
    for token in trigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)
#Remove rare & common tokens 
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=10, no_above=0.2)
#Create dictionary and corpus required for Topic Modeling
corpus = [dictionary.doc2bow(doc) for doc in docs]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
print(corpus[:1])

# Set parameters.
num_topics = 5
chunksize = 500 
passes = 20 
iterations = 400
eval_every = 1  

# Make a index to word dictionary.
temp = dictionary[0]  # only to "load" the dictionary.
id2word = dictionary.id2token

lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)
# Print the Keyword in the 5 topics
print(lda_model.print_topics())

Вот мой способ использования LDA для просмотра тем.

import pyLDAvis.gensim
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

И ошибка, которую я получаю:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-22-8a46a8151430> in <module>
      4 # Visualize the topics
      5 pyLDAvis.enable_notebook()
----> 6 LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
      7 LDAvis_prepared

C:\ProgramData\Anaconda3\lib\site-packages\pyLDAvis\gensim.py in prepare(topic_model, corpus, dictionary, doc_topic_dist, **kwargs)
    116     See `pyLDAvis.prepare` for **kwargs.
    117     """
--> 118     opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs)
    119     return vis_prepare(**opts)

C:\ProgramData\Anaconda3\lib\site-packages\pyLDAvis\gensim.py in _extract_data(topic_model, corpus, dictionary, doc_topic_dists)
     24       corpus = gensim.matutils.Sparse2Corpus(corpus_csc)
     25 
---> 26    vocab = list(dictionary.token2id.keys())
     27    # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm..
     28    # for now, I'll just make sure we don't ever get zeros...

AttributeError: 'dict' object has no attribute 'token2id'
...