Темы LDA не имеют изменений - PullRequest
0 голосов
/ 25 сентября 2019

Я пытаюсь найти категории для веб-сайта () по содержимому, которое присутствует на том же самом, но моя модель LDA продолжает возвращать очень похожие темы.Не уверен, что у моего корпуса или словаря есть проблемы

Я пытался следовать многим онлайн-учебникам, которые показывают очень похожий код, но почему-то мой код работает неправильно.Я также пробовал многоядерный LDA.Я думаю использовать TF-IDF, но думаю, что это должно было работать само по себе.Я открыт для любых предложений, если LDA не подходит, пожалуйста, дайте мне знать.

Я не рассмотрел весь сайт здесь, но это конечная цель.

import random
import pandas as pd
import re
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
from nltk.collocations import *
from nltk import FreqDist
import string
from sklearn.feature_extraction.text import CountVectorizer
import csv
import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt
import tokenize
from urllib.request import urlopen
from bs4 import BeautifulSoup
from nltk import word_tokenize
import requests
random.seed(1999)
np.random.seed(1999)

url_list=['https://www.aarp.org/caregiving/answers/info-2017/adult-day-care.html','https://www.aarp.org/caregiving/answers/info-2017/aging-parent-stop-driving.html','https://www.aarp.org/travel/vacation-ideas/cruises/info-2019/how-to-save.html',"https://www.aarp.org/caregiving/home-care/info-2018/hiring-caregiver.html","https://www.aarp.org/travel/vacation-ideas/cruises/info-2019/plan-by-budget.html"]
All_text=[]
myWords=[]
myWords_internal=[]
myWords_External=[]
custom_stopwords=["share", "print", "en", "español","espanol", "register",  "login",    "search",   "join",     "discussion",   "please",   "leave ",   "comment", "comments", "must", "logged",    "aarp",     "email",    "org",  "using",    "linkedin",     "twitter",  "facebook","javascript", "www", "qa"]
ps=nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
for i in custom_stopwords:
    stopwords.append(i)

for url in url_list:
    All_text=[]
    myWords=[]
    myWords_internal=[]

    answer = requests.get(url)

    html = urlopen(url)
    soup = BeautifulSoup(html, "html5lib")

    text = (''.join(s.findAll(text=True))for s in soup.findAll({'h1' : True,'h2' : True,'h3' : True,'h4' : True,'p' : True}))

    for y in text:
#   
      g=y.lower()
#      
      z=re.split('\W+',g)

      texts= " ".join([word for word in z if word not in string.punctuation])

      tokens= re.split('\W+',texts)

      texts2= " ".join([ps.lemmatize(word) for word in tokens if word not in stopwords])

      tokens= re.split('\W+',texts2)

      texts3= " ".join([word for word in tokens if word.isnumeric() == False])
      tokens= re.split('\W+',texts3)

      texts4=" ".join([word for word in tokens if word != "  "])
      tokens= re.split('\W+',texts4)
      texts5=" ".join([word for word in tokens if word != ""])
      tokens= re.split('\W+',texts5)
      str_list = list(filter(None, tokens))

      myWords_internal.append(str_list)

      myWords = [item for sublist in myWords_internal for item in sublist]

    myWords_External.append(myWords)

dictionary = corpora.Dictionary(myWords_External)

corpus1 = [dictionary.doc2bow(i) for i in myWords_External]

limit=15
start=1 
step=1
coherence_values = []
model_list = []
for num_topics in range(start, limit, step):
  print(num_topics)
  lda_model = LdaModel(corpus=corpus1, num_topics=num_topics, id2word=dictionary)
#  lda_model = gensim.models.LdaMulticore(corpus1, num_topics=num_topics, id2word=dictionary, passes=1, workers=2)

  print(lda_model.show_topics())

  model_list.append(lda_model)

  coherencemodel = CoherenceModel(model=lda_model, texts=myWords_External, dictionary=dictionary, coherence='c_v')
  coherence_values.append(coherencemodel.get_coherence())

print(coherence_values)                               #Cross-Verifying the Coherence value
print(*coherence_values, sep = "\n")

Ниже приведен пример результатов, если принять во внимание весь сайт.

1 [(0, '0.010*"receive" + 0.008*"member" + 0.008*"benefit" + 0.008*"save" + 0.005*"people" + 0.004*"say" + 0.004*"related" + 0.004*"make" + 0.004*"age" + 0.004*"year"')] 
2 [(0, '0.008*"member" + 0.007*"receive" + 0.007*"save" + 0.006*"benefit" + 0.005*"people" + 0.005*"say" + 0.005*"one" + 0.005*"age" + 0.004*"related" + 0.004*"year"'), (1, '0.013*"receive" + 0.009*"member" + 0.009*"benefit" + 0.008*"save" + 0.006*"enter" + 0.005*"valid" + 0.005*"people" + 0.005*"make" + 0.004*"social" + 0.004*"free"')] 
3 [(0, '0.011*"receive" + 0.010*"member" + 0.009*"benefit" + 0.008*"save" + 0.006*"people" + 0.005*"subscription" + 0.005*"valid" + 0.005*"provider" + 0.004*"social" + 0.004*"leave"'), (1, '0.008*"save" + 0.007*"member" + 0.007*"receive" + 0.007*"benefit" + 0.004*"age" + 0.004*"make" + 0.004*"one" + 0.004*"say" + 0.004*"related" + 0.004*"people"'), (2, '0.010*"receive" + 0.007*"member" + 0.007*"benefit" + 0.006*"save" + 0.005*"people" + 0.005*"say" + 0.005*"year" + 0.005*"one" + 0.004*"provider" + 0.004*"get"')] 
4 [(0, '0.008*"member" + 0.007*"receive" + 0.006*"save" + 0.005*"year" + 0.005*"benefit" + 0.005*"people" + 0.004*"related" + 0.004*"one" + 0.004*"subscription" + 0.004*"also"'), (1, '0.010*"receive" + 0.008*"member" + 0.008*"save" + 0.007*"benefit" + 0.006*"people" + 0.005*"age" + 0.005*"free" + 0.005*"volunteering" + 0.005*"provider" + 0.005*"make"'), (2, '0.011*"receive" + 0.011*"benefit" + 0.010*"member" + 0.009*"save" + 0.006*"social" + 0.006*"valid" + 0.006*"enter" + 0.005*"say" + 0.005*"age" + 0.005*"people"'), (3, '0.008*"receive" + 0.006*"member" + 0.006*"people" + 0.006*"save" + 0.005*"benefit" + 0.005*"say" + 0.004*"year" + 0.004*"related" + 0.004*"also" + 0.004*"make"')] 
5 [(0, '0.008*"member" + 0.008*"receive" + 0.007*"benefit" + 0.006*"save" + 0.005*"make" + 0.005*"people" + 0.004*"subscription" + 0.004*"insurance" + 0.004*"volunteering" + 0.004*"leave"'), (1, '0.009*"receive" + 0.008*"member" + 0.006*"say" + 0.006*"save" + 0.006*"benefit" + 0.005*"people" + 0.005*"way" + 0.005*"make" + 0.004*"year" + 0.004*"one"'), (2, '0.012*"receive" + 0.008*"member" + 0.007*"save" + 0.007*"benefit" + 0.006*"age" + 0.006*"home" + 0.005*"related" + 0.005*"people" + 0.005*"year" + 0.005*"one"'), (3, '0.011*"benefit" + 0.011*"receive" + 0.010*"member" + 0.010*"save" + 0.008*"enter" + 0.007*"valid" + 0.006*"social" + 0.006*"people" + 0.005*"related" + 0.005*"location"'), (4, '0.008*"receive" + 0.007*"save" + 0.007*"member" + 0.006*"benefit" + 0.005*"people" + 0.004*"year" + 0.004*"say" + 0.004*"make" + 0.004*"provider" + 0.004*"volunteering"')] 
6 [(0, '0.012*"enter" + 0.012*"valid" + 0.007*"location" + 0.007*"save" + 0.007*"receive" + 0.006*"room" + 0.006*"member" + 0.006*"date" + 0.005*"people" + 0.004*"benefit"'), (1, '0.009*"member" + 0.008*"receive" + 0.007*"save" + 0.006*"benefit" + 0.005*"year" + 0.005*"say" + 0.005*"people" + 0.005*"make" + 0.005*"one" + 0.004*"leave"'), (2, '0.009*"receive" + 0.008*"member" + 0.008*"save" + 0.008*"benefit" + 0.005*"people" + 0.005*"one" + 0.005*"related" + 0.004*"make" + 0.004*"provider" + 0.004*"say"'), (3, '0.012*"benefit" + 0.009*"receive" + 0.008*"social" + 0.008*"member" + 0.007*"save" + 0.006*"security" + 0.005*"age" + 0.005*"say" + 0.005*"year" + 0.005*"make"'), (4, '0.015*"receive" + 0.011*"member" + 0.009*"benefit" + 0.009*"save" + 0.006*"age" + 0.006*"people" + 0.006*"related" + 0.006*"subscription" + 0.005*"provider" + 0.005*"insurance"'), (5, '0.009*"receive" + 0.007*"member" + 0.007*"save" + 0.006*"say" + 0.005*"people" + 0.005*"benefit" + 0.005*"day" + 0.004*"make" + 0.004*"one" + 0.004*"volunteering"')] 
7 [(0, '0.008*"receive" + 0.008*"member" + 0.006*"save" + 0.006*"benefit" + 0.005*"get" + 0.004*"social" + 0.004*"say" + 0.004*"make" + 0.004*"also" + 0.004*"one"'), (1, '0.012*"valid" + 0.011*"enter" + 0.008*"save" + 0.008*"receive" + 0.008*"location" + 0.007*"member" + 0.006*"room" + 0.006*"benefit" + 0.006*"date" + 0.004*"people"'), (2, '0.008*"benefit" + 0.008*"receive" + 0.007*"age" + 0.007*"say" + 0.006*"people" + 0.006*"social" + 0.005*"member" + 0.005*"save" + 0.005*"medicare" + 0.005*"year"'), (3, '0.008*"receive" + 0.007*"member" + 0.006*"save" + 0.005*"one" + 0.005*"benefit" + 0.005*"people" + 0.004*"subscription" + 0.004*"year" + 0.004*"family" + 0.004*"home"'), (4, '0.009*"receive" + 0.008*"member" + 0.008*"save" + 0.006*"benefit" + 0.005*"people" + 0.005*"related" + 0.004*"community" + 0.004*"say" + 0.004*"u" + 0.004*"new"'), (5, '0.014*"receive" + 0.012*"member" + 0.010*"save" + 0.009*"benefit" + 0.006*"provider" + 0.006*"people" + 0.006*"volunteering" + 0.006*"confirm" + 0.006*"social" + 0.006*"leave"'), (6, '0.011*"receive" + 0.010*"benefit" + 0.009*"member" + 0.008*"save" + 0.006*"people" + 0.005*"age" + 0.005*"say" + 0.005*"make" + 0.005*"provider" + 0.004*"related"')] 
8 [(0, '0.006*"receive" + 0.006*"member" + 0.006*"save" + 0.005*"benefit" + 0.005*"people" + 0.004*"one" + 0.004*"make" + 0.004*"year" + 0.004*"age" + 0.004*"also"'), (1, '0.007*"say" + 0.006*"receive" + 0.006*"member" + 0.006*"save" + 0.006*"year" + 0.005*"people" + 0.005*"enter" + 0.005*"benefit" + 0.005*"time" + 0.005*"one"'), (2, '0.011*"receive" + 0.011*"member" + 0.010*"benefit" + 0.008*"save" + 0.006*"social" + 0.006*"related" + 0.006*"volunteering" + 0.006*"people" + 0.006*"age" + 0.006*"leave"'), (3, '0.009*"benefit" + 0.008*"receive" + 0.006*"people" + 0.006*"save" + 0.006*"member" + 0.006*"say" + 0.005*"get" + 0.005*"year" + 0.004*"way" + 0.004*"make"'), (4, '0.010*"receive" + 0.009*"member" + 0.008*"save" + 0.006*"benefit" + 0.006*"drug" + 0.005*"people" + 0.005*"health" + 0.005*"say" + 0.004*"year" + 0.004*"make"'), (5, '0.010*"receive" + 0.009*"member" + 0.008*"valid" + 0.008*"enter" + 0.008*"save" + 0.007*"benefit" + 0.005*"location" + 0.005*"home" + 0.005*"room" + 0.004*"people"'), (6, '0.013*"receive" + 0.011*"member" + 0.010*"save" + 0.010*"benefit" + 0.006*"provider" + 0.005*"related" + 0.005*"people" + 0.005*"age" + 0.005*"confirm" + 0.005*"subscription"'), (7, '0.009*"receive" + 0.006*"benefit" + 0.006*"save" + 0.005*"member" + 0.005*"social" + 0.005*"related" + 0.005*"use" + 0.004*"security" + 0.004*"subscription" + 0.004*"provider"')] 
9 [(0, '0.010*"enter" + 0.009*"valid" + 0.006*"member" + 0.006*"receive" + 0.006*"save" + 0.005*"room" + 0.005*"date" + 0.005*"location" + 0.004*"say" + 0.004*"also"'), (1, '0.011*"valid" + 0.011*"receive" + 0.010*"enter" + 0.010*"member" + 0.007*"benefit" + 0.007*"save" + 0.007*"location" + 0.006*"date" + 0.006*"people" + 0.005*"room"'), (2, '0.012*"receive" + 0.011*"member" + 0.011*"save" + 0.010*"benefit" + 0.006*"subscription" + 0.006*"people" + 0.006*"related" + 0.006*"leave" + 0.006*"provider" + 0.005*"volunteering"'), (3, '0.006*"receive" + 0.006*"people" + 0.005*"save" + 0.005*"year" + 0.004*"say" + 0.004*"benefit" + 0.004*"home" + 0.004*"get" + 0.004*"age" + 0.004*"make"'), (4, '0.011*"receive" + 0.010*"member" + 0.008*"save" + 0.008*"benefit" + 0.006*"provider" + 0.006*"make" + 0.006*"medicare" + 0.005*"people" + 0.005*"related" + 0.004*"volunteering"'), (5, '0.008*"receive" + 0.005*"save" + 0.005*"member" + 0.005*"say" + 0.005*"home" + 0.005*"benefit" + 0.004*"one" + 0.004*"get" + 0.004*"people" + 0.004*"related"'), (6, '0.009*"benefit" + 0.009*"receive" + 0.008*"save" + 0.007*"member" + 0.007*"social" + 0.005*"age" + 0.005*"year" + 0.005*"security" + 0.004*"community" + 0.004*"make"'), (7, '0.009*"receive" + 0.008*"member" + 0.008*"benefit" + 0.007*"say" + 0.006*"people" + 0.006*"save" + 0.006*"year" + 0.005*"make" + 0.005*"age" + 0.004*"related"'), (8, '0.007*"receive" + 0.007*"benefit" + 0.006*"member" + 0.006*"save" + 0.005*"people" + 0.004*"home" + 0.004*"say" + 0.004*"make" + 0.004*"one" + 0.004*"year"')] 
10 [(0, '0.013*"receive" + 0.011*"member" + 0.010*"save" + 0.009*"benefit" + 0.006*"people" + 0.006*"provider" + 0.005*"age" + 0.005*"subscription" + 0.005*"free" + 0.005*"related"'), (1, '0.009*"receive" + 0.007*"member" + 0.006*"save" + 0.005*"benefit" + 0.005*"make" + 0.004*"people" + 0.004*"one" + 0.004*"related" + 0.004*"also" + 0.004*"volunteering"'), (2, '0.006*"say" + 0.006*"receive" + 0.006*"save" + 0.006*"member" + 0.005*"get" + 0.005*"benefit" + 0.005*"year" + 0.005*"people" + 0.004*"make" + 0.004*"one"'), (3, '0.009*"receive" + 0.007*"member" + 0.006*"save" + 0.006*"benefit" + 0.006*"year" + 0.005*"make" + 0.005*"people" + 0.005*"provider" + 0.005*"valid" + 0.004*"enter"'), (4, '0.010*"receive" + 0.010*"save" + 0.008*"benefit" + 0.008*"member" + 0.006*"age" + 0.005*"year" + 0.005*"subscription" + 0.005*"say" + 0.004*"provider" + 0.004*"social"'), (5, '0.011*"receive" + 0.008*"benefit" + 0.008*"member" + 0.007*"save" + 0.006*"people" + 0.006*"health" + 0.005*"care" + 0.005*"also" + 0.005*"home" + 0.005*"volunteering"'), (6, '0.013*"receive" + 0.011*"save" + 0.010*"member" + 0.008*"benefit" + 0.006*"enter" + 0.006*"valid" + 0.005*"provider" + 0.005*"location" + 0.005*"related" + 0.005*"people"'), (7, '0.007*"member" + 0.007*"valid" + 0.006*"enter" + 0.006*"receive" + 0.006*"save" + 0.005*"say" + 0.005*"people" + 0.004*"day" + 0.004*"year" + 0.004*"new"'), (8, '0.015*"benefit" + 0.013*"member" + 0.011*"receive" + 0.009*"social" + 0.009*"save" + 0.008*"security" + 0.006*"people" + 0.006*"age" + 0.006*"year" + 0.005*"related"'), (9, '0.008*"enter" + 0.008*"valid" + 0.008*"receive" + 0.006*"save" + 0.006*"member" + 0.005*"say" + 0.005*"location" + 0.005*"benefit" + 0.005*"date" + 0.004*"related"')] 
11 [(10, '0.015*"enter" + 0.015*"valid" + 0.010*"receive" + 0.009*"location" + 0.009*"save" + 0.008*"member" + 0.008*"room" + 0.007*"date" + 0.007*"benefit" + 0.004*"volunteering"'), (5, '0.010*"receive" + 0.008*"member" + 0.007*"benefit" + 0.006*"medicare" + 0.006*"care" + 0.006*"health" + 0.006*"insurance" + 0.005*"people" + 0.005*"drug" + 0.005*"save"'), (1, '0.014*"benefit" + 0.013*"social" + 0.011*"security" + 0.010*"receive" + 0.009*"member" + 0.008*"save" + 0.005*"also" + 0.005*"people" + 0.005*"day" + 0.004*"make"'), (8, '0.008*"receive" + 0.007*"member" + 0.006*"benefit" + 0.005*"also" + 0.005*"people" + 0.005*"save" + 0.004*"say" + 0.004*"subscription" + 0.004*"age" + 0.004*"hearing"'), (4, '0.007*"receive" + 0.007*"benefit" + 0.007*"age" + 0.007*"member" + 0.007*"save" + 0.007*"people" + 0.006*"make" + 0.005*"related" + 0.005*"provider" + 0.005*"say"'), (9, '0.013*"receive" + 0.010*"member" + 0.009*"save" + 0.008*"benefit" + 0.005*"one" + 0.005*"say" + 0.005*"related" + 0.005*"people" + 0.005*"provider" + 0.005*"subscription"'), (3, '0.006*"member" + 0.006*"say" + 0.006*"receive" + 0.005*"one" + 0.004*"save" + 0.004*"year" + 0.004*"benefit" + 0.004*"u" + 0.004*"time" + 0.004*"home"'), (0, '0.013*"receive" + 0.010*"member" + 0.008*"benefit" + 0.007*"save" + 0.007*"people" + 0.006*"provider" + 0.005*"subscription" + 0.005*"say" + 0.005*"related" + 0.005*"confirm"'), (2, '0.011*"receive" + 0.010*"save" + 0.009*"member" + 0.006*"people" + 0.005*"benefit" + 0.005*"leave" + 0.005*"volunteering" + 0.005*"help" + 0.005*"age" + 0.005*"make"'), (7, '0.008*"say" + 0.007*"benefit" + 0.006*"receive" + 0.006*"year" + 0.005*"save" + 0.005*"member" + 0.005*"people" + 0.004*"like" + 0.004*"time" + 0.004*"one"')] 
12 [(0, '0.009*"benefit" + 0.009*"social" + 0.008*"receive" + 0.006*"save" + 0.006*"security" + 0.006*"member" + 0.004*"medicare" + 0.004*"make" + 0.004*"tax" + 0.004*"year"'), (7, '0.013*"receive" + 0.008*"save" + 0.008*"member" + 0.007*"benefit" + 0.005*"time" + 0.005*"subscription" + 0.004*"provider" + 0.004*"related" + 0.004*"leave" + 0.004*"insurance"'), (8, '0.012*"receive" + 0.008*"save" + 0.007*"member" + 0.007*"benefit" + 0.006*"insurance" + 0.006*"age" + 0.005*"people" + 0.005*"health" + 0.005*"help" + 0.005*"make"'), (2, '0.015*"benefit" + 0.011*"receive" + 0.009*"save" + 0.008*"social" + 0.008*"member" + 0.006*"security" + 0.006*"age" + 0.005*"year" + 0.005*"people" + 0.005*"related"'), (6, '0.009*"receive" + 0.007*"member" + 0.006*"benefit" + 0.006*"save" + 0.005*"medicare" + 0.004*"people" + 0.004*"provider" + 0.004*"age" + 0.004*"say" + 0.004*"also"'), (10, '0.019*"enter" + 0.018*"valid" + 0.010*"location" + 0.009*"room" + 0.009*"date" + 0.007*"save" + 0.006*"member" + 0.006*"receive" + 0.005*"benefit" + 0.005*"flight"'), (3, '0.007*"member" + 0.007*"receive" + 0.005*"benefit" + 0.005*"say" + 0.005*"people" + 0.004*"one" + 0.004*"way" + 0.004*"save" + 0.004*"related" + 0.004*"year"'), (1, '0.013*"receive" + 0.013*"member" + 0.012*"save" + 0.010*"benefit" + 0.008*"people" + 0.007*"related" + 0.006*"provider" + 0.006*"say" + 0.006*"volunteering" + 0.006*"confirm"'), (5, '0.007*"member" + 0.007*"receive" + 0.006*"year" + 0.006*"percent" + 0.005*"people" + 0.005*"save" + 0.005*"say" + 0.004*"job" + 0.004*"u" + 0.004*"benefit"'), (11, '0.012*"member" + 0.008*"receive" + 0.008*"save" + 0.007*"benefit" + 0.005*"get" + 0.004*"also" + 0.004*"people" + 0.004*"volunteering" + 0.004*"confirm" + 0.004*"free"')] 
13 [(2, '0.015*"benefit" + 0.014*"receive" + 0.013*"social" + 0.011*"security" + 0.011*"save" + 0.010*"member" + 0.007*"age" + 0.006*"people" + 0.005*"provider" + 0.005*"year"'), (0, '0.009*"receive" + 0.009*"member" + 0.007*"save" + 0.006*"people" + 0.006*"benefit" + 0.006*"home" + 0.006*"year" + 0.005*"say" + 0.005*"age" + 0.005*"related"'), (1, '0.009*"enter" + 0.008*"receive" + 0.007*"member" + 0.007*"valid" + 0.007*"save" + 0.006*"room" + 0.005*"benefit" + 0.005*"date" + 0.004*"make" + 0.004*"year"'), (5, '0.020*"valid" + 0.017*"enter" + 0.012*"location" + 0.009*"room" + 0.008*"date" + 0.008*"save" + 0.008*"receive" + 0.008*"member" + 0.007*"benefit" + 0.005*"flight"'), (8, '0.008*"member" + 0.008*"receive" + 0.007*"save" + 0.006*"benefit" + 0.005*"people" + 0.005*"health" + 0.005*"drug" + 0.005*"say" + 0.004*"related" + 0.004*"also"'), (7, '0.010*"receive" + 0.008*"save" + 0.007*"member" + 0.005*"benefit" + 0.005*"make" + 0.004*"related" + 0.004*"volunteering" + 0.004*"car" + 0.004*"say" + 0.004*"provider"'), (11, '0.008*"receive" + 0.008*"say" + 0.008*"member" + 0.007*"save" + 0.007*"benefit" + 0.006*"year" + 0.006*"people" + 0.005*"make" + 0.005*"subscription" + 0.005*"age"'), (12, '0.010*"receive" + 0.008*"benefit" + 0.007*"people" + 0.007*"member" + 0.006*"save" + 0.005*"one" + 0.005*"related" + 0.005*"say" + 0.005*"provider" + 0.004*"age"'), (6, '0.014*"member" + 0.014*"receive" + 0.011*"save" + 0.009*"benefit" + 0.006*"related" + 0.006*"provider" + 0.005*"subscription" + 0.005*"leave" + 0.005*"confirm" + 0.005*"people"'), (3, '0.010*"receive" + 0.008*"benefit" + 0.007*"save" + 0.007*"medicare" + 0.007*"insurance" + 0.006*"member" + 0.006*"people" + 0.005*"health" + 0.005*"get" + 0.005*"volunteering"')] 
14 [(2, '0.009*"member" + 0.009*"receive" + 0.008*"benefit" + 0.008*"medicare" + 0.006*"save" + 0.005*"people" + 0.005*"related" + 0.005*"year" + 0.005*"home" + 0.004*"volunteering"'), (5, '0.009*"receive" + 0.008*"benefit" + 0.008*"member" + 0.007*"save" + 0.005*"year" + 0.004*"day" + 0.004*"one" + 0.004*"subscription" + 0.004*"say" + 0.004*"make"'), (4, '0.008*"save" + 0.007*"receive" + 0.006*"valid" + 0.006*"car" + 0.005*"location" + 0.005*"enter" + 0.005*"member" + 0.005*"benefit" + 0.004*"related" + 0.004*"say"'), (0, '0.008*"receive" + 0.007*"year" + 0.007*"job" + 0.007*"benefit" + 0.006*"say" + 0.006*"people" + 0.006*"member" + 0.005*"make" + 0.004*"work" + 0.004*"one"'), (9, '0.007*"receive" + 0.006*"benefit" + 0.006*"member" + 0.005*"eye" + 0.005*"people" + 0.005*"age" + 0.005*"make" + 0.004*"say" + 0.004*"one" + 0.004*"may"'), (11, '0.033*"valid" + 0.032*"enter" + 0.018*"location" + 0.017*"room" + 0.016*"date" + 0.008*"member" + 0.008*"flight" + 0.007*"save" + 0.007*"receive" + 0.005*"seat"'), (7, '0.009*"member" + 0.007*"benefit" + 0.007*"receive" + 0.007*"save" + 0.005*"one" + 0.004*"people" + 0.004*"make" + 0.004*"insurance" + 0.004*"say" + 0.004*"volunteering"'), (8, '0.008*"receive" + 0.008*"benefit" + 0.007*"save" + 0.007*"age" + 0.006*"people" + 0.006*"social" + 0.005*"member" + 0.004*"make" + 0.004*"one" + 0.004*"also"'), (10, '0.011*"receive" + 0.008*"health" + 0.008*"save" + 0.008*"member" + 0.006*"people" + 0.006*"age" + 0.005*"community" + 0.005*"help" + 0.005*"benefit" + 0.005*"subscription"'), (1, '0.012*"benefit" + 0.011*"receive" + 0.010*"member" + 0.009*"save" + 0.006*"related" + 0.006*"people" + 0.006*"social" + 0.006*"provider" + 0.005*"confirm" + 0.005*"volunteering"')]
...