Я работаю над проектом Twitter-python для начинающих, и я следую за страницей ниже:
https://www.earthdatascience.org/courses/earth-analytics-python/using-apis-natural-language-processing-twitter/calculate-tweet-word-bigrams-networks-in-python/
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import collections
import tweepy as tw
import nltk
from nltk import bigrams
from nltk.corpus import stopwords
import re
import networkx as nx
import warnings
warnings.filterwarnings('ignore')
sns.set(font_scale = 1.5)
sns.set_style('whitegrid')
consumer_key = 'xxx'
consumer_secret = 'xxx'
access_token = 'xxx'
access_token_secret = 'xxx'
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit = True)
search_term = 'Ocado -filter:retweets'
tweets = tw.Cursor(api.search,
q = search_term,
lang = 'en',
since = '2020-03-01').items(100)
def remove_url(txt):
"""Replace URLs found in a text string with nothing
(i.e. it will remove the URL from the string).
Parameters
----------
txt: a string
A text string that you want to parse and remove urls.
Returns
-------
The same txt string with URLs removed.
"""
return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())
# Remove URLs
tweets_no_urls = [remove_url(tweet.text) for tweet in tweets]
# Create a sublist of lower case words for each tweet
words_in_tweet = [tweet.lower().split() for tweet in tweets_no_urls]
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# Remove stop words from each tweet list of words
tweets_nsw = [[word for word in tweet_words if not word in stop_words]
for tweet_words in words_in_tweet]
# Remove collection words
collection_words = ['ocado', 'delivery', 'change']
tweets_nsw_nc = [[w for w in word if not w in collection_words]
for word in tweets_nsw]
#Create a list of lists containing bigrams in tweets
terms_bigram = [list(bigrams(tweet)) for tweet in tweets_nsw_nc]
#View bigrams from the first tweet
terms_bigram[0]
Проблема происходит с Bigrams:
Я вижу ошибку:
* IndexError Traceback (последний последний вызов) в 3 4 # Просмотр биграмм из первого твита ----> 5 term_bigram [0]
IndexError: список индексов вне диапазона *
Я не могу этого понять. Может ли кто-нибудь помочь мне понять, почему индекс списка выходит за пределы допустимого диапазона, когда я пытаюсь выполнить условия term_bigrams?