Ниже приведен процесс поиска наиболее распространенных слов в твитах в день за 3 дня сбора данных в Твиттере, которые я собрал. Как вы можете видеть ниже, я просто повторяю код для каждого дня. Это нормально и делает свою работу, хотя я довольно новичок в программировании и чувствую, что есть гораздо более чистый способ сделать это, чем повторять код 3 раза.
Любая помощь в том, как автоматизировать это, чтобы избежать повторяющийся код?
#strip grouped tweets by day to lists
day1tweets = day1['text'].to_list()
day2tweets = day2['text'].to_list()
day3tweets = day3['text'].to_list()
############# Day 1
# tokenizing tweet text data
day1tweet = [word_tokenize(i) for i in day1tweets]
#splits list into words
day1tweet = list(itertools.chain(*day1tweet))
#lowercase
day1tweet = [word.lower() for word in day1tweet]
# remove stopwords
day1tweet = [word for word in day1tweet if word not in default_stopwords]
day1tweet = [word for word in day1tweet if word not in custom_stopwords]
#removing 1 letter words and nonetype
day1tweet = [word for word in day1tweet if len(word) > 1]
# obtaining word frequencies, then converting to a dataframe of most common words
d1dist = nltk.FreqDist(day1tweet)
top20d1 =[]
for word, frequency in d1dist.most_common(20):
top20d1.append(u'{} {}'.format(word, frequency))
top20d1 = pd.DataFrame({'Name': top20d1})
top20d1 = top20d1.Name.str.split(expand=True)
top20d1.columns = {'word', 'freq'}
top20d1.rename(columns = {'freq':'words', 'word':'frequency'}, inplace = True)
top20d1['frequency'] = top20d1['frequency'].astype(int)
# Create horizontal bars
plt.barh(top20d1['words'], top20d1['frequency'], color= '#005EB8')
plt.title('Most common words in user profile descriptions')
plt.ylabel('Top 20 words')
plt.xlabel('Frequency')
plt.gca().invert_yaxis()
plt.gca().set_xlim([1000,23000])
plt.gcf().set_size_inches(10, 8)
############# Day 2
# tokenizing tweet text data
day2tweet = [word_tokenize(i) for i in day2tweets]
#splits list into words
day2tweet = list(itertools.chain(*day2tweet))
#lowercase
day2tweet = [word.lower() for word in day2tweet]
# remove stopwords
day2tweet = [word for word in day2tweet if word not in default_stopwords]
day2tweet = [word for word in day2tweet if word not in custom_stopwords]
#removing 1 letter words and nonetype
day2tweet = [word for word in day2tweet if len(word) > 1]
# obtaining word frequencies, then converting to a dataframe of most common words
d2dist = nltk.FreqDist(day2tweet)
day2tweet =[]
for word, frequency in d2dist.most_common(20):
day2tweet.append(u'{} {}'.format(word, frequency))
day2tweet = pd.DataFrame({'Name': day2tweet})
day2tweet = day2tweet.Name.str.split(expand=True)
day2tweet.columns = {'word', 'freq'}
day2tweet.rename(columns = {'freq':'words', 'word':'frequency'}, inplace = True)
day2tweet['frequency'] = day2tweet['frequency'].astype(int)
# Create horizontal bars
plt.barh(day2tweet['words'], day2tweet['frequency'], color= '#005EB8')
plt.title('Most common words in user profile descriptions')
plt.ylabel('Top 20 words')
plt.xlabel('Frequency')
plt.gca().invert_yaxis()
plt.gca().set_xlim([1000,23000])
plt.gcf().set_size_inches(10, 8)
############# Day 3
# tokenizing tweet text data
day3tweet = [word_tokenize(i) for i in day3tweets]
#splits list into words
day3tweet = list(itertools.chain(*day3tweet))
#lowercase
day3tweet = [word.lower() for word in day3tweet]
# remove stopwords
day3tweet = [word for word in day3tweet if word not in default_stopwords]
day3tweet = [word for word in day3tweet if word not in custom_stopwords]
#removing 1 letter words and nonetype
day3tweet = [word for word in day3tweet if len(word) > 1]
# obtaining word frequencies, then converting to a dataframe of most common words
d3dist = nltk.FreqDist(day3tweet)
day3tweet =[]
for word, frequency in d3dist.most_common(20):
day3tweet.append(u'{} {}'.format(word, frequency))
day3tweet = pd.DataFrame({'Name': day3tweet})
day3tweet = day3tweet.Name.str.split(expand=True)
day3tweet.columns = {'word', 'freq'}
day3tweet.rename(columns = {'freq':'words', 'word':'frequency'}, inplace = True)
day3tweet['frequency'] = day3tweet['frequency'].astype(int)
# Create horizontal bars
plt.barh(day3tweet['words'], day3tweet['frequency'], color= '#005EB8')
plt.title('Most common words in user profile descriptions')
plt.ylabel('Top 20 words')
plt.xlabel('Frequency')
plt.gca().invert_yaxis()
#plt.gca().set_xlim([1000,34000])
plt.gcf().set_size_inches(10, 8)