Как автоматизировать этот повторяющийся процесс в python - PullRequest
0 голосов
/ 20 апреля 2020

Ниже приведен процесс поиска наиболее распространенных слов в твитах в день за 3 дня сбора данных в Твиттере, которые я собрал. Как вы можете видеть ниже, я просто повторяю код для каждого дня. Это нормально и делает свою работу, хотя я довольно новичок в программировании и чувствую, что есть гораздо более чистый способ сделать это, чем повторять код 3 раза.

Любая помощь в том, как автоматизировать это, чтобы избежать повторяющийся код?

#strip grouped tweets by day to lists
day1tweets = day1['text'].to_list() 
day2tweets = day2['text'].to_list() 
day3tweets = day3['text'].to_list() 

############# Day 1
# tokenizing tweet text data
day1tweet = [word_tokenize(i) for i in day1tweets]
#splits list into words
day1tweet  = list(itertools.chain(*day1tweet))
#lowercase
day1tweet = [word.lower() for word in day1tweet]
# remove stopwords
day1tweet = [word for word in day1tweet if word not in default_stopwords]
day1tweet = [word for word in day1tweet if word not in custom_stopwords]
#removing 1 letter words and nonetype
day1tweet = [word for word in day1tweet if len(word) > 1]

# obtaining word frequencies, then converting to a dataframe of most common words
d1dist = nltk.FreqDist(day1tweet)
top20d1 =[]
for word, frequency in d1dist.most_common(20):
    top20d1.append(u'{} {}'.format(word, frequency))
top20d1 = pd.DataFrame({'Name': top20d1})
top20d1 = top20d1.Name.str.split(expand=True)
top20d1.columns = {'word', 'freq'}
top20d1.rename(columns = {'freq':'words', 'word':'frequency'}, inplace = True)
top20d1['frequency'] = top20d1['frequency'].astype(int)

# Create horizontal bars
plt.barh(top20d1['words'], top20d1['frequency'], color= '#005EB8')
plt.title('Most common words in user profile descriptions')
plt.ylabel('Top 20 words') 
plt.xlabel('Frequency')  
plt.gca().invert_yaxis()
plt.gca().set_xlim([1000,23000])
plt.gcf().set_size_inches(10, 8)

############# Day 2
# tokenizing tweet text data
day2tweet = [word_tokenize(i) for i in day2tweets]
#splits list into words
day2tweet  = list(itertools.chain(*day2tweet))
#lowercase
day2tweet = [word.lower() for word in day2tweet]
# remove stopwords
day2tweet = [word for word in day2tweet if word not in default_stopwords]
day2tweet = [word for word in day2tweet if word not in custom_stopwords]
#removing 1 letter words and nonetype
day2tweet = [word for word in day2tweet if len(word) > 1]

# obtaining word frequencies, then converting to a dataframe of most common words
d2dist = nltk.FreqDist(day2tweet)
day2tweet =[]
for word, frequency in d2dist.most_common(20):
    day2tweet.append(u'{} {}'.format(word, frequency))
day2tweet = pd.DataFrame({'Name': day2tweet})
day2tweet = day2tweet.Name.str.split(expand=True)
day2tweet.columns = {'word', 'freq'}
day2tweet.rename(columns = {'freq':'words', 'word':'frequency'}, inplace = True)
day2tweet['frequency'] = day2tweet['frequency'].astype(int)

# Create horizontal bars
plt.barh(day2tweet['words'], day2tweet['frequency'], color= '#005EB8')
plt.title('Most common words in user profile descriptions')
plt.ylabel('Top 20 words') 
plt.xlabel('Frequency')  
plt.gca().invert_yaxis()
plt.gca().set_xlim([1000,23000])
plt.gcf().set_size_inches(10, 8)


############# Day 3
# tokenizing tweet text data
day3tweet = [word_tokenize(i) for i in day3tweets]
#splits list into words
day3tweet  = list(itertools.chain(*day3tweet))
#lowercase
day3tweet = [word.lower() for word in day3tweet]
# remove stopwords
day3tweet = [word for word in day3tweet if word not in default_stopwords]
day3tweet = [word for word in day3tweet if word not in custom_stopwords]
#removing 1 letter words and nonetype
day3tweet = [word for word in day3tweet if len(word) > 1]

# obtaining word frequencies, then converting to a dataframe of most common words
d3dist = nltk.FreqDist(day3tweet)
day3tweet =[]
for word, frequency in d3dist.most_common(20):
    day3tweet.append(u'{} {}'.format(word, frequency))
day3tweet = pd.DataFrame({'Name': day3tweet})
day3tweet = day3tweet.Name.str.split(expand=True)
day3tweet.columns = {'word', 'freq'}
day3tweet.rename(columns = {'freq':'words', 'word':'frequency'}, inplace = True)
day3tweet['frequency'] = day3tweet['frequency'].astype(int)

# Create horizontal bars
plt.barh(day3tweet['words'], day3tweet['frequency'], color= '#005EB8')
plt.title('Most common words in user profile descriptions')
plt.ylabel('Top 20 words') 
plt.xlabel('Frequency')  
plt.gca().invert_yaxis()
#plt.gca().set_xlim([1000,34000])
plt.gcf().set_size_inches(10, 8)

1 Ответ

0 голосов
/ 20 апреля 2020

Вы можете использовать такую ​​функцию. Тогда вам не нужно писать один и тот же код много раз.

def do_something(tweet):
  # tokenizing tweet text data
  tweet = [word_tokenize(i) for i in tweet]
  #splits list into words
  tweet  = list(itertools.chain(*tweet))
  #lowercase
  tweet = [word.lower() for word in tweet]
  # remove stopwords
  tweet = [word for word in tweet if word not in default_stopwords]
  tweet = [word for word in tweet if word not in custom_stopwords]
  #removing 1 letter words and nonetype
  tweet = [word for word in tweet if len(word) > 1]

  # obtaining word frequencies, then converting to a dataframe of most common words
  dist = nltk.FreqDist(tweet)
  top20 =[]
  for word, frequency in dist.most_common(20):
    top20.append(u'{} {}'.format(word, frequency))
  top20 = pd.DataFrame({'Name': top20})
  top20 = top20.Name.str.split(expand=True)
  top20.columns = {'word', 'freq'}
  top20.rename(columns = {'freq':'words', 'word':'frequency'}, inplace = True)
  top20['frequency'] = top20['frequency'].astype(int)

  # Create horizontal bars
  plt.barh(top20['words'], top20['frequency'], color= '#005EB8')
  plt.title('Most common words in user profile descriptions')
  plt.ylabel('Top 20 words') 
  plt.xlabel('Frequency')  
  plt.gca().invert_yaxis()
  plt.gca().set_xlim([1000,23000])
  plt.gcf().set_size_inches(10, 8)

#strip grouped tweets by day to lists
do_somthing(day1['text'].to_list())
do_somthing(day2['text'].to_list())
do_somthing(day3['text'].to_list())
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...