Как я могу предварительно обработать текст НЛП (строчные буквы, удалить специальные символы, удалить цифры, удалить электронную почту и т. Д.) За один проход, используя Python?
Here are all the things I want to do to a Pandas dataframe in one pass in python:
1. Lowercase text
2. Remove whitespace
3. Remove numbers
4. Remove special characters
5. Remove emails
6. Remove stop words
7. Remove NAN
8. Remove weblinks
9. Expand contractions (if possible not necessary)
10. Tokenize
Вот как я делаю все это индивидуально:
def preprocess(self, dataframe):
self.log.info("In preprocess function.")
dataframe1 = self.remove_nan(dataframe)
dataframe2 = self.lowercase(dataframe1)
dataframe3 = self.remove_whitespace(dataframe2)
# Remove emails and websites before removing special characters
dataframe4 = self.remove_emails(self, dataframe3)
dataframe5 = self.remove_website_links(self, dataframe4)
dataframe6 = self.remove_special_characters(dataframe5)
dataframe7 - self.remove_numbers(dataframe6)
self.remove_stop_words(dataframe8) # Doesn't return anything for now
dataframe7 = self.tokenize(dataframe6)
self.log.info(f"Sample of preprocessed data: {dataframe4.head()}")
return dataframe7
def remove_nan(self, dataframe):
"""Pass in a dataframe to remove NAN from those columns."""
return dataframe.dropna()
def lowercase(self, dataframe):
logging.info("Converting dataframe to lowercase")
lowercase_dataframe = dataframe.apply(lambda x: x.lower())
return lowercase_dataframe
def remove_special_characters(self, dataframe):
self.log.info("Removing special characters from dataframe")
no_special_characters = dataframe.replace(r'[^A-Za-z0-9 ]+', '', regex=True)
return no_special_characters
def remove_numbers(self, dataframe):
self.log.info("Removing numbers from dataframe")
removed_numbers = dataframe.str.replace(r'\d+','')
return removed_numbers
def remove_whitespace(self, dataframe):
self.log.info("Removing whitespace from dataframe")
# replace more than 1 space with 1 space
merged_spaces = dataframe.str.replace(r"\s\s+",' ')
# delete beginning and trailing spaces
trimmed_spaces = merged_spaces.apply(lambda x: x.str.strip())
return trimmed_spaces
def remove_stop_words(self, dataframe):
# TODO: An option to pass in a custom list of stopwords would be cool.
set(stopwords.words('english'))
def remove_website_links(self, dataframe):
self.log.info("Removing website links from dataframe")
no_website_links = dataframe.str.replace(r"http\S+", "")
return no_website_links
def tokenize(self, dataframe):
tokenized_dataframe = dataframe.apply(lambda row: word_tokenize(row))
return tokenized_dataframe
def remove_emails(self, dataframe):
no_emails = dataframe.str.replace(r"\S*@\S*\s?")
return no_emails
def expand_contractions(self, dataframe):
# TODO: Not a priority right now. Come back to it later.
return dataframe