Вы можете создать трансформатор sklearn, который сделает все это за вас.
Например,
import re
from sklearn.base import TransformerMixin
class TextCleaner(TransformerMixin):
'''
Text cleaner example
'''
def __init__(self,
words_only=False,
emoji_normalize=False,
remove_digits=True,
lower_case=False,
stop_words = None,
token=False):
self.words_only = words_only
self.word_normalize = word_normalize
self.emoji_normalize = emoji_normalize
self.remove_digits = remove_digits
self.lower_case = lower_case
self.stop_words = stop_words
self.token = token
def fit(self, X, y=None):
return self
def transform(self, X):
self.X = X
# eyes [nose] mouth | mouth [nose] eyes pattern
emoticons = r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)"
emoticon_re = re.compile(emoticons, re.VERBOSE | re.I | re.UNICODE)
# Keep word only. Digit are consider true Emojis false
if self.words_only:
clean_text = self.X.apply(lambda x: (re.sub('[\W]+', ' ', x)))
else:
clean_text = self.X.apply(lambda x: ('{}{}'.format(re.sub('[\W]+', ' ', x),
''.join(re.findall(emoticon_re, x)))))
# normalize emoji?
if self.emoji_normalize:
clean_text = self.X.apply( lambda x:
(re.sub('[\W]+', ' ', x) +
' '.join(re.findall(emoticon_re, x)).replace(';',':').replace('-',''))
)
if self.remove_digits:
clean_text = clean_text.apply(
lambda x: x.translate(str.maketrans('', '', '0123456789')))
if self.lower_case:
clean_text = clean_text.str.lower()
# list of stop words
if self.stop_words is not None:
_ = [word for word in clean_text.split() if word not in stop_words]
clean_text = ' '.join(word for word in _)
if self.token:
return clean_text.str.split()
else:
return clean_text
Вы можете связать этот трансформатор с другими трансформаторами и вашим классификатором.Например,
from sklearn.linear_model import LogisticRegression
word_vec_para = dict(
ngram_range=(1, 2),
tokenizer=token,
lowercase=True,
min_df=1)
pipe = Pipeline(steps=[
('text_clean', TextCleaner(
remove_digits=True, lower_case=True, stop_words=your_stop_words)),
('word_vec', CountVectorizer(**word_vec_para)),
('word_tdf', TfidfTransformer(sublinear_tf=True)), ('fe_se', SelectKBest(chi2, k='all')),
('log_reg', LogisticRegression(verbose=1))])
pipe.fit(X,y)