Я работаю над классификационным текстом с 3 функциями (название, автор, текст). мои данные - 18285 строк, после очистки и помещения каждого вывода в корпус
# title_coups
title = []
for i in range(0,len(data)):
review = re.sub('[^a-zA-Z0-9]', ' ', data['title'][i])
review = review.lower()
review = review.split()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in
set(all_stopwords)]
review = ' '.join(review)
title.append(review)
#body_corpus
body = []
for i in range(0,len(data)):
review = re.sub('[^a-zA-Z0-9]', ' ', data['text'][i])
review = review.lower()
review = review.split()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
review = ' '.join(review)
body.append(review)
#author_corpus
author = []
for i in range(0,len(data)):
review = re.sub('[^a-zA-Z]', ' ', data['author'][i])
review = re.sub(r"and",'',review,flags=re.I)
review = review.lower()
review = review.split()
review = ' '.join(review)
author.append(review)
у меня проблема с TfidfVectorizer при лечении body_corpus
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(body)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df_body= pd.DataFrame(denselist, columns=feature_names)
the consul show me :
MemoryError Traceback (последний вызов последний)