Я не могу сделать pos-теги и удаление стоп-слов из текстовых документов в коде, который я разработал, получив помощь из интернет-источников. Я новичок в Python и занимаюсь многоклассовой классификацией текста, так что любой, кто может подсказать мне, как я могу делать теги pos и удаление стоп-слов в моем коде. Любая помощь будет высоко оценена. Спасибо
Я пробовал несколько способов использовать nltk для удаления стоп-слов и тегов pos, но не смог.
import pandas as pd
import numpy as np
from nltk import tokenize,PorterStemmer,pos_tag
from sklearn.feature_extraction.text import
CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import
confusion_matrix,accuracy_score,classification_report
from sklearn.svm import SVC
from sklearn import model_selection, naive_bayes, svm
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
df = pd.read_csv('projectdataayes.csv')
df = df[pd.notnull(df['types'])]
my_types = ['Requirement','Non-Requirement']
#converting to lower case
df['description'] = df.description.map(lambda x: x.lower())
#Removing the punctuation
df['description'] = df.description.str.replace('[^\w\s]', '')
#splitting the word into tokens
df['description'] = df['description'].apply(tokenize.word_tokenize)
#Stopwords Removal
STOPWORDS = set(stopwords.words('english'))
df['description'] = [w for w in df['description'] if not w in STOPWORDS]
#stemming
stemmer = PorterStemmer()
df['description'] = df['description'].apply(lambda x:
[stemmer.stem(y) for y in x])
print(df[:10])
## This converts the list of words into space-separated strings
df['description'] = df['description'].apply(lambda x: ' '.join(x))
count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['description'])
#tf-idf
transformer = TfidfTransformer().fit(counts)
counts = transformer.transform(counts)
#splitting the data and training the model
#naives-bayes
X_train, X_test, y_train, y_test = train_test_split(counts,
df['types'], test_size=0.3, random_state=39)
model = MultinomialNB().fit(X_train, y_train)
Ошибка:
File "U:/FAHAD UL HASSAN/Python Code/happycsv.py", line 41, in <module>
df['description'] = [w for w in df['description'] if not w in STOPWORDS]
File "U:/FAHAD UL HASSAN/Python Code/happycsv.py", line 41, in <listcomp>
df['description'] = [w for w in df['description'] if not w in STOPWORDS]
TypeError: unhashable type: 'list'