def generate_tokens(sentence_list):
""" This function returns a list of lists of
clean tokens"""
final_tokens = []
for sentence in sentence_list:
new_text = re.sub("[^A-Za-z]"," ", sentence)
tokens = new_text.lower().split()
tokens = [el for el in tokens if el not in sw]
if tokens!=[]:
final_tokens.append(tokens[0])
return final_tokens
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
sw = nltk.corpus.stopwords.words('english')
df["sentences"] = df.Content.astype(str).str.split()
df["clean_tokens"] = df.sentences.apply(generate_tokens)
total = []
df.head()
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["clean_tokens"])
print(X.toarray())
Как я могу это исправить?
AttributeError Traceback (most recent call last)
<ipython-input-101-3d88bc2d8f98> in <module>
2 vectorizer = CountVectorizer()
3
----> 4 X = vectorizer.fit_transform(df["clean_tokens"])
5
6 print(X.toarray())
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1056
1057 vocabulary, X = self._count_vocab(raw_documents,
-> 1058 self.fixed_vocabulary_)
1059
1060 if self.binary:
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
968 for doc in raw_documents:
969 feature_counter = {}
--> 970 for feature in analyze(doc):
971 try:
972 feature_idx = vocabulary[feature]
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(doc)
350 tokenize)
351 return lambda doc: self._word_ngrams(
--> 352 tokenize(preprocess(self.decode(doc))), stop_words)
353
354 else:
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(x)
254
255 if self.lowercase:
--> 256 return lambda x: strip_accents(x.lower())
257 else:
258 return strip_accents
AttributeError: 'list' object has no attribute 'lower'