Я пытаюсь повторить работу в данном уроке мешок слов с словами - Часть 2 вектора слов , делая это на этапе, который включает в себя «токенизацию».Получаю ошибку кодирования, хотя при загрузке набора данных я уже установил кодировку в utf-8.Я уже пробовал это решение ошибка кодирования
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def review_to_sentences( review, remove_stopwords=False ):
# 1. Use the NLTK tokenizer to split the paragraph into sentences
raw_sentences = tokenizer.tokenize(review.strip())
#raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
sentences.append( review_to_wordlist( raw_sentence,remove_stopwords ))
return sentences
#function call
sentences = [] # Initialize an empty list of sentences
print ("Parsing sentences from training set")
for review in train["review"]:
sentences += review_to_sentences(review )
Это ошибка, которую я получаю
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-60-9241f5864c9f> in <module>()
2 print ("Parsing sentences from training set")
3 for review in train["review"]:
----> 4 sentences += review_to_sentences(review )
<ipython-input-59-798565b6a57a> in review_to_sentences(review, remove_stopwords)
21 if len(raw_sentence) > 0:
22 # Otherwise, call review_to_wordlist to get a list of words
---> 23 sentences.append( review_to_wordlist( raw_sentence,remove_stopwords ))
24 #
25 # Return the list of sentences (each sentence is a list of words,
<ipython-input-58-edff2925e1af> in review_to_wordlist(review, remove_stopwords)
9 #
10 # 1. Remove HTML
---> 11 review_text = BeautifulSoup(review).get_text()
12 #
13 # 2. Remove non-letters
~/anaconda3/envs/Py35/lib/python3.5/site-packages/bs4/__init__.py in __init__(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, **kwargs)
280 self.reset()
281 try:
--> 282 self._feed()
283 break
284 except ParserRejectedMarkup:
~/anaconda3/envs/Py35/lib/python3.5/site-packages/bs4/__init__.py in _feed(self)
341 self.builder.reset()
342
--> 343 self.builder.feed(self.markup)
344 # Close out any unfinished strings and close all the open tags.
345 self.endData()
~/anaconda3/envs/Py35/lib/python3.5/site-packages/bs4/builder/_html5lib.py in feed(self, markup)
70 else:
71 extra_kwargs['encoding'] = self.user_specified_encoding
---> 72 doc = parser.parse(markup, **extra_kwargs)
73
74 # Set the character encoding detected by the tokenizer.
~/anaconda3/envs/Py35/lib/python3.5/site-packages/html5lib/html5parser.py in parse(self, stream, *args, **kwargs)
287 "diffuseconstant": "diffuseConstant",
288 "edgemode": "edgeMode",
--> 289 "externalresourcesrequired": "externalResourcesRequired",
290 "filterres": "filterRes",
291 "filterunits": "filterUnits",
~/anaconda3/envs/Py35/lib/python3.5/site-packages/html5lib/html5parser.py in _parse(self, stream, innerHTML, container, scripting, **kwargs)
128
129 self.beforeRCDataPhase = None
--> 130
131 self.framesetOK = True
132
~/anaconda3/envs/Py35/lib/python3.5/site-packages/html5lib/_tokenizer.py in __init__(self, stream, parser, **kwargs)
~/anaconda3/envs/Py35/lib/python3.5/site-packages/html5lib/_inputstream.py in HTMLInputStream(source, **kwargs)
TypeError: __init__() got an unexpected keyword argument 'encoding'