Я пытаюсь токенизировать отправленные и слова, используя nltk. Я могу маркировать отправленные, но не слова.
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
df = pd.read_excel('sampleCases.xlsx',encoding = 'UTF')
df['Description_tokenize']=df['Description'].apply(sent_tokenize)
df[Desc_words_tokenized=df['Desc_sent_tokenized'].apply(word_tokenize)
Я получаю сообщение об ошибке
~/anaconda3/lib/python3.7/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
3192 else:
3193 values = self.astype(object).values
-> 3194 mapped = lib.map_infer(values, f, convert=convert_dtype)
3195
3196 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer()
~/anaconda3/lib/python3.7/site-packages/nltk/tokenize/__init__.py in word_tokenize(text, language, preserve_line)
141 :type preserve_line: bool
142 """
--> 143 sentences = [text] if preserve_line else sent_tokenize(text, language)
144 return [
145 token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
~/anaconda3/lib/python3.7/site-packages/nltk/tokenize/__init__.py in sent_tokenize(text, language)
103 """
104 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
--> 105 return tokenizer.tokenize(text)
106
107
~/anaconda3/lib/python3.7/site-packages/nltk/tokenize/punkt.py in tokenize(self, text, realign_boundaries)
1267 Given a text, returns a list of the sentences in that text.
1268 """
-> 1269 return list(self.sentences_from_text(text, realign_boundaries))
1270
1271 def debug_decisions(self, text):
~/anaconda3/lib/python3.7/site-packages/nltk/tokenize/punkt.py in sentences_from_text(self, text, realign_boundaries)
1321 follows the period.
1322 """
-> 1323 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
1324
1325 def _slices_from_text(self, text):
~/anaconda3/lib/python3.7/site-packages/nltk/tokenize/punkt.py in <listcomp>(.0)
1321 follows the period.
1322 """
-> 1323 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
1324
1325 def _slices_from_text(self, text):
~/anaconda3/lib/python3.7/site-packages/nltk/tokenize/punkt.py in span_tokenize(self, text, realign_boundaries)
1311 if realign_boundaries:
1312 slices = self._realign_boundaries(text, slices)
-> 1313 for sl in slices:
1314 yield (sl.start, sl.stop)
1315
~/anaconda3/lib/python3.7/site-packages/nltk/tokenize/punkt.py in _realign_boundaries(self, text, slices)
1352 """
1353 realign = 0
-> 1354 for sl1, sl2 in _pair_iter(slices):
1355 sl1 = slice(sl1.start + realign, sl1.stop)
1356 if not sl2:
~/anaconda3/lib/python3.7/site-packages/nltk/tokenize/punkt.py in _pair_iter(it)
315 """
316 it = iter(it)
--> 317 prev = next(it)
318 for el in it:
319 yield (prev, el)
~/anaconda3/lib/python3.7/site-packages/nltk/tokenize/punkt.py in _slices_from_text(self, text)
1325 def _slices_from_text(self, text):
1326 last_break = 0
-> 1327 for match in self._lang_vars.period_context_re().finditer(text):
1328 context = match.group() + match.group('after_tok')
1329 if self.text_contains_sentbreak(context):
TypeError: expected string or bytes-like object
df['Desc_sent_tokenized'].dtype
out[10] dtype('O')
Я пытался использовать
df['Desc_str_sent_tokenized'] = df['Desc_sent_tokenized'].astype(str)
df['Desc_words_tokenized']=df['Desc_str_sent_tokenized'].apply(word_tokenize)
Но я получаювывод, подобный этому
<bound method NDFrame.head of 0 [[,'Based, on, the, analysis, done, so, far, there, are, few, logs, indicating, towards, network, issue, .,'
Name: Desc_words_tokenized, dtype: object>
Однако я ожидаю вывод, подобный этому
['Based', 'on', 'the', 'analysis', 'done', 'so',«далеко», «там», «есть», «несколько», «журналы», «указывающие», «в сторону», «сеть», «проблема»]