Я хочу стереть несколько слов, которых нет в списке стоп-слов. Я сделал словарь stopwords_id.txt из слов, которые хочу стереть. У меня есть 74 pdf файла для преобразования в txt и после этого сохраните его как txt.
def preProcessing(textPdf):
textPdfPre = []
for i in range(len(textPdf)):
text = textPdf[i][textPdf[i].index('Uraian')+7:]
#remove punctuations
text = re.sub(r'[^\w]|_',' ',text)
#remove digit from string
text = re.sub(r"\S*\d\S*", "", text).strip()
#remove digit or numbers
text = re.sub(r"\b\d+\b", " ", text)
#remove word with less than 3 alphabets
text = re.sub(r'\W*\b\w{1,3}\b', ' ', text)
#Remove additional white spaces
text = re.sub(r'[\s]+', ' ', text)
#to lower
textPdfPre.append(text.lower())
return textPdfPre
def removeStopword(textPdfPre):
stop_words = set(stopwords.words('stopwords_id.txt'))
word_tokens = word_tokenize(textPdfPre)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
return ' '.join(filtered_sentence)
def savePDFPreToTxt(daftarFile, filtered_sentence):
pathTxt = '/home/a1-re/Documents/skripsi/txt2'
if not exists(pathTxt):
makedirs(pathTxt)
for i in range(len(textPdfPre)):
namaFileExt = basename(daftarFile[i])
namaFile = splitext(namaFileExt)[0]
file = open(pathTxt + '/'+ namaFile + '.txt','w')
file.write(textPdfPre[i])
file.close()
pathFilePdf = '/home/a1-re/Documents/skripsi/fk_epps'
daftarFile = []
for r, d, f in walk(pathFilePdf):
for file in f:
if isfile(join(r, file)):
daftarFile.append(join(r, file))
textPdf = bacaFilePdf(daftarFile)
textPdfPre = preProcessing(textPdf)
filtered_sentence = removeStopword(textPdfPre)
savePDFPreToTxt(daftarFile, filtered_sentence)
, но я нашел ошибку в этой области, и я не знаю, как ее решить.
TypeError Traceback (most recent call last)
<ipython-input-85-f5ba61006e2f> in <module>()
9 textPdf = bacaFilePdf(daftarFile)
10 textPdfPre = preProcessing(textPdf)
---> 11 filtered_sentence = removeStopword(textPdfPre)
12 savePDFPreToTxt(daftarFile, filtered_sentence)
<ipython-input-83-8d43b1980fe6> in removeStopword(textPdfPre)
27 def removeStopword(textPdfPre):
28 stop_words = set(stopwords.words('stopwords_id.txt'))
---> 29 word_tokens = word_tokenize(textPdfPre)
30 filtered_sentence = [w for w in word_tokens if not w in stop_words]
31
/home/a1-re/.local/lib/python2.7/site-packages/nltk/tokenize/__init__.pyc in word_tokenize(text, language, preserve_line)
142 :type preserve_line: bool
143 """
--> 144 sentences = [text] if preserve_line else sent_tokenize(text, language)
145 return [
146 token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
/home/a1-re/.local/lib/python2.7/site-packages/nltk/tokenize/__init__.pyc in sent_tokenize(text, language)
104 """
105 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
--> 106 return tokenizer.tokenize(text)
107
108
/home/a1-re/.local/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in tokenize(self, text, realign_boundaries)
1275 Given a text, returns a list of the sentences in that text.
1276 """
-> 1277 return list(self.sentences_from_text(text, realign_boundaries))
1278
1279 def debug_decisions(self, text):
/home/a1-re/.local/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in sentences_from_text(self, text, realign_boundaries)
1329 follows the period.
1330 """
-> 1331 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
1332
1333 def _slices_from_text(self, text):
/home/a1-re/.local/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in span_tokenize(self, text, realign_boundaries)
1319 if realign_boundaries:
1320 slices = self._realign_boundaries(text, slices)
-> 1321 for sl in slices:
1322 yield (sl.start, sl.stop)
1323
TypeError: expected string or buffer
Есть идеи, почему существует эта ошибка типа? Я думаю, что в removeStopwords что-то не так.