Ошибка типа: ожидаемая строка или буфер в списке стоп-слов - PullRequest
0 голосов
/ 20 марта 2020

Я хочу стереть несколько слов, которых нет в списке стоп-слов. Я сделал словарь stopwords_id.txt из слов, которые хочу стереть. У меня есть 74 pdf файла для преобразования в txt и после этого сохраните его как txt.

def preProcessing(textPdf):
        textPdfPre = []
        for i in range(len(textPdf)):

            text = textPdf[i][textPdf[i].index('Uraian')+7:]

            #remove punctuations
            text = re.sub(r'[^\w]|_',' ',text)

            #remove digit from string
            text = re.sub(r"\S*\d\S*", "", text).strip()

            #remove digit or numbers
            text = re.sub(r"\b\d+\b", " ", text)

            #remove word with less than 3 alphabets
            text = re.sub(r'\W*\b\w{1,3}\b', ' ', text)

            #Remove additional white spaces
            text = re.sub(r'[\s]+', ' ', text) 

            #to lower
            textPdfPre.append(text.lower())

        return textPdfPre

    def removeStopword(textPdfPre):
        stop_words = set(stopwords.words('stopwords_id.txt'))
        word_tokens = word_tokenize(textPdfPre)
        filtered_sentence = [w for w in word_tokens if not w in stop_words]

        return ' '.join(filtered_sentence)

    def savePDFPreToTxt(daftarFile, filtered_sentence):

        pathTxt = '/home/a1-re/Documents/skripsi/txt2'

        if not exists(pathTxt):
            makedirs(pathTxt)

        for i in range(len(textPdfPre)):
            namaFileExt = basename(daftarFile[i])
            namaFile = splitext(namaFileExt)[0]

            file = open(pathTxt + '/'+ namaFile + '.txt','w') 
            file.write(textPdfPre[i])
            file.close() 

    pathFilePdf = '/home/a1-re/Documents/skripsi/fk_epps'

    daftarFile = []
    for r, d, f in walk(pathFilePdf):
        for file in f:
            if isfile(join(r, file)):
                daftarFile.append(join(r, file))

    textPdf = bacaFilePdf(daftarFile)
    textPdfPre = preProcessing(textPdf)
    filtered_sentence = removeStopword(textPdfPre)
    savePDFPreToTxt(daftarFile, filtered_sentence)

, но я нашел ошибку в этой области, и я не знаю, как ее решить.

TypeError                                 Traceback (most recent call last)
<ipython-input-85-f5ba61006e2f> in <module>()
      9 textPdf = bacaFilePdf(daftarFile)
     10 textPdfPre = preProcessing(textPdf)
---> 11 filtered_sentence = removeStopword(textPdfPre)
     12 savePDFPreToTxt(daftarFile, filtered_sentence)

<ipython-input-83-8d43b1980fe6> in removeStopword(textPdfPre)
     27 def removeStopword(textPdfPre):
     28     stop_words = set(stopwords.words('stopwords_id.txt'))
---> 29     word_tokens = word_tokenize(textPdfPre)
     30     filtered_sentence = [w for w in word_tokens if not w in stop_words]
     31 

/home/a1-re/.local/lib/python2.7/site-packages/nltk/tokenize/__init__.pyc in word_tokenize(text, language, preserve_line)
    142     :type preserve_line: bool
    143     """
--> 144     sentences = [text] if preserve_line else sent_tokenize(text, language)
    145     return [
    146         token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)

/home/a1-re/.local/lib/python2.7/site-packages/nltk/tokenize/__init__.pyc in sent_tokenize(text, language)
    104     """
    105     tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
--> 106     return tokenizer.tokenize(text)
    107 
    108 

/home/a1-re/.local/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in tokenize(self, text, realign_boundaries)
   1275         Given a text, returns a list of the sentences in that text.
   1276         """
-> 1277         return list(self.sentences_from_text(text, realign_boundaries))
   1278 
   1279     def debug_decisions(self, text):

/home/a1-re/.local/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in sentences_from_text(self, text, realign_boundaries)
   1329         follows the period.
   1330         """
-> 1331         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
   1332 
   1333     def _slices_from_text(self, text):

/home/a1-re/.local/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in span_tokenize(self, text, realign_boundaries)
   1319         if realign_boundaries:
   1320             slices = self._realign_boundaries(text, slices)
-> 1321         for sl in slices:
   1322             yield (sl.start, sl.stop)
   1323 


TypeError: expected string or buffer

Есть идеи, почему существует эта ошибка типа? Я думаю, что в removeStopwords что-то не так.

...