Я использую токенайзер предложений, но как я могу удалить ненужные / n из вывода
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import PyPDF2 as p2
pdf_file = open("Muhammad_CV.pdf", 'rb')
pdf_read = p2.PdfFileReader(pdf_file)
count = pdf_read.numPages
for i in range(count):
page = pdf_read.getPage(i)
text = page.extractText() #Extract text
tokenized = sent_tokenize(text) #Token
all_words = []
for w in tokenized:
all_words.append(w.lower()) #Lower case
# ///////////////// Stop Words ///////////////////////////
stop_words = set(stopwords.words('english'))
filtered = []
for w in all_words:
if w not in stop_words:
filtered.append(w)
print(filtered)
Вывод, который я получаю:
{'the specialization includes:\n \n\n \nintroduction\n \nto\n \ndata\n \nscience\n \n\n \nbig\n \ndata\n \n&\n \ncloud\n \ncomputing\n \n\n \ndata\n \nmining\n \n\n \nmachine\n \nlearn\ning'}
Желаемый вывод:
{'the specialization includes: introduction to data science big data cloud\n computing data mining machine learning'}