У меня есть этот код ниже для обработки текстовых данных с использованием tf idf в python.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import glob
files = glob.glob("Text/*.txt")
with open("all_data.txt","wb") as outfile:
for f in files:
with open(f,"rb") as infile:
outfile.write(infile.read())
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk #import library nltk
from nltk.tokenize import word_tokenize #import word_tokenize for tokenizing text into words
from nltk.tokenize import sent_tokenize #import sent_tokenize for tokenizing paragraph into sentences
from nltk.stem.porter import PorterStemmer #import Porter Stemmer Algorithm
from nltk.stem import WordNetLemmatizer #import WordNet lemmatizer
from nltk.corpus import stopwords #import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory #import Indonesian Stemmer
import re #import regular expression
from nltk.tokenize import RegexpTokenizer
file = open('all_data.txt', 'r')
t = file.read()
text_data = t
#casefolding
def casefolding(s):
new_str = s.lower()
return new_str
cf = casefolding(text_data)
#remove punctuation from string
def removepunct(str):
new_string = re.sub(r"[\W]", " ", str)
return new_string
rp = removepunct(cf)
#remove digit from string
def removeDigit(str):
new_string = re.sub(r"[0-9]", " ", str)
return new_string
rd = removeDigit(rp)
#remove words in length 1-3
def removelg(str):
new_string = re.sub(r' \w{1,3} ', ' ', str)
return new_string
rl = removelg(rd)
#remove multiple space
def removespace(str):
new_string = re.sub(' +', ' ',str)
return new_string
rms = removespace(rl)
#Stemming Indonesian
def stemmingIndo(str):
factory = StemmerFactory()
stemmer = factory.create_stemmer()
return stemmer.stem(str)
stindo = stemmingIndo(rms)
#remove stopwords
def stpwrds(str):
stop_words = set(stopwords.words('indonesian'))
word_tokens = word_tokenize(stindo)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
return filtered_sentence
filt = stpwrds(stindo)
par = ' '.join(filt)
def word_tokenization(s):
tokens = word_tokenize(s)
return tokens
wordtoken = word_tokenization(par)
bowD = wordtoken
wordSet = set(bowD)
wordDict = dict.fromkeys(wordSet,0)
for word in bowD:
wordDict[word]+=1
def computeTF(wordDict, bow):
tfDict = {}
bowCount = len(bow)
for word, count in wordDict.items():
tfDict[word] = count/float(bowCount)
return tfDict
tf = computeTF(wordDict, bowD)
def computeIDF(docList):
import math
idfDict = {}
N = len(docList)
idfDict = dict.fromkeys(docList[0].keys(), 0)
for doc in docList:
for word, val in doc.items():
if val > 0:
idfDict[word] += 1
for word, val in idfDict.items():
idfDict[word] = math.log10((1+N) / float(val))
return idfDict
idf = computeIDF([wordDict])
def computeTFIDF(tfBow, idfs):
tfidf = {}
for word, val in tfBow.items():
tfidf[word] = val*idfs[word]
return tfidf
tfidf = computeTFIDF(tf, idf)
df = pd.DataFrame({'weight': tfidf})
#test = df.sort_values('tfidf', ascending=False)
test = df.sort_values(by = 'weight', ascending=False)
print(test)
Мне удалось запустить его и получить вывод ниже. Я не думаю, что в этом есть ошибка, но я не знаю, как получить полный вывод
weight
butuh 0.026342
orang 0.019802
milik 0.009629
saudara 0.007267
hidup 0.006359
atur 0.006359
periksa 0.005450
hasil 0.005450
suka 0.004360
barang 0.003997
epps 0.003633
pengaruh 0.003270
perhati 0.003270
agresif 0.003088
salah 0.003088
laku 0.002907
prestasi 0.002907
gantung 0.002907
seksual 0.002907
muhammad 0.002725
rawat 0.002725
benda 0.002725
tolong 0.002725
manja 0.002543
percaya 0.002543
hadap 0.002543
harmonis 0.002543
gaul 0.002543
tekun 0.002362
ubah 0.002362
... ...
widad 0.000908
hubung 0.000727
manusia 0.000727
ekspresi 0.000727
aktivitas 0.000727
taruh 0.000727
pilih 0.000545
masuk 0.000545
putus 0.000545
peka 0.000545
kait 0.000545
ambil 0.000545
sulit 0.000545
paham 0.000545
raih 0.000545
rutin 0.000545
didik 0.000545
laksana 0.000363
kuat 0.000363
mudah 0.000363
jaga 0.000363
patuh 0.000363
gigih 0.000363
tonjol 0.000182
konvensi 0.000182
lingkung 0.000182
sosial 0.000182
interaksi 0.000182
urus 0.000182
tarik 0.000182
[150 rows x 1 columns]
Я получаю усеченное представление, но мне нужен полный массив. Я хочу видеть 150 данных в строке.
Есть ли способ сделать это? я должен разделить на 2 столбца и как это работает?