Я пытаюсь создать классификатор текста, используя набор данных из твиттера для классификации текста по двум категориям: агрессивный или нормальный.
Мой классификатор работает с точностью 66%, но он очень медленный, так как набор данных довольно большой.
Любые предложения, как мне сделать это быстрее?
Набор данных можно увидеть здесь:
https://dataturks.com/projects/abhishek.narayanan/Dataset%20for%20Detection%20of%20Cyber-Trolls
Мой основной код:
'''
The dataset basically contains vectors with two components.
The first component is basically the String to analyse.
The Second component is the integer representing the sort of text
1 ----> Aggressive Text
0 ----> Positive Text
'''
import Preprocessdata as Dataset
import random
def Dataset_by_class(Dataset):
Dict = {}
for Instance in Dataset:
if Instance[-1] not in Dict:
Dict[Instance[-1]] = []
Dict[Instance[-1]].append(Instance[0])
return Dict
def String_by_class(Dataset):
Dict = {}
for index,Instance in Dataset.items():
Final = []
for Value in Instance:
Final.extend(Value.split())
Dict[index] = Final
return Dict
def PreProcess_TestSet(Dataset):
Label = []
Testing = []
for Instance in Dataset:
Testing.append(Instance[0])
Label.append(Instance[1])
return[Testing,Label]
def Laplace_Smoothening(Item,List,Alpha,Unique_Count,index):
Prob = 1
for String in List:
X = Item.count(String)
Prob = Prob * ((X+Alpha)/(len(Item) + Alpha * Unique_Count[index]))
return Prob
def Predict(Dataset,TestSet):
# We will be using LAplace Smoothening using alpha as 1
Unique_Count = {}
for index,Item in Dataset.items():
Unique_Count[index] = len(set(Item))
Alpha = 1
Predicted_Label = []
Prob_by_class = {}
for Test_Instance in TestSet:
List = Test_Instance.split()
for index,Item in Dataset.items():
Probility_by_class = Laplace_Smoothening(Item,List,Alpha,Unique_Count,index)
Prob_by_class[index] = Probility_by_class
if(Prob_by_class[0] > Prob_by_class[1]):
Predicted_Label.append(0)
else:
Predicted_Label.append(1)
return Predicted_Label
def Calculate_Accuracy(Prediction,Labels):
Count = 0
for Predicted,Given in zip(Prediction,Labels):
if Predicted == Given:
Count += 1
return (Count/len(Labels))*100
# This function automatically creates dataset from json and is imported
Data_Set = Dataset.PreProcess_Dataset('Dataset for Detection of Cyber-Trolls.json')
SplitRatio = 0.70
#Splits the dataset according to split ratio
TrainSet,TestSet = Train_Test_Dataset(Data_Set,SplitRatio)
DataSet_By_Class = Dataset_by_class(TrainSet)
String_By_Class = String_by_class(DataSet_By_Class)
TestSet,Labels = PreProcess_TestSet(TestSet)
Prediction = Predict(String_By_Class,TestSet)
Accuracy = Calculate_Accuracy(Prediction,Labels)
print("The Accuracy is {:.2f}".format(Accuracy))
Результат хорош, но ужасно медленный.