Мой набор данных - это набор системных вызовов как для вредоносных программ, так и для безопасных, я предварительно обработал его, и теперь он выглядит так
Теперь я использую tfidf
для извлечения функций, а затем использую ngram
чтобы составить их последовательность
from __future__ import print_function
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
from sklearn.svm import OneClassSVM
nGRAM1 = 8
nGRAM2 = 10
weight = 4
main_corpus_MAL = []
main_corpus_target_MAL = []
main_corpus_BEN = []
main_corpus_target_BEN = []
my_categories = ['benign', 'malware']
# feeding corpus the testing data
print("Loading system call database for categories:")
print(my_categories if my_categories else "all")
import glob
import os
malCOUNT = 0
benCOUNT = 0
for filename in glob.glob(os.path.join('C:\\Users\\alika\\Documents\\testingSVM\\sysMAL', '*.txt')):
fMAL = open(filename, "r")
aggregate = ""
for line in fMAL:
linea = line[:(len(line)-1)]
aggregate += " " + linea
malCOUNT += 1
for filename in glob.glob(os.path.join('C:\\Users\\alika\\Documents\\testingSVM\\sysBEN', '*.txt')):
fBEN = open(filename, "r")
aggregate = ""
for line in fBEN:
linea = line[:(len(line) - 1)]
aggregate += " " + linea
benCOUNT += 1
# weight as determined in the top of the code
train_corpus = main_corpus_BEN[:(weight*len(main_corpus_BEN)//(weight+1))]
train_corpus_target = main_corpus_target_BEN[:(weight*len(main_corpus_BEN)//(weight+1))]
test_corpus = main_corpus_MAL[(len(main_corpus_MAL)-(len(main_corpus_MAL)//(weight+1))):]
test_corpus_target = main_corpus_target_MAL[(len(main_corpus_MAL)-len(main_corpus_MAL)//(weight+1)):]
def size_mb(docs):
return sum(len(s.encode('utf-8')) for s in docs) / 1e6
# size of datasets
train_corpus_size_mb = size_mb(train_corpus)
test_corpus_size_mb = size_mb(test_corpus)
print("%d documents - %0.3fMB (training set)" % (
len(train_corpus_target), train_corpus_size_mb))
print("%d documents - %0.3fMB (test set)" % (
len(test_corpus_target), test_corpus_size_mb))
print("%d categories" % len(my_categories))
print("Benign Traces: "+str(benCOUNT)+" traces")
print("Malicious Traces: "+str(malCOUNT)+" traces")
print("Extracting features from the training data using a sparse vectorizer...")
t0 = time()
vectorizer = TfidfVectorizer(ngram_range=(nGRAM1, nGRAM2), min_df=1, use_idf=True, smooth_idf=True) ##############
analyze = vectorizer.build_analyzer()
X_train = vectorizer.fit_transform(train_corpus)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, train_corpus_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print("Extracting features from the test data using the same vectorizer...")
t0 = time()
X_test = vectorizer.transform(test_corpus)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, test_corpus_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
Loading system call database for categories:
['benign', 'malware']
177 documents - 45.926MB (training set)
44 documents - 12.982MB (test set)
2 categories
Benign Traces: 72 traces
Malicious Traces: 150 traces
Extracting features from the training data using a sparse vectorizer...
done in 7.831695s at 5.864MB/s
n_samples: 177, n_features: 603170
Extracting features from the test data using the same vectorizer...
done in 1.624100s at 7.993MB/s
n_samples: 44, n_features: 603170
Теперь для учебного раздела я пытаюсь использовать sklearn OneClassSVM
print("Training: ")
classifier = OneClassSVM(kernel='linear', gamma='auto')
fraud_pred = classifier.predict(X_test)
unique, counts = np.unique(fraud_pred, return_counts=True)
print (np.asarray((unique, counts)).T)
fraud_pred = pd.DataFrame(fraud_pred)
fraud_pred= fraud_pred.rename(columns={0: 'prediction'})
main_corpus_target = pd.DataFrame(main_corpus_target)
main_corpus_target= main_corpus_target.rename(columns={0: 'Category'})
это вывод fraud_pred
и main_corpus_target
0 1
1 -1
2 1
3 1
4 1
5 -1
6 1
7 -1
30 rows * 1 column
0 1
1 1
2 1
3 1
4 1
217 0
218 0
219 0
220 0
221 0
222 rows * 1 column
, но когда я пытаюсь вычислить TP,TN,FP,FN
##Performance check of the model
TP = FN = FP = TN = 0
for j in range(len(main_corpus_target)):
if main_corpus_target['Category'][j]== 0 and fraud_pred['prediction'][j] == 1:
TP = TP+1
elif main_corpus_target['Category'][j]== 0 and fraud_pred['prediction'][j] == -1:
FN = FN+1
elif main_corpus_target['Category'][j]== 1 and fraud_pred['prediction'][j] == 1:
FP = FP+1
TN = TN +1
print (TP, FN, FP, TN)
, я получаю эту ошибку:
KeyError Traceback (most recent call last)
<ipython-input-32-1046cc75ba83> in <module>
7 elif main_corpus_target['Category'][j]== 0 and fraud_pred['prediction'][j] == -1:
8 FN = FN+1
----> 9 elif main_corpus_target['Category'][j]== 1 and fraud_pred['prediction'][j] == 1:
10 FP = FP+1
11 else:
c:\users\alika\appdata\local\programs\python\python36\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
1069 key = com.apply_if_callable(key, self)
1070 try:
-> 1071 result = self.index.get_value(self, key)
1073 if not is_scalar(result):
c:\users\alika\appdata\local\programs\python\python36\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
4728 k = self._convert_scalar_indexer(k, kind="getitem")
4729 try:
-> 4730 return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
4731 except KeyError as e1:
4732 if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 30
1) Я знаю, что ошибка в том, что он пытается получить доступ к ключу, которого нет в словаре, но я не могу просто вставить некоторые цифры в fraud_pred
для решения этой проблемы, любой предложения ??
2) я делаю что-то неправильно, что они не совпадают?
3) я хочу сравнить результаты с другими алгоритмами классификации одного класса, из-за моего метода, какие из них лучше можно использовать ??