Как обрабатывать различные входные размеры строки в нейронной сети - PullRequest
0 голосов
/ 28 сентября 2019

У меня есть набор данных, который является набором данных NSLKDD.Я преобразовал все значения объектов и объединил их в одну строку.Теперь длина строки отличается.Проблема в том, что размер входных данных для каждой строки варьируется, поэтому NN выдает ошибку.Есть ли способ, которым входной слой NN может обрабатывать эти переменные размеры.

 0.0,1.0,1.0,1.0,491.0,0.0,0.0,0.0,0.0,0.0,0.0,...
 0.0,2.0,2.0,1.0,146.0,0.0,0.0,0.0,0.0,0.0,0.0,...
 0.0,1.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....
 0.0,1.0,4.0,1.0,232.0,8153.0,0.0,0.0,0.0,0.0,0...
 0.0,1.0,4.0,1.0,199.0,420.0,0.0,0.0,0.0,0.0,0....
                            ...                        
 0.0,1.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....
 8.0,2.0,3.0,1.0,105.0,145.0,0.0,0.0,0.0,0.0,0....
 0.0,1.0,16.0,1.0,2231.0,384.0,0.0,0.0,0.0,0.0,...
 0.0,1.0,36.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...
 0.0,1.0,1.0,1.0,151.0,0.0,0.0,0.0,0.0,0.0,0.0,...

Выше приведен пример того, как каждая строка выглядит в пандах или массивах.Итак, как я могу изменить ввод NN в зависимости от длины каждой строки во время тренировки?Существует небольшая разница между фактическими данными в наборе данных и тем, который я опубликовал, поскольку я преобразовал категориальные данные, которые имеют текстовые значения с уникальными числовыми значениями. Это ссылка набора данных NSLKDD

ниже приведен код, который преобразует фактические данные в те, которые я изменил на

import random
import numpy as np
import pandas as pd
df = pd.read_csv('../dataset/NSL-KDD/KDDTRAIN.csv',header=None)
df = df[df.columns[:-1]]
col_names = ["duration","protocol_type","service","flag","src_bytes",
"dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
"logged_in","num_compromised","root_shell","su_attempted","num_root",
"num_file_creations","num_shells","num_access_files","num_outbound_cmds",
"is_host_login","is_guest_login","count","srv_count","serror_rate",
"srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
"diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
"dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
"dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
df.columns = col_names
mapping = {'normal':0 , 'neptune':1, 'warezclient':1, 'ipsweep':1, 'portsweep':1,
   'teardrop':1, 'nmap':1, 'satan':1, 'smurf':1, 'pod':1, 'back':1,
   'guess_passwd':1, 'ftp_write':1, 'multihop':1, 'rootkit':1,
   'buffer_overflow':1, 'imap':1, 'warezmaster':1, 'phf':1, 'land':1,
   'loadmodule':1, 'spy':1, 'perl':1}
mapping1 = {'tcp':1,'udp':2, 'icmp':3}
mapping2 = {'ftp_data':1, 'other':2, 'private':3, 'http':4, 'remote_job':5, 
'name':6, 'netbios_ns':7, 'eco_i':8, 'mtp':9, 'telnet':10, 'finger':11, 
'domain_u':12, 'supdup':13, 'uucp_path':14, 'Z39_50':15, 'smtp':16, 
'csnet_ns':17, 'uucp':18, 'netbios_dgm':19, 'urp_i':20, 'auth':21, 
'domain':22, 'ftp':23, 'bgp':24, 'ldap':25, 'ecr_i':26, 'gopher':27, 
'vmnet':28, 'systat':29, 'http_443':30, 'efs':31, 'whois':32,
'imap4':33, 'iso_tsap':34, 'echo':35, 'klogin':36, 'link':37, 'sunrpc':38, 
'login':39, 'kshell':40,'sql_net':41, 'time':42, 'hostnames':43, 
'exec':44, 'ntp_u':45, 'discard':46, 'nntp':47, 'courier':48,'ctf':49, 
'ssh':50, 'daytime':51, 'shell':52, 'netstat':53, 'pop_3':54, 'nnsp':55, 
'IRC':56, 'pop_2':57,'printer':58, 'tim_i':59, 'pm_dump':60, 'red_i':61, 
'netbios_ssn':62, 'rje':63, 'X11':64, 'urh_i':65, 'http_8001':66, 
'aol':67, 'http_2784':68, 'tftp_u':69, 'harvest':70}
 mapping3 = {'SF':1, 'S0':2, 'REJ':3, 'RSTR':4, 'SH':5, 'RSTO':6, 'S1':7, 'RSTOS0':8, 'S3':9, 'S2':10, 'OTH':11}
df.protocol_type = [mapping1[item] for item in df.protocol_type] 
df.service = [mapping2[item] for item in df.service] 
df.flag = [mapping3[item] for item in df.flag] 
df.label = [mapping[item] for item in df.label] 
df.shape[0]

df['s'] = df[df.columns[0:41]].apply(
lambda x: ','.join(x.dropna().astype(str)),
axis=1
)


df = df.drop(columns=["duration","protocol_type","service","flag","src_bytes",
"dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
"logged_in","num_compromised","root_shell","su_attempted","num_root",
"num_file_creations","num_shells","num_access_files","num_outbound_cmds",
"is_host_login","is_guest_login","count","srv_count","serror_rate",
"srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
"diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
"dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
"dst_host_rerror_rate","dst_host_srv_rerror_rate"])

df.columns = ['y','s']
df['y'] = df['y'].map({0: -1.0, 1: 1.0})
print (df)

У меня также естьопубликовал код на тот случай, если кто-то захочет попробовать ввести набор данных, в который я преобразовал.

...