Я хочу классифицировать вредоносные URL-адреса как хорошие или плохие, используя машинное обучение в python, и все идет хорошо, но появляется эта ошибка. Приведенный ниже код используется для моделирования набора данных, и последнее, что было написано, это проблема. более того, я записываю полную ошибку трассировки, я пытался удалить все нулевые значения, но отметил работу
[# EDA Packages
import pandas as pd
import numpy as np
import random
# Machine Learning Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Load Url Data
urls_data = pd.read_csv('C:/Users/user/Desktop/iot/data/dataot.csv',error_bad_lines=False, encoding = 'unicode_escape')
type(urls_data)
urls_data.head()
def makeTokens(f):
tkns_BySlash = str(f.encode('utf-8')).split('/') # make tokens after splitting by slash
total_Tokens = \[\]
for i in tkns_BySlash:
tokens = str(i).split('-') # make tokens after splitting by dash
tkns_ByDot = \[\]
for j in range(0,len(tokens)):
temp_Tokens = str(tokens\[j\]).split('.') # make tokens after splitting by dot
tkns_ByDot = tkns_ByDot + temp_Tokens
total_Tokens = total_Tokens + tokens + tkns_ByDot
total_Tokens = list(set(total_Tokens)) #remove redundant tokens
if 'com' in total_Tokens:
total_Tokens.remove('com') #removing .com since it occurs a lot of times and it should not be included in our features
return total_Tokens
# Labels
y = urls_data\["label"\]
# Features
url_list = urls_data\["url"\]
# Using Default Tokenizer
#vectorizer = TfidfVectorizer()
# Using Custom Tokenizer
vectorizer = TfidfVectorizer(tokenizer=makeTokens)
# Store vectors into X variable as Our XFeatures
X = vectorizer.fit_transform(url_list)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Model Building
#using logistic regression
logit = LogisticRegression()
logit.fit(X_train, y_train, errors='coerce')]
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-84-73b6ea97c396> in <module>
2 #using logistic regression
3 logit = LogisticRegression()
----> 4 logit.fit(X_train, y_train)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
1284 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
1285 accept_large_sparse=solver != 'liblinear')
-> 1286 check_classification_targets(y)
1287 self.classes_ = np.unique(y)
1288 n_samples, n_features = X.shape
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y)
166 y : array-like
167 """
--> 168 y_type = type_of_target(y)
169 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
170 'multilabel-indicator', 'multilabel-sequences']:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in type_of_target(y)
285 return 'continuous' + suffix
286
--> 287 if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
288 return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
289 else:
C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
231 ar = np.asanyarray(ar)
232 if axis is None:
--> 233 ret = _unique1d(ar, return_index, return_inverse, return_counts)
234 return _unpack_tuple(ret)
235
C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
279 aux = ar[perm]
280 else:
--> 281 ar.sort()
282 aux = ar
283 mask = np.empty(aux.shape, dtype=np.bool_)
TypeError: '<' not supported between instances of 'float' and 'str'