TypeError: '<' не поддерживается между экземплярами 'float' и 'str' в модели логистической регрессии - PullRequest
0 голосов
/ 29 мая 2019

Я хочу классифицировать вредоносные URL-адреса как хорошие или плохие, используя машинное обучение в python, и все идет хорошо, но появляется эта ошибка. Приведенный ниже код используется для моделирования набора данных, и последнее, что было написано, это проблема. более того, я записываю полную ошибку трассировки, я пытался удалить все нулевые значения, но отметил работу

      [# EDA Packages

        import pandas as pd
        import numpy as np
        import random

    # Machine Learning Packages
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split

    # Load Url Data 
    urls_data = pd.read_csv('C:/Users/user/Desktop/iot/data/dataot.csv',error_bad_lines=False, encoding = 'unicode_escape')

    type(urls_data)

    urls_data.head()

    def makeTokens(f):
        tkns_BySlash = str(f.encode('utf-8')).split('/')    # make tokens after splitting by slash
        total_Tokens = \[\]
        for i in tkns_BySlash:
            tokens = str(i).split('-')  # make tokens after splitting by dash
            tkns_ByDot = \[\]
            for j in range(0,len(tokens)):
                temp_Tokens = str(tokens\[j\]).split('.')   # make tokens after splitting by dot
                tkns_ByDot = tkns_ByDot + temp_Tokens
            total_Tokens = total_Tokens + tokens + tkns_ByDot
        total_Tokens = list(set(total_Tokens))  #remove redundant tokens
        if 'com' in total_Tokens:
            total_Tokens.remove('com')  #removing .com since it occurs a lot of times and it should not be included in our features
        return total_Tokens

    # Labels
    y = urls_data\["label"\]

    # Features
    url_list = urls_data\["url"\]

    # Using Default Tokenizer
    #vectorizer = TfidfVectorizer()

    # Using Custom Tokenizer
    vectorizer = TfidfVectorizer(tokenizer=makeTokens)

    # Store vectors into X variable as Our XFeatures
    X = vectorizer.fit_transform(url_list)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Model Building
    #using logistic regression
    logit = LogisticRegression()    
    logit.fit(X_train, y_train, errors='coerce')]

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-84-73b6ea97c396> in <module>
      2 #using logistic regression
      3 logit = LogisticRegression()
----> 4 logit.fit(X_train, y_train)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
   1284         X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
   1285                          accept_large_sparse=solver != 'liblinear')
-> 1286         check_classification_targets(y)
   1287         self.classes_ = np.unique(y)
   1288         n_samples, n_features = X.shape

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y)
    166     y : array-like
    167     """
--> 168     y_type = type_of_target(y)
    169     if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
    170                       'multilabel-indicator', 'multilabel-sequences']:

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in type_of_target(y)
    285         return 'continuous' + suffix
    286 
--> 287     if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
    288         return 'multiclass' + suffix  # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
    289     else:

C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
    231     ar = np.asanyarray(ar)
    232     if axis is None:
--> 233         ret = _unique1d(ar, return_index, return_inverse, return_counts)
    234         return _unpack_tuple(ret)
    235 

C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
    279         aux = ar[perm]
    280     else:
--> 281         ar.sort()
    282         aux = ar
    283     mask = np.empty(aux.shape, dtype=np.bool_)

TypeError: '<' not supported between instances of 'float' and 'str'
...