Я пытаюсь запустить простые классификаторы ML для своих данных, но я получаю ошибку ниже. Я новичок, поэтому любезно объясните мне причину, когда вы предоставите решение. спасибо "Ошибка типа: неупорядоченные типы: str ()
ниже мой код
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
tweets = pd.read_csv("C:\\Users\\data.csv")
tweets.shape
(4787, 2)
X = tweets.iloc[:, 0].values
y = tweets.iloc[:, 1].values
processed_tweets = []
for tweet in range(0, len(X)):
# Remove all the special characters
processed_tweet = re.sub(r'\W', ' ', str(X[tweet]))
# remove all single characters
processed_tweet = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_tweet)
# Remove single characters from the start
processed_tweet = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_tweet)
# Substituting multiple spaces with single space
processed_tweet= re.sub(r'\s+', ' ', processed_tweet, flags=re.I)
# Removing prefixed 'b'
processed_tweet = re.sub(r'^b\s+', '', processed_tweet)
# Converting to Lowercase
processed_tweet = processed_tweet.lower()
processed_tweets.append(processed_tweet)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.7,
stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(processed_tweets).toarray()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape
(3829 , 1710)
y_train.shape
(3829,)
X.shape
(4787, 1710)
from sklearn.ensemble import RandomForestClassifier
text_classifier = RandomForestClassifier(n_estimators=100, random_state=0)
text_classifier.fit(X_train, y_train)
Ниже приведена ошибка, которую я получаю при запуске выше код
TypeError Traceback (most recent call last)
<ipython-input-24-7c5c1beb13e6> in <module>()
1 from sklearn.ensemble import RandomForestClassifier
2 text_classifier = RandomForestClassifier(n_estimators=100,
random_state=0)
----> 3 text_classifier.fit(X_train, y_train)
C:\miniconda3\envs\conda\lib\site-
packages\sklearn\ensemble\forest.py in fit(self, X, y,
sample_weight)
276 self.n_outputs_ = y.shape[1]
277
--> 278 y, expanded_class_weight =
self._validate_y_class_weight(y)
279
280 if getattr(y, "dtype", None) != DOUBLE or not
y.flags.contiguous:
C:\miniconda3\envs\conda\lib\site-
packages\sklearn\ensemble\forest.py in
_validate_y_class_weight(self, y)
476
477 def _validate_y_class_weight(self, y):
--> 478 check_classification_targets(y)
479
480 y = np.copy(y)
C:\miniconda3\envs\conda\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y)
166 y : array-like
167 """
--> 168 y_type = type_of_target(y)
169 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
170 'multilabel-indicator', 'multilabel-sequences']:
C:\miniconda3\envs\conda\lib\site-packages\sklearn\utils\multiclass.py in type_of_target(y)
285 return 'continuous' + suffix
286
--> 287 if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
288 return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
289 else:
<__array_function__ internals> in unique(*args, **kwargs)
C:\miniconda3\envs\conda\lib\site-packages\numpy\lib\arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
261 ar = np.asanyarray(ar)
262 if axis is None:
--> 263 ret = _unique1d(ar, return_index, return_inverse, return_counts)
264 return _unpack_tuple(ret)
265
C:\miniconda3\envs\conda\lib\site-packages\numpy\lib\arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
309 aux = ar[perm]
310 else:
--> 311 ar.sort()
312 aux = ar
313 mask = np.empty(aux.shape, dtype=np.bool_)
TypeError: unorderable types: str() < float()