У меня проблема со следующим кодом:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
data = pd.read_csv("movies.csv")
X = data.iloc[6:] #columns with words
y = genre_to_binary(data.iloc[1]) #target column i.e genre
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
def genre_to_binary(series):
# Convert from Pandas Series to List:
genres_list = series.values.tolist()
# The model cannot convert the genres into floats itself so we need to do this:
converted_genre=[]
for i in np.arange(len(genres_list)):
if genres_list[i]=="action":
converted_genre=np.append(converted_genre,0)
else:
converted_genre=np.append(converted_genre,1)
return converted_genre
Это вызывает следующую ошибку:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-78-b772929399c8> in <module>
1 #apply SelectKBest class to extract top 10 best features
2 bestfeatures = SelectKBest(score_func=chi2, k=10)
----> 3 fit = bestfeatures.fit(X,y)
4 dfscores = pd.DataFrame(fit.scores_)
5 dfcolumns = pd.DataFrame(X.columns)
/srv/app/venv/lib/python3.6/site-packages/sklearn/feature_selection/univariate_selection.py in fit(self, X, y)
339 self : object
340 """
--> 341 X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=True)
342
343 if not callable(self.score_func):
/srv/app/venv/lib/python3.6/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
717 ensure_min_features=ensure_min_features,
718 warn_on_dtype=warn_on_dtype,
--> 719 estimator=estimator)
720 if multi_output:
721 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
/srv/app/venv/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
534 # make sure we actually converted to numeric:
535 if dtype_numeric and array.dtype.kind == "O":
--> 536 array = array.astype(np.float64)
537 if not allow_nd and array.ndim >= 3:
538 raise ValueError("Found array with dim %d. %s expected <= 2."
ValueError: could not convert string to float: 'natural born killers'
Это не имеет смысла для меня, поскольку «прирожденные убийцы» не должны появляться в X или Y.
Спасибо за вашу помощь! Вы можете найти файл movies.csv здесь