Я работал с K-ближайшими соседями в руководстве по Python по Dataquest.Работали над проблемами уже несколько месяцев, и каждый раз это что-то новое.Я перешел к последним нескольким строкам кода, но не смог выяснить, почему каждый раз, когда я пытаюсь подогнать модель по данным обучения, возникает следующая ошибка:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-59-0e474d4c7797> in <module>()
----> 1 knn.fit(train[x_columns], train[y_column])
C:\Users\aduran\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\neighbors\base.py in fit(self, X, y)
739 """
740 if not isinstance(X, (KDTree, BallTree)):
--> 741 X, y = check_X_y(X, y, "csr", multi_output=True)
742 self._y = y
743 return self._fit(X)
C:\Users\aduran\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
519 X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
520 ensure_2d, allow_nd, ensure_min_samples,
--> 521 ensure_min_features, warn_on_dtype, estimator)
522 if multi_output:
523 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
C:\Users\aduran\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
405 % (array.ndim, estimator_name))
406 if force_all_finite:
--> 407 _assert_all_finite(array)
408
409 shape_repr = _shape_repr(array.shape)
C:\Users\aduran\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X)
56 and not np.isfinite(X).all()):
57 raise ValueError("Input contains NaN, infinity"
---> 58 " or a value too large for %r." % X.dtype)
59
60
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
Мой код предоставленниже и ссылка для фактического исследования .Любая помощь в том, почему, когда я делаю код knn.fit(train[x_columns], train[y_column]
, появляется сообщение об ошибке выше, было бы полезно.
import pandas
with open("C:/Users/aduran/Downloads/nba_data.csv", 'r') as csvfile:
nba = pandas.read_csv(csvfile)
print(nba.columns.values)
#Select Lebron James from our dataset
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]
#Choose only the numeric columns(we'll use for computing euclidean distance)
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga',
'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft',
'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf',
'pts']
#create euclidean distance
def euclidean_distance(row) :
inner_value = 0
for k in distance_columns:
print(k)
print(selected_player[k])
inner_value += (row[k] - selected_player[k]) ** 2
print(inner_value)
print('-----')
print(math.sqrt(inner_value))
print('=====')
return math.sqrt(inner_value)
import math
#find the distance from each player in the dataset to LeBron
lebron_distance = nba.apply(euclidean_distance, axis = 1)
#Select only the numeric columns from the NBA dataset
nba_numeric = nba[distance_columns]
#NORMALIZE ALL OF THE NUMERIC COLUMNS
nba_normalized = (nba_numeric - nba_numeric.mean() / nba_numeric.std())
#Finding the nearest neighbor
from scipy.spatial import distance
#Fill in the NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)
#Find the normalized vector for Lebron James
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]
#Find the distance between Lebron James and everyon else
euclidean_distances = nba_normalized.apply(lambda row:
distance. euclidean(row, lebron_normalized), axis=1)
# Create a new dataframe with distances.
distance_frame = pandas.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort("dist", inplace=True)
#Find the most similiar player to LeBron
second_smallest = distance_frame.iloc[1]["idx"]
most_similiar_to_lebron = nba.loc[int(second_smallest)]["player"]
#Generating Training and testing sets to accomodate overfitting
import random
from numpy.random import permutation
#Randomly shuffle the index of the nba
random_indices = permutation(nba.index)
#Set cutoff for how many items we want in the test set (in this case 1/3)
test_cutoff = math.floor(len(nba)/3)
#Generate the test set by taking the first 1/3 of the randomly shuffled indices
test = nba.loc[random_indices[1:test_cutoff]]
#Generate the train set with the rest of the data
train = nba.loc[random_indices[test_cutoff:]]
#The columns that weill be making predictions with
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga',
'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft',
'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
#the column that we want to predict
y_column = ["pts"]
from sklearn.neighbors import KNeighborsRegressor
#Create the knn model #Look at the 5 closest neighbors
knn = KNeighborsRegressor(n_neighbors=5)
#Fit the model on the training data
knn.fit(train[x_columns], train[y_column])