KNN- игроки НБА - PullRequest
       7

KNN- игроки НБА

0 голосов
/ 06 июня 2018

Я работал с K-ближайшими соседями в руководстве по Python по Dataquest.Работали над проблемами уже несколько месяцев, и каждый раз это что-то новое.Я перешел к последним нескольким строкам кода, но не смог выяснить, почему каждый раз, когда я пытаюсь подогнать модель по данным обучения, возникает следующая ошибка:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-59-0e474d4c7797> in <module>()
----> 1 knn.fit(train[x_columns], train[y_column])

C:\Users\aduran\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\neighbors\base.py in fit(self, X, y)
    739         """
    740         if not isinstance(X, (KDTree, BallTree)):
--> 741             X, y = check_X_y(X, y, "csr", multi_output=True)
    742         self._y = y
    743         return self._fit(X)

C:\Users\aduran\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    519     X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
    520                     ensure_2d, allow_nd, ensure_min_samples,
--> 521                     ensure_min_features, warn_on_dtype, estimator)
    522     if multi_output:
    523         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,

C:\Users\aduran\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    405                              % (array.ndim, estimator_name))
    406         if force_all_finite:
--> 407             _assert_all_finite(array)
    408 
    409     shape_repr = _shape_repr(array.shape)

C:\Users\aduran\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X)
     56             and not np.isfinite(X).all()):
     57         raise ValueError("Input contains NaN, infinity"
---> 58                          " or a value too large for %r." % X.dtype)
     59 
     60 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

Мой код предоставленниже и ссылка для фактического исследования .Любая помощь в том, почему, когда я делаю код knn.fit(train[x_columns], train[y_column], появляется сообщение об ошибке выше, было бы полезно.

    import pandas
with open("C:/Users/aduran/Downloads/nba_data.csv", 'r') as csvfile:
    nba = pandas.read_csv(csvfile)

print(nba.columns.values)

#Select Lebron James from our dataset
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]

#Choose only the numeric columns(we'll use for computing euclidean distance)
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga',
'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft',
'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf',
'pts']

#create euclidean distance
def euclidean_distance(row) :
    inner_value = 0

    for k in distance_columns:
        print(k)
        print(selected_player[k])
        inner_value += (row[k] - selected_player[k]) ** 2
        print(inner_value)
        print('-----')
        print(math.sqrt(inner_value))
    print('=====')
    return math.sqrt(inner_value)

import math

#find the distance from each player in the dataset to LeBron
lebron_distance = nba.apply(euclidean_distance, axis = 1)

#Select only the numeric columns from the NBA dataset
nba_numeric = nba[distance_columns]

#NORMALIZE ALL OF THE NUMERIC COLUMNS
nba_normalized = (nba_numeric - nba_numeric.mean() / nba_numeric.std())

#Finding the nearest neighbor
from scipy.spatial import distance

#Fill in the NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)

#Find the normalized vector for Lebron James
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]

#Find the distance between Lebron James and everyon else
euclidean_distances = nba_normalized.apply(lambda row:
                        distance. euclidean(row, lebron_normalized), axis=1)

# Create a new dataframe with distances.
distance_frame = pandas.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort("dist", inplace=True)

#Find the most similiar player to LeBron
second_smallest = distance_frame.iloc[1]["idx"] 
most_similiar_to_lebron = nba.loc[int(second_smallest)]["player"]

#Generating Training and testing sets to accomodate overfitting

import random
from numpy.random import permutation

#Randomly shuffle the index of the nba
random_indices = permutation(nba.index)

#Set cutoff for how many items we want in the test set (in this case 1/3)
test_cutoff = math.floor(len(nba)/3)

#Generate the test set by taking the first 1/3 of the randomly shuffled indices
test = nba.loc[random_indices[1:test_cutoff]]

#Generate the train set with the rest of the data
train = nba.loc[random_indices[test_cutoff:]]

#The columns that weill be making predictions with
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga',
'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft',
'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']

#the column that we want to predict
y_column = ["pts"] 

from sklearn.neighbors import KNeighborsRegressor

#Create the knn model #Look at the 5 closest neighbors
knn = KNeighborsRegressor(n_neighbors=5)

#Fit the model on the training data
knn.fit(train[x_columns], train[y_column])
...