Набор данных
Id,Cl.thickness,Cell.size,Cell.shape,Marg.adhesion,Epith.c.size,Bare.nuclei,Bl.cromatin,Normal.nucleoli,Mitoses,Class
1000025,5,1,1,1,2,1,3,1,1,benign
1002945,5,4,4,5,7,10,3,2,1,benign
Код ниже
import math
import numpy as np
import pandas as pd
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import learning_curve,GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn import preprocessing, metrics, svm, ensemble
from sklearn.metrics import accuracy_score, classification_report
import tabpy_client
# Breast Cancer dataset
# Citation: Dr. William H. Wolberg, University of Wisconsin Hospitals, Madison
# https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)
# Read the dataset (Note that the CSV provided for this demo has rows with the missing data removed)
df = pd.read_csv('breastcancer.csv', header=0)
# Take a look at the structure of the file
df.head(n=4)
# Drop Id column not used in analysis
df.drop(['Id'], 1, inplace=True)
# Use LabelEncoder to convert textual classifications to numeric.
# We will use the same encoder later to convert them back.
encoder = preprocessing.LabelEncoder()
df['Class'] = encoder.fit_transform(df['Class'])
# You could also do this manually in the following way:
# df['Class'] = df['Class'].map( {'benign': 0, 'malignant': 1} ).astype(int)
# Check the result of the transform
df.head(n=6)
# Split columns into independent/predictor variables vs dependent/response/outcome variable
X = np.array(df.drop(['Class'], 1))
y = np.array(df['Class'])
# Scale the data. We will use the same scaler later for scoring function
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
# 10 fold stratified cross validation
kf = StratifiedKFold(y,n_splits=10, random_state=None, shuffle=True)
# Define the parameter grid to use for tuning the Support Vector Machine
parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
# Pick the goal you're optimizing for e.g. precision if you prefer fewer false-positives
# recall if you prefer fewer false-negatives. For demonstration purposes let's pick several
# Note that the final model selection will be based on the last item in the list
scoringmethods = ['f1','accuracy','precision', 'recall','roc_auc']
Почему n_splits
выдает ошибку
TypeError: __init__() got multiple values for argument 'n_splits'.
n_splits
- параметр в gridsearch