Я очень новичок в DataScience / Pandas в целом.Я в основном следовал за этим и мог заставить его работать с использованием разных классификаторов.
import pandas as pd
import src.helper as helper
import time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
# Headings
headings = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing',
'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
# Load the data
shrooms = pd.read_csv('data/shrooms_no_header.csv', names=headings, converters={"header": float})
# Replace the ? in 'stalk-root' with 0
shrooms.loc[shrooms['stalk-root'] == '?', 'stalk-root'] = np.nan
shrooms.fillna(0, inplace=True)
# Remove columns with only one unique value
for col in shrooms.columns.values:
if len(shrooms[col].unique()) <= 1:
print("Removing column {}, which only contains the value: {}".format(col, shrooms[col].unique()[0]))
shrooms.drop(col, axis=1, inplace=True)
# Col to predict later
col_predict = 'class'
# Binary Encoding
all_cols = list(shrooms.columns.values)
all_cols.remove(col_predict)
helper.encode(shrooms, [col_predict])
# Expand Shrooms DataFrame to Binary Values
helper.expand(shrooms, all_cols)
# Remove the class we want to predict
x_all = list(shrooms.columns.values)
x_all.remove(col_predict)
# Set Train/Test ratio
ratio = 0.7
# Split the DF
df_train, df_test, X_train, Y_train, X_test, Y_test = helper.split_df(shrooms, col_predict, x_all, ratio)
# Try different classifier
# TODO: Batch Use to compare
classifier = GradientBoostingClassifier(n_estimators=1000)
# TODO: Optimize Hyperparamter (where applicable)
# Time the training
timer_start = time.process_time()
classifier.fit(X_train, Y_train)
timer_stop = time.process_time()
time_diff = timer_stop - timer_start
# Get the score
score_train = classifier.score(X_train, Y_train)
score_test = classifier.score(X_test, Y_test)
print('Train Score {}, Test Score {}, Time {}'.format(score_train, score_test, time_diff))
# TODO: Test a manual DataFrame
"Помощники" - это функции, которые я не совсем понимаю, но они работают:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
def split_df(df, y_col, x_cols, ratio):
"""
This method transforms a dataframe into a train and test set, for this you need to specify:
1. the ratio train : test (usually 0.7)
2. the column with the Y_values
"""
mask = np.random.rand(len(df)) < ratio
train = df[mask]
test = df[~mask]
y_train = train[y_col].values
y_test = test[y_col].values
x_train = train[x_cols].values
x_test = test[x_cols].values
return train, test, x_train, y_train, x_test, y_test
def encode(df, columns):
for col in columns:
le = LabelEncoder()
col_values_unique = list(df[col].unique())
le_fitted = le.fit(col_values_unique)
col_values = list(df[col].values)
le.classes_
col_values_transformed = le.transform(col_values)
df[col] = col_values_transformed
def expand(df, list_columns):
for col in list_columns:
colvalues = df[col].unique()
for colvalue in colvalues:
newcol_name = "{}_is_{}".format(col, colvalue)
df.loc[df[col] == colvalue, newcol_name] = 1
df.loc[df[col] != colvalue, newcol_name] = 0
df.drop(list_columns, inplace=True, axis=1)
def correlation_to(df, col):
correlation_matrix = df.corr()
correlation_type = correlation_matrix[col].copy()
abs_correlation_type = correlation_type.apply(lambda x: abs(x))
desc_corr_values = abs_correlation_type.sort_values(ascending=False)
y_values = list(desc_corr_values.values)[1:]
x_values = range(0, len(y_values))
xlabels = list(desc_corr_values.keys())[1:]
fig, ax = plt.subplots(figsize=(8, 8))
ax.bar(x_values, y_values)
ax.set_title('The correlation of all features with {}'.format(col), fontsize=20)
ax.set_ylabel('Pearson correlatie coefficient [abs waarde]', fontsize=16)
plt.xticks(x_values, xlabels, rotation='vertical')
plt.show()
Я бы хотел провести «ручной» тест, такой как ввод атрибутов x и получение прогноза на основе этого.
Так, например, я жестко кодирую DataFrame, как показано ниже:
manual = pd.DataFrame({
"cap-shape": ["x"],
"cap-surface": ["s"],
"cap-color": ["n"],
"bruises": ["f"],
"odor": ["n"],
"gill-attachment": ["a"],
"gill-spacing": ["c"],
"gill-size": ["b"],
"gill-color": ["y"],
"stalk-shape": ["e"],
"stalk-root": ["?"],
"stalk-surface-above-ring": ["s"],
"stalk-surface-below-ring": ["s"],
"stalk-color-above-ring": ["o"],
"stalk-color-below-ring": ["o"],
"veil-type": ["p"],
"veil-color": ["o"],
"ring-number": ["o"],
"ring-type": ["p"],
"spore-print-color": ["o"],
"population": ["c"],
"habitat": ["l"]
})
Как бы я применил ту же кодировку?В моем коде написано helper.encode(manual, [col_predict])
, но в руководстве ofc нет col_predict
?
Пожалуйста, имейте в виду, что я новичок, я искал в Интернете все, но не могу найти правильный источник/ учебник, который позволяет мне тестировать один набор.
Полный код можно найти здесь .