Трудно сказать, что вы пытаетесь сделать здесь. С одной стороны, 100 тыс. Записей НЕ огромны. Я думаю, что вы говорите о классификации, потому что вы имеете в виду категориальные точки данных, поэтому я сосредоточусь на этом в приведенном ниже примере кода.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from pylab import rcParams
import seaborn as sb
import scipy
from scipy.stats import spearmanr
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import scale
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import train_test_split
import sklearn.metrics as sm
# import data from web
url = 'https://python-graph-gallery.com/wp-content/uploads/mtcars.csv'
df = pd.read_csv(url)
# check for nulls
df.isnull().sum()
data = df.iloc[:,[2,10]].values
#data_names = ["drat","carb"]
y = df.iloc[:,[1]].values
# classification
y_predict = LogReg.predict(x)
from sklearn.metrics import classification_report
report = classification_report(y,y_predict)
print(report)
# Result:
precision recall f1-score support
4 0.91 0.91 0.91 11
6 1.00 0.43 0.60 7
8 0.78 1.00 0.88 14
accuracy 0.84 32
macro avg 0.90 0.78 0.79 32
weighted avg 0.87 0.84 0.83 32
# continuing...
X_train, X_test, y_train, y_test = train_test_split(data, y,
train_size=0.75, test_size=0.25)
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)
predictions = lm.predict(X_test)
plt.scatter(y_test,predictions)
# find Best pipeline:
# Fit the TPOT classifier
tpot = TPOTClassifier(verbosity=2, max_time_mins=5, population_size=40)
tpot.fit(X_train, y_train)
# Final Result:
Best pipeline: RandomForestClassifier(SGDClassifier(FastICA(input_matrix, tol=0.6000000000000001), alpha=0.01, eta0=1.0, fit_intercept=True, l1_ratio=0.75, learning_rate=invscaling, loss=modified_huber, penalty=elasticnet, power_t=0.1), bootstrap=False, criterion=entropy, max_features=0.45, min_samples_leaf=1, min_samples_split=3, n_estimators=100)
Out[128]:
TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
disable_update_check=False, early_stop=None, generations=100,
max_eval_time_mins=5, max_time_mins=5, memory=None,
mutation_rate=0.9, n_jobs=1, offspring_size=None,
periodic_checkpoint_folder=None, population_size=40,
random_state=None, scoring=None, subsample=1.0, template=None,
use_dask=False, verbosity=2, warm_start=False)