Я работаю в виртуальной среде, которая была настроена следующим образом https://docs.python.org/3/tutorial/venv.html
Кроме того, я использую ноутбук Jupyter.
В своем коде я использую sklearn.model_selection.cross_val_score (...).Кажется, что параметр n_jobs = "1" или "-1" вызывает проблемы, такие, что при использовании "1" я не получаю ошибок.При использовании «-1» выдает следующую ошибку:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
'''
Traceback (most recent call last):
File "c:\users\chang\ml\lib\site-packages\sklearn\externals\joblib\externals\loky\process_executor.py", line 391, in _process_worker
call_item = call_queue.get(block=True, timeout=timeout)
File "C:\Users\chang\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\queues.py", line 99, in get
if not self._rlock.acquire(block, timeout):
PermissionError: [WinError 5] Access is denied
'''
The above exception was the direct cause of the following exception:
BrokenProcessPool Traceback (most recent call last)
<ipython-input-10-56afe11b41fd> in <module>
11 X_poly = poly.fit_transform(X)
12
---> 13 score = cross_val_score(lgr_clf, X_poly, y, cv=loo, scoring='accuracy', n_jobs=-1).mean()
14 scores.append(score)
15
c:\users\chang\ml\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
c:\users\chang\ml\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
238 return_times=True, return_estimator=return_estimator,
239 error_score=error_score)
--> 240 for train, test in cv.split(X, y, groups))
241
242 zipped_scores = list(zip(*scores))
c:\users\chang\ml\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
928
929 with self._backend.retrieval_context():
--> 930 self.retrieve()
931 # Make sure that we get a last message telling us we are done
932 elapsed_time = time.time() - self._start_time
c:\users\chang\ml\lib\site-packages\sklearn\externals\joblib\parallel.py in retrieve(self)
831 try:
832 if getattr(self._backend, 'supports_timeout', False):
--> 833 self._output.extend(job.get(timeout=self.timeout))
834 else:
835 self._output.extend(job.get())
c:\users\chang\ml\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
519 AsyncResults.get from multiprocessing."""
520 try:
--> 521 return future.result(timeout=timeout)
522 except LokyTimeoutError:
523 raise TimeoutError()
~\AppData\Local\Programs\Python\Python37-32\lib\concurrent\futures\_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433 else:
434 raise TimeoutError()
~\AppData\Local\Programs\Python\Python37-32\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
------------------------------------------------------------------------------
У меня есть второй компьютер, на котором работает код, но нет настройки виртуальной среды.
Запуск cmd от имени администратора не решает мою проблему.
У меня нет виртуальной среды в качестве переменной среды, но у меня есть C: \Users \ chang \ AppData \ Local \ Programs \ Python \ Python37-32 в качестве переменной среды.
Я подозреваю, что упускаю важный шаг при настройке виртуальной среды, которая приводит кPermissionError: [WinError 5] Доступ запрещен, ошибка.
#!/usr/bin/env python
# coding: utf-8
# In[14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn import preprocessing
from sklearn import neighbors
import statsmodels.api as sm
import statsmodels.formula.api as smf
get_ipython().run_line_magic('matplotlib', 'inline')
plt.style.use('seaborn-white')
# In[15]:
df = pd.read_csv('Default.csv', index_col = 0)
df.info()
# In[16]:
##ESTIAMATE TEST ERROR. 3 SPLITS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
cols = ['student', 'balance', 'income']
X=df[cols]
y=df['default']
X=X.replace("Yes",1)
X=X.replace("No",0)
y=y.replace("Yes",1)
y=y.replace("No",0)
t_prop = 0.5
poly_order = np.arange(1,4) #degrees
r_state = np.arange(3) #number of splits
Z = np.zeros((poly_order.size,r_state.size))
X1, Y1 = np.meshgrid(poly_order, r_state, indexing='ij')
for (i,j),v in np.ndenumerate(Z):
poly = PolynomialFeatures(int(X1[i,j]))
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3,)# random_state=42)
y_train_default = (y_train == 1)
y_test_default = (y_test == 1)
lgr_clf = LogisticRegression(solver = "lbfgs")
lgr_clf.fit(X_train, y_train_default)
y_train_pred = lgr_clf.predict(X_train)
y_test_pred = lgr_clf.predict(X_test)
Z[i,j]= metrics.accuracy_score(y_test, y_test_pred)
plt.plot(X1,Z)
plt.title('{} random splits of the data set'.format(max(r_state)+1))
plt.ylabel('Accuracy Score')
plt.ylim(.94,1)
plt.xlabel('Degree of Polynomial')
plt.xlim(1,3)
# In[17]:
##LOOCV
loo = LeaveOneOut()
loo.get_n_splits(df)
scores = list()
X = X[:2500]
y = y[:2500]
for i in poly_order:
poly = PolynomialFeatures(i)
X_poly = poly.fit_transform(X)
score = cross_val_score(lgr_clf, X_poly, y, cv=loo, scoring='accuracy', n_jobs = -1).mean()
scores.append(score)
# k-fold CV
folds = 3
elements = len(df.index)
X1, Y1 = np.meshgrid(poly_order, r_state, indexing='ij')
Z3 = np.zeros((poly_order.size,r_state.size))
for (i,j),v in np.ndenumerate(Z3):
poly = PolynomialFeatures(X1[i,j])
X_poly = poly.fit_transform(X)
kf_10 = KFold(n_splits=folds, random_state=Y1[i,j])
Z3[i,j] = cross_val_score(lgr_clf, X_poly, y, cv=kf_10, scoring='accuracy').mean()
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
# Note: cross_val_score() method return negative values for the scores.
# https://github.com/scikit-learn/scikit-learn/issues/2439
# Left plot
ax1.plot(poly_order, np.array(scores), '-o')
ax1.set_title('LOOCV')
# Right plot
ax2.plot(X1,Z3,'-o')
ax2.set_title('3-fold CV')
for ax in fig.axes:
ax.set_ylabel('Mean Squared Error')
ax.set_xlabel('Degree of Polynomial')
ax.set_ylim(0.9,1)
ax.set_xlim(0.5,3.5)
#ax.set_xticks(range(1,5,2));
# In[18]:
##ESTIAMATE TEST ERROR. 4 SPLITS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
cols = ['student', 'balance', 'income']
X=df[cols]
y=df['default']
X=X.replace("Yes",1)
X=X.replace("No",0)
y=y.replace("Yes",1)
y=y.replace("No",0)
t_prop = 0.5
poly_order = np.arange(1,4) #degrees
r_state = np.arange(4) #number of splits
Z = np.zeros((poly_order.size,r_state.size))
X1, Y1 = np.meshgrid(poly_order, r_state, indexing='ij')
for (i,j),v in np.ndenumerate(Z):
poly = PolynomialFeatures(int(X1[i,j]))
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3,)# random_state=42)
y_train_default = (y_train == 1)
y_test_default = (y_test == 1)
lgr_clf = LogisticRegression(solver = "lbfgs")
lgr_clf.fit(X_train, y_train_default)
y_train_pred = lgr_clf.predict(X_train)
y_test_pred = lgr_clf.predict(X_test)
Z[i,j]= metrics.accuracy_score(y_test, y_test_pred)
plt.plot(X1,Z)
plt.title('{} random splits of the data set'.format(max(r_state)+1))
plt.ylabel('Accuracy Score')
plt.ylim(.94,1)
plt.xlabel('Degree of Polynomial')
plt.xlim(1,3)
# In[19]:
##LOOCV
loo = LeaveOneOut()
loo.get_n_splits(df)
scores = list()
X = X[:2500]
y = y[:2500]
for i in poly_order:
poly = PolynomialFeatures(i)
X_poly = poly.fit_transform(X)
score = cross_val_score(lgr_clf, X_poly, y, cv=loo, scoring='accuracy', n_jobs = -1).mean()
scores.append(score)
# k-fold CV
folds = 4
elements = len(df.index)
X1, Y1 = np.meshgrid(poly_order, r_state, indexing='ij')
Z4 = np.zeros((poly_order.size,r_state.size))
for (i,j),v in np.ndenumerate(Z4):
poly = PolynomialFeatures(X1[i,j])
X_poly = poly.fit_transform(X)
kf_10 = KFold(n_splits=folds, random_state=Y1[i,j])
Z4[i,j] = cross_val_score(lgr_clf, X_poly, y, cv=kf_10, scoring='accuracy').mean()
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
# Note: cross_val_score() method return negative values for the scores.
# https://github.com/scikit-learn/scikit-learn/issues/2439
# Left plot
ax1.plot(poly_order, np.array(scores), '-o')
ax1.set_title('LOOCV')
# Right plot
ax2.plot(X1,Z4,'-o')
ax2.set_title('4-fold CV')
for ax in fig.axes:
ax.set_ylabel('Mean Squared Error')
ax.set_xlabel('Degree of Polynomial')
ax.set_ylim(0.9,1)
ax.set_xlim(0.5,3.5)
#ax.set_xticks(range(1,5,2));
# In[21]:
##ESTIAMATE TEST ERROR. 5 SPLITS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
cols = ['student', 'balance', 'income']
X=df[cols]
y=df['default']
X=X.replace("Yes",1)
X=X.replace("No",0)
y=y.replace("Yes",1)
y=y.replace("No",0)
t_prop = 0.5
poly_order = np.arange(1,4) #degrees
r_state = np.arange(5) #number of splits
Z = np.zeros((poly_order.size,r_state.size))
X1, Y1 = np.meshgrid(poly_order, r_state, indexing='ij')
for (i,j),v in np.ndenumerate(Z):
poly = PolynomialFeatures(int(X1[i,j]))
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3,)# random_state=42)
y_train_default = (y_train == 1)
y_test_default = (y_test == 1)
lgr_clf = LogisticRegression(solver = "lbfgs")
lgr_clf.fit(X_train, y_train_default)
y_train_pred = lgr_clf.predict(X_train)
y_test_pred = lgr_clf.predict(X_test)
Z[i,j]= metrics.accuracy_score(y_test, y_test_pred)
plt.plot(X1,Z)
plt.title('{} random splits of the data set'.format(max(r_state)+1))
plt.ylabel('Accuracy Score')
plt.ylim(.94,1)
plt.xlabel('Degree of Polynomial')
plt.xlim(1,3)
# In[22]:
##LOOCV
loo = LeaveOneOut()
loo.get_n_splits(df)
scores = list()
X = X[:2500]
y = y[:2500]
for i in poly_order:
poly = PolynomialFeatures(i)
X_poly = poly.fit_transform(X)
score = cross_val_score(lgr_clf, X_poly, y, cv=loo, scoring='accuracy', n_jobs = -1).mean()
scores.append(score)
# k-fold CV
folds = 5
elements = len(df.index)
X1, Y1 = np.meshgrid(poly_order, r_state, indexing='ij')
Z5 = np.zeros((poly_order.size,r_state.size))
for (i,j),v in np.ndenumerate(Z5):
poly = PolynomialFeatures(X1[i,j])
X_poly = poly.fit_transform(X)
kf_10 = KFold(n_splits=folds, random_state=Y1[i,j])
Z5[i,j] = cross_val_score(lgr_clf, X_poly, y, cv=kf_10, scoring='accuracy').mean()
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
# Note: cross_val_score() method return negative values for the scores.
# https://github.com/scikit-learn/scikit-learn/issues/2439
# Left plot
ax1.plot(poly_order, np.array(scores), '-o')
ax1.set_title('LOOCV')
# Right plot
ax2.plot(X1,Z5,'-o')
ax2.set_title('5-fold CV')
for ax in fig.axes:
ax.set_ylabel('Mean Squared Error')
ax.set_xlabel('Degree of Polynomial')
ax.set_ylim(0.9,1)
ax.set_xlim(0.5,3.5)
#ax.set_xticks(range(1,5,2));
# In[23]:
#Analysis
#When Comparing the LOOCV to the random split, it can be seen that the
#LOOCV is closest to a linear model with polynomial degree one.
#This is also a true statement when compared to the K-fold CV.
#In addition the number of folds does not cause a huge deviation
#compared to LOOCV. This proves the statement in class that having
#a large or small number of folds does not necessarily make the model better
# In[ ]: