Буду признателен за любую помощь, так как я сталкиваюсь с ошибкой при использовании Pandas с функцией ниже.
Вот пример настройки, которую я пытаюсь использовать:
example_data = {'age': [37,37,27,22,32,22,42,32,37,22], 'target': [0,0,2,0,0,0,0,0,2,0]}
example_df = pd.DataFrame(data=example_data)
example_df
Я вызвал функцию rdc следующим образом:
ldc(x=example_data['age'],y=example_data['target'])
Однако я сталкиваюсь спроблема:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-248-c4a80b9f6e55> in <module>()
----> 1 ldc(x=example_data['age'],y=example_data['target'])
<ipython-input-231-78a770305f24> in ldc(x, y, f, k, s, n)
29 return np.median(values)
30
---> 31 if len(x.shape) == 1: x = x.values.reshape((-1, 1))
32 if len(y.shape) == 1: y = y.values.reshape((-1, 1))
33
AttributeError: 'list' object has no attribute 'shape'
Ниже приводится сама функция, которую я использую:
"""
Implements the Randomized Dependence Coefficient
David Lopez-Paz, Philipp Hennig, Bernhard Schoelkopf
http://papers.nips.cc/paper/5138-the-randomized-dependence-coefficient.pdf
"""
import numpy as np
from scipy.stats import rankdata
def rdc(x, y, f=np.sin, k=20, s=1/6., n=1):
"""
Computes the Randomized Dependence Coefficient
x,y: numpy arrays 1-D or 2-D
If 1-D, size (samples,)
If 2-D, size (samples, variables)
f: function to use for random projection
k: number of random projections to use
s: scale parameter
n: number of times to compute the RDC and
return the median (for stability)
According to the paper, the coefficient should be relatively insensitive to
the settings of the f, k, and s parameters.
"""
if n > 1:
values = []
for i in range(n):
try:
values.append(rdc(x, y, f, k, s, 1))
except np.linalg.linalg.LinAlgError: pass
return np.median(values)
if len(x.shape) == 1: x = x.values.reshape((-1, 1))
if len(y.shape) == 1: y = y.values.reshape((-1, 1))
# Copula Transformation
cx = np.column_stack([rankdata(xc, method='ordinal') for xc in x.T])/float(x.size)
cy = np.column_stack([rankdata(yc, method='ordinal') for yc in y.T])/float(y.size)
# Add a vector of ones so that w.x + b is just a dot product
O = np.ones(cx.shape[0])
X = np.column_stack([cx, O])
Y = np.column_stack([cy, O])
# Random linear projections
Rx = (s/X.shape[1])*np.random.randn(X.shape[1], k)
Ry = (s/Y.shape[1])*np.random.randn(Y.shape[1], k)
X = np.dot(X, Rx)
Y = np.dot(Y, Ry)
# Apply non-linear function to random projections
fX = f(X)
fY = f(Y)
# Compute full covariance matrix
C = np.cov(np.hstack([fX, fY]).T)
# Due to numerical issues, if k is too large,
# then rank(fX) < k or rank(fY) < k, so we need
# to find the largest k such that the eigenvalues
# (canonical correlations) are real-valued
k0 = k
lb = 1
ub = k
while True:
# Compute canonical correlations
Cxx = C[:k, :k]
Cyy = C[k0:k0+k, k0:k0+k]
Cxy = C[:k, k0:k0+k]
Cyx = C[k0:k0+k, :k]
eigs = np.linalg.eigvals(np.dot(np.dot(np.linalg.inv(Cxx), Cxy),
np.dot(np.linalg.inv(Cyy), Cyx)))
# Binary search if k is too large
if not (np.all(np.isreal(eigs)) and
0 <= np.min(eigs) and
np.max(eigs) <= 1):
ub -= 1
k = (ub + lb) / 2
continue
if lb == ub: break
lb = k
if ub == lb + 1:
k = ub
else:
k = (ub + lb) / 2
return np.sqrt(np.max(eigs))