Как изменить функцию стоимости и градиентный спуск для данного набора данных - PullRequest
0 голосов
/ 13 мая 2019

Я делаю проект по прогнозированию урожайности с помощью логистической регрессии.

Я успешно объявил все функции, но я не могу подобрать их в соответствии со своими требованиями к набору данных.

Так может кто-нибудь помочь мне с этим?

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np

#Importing the datasets
df=pd.read_csv(r'E:\shardul\Projects\Agri yantra\baramati.csv')
X = df.drop(['Production'], axis = 1)      #here, we would take all the 
columns except 'target' as input vector
y= df['Production'] 

#Convert the column into categorical columns
crop=pd.get_dummies(X['crop'],drop_first=True)
X=X.drop('crop',axis=1)

# concat the dummy variables
X=pd.concat([X,crop],axis=1)

#Convert the column into categorical columns
region=pd.get_dummies(X['region'],drop_first=True)
X=X.drop('region',axis=1)

# concat the dummy variables
X=pd.concat([X,region],axis=1)
ynewtest = y
xnewtest = X
y = y[:, np.newaxis]#converting the output to an array 
print(X)
print('The shape of the input is {}'.format(X.shape))     #printing the shape of the input
print('The shape of the output is {}'.format(y.shape))    #printing the shape of the output
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 
0.33, random_state = 101)
print('The shape of the input training set is {}'.format(X_train.shape))
print('The shape of the output training set is {}'.format(y_train.shape))
print('The shape of the input testing set is {}'.format(X_test.shape))
print('The shape of the output testing set is {}'.format(y_test.shape))

#We are initially defining the sigmoid function that could be used later
def sigmoid(z):

    s = 1 / (1 + np.exp(-z))

return s

#This is a function that is used to initialize the weights with 0 and biases also with 0
def initialize_with_zeros(dim):
    w = np.zeros((dim, 1))
    b = 0
return w, b

#this network ensures that there is a forward propagation and at the same time, returns the cost
def propagate(w, b, X, y):

    m = X.shape[0]
    A = sigmoid(np.dot(X, w) + b)
    cost = -(1 / m) * np.sum(y * np.log(A) + (1 - y) * np.log(1 - A)) 

#computing the cost function or the error function
dw = (1 / m) * np.dot(X.T, (A - y))   #this is derivative of the cost 
function with respect to w
db = (1 / m) * np.sum(A - y)          #this is the derivative of the cost 
function with respect to b
grads = {'dw': dw, 'db': db}          #these values are stored in a 
dictionary so as to access them later
return grads, cost

#We are trying to get the parameters w and b after modifying them using the knowledge of the cost function
def optimize(w, b, X, y, num_iterations, learning_rate, print_cost = 
False):
costs = []                    #This is an empty list created so that it stores all the values later
    for i in range(num_iterations):
        grads, cost = propagate(w, b, X, y)       #we are calling the previously defined function 
    dw = grads['dw']                          #we are accessing the derivatives of cost with respect to w
    db = grads['db']                          #we are accessing the derivatives of cost with respect to b
    w = w - learning_rate * dw                #we are modifying the parameter w so that the cost would reduce in the long run
    b = b - learning_rate * db                #we are modifying the parameter b so that the cost would reduce in the long run
    np.squeeze(cost)
    if i % 100 == 0:
        costs.append(cost)                    #we are giving all the cost values to the empty list that was created initially
    if print_cost and i % 1000 == 0:
        print("cost after iteration {}: {}".format(i, cost))
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate = " + str(learning_rate))
plt.show()
params = {'w': w, 'b': db}                    #we are storing this value in the dictionary so that it could be accessed later
grads = {'dw': dw, 'db': db}                  #we are storing these valeus in the dictionary so that they could be accessed later
return params, grads, costs

#This is a function that gives 1 if the activation is greater that 0.5 and 0 otherwise
def predict(w, b, X):
m = X.shape[0]
y_prediction = np.zeros((m, 1))
A = sigmoid(np.dot(X, w) + b)
for i in range(A.shape[0]):
    if (A[i, 0] <= 0.5):
        y_prediction[i, 0] = 0
    else:
        y_prediction[i, 0] = 1

return y_prediction

def model(X_train, X_test, y_train, y_test, num_iterations, 
learning_rate, print_cost = True):
w, b = initialize_with_zeros(X.shape[1])
parameters, grads, costs = optimize(w, b, X, y, num_iterations, 
learning_rate, print_cost = True)
w = parameters["w"]
b = parameters["b"]
y_prediction_test = predict(w, b, X_test)
y_prediction_train = predict(w, b, X_train)

print('train accuracy: {}'.format(100 - np.mean(np.abs(y_prediction_train - y_train)) * 100))
print('test accuracy: {}'.format("test accuracy: {} %".format(100 - 
np.mean(np.abs(y_prediction_test - y_test)) * 100)))

d = {"costs": costs,
     "y_prediction_test": y_prediction_test, 
     "y_prediction_train" : y_prediction_train, 
     "w" : w, 
     "b" : b,
     "learning_rate" : learning_rate,
     "num_iterations": num_iterations}

return d

d = model(X_train, X_test, y_train, y_test, num_iterations = 100, 
learning_rate = 0.0015, print_cost = True)
xpred = xnewtest
ypred = ynewtest
i = 10         #play around with this number to access each row in the 
training and test set and check the accuracy
xnewpred = xpred.iloc[i]
ynewpred = ypred.iloc[i]
print('The input values of the features are:')
print(xnewpred)
print('Predicted crop yeild is:')
print(float(ynewpred))
xnewpred = xnewpred[:, np.newaxis]
xnewpred = xnewpred.T
ynew = predict(d["w"], d["b"], xnewpred)
print('The output of the predicted value is:')
print(ynew[0][0])

train accuracy: -1172738.8888888888
test accuracy: test accuracy: -2992973.0 %
The input values of the features are:
Area                       700.00
Iron\n                       9.20
IndexofProductivity        100.58
WaterHoldingCapacity        60.92
FieldCapacity               60.14
PH                           7.30
ElectricalConductivity       1.37
Organicmatter                0.93
Calciumcarbonate            10.88
Nitrogen                    71.00
phosphorous                 29.00
Potassium                  229.00
Rainfall                   457.00
jowar                        1.00
sugarcane                    0.00
wheat                        0.00
daund                        0.00
Name: 10, dtype: float64
Predicted crop yeild is:
21000.0
The output of the predicted value is:
1.0
...