H2o AutoML неожиданные результаты для прогноза модели - PullRequest
0 голосов
/ 17 марта 2020

Я новичок в AutoMl. Изучение набора данных клуба кредитования показало, что model.predict(test), дают неожиданные результаты.

# Import necessary packages
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

# Initialize instance of H2O
h2o.init()

# If possible download from the s3 link and change the path to the dataset
path = "http://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/lending-club/LoanStats3a.csv"

# Specify some column types to "String" that we want to munge later
types = {"int_rate":"string", "revol_util":"string", "emp_length":"string", 
         "earliest_cr_line":"string", "issue_d":"string", "last_credit_pull_d":"factor", "verification_status":"string"}

# Task 1: Import the file and look at the frame
data = h2o.import_file(path=path, col_types= types)
data.describe()

# Task 2: Look at the levels int he response column, "loan_status"
# Hint: Use .table() function on the response column
data["loan_status"].table()

# Task 3: Drop all loans that are still in progress and therefore cannot be deemed good/bad loans
# Hint: "Current", "In Grace Period", "Late (16-30 days)", "Late (31-120 days)" are ongoing loans
data = data[~data["loan_status"].isin(["Current", "In Grace Period", "Late (16-30 days)", "Late (31-120 days)"]), :]

data.show()

# Task 4: Bin the response variable to good/bad oans only, use your best judgement for what qualifies as a good/bad loan
# Create a new column called "bad_loan" which should be a binary variable
# Hint: You can turn the bad_loan column into a factor using .asfactor()
data["bad_loan"] = data["loan_status"].isin(["Charged Off", "Default", 
                                                  "Does not meet the credit policy.  Status:Charged Off"])
data ["bad_loan"] = data["bad_loan"].asfactor()

data["bad_loan"]

# Task 5: String munging to clean string columns before converting to numeric
# Hint: Column that need munging include "int_rate," "revol_util," "emp_length"

#### Example for int_rate using gsub, trim, asnumeric ####
data ["int_rate"] = data["int_rate"].gsub(pattern = "%", replacement = "") # strip %
data [ "int_rate"] = data["int_rate"].trim() # trim ws
data ["int_rate"] = data["int_rate"].asnumeric() #change to a numeric 
data["int_rate"].show()

# Now try for revol_util yourself
data["revol_util"] = data["revol_util"].gsub(pattern = "%", replacement = "")
data["revol_util"] = data["revol_util"].trim() 
data["revol_util"] = data["revol_util"].asnumeric() 
data["revol_util"].show()

# Now we're going to clean up emp_length
# Use gsub to remove " year" and " years" also translate n/a to "" 
data ["emp_length"] = data["emp_length"].gsub(pattern = "([ ]*+[a-zA-Z].*)|(n/a)", 
                                              replacement = "") 

# Use trim to remove any trailing spaces 
data ["emp_length"] = data["emp_length"].trim()

# Use sub to convert < 1 to 0 years and do the same for 10 + to 10
# Hint: Be mindful of spaces between characters
data ["emp_length"] = data["emp_length"].gsub(pattern = "< 1", replacement = "0.5")
data ["emp_length"] = data["emp_length"].gsub(pattern = "10\\+", replacement = "10")
data ["emp_length"] = data["emp_length"].asnumeric()
data ["emp_length"].show()

# Task 6: Extract month and year from earliest_cr_line and make two new columns called 
# earliest_cr_month and earliest_cr_year
data["earliest_cr_month"] = data["earliest_cr_line"].strsplit(pattern = "-")[0]
data["earliest_cr_year"] = data["earliest_cr_line"].strsplit(pattern = "-")[1]
data["earliest_cr_year"] = data["earliest_cr_line"].asnumeric()
data["earliest_cr_year"] = data["earliest_cr_line"].strsplit(pattern = "-")[1].asnumeric()
data["earliest_cr_year"].show()

# Task 7: Extract month and year from issue_d and make two new columns called issue_d_month and issue_d_year
data["issue_d_month"] = data["issue_d"].strsplit(pattern = "-")[0]
data["issue_d_year"] = data["issue_d"].strsplit(pattern = "-")[1]
data["issue_d_year"] = data["issue_d_year"].asnumeric()
data["issue_d_year"].show()

# Task 8: Create new column called credit_length
# Hint: Do this by subtracting the earliest_cr_year from the issue_d_year
data["credit_length"] = data["issue_d_year"] - data["earliest_cr_year"]
data["credit_length"].show()

# Task 9: Use the sub function to create two levels from the verification_status column. Ie "verified" and "not verified"
data ["verification_status"] = data["verification_status"].sub(pattern = "VERIFIED - income source", 
                                                               replacement = "verified")
data ["verification_status"] = data["verification_status"].sub(pattern = "VERIFIED - income", 
                                                               replacement = "verified")
data ["verification_status"] = data["verification_status"].asfactor()

# Task 10: Do a test-train split (80-20)
s = data["int_rate"].runif()
train = data[s <= 0.80]
test  = data[s > 0.80]

# Task 11: Define your response and predictor variables 
y="bad_loan"
x=["loan_amnt", "credit_length", "revol_util", 
     "home_ownership", "annual_inc", "purpose", "addr_state", "dti",
     "delinq_2yrs", "total_acc", "verification_status", "term"]

# Task 12: Set parameters for GBM model 
from h2o.estimators.gbm import H2OGradientBoostingEstimator
model = H2OGradientBoostingEstimator(model_id="BadLoanModel",
                                       score_each_iteration=True,
                                       ntrees=10,
                                       learn_rate=0.05)

# Task 13: Build your model
model.train(x=x, y=y, training_frame=train, validation_frame=test)

# Task 14: View your model results
model

predict = model.predict(test)
predict.head()

Результаты: Согласно моему пониманию

p0 - это вероятность (между 0 и 1), что класс 0 выбран.

p1 - это вероятность (между 0 и 1) выбран класс 1.

predict p0  p1
1   0.836788    0.163212
0   0.875218    0.124782
0   0.864133    0.135867
0   0.861971    0.138029
0   0.875354    0.124646
0   0.867809    0.132191
0   0.853071    0.146929
1   0.758071    0.241929
1   0.823545    0.176455
0   0.867809    0.132191

Результаты выглядят очень странно, поэтому здесь P1 всегда ниже.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...