Я новичок в AutoMl. Изучение набора данных клуба кредитования показало, что model.predict(test)
, дают неожиданные результаты.
# Import necessary packages
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
# Initialize instance of H2O
h2o.init()
# If possible download from the s3 link and change the path to the dataset
path = "http://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/lending-club/LoanStats3a.csv"
# Specify some column types to "String" that we want to munge later
types = {"int_rate":"string", "revol_util":"string", "emp_length":"string",
"earliest_cr_line":"string", "issue_d":"string", "last_credit_pull_d":"factor", "verification_status":"string"}
# Task 1: Import the file and look at the frame
data = h2o.import_file(path=path, col_types= types)
data.describe()
# Task 2: Look at the levels int he response column, "loan_status"
# Hint: Use .table() function on the response column
data["loan_status"].table()
# Task 3: Drop all loans that are still in progress and therefore cannot be deemed good/bad loans
# Hint: "Current", "In Grace Period", "Late (16-30 days)", "Late (31-120 days)" are ongoing loans
data = data[~data["loan_status"].isin(["Current", "In Grace Period", "Late (16-30 days)", "Late (31-120 days)"]), :]
data.show()
# Task 4: Bin the response variable to good/bad oans only, use your best judgement for what qualifies as a good/bad loan
# Create a new column called "bad_loan" which should be a binary variable
# Hint: You can turn the bad_loan column into a factor using .asfactor()
data["bad_loan"] = data["loan_status"].isin(["Charged Off", "Default",
"Does not meet the credit policy. Status:Charged Off"])
data ["bad_loan"] = data["bad_loan"].asfactor()
data["bad_loan"]
# Task 5: String munging to clean string columns before converting to numeric
# Hint: Column that need munging include "int_rate," "revol_util," "emp_length"
#### Example for int_rate using gsub, trim, asnumeric ####
data ["int_rate"] = data["int_rate"].gsub(pattern = "%", replacement = "") # strip %
data [ "int_rate"] = data["int_rate"].trim() # trim ws
data ["int_rate"] = data["int_rate"].asnumeric() #change to a numeric
data["int_rate"].show()
# Now try for revol_util yourself
data["revol_util"] = data["revol_util"].gsub(pattern = "%", replacement = "")
data["revol_util"] = data["revol_util"].trim()
data["revol_util"] = data["revol_util"].asnumeric()
data["revol_util"].show()
# Now we're going to clean up emp_length
# Use gsub to remove " year" and " years" also translate n/a to ""
data ["emp_length"] = data["emp_length"].gsub(pattern = "([ ]*+[a-zA-Z].*)|(n/a)",
replacement = "")
# Use trim to remove any trailing spaces
data ["emp_length"] = data["emp_length"].trim()
# Use sub to convert < 1 to 0 years and do the same for 10 + to 10
# Hint: Be mindful of spaces between characters
data ["emp_length"] = data["emp_length"].gsub(pattern = "< 1", replacement = "0.5")
data ["emp_length"] = data["emp_length"].gsub(pattern = "10\\+", replacement = "10")
data ["emp_length"] = data["emp_length"].asnumeric()
data ["emp_length"].show()
# Task 6: Extract month and year from earliest_cr_line and make two new columns called
# earliest_cr_month and earliest_cr_year
data["earliest_cr_month"] = data["earliest_cr_line"].strsplit(pattern = "-")[0]
data["earliest_cr_year"] = data["earliest_cr_line"].strsplit(pattern = "-")[1]
data["earliest_cr_year"] = data["earliest_cr_line"].asnumeric()
data["earliest_cr_year"] = data["earliest_cr_line"].strsplit(pattern = "-")[1].asnumeric()
data["earliest_cr_year"].show()
# Task 7: Extract month and year from issue_d and make two new columns called issue_d_month and issue_d_year
data["issue_d_month"] = data["issue_d"].strsplit(pattern = "-")[0]
data["issue_d_year"] = data["issue_d"].strsplit(pattern = "-")[1]
data["issue_d_year"] = data["issue_d_year"].asnumeric()
data["issue_d_year"].show()
# Task 8: Create new column called credit_length
# Hint: Do this by subtracting the earliest_cr_year from the issue_d_year
data["credit_length"] = data["issue_d_year"] - data["earliest_cr_year"]
data["credit_length"].show()
# Task 9: Use the sub function to create two levels from the verification_status column. Ie "verified" and "not verified"
data ["verification_status"] = data["verification_status"].sub(pattern = "VERIFIED - income source",
replacement = "verified")
data ["verification_status"] = data["verification_status"].sub(pattern = "VERIFIED - income",
replacement = "verified")
data ["verification_status"] = data["verification_status"].asfactor()
# Task 10: Do a test-train split (80-20)
s = data["int_rate"].runif()
train = data[s <= 0.80]
test = data[s > 0.80]
# Task 11: Define your response and predictor variables
y="bad_loan"
x=["loan_amnt", "credit_length", "revol_util",
"home_ownership", "annual_inc", "purpose", "addr_state", "dti",
"delinq_2yrs", "total_acc", "verification_status", "term"]
# Task 12: Set parameters for GBM model
from h2o.estimators.gbm import H2OGradientBoostingEstimator
model = H2OGradientBoostingEstimator(model_id="BadLoanModel",
score_each_iteration=True,
ntrees=10,
learn_rate=0.05)
# Task 13: Build your model
model.train(x=x, y=y, training_frame=train, validation_frame=test)
# Task 14: View your model results
model
predict = model.predict(test)
predict.head()
Результаты: Согласно моему пониманию
p0 - это вероятность (между 0 и 1), что класс 0 выбран.
p1 - это вероятность (между 0 и 1) выбран класс 1.
predict p0 p1
1 0.836788 0.163212
0 0.875218 0.124782
0 0.864133 0.135867
0 0.861971 0.138029
0 0.875354 0.124646
0 0.867809 0.132191
0 0.853071 0.146929
1 0.758071 0.241929
1 0.823545 0.176455
0 0.867809 0.132191
Результаты выглядят очень странно, поэтому здесь P1 всегда ниже.