#Structure of the code for wide model
#!/usr/bin/env python
# -*- coding: utf-8-*-
import pandas as pd
import tensorflow as tf
import tempfile
COLUMNS=["DA","PD","LO","MN1","MN2","MN3","MN4","MN5","MN6","MN7","MN8","MN9","MH1","MH2","MH3","MH4","MH5","MH6","MH7",
"MH8","MH9","MC1","MC2","MC3","MC4","MC5","MC6","MC7","MC8","MC9","IT1","IT2","OG1","OG2",
"NPK","NPE","GC1","GC2","GC3","SC1","SC2","SC3","QY","IS1","IS2"]
# train_file = "/home/davide/Scrivania/eventdataYN.csv"
# test_file = "/home/davide/Scrivania/eventtestYN.csv"
train_file = "/home/davide/Scrivania/eventdataCAT.csv"
test_file = "/home/davide/Scrivania/eventtestCAT.csv"
df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True)
# LABELY_COLUMN = "labelY"
# df_train[LABELY_COLUMN] = (df_train["NPK"].apply(lambda x: "YES" in x)).astype(int)
# df_test[LABELY_COLUMN] = (df_test["NPK"].apply(lambda x: "YES" in x)).astype(int)
# LABELN_COLUMN = "labelN"
# df_train[LABELN_COLUMN] = (df_train["NPK"].apply(lambda x: "NO" in x)).astype(int)
# df_test[LABELN_COLUMN] = (df_test["NPK"].apply(lambda x: "NO" in x)).astype(int)
LABEL1_COLUMN = "label1"
df_train[LABEL1_COLUMN] = (df_train["NPK"].apply(lambda x: "<10" in x)).astype(int)
df_test[LABEL1_COLUMN] = (df_test["NPK"].apply(lambda x: "<10" in x)).astype(int)
# LABEL10_COLUMN = "label10"
# df_train[LABEL10_COLUMN] = (df_train["NPK"].apply(lambda x: "10-100" in x)).astype(int)
# df_test[LABEL10_COLUMN] = (df_test["NPK"].apply(lambda x: "10-100" in x)).astype(int)
# LABEL100_COLUMN = "label100"
# df_train[LABEL100_COLUMN] = (df_train["NPK"].apply(lambda x: "100-1000" in x)).astype(int)
# df_test[LABEL100_COLUMN] = (df_test["NPK"].apply(lambda x: "100-1000" in x)).astype(int)
# LABEL1000_COLUMN = "label1000"
# df_train[LABEL1000_COLUMN] = (df_train["NPK"].apply(lambda x: ">1000" in x)).astype(int)
# df_test[LABEL1000_COLUMN] = (df_test["NPK"].apply(lambda x: ">1000" in x)).astype(int)
CATEGORICAL_COLUMNS = ["DA","PD","LO","MN1","MN2","MN3","MN4","MN5","MN6","MN7","MN8","MN9","MH1","MH2","MH3","MH4","MH5","MH6","MH7","MH8","MH9","MC1","MC2","MC3","MC4","MC5","MC6","MC7","MC8","MC9","IT1","IT2","OG1","OG2","NPK","GC1","GC2","GC3","SC1","SC2","SC3","QY","IS1","IS2"]
CONTINUOUS_COLUMNS = ["NPE"]
def input_fn(df):
# Creates a dictionary mapping from each continuous feature column name (k) to
# the values of that column stored in a constant Tensor.
continuous_cols = {k: tf.constant(df[k].values)
for k in CONTINUOUS_COLUMNS}
# Creates a dictionary mapping from each categorical feature column name (k)
# to the values of that column stored in a tf.SparseTensor.
categorical_cols = {k: tf.SparseTensor(
indices=[[i, 0] for i in range(df[k].size)],
values=df[k].values,
dense_shape=[df[k].size, 1])
for k in CATEGORICAL_COLUMNS}
# Merges the two dictionaries into one.
feature_cols = dict(continuous_cols.items() + categorical_cols.items())
# Converts the label columns into constant Tensors.
# labelY = tf.constant(df[LABELY_COLUMN].values)
# labelN = tf.constant(df[LABELN_COLUMN].values)
label1 = tf.constant(df[LABEL1_COLUMN].values)
# label10 = tf.constant(df[LABEL10_COLUMN].values)
# label100 = tf.constant(df[LABEL100_COLUMN].values)
# label1000 = tf.constant(df[LABEL1000_COLUMN].values)
# Returns the feature columns and the labels.
return feature_cols, label1
def train_input_fn():
return input_fn(df_train)
def eval_input_fn():
return input_fn(df_test)
#define columns
DA = tf.contrib.layers.sparse_column_with_keys(column_name="DA", keys=["<1900s","1900s","1910s","1920s","1930s","1940s","1950s","1960s","1970s","1980s","1990s"])
PD = tf.contrib.layers.sparse_column_with_keys(column_name="PD", keys=["R", "U","Na"])
LO = tf.contrib.layers.sparse_column_with_hash_bucket("LO", hash_bucket_size=280)
DAxLO=tf.contrib.layers.crossed_column([DA,LO],hash_bucket_size=int(1e5))
MN1 = tf.contrib.layers.sparse_column_with_hash_bucket("MN1", hash_bucket_size=1450)
MN2 = tf.contrib.layers.sparse_column_with_hash_bucket("MN2", hash_bucket_size=375)
MN3 = tf.contrib.layers.sparse_column_with_hash_bucket("MN3", hash_bucket_size=170)
MN4 = tf.contrib.layers.sparse_column_with_hash_bucket("MN4", hash_bucket_size=90)
MN5 = tf.contrib.layers.sparse_column_with_hash_bucket("MN5", hash_bucket_size=30)
MN6 = tf.contrib.layers.sparse_column_with_hash_bucket("MN6", hash_bucket_size=15)
MN7 = tf.contrib.layers.sparse_column_with_keys(column_name="MN7", keys=["AEROSOLS","FUEL OIL","Na","NITRIC ACID","OIL","STYRENE"])
MN8 = tf.contrib.layers.sparse_column_with_keys(column_name="MN8", keys=["HG SEED DRESS","Na","NATURAL GAS","TOLUENE DIISOCYANATE"])
MN9 = tf.contrib.layers.sparse_column_with_keys(column_name="MN9", keys=["Na","ORG.PHOS.PESTIC"])
MN=tf.contrib.layers.crossed_column([MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9],hash_bucket_size=int(1e6))
MH1 = tf.contrib.layers.sparse_column_with_keys(column_name="MH1", keys=["AS","CD","CO","EX","FI","Na","OX","TO"])
MH2 = tf.contrib.layers.sparse_column_with_keys(column_name="MH2", keys=["AS","CD","CO","EX","FI","Na","OX","TO"])
MH3 = tf.contrib.layers.sparse_column_with_keys(column_name="MH3", keys=["AS","CO","EX","FI","Na","OX","TO"])
MH4 = tf.contrib.layers.sparse_column_with_keys(column_name="MH4", keys=["CO","FI","Na","OX","TO"])
MH5 = tf.contrib.layers.sparse_column_with_keys(column_name="MH5", keys=["CO","FI","Na","TO"])
MH6 = tf.contrib.layers.sparse_column_with_keys(column_name="MH6", keys=["CO","FI","Na","TO"])
MH7 = tf.contrib.layers.sparse_column_with_keys(column_name="MH7", keys=["CO","FI","Na"])
MH8 = tf.contrib.layers.sparse_column_with_keys(column_name="MH8", keys=["FI","Na","TO"])
MH9 = tf.contrib.layers.sparse_column_with_keys(column_name="MH9", keys=["Na","TO"])
MH=tf.contrib.layers.crossed_column([MH1,MH2,MH3,MH4,MH5,MH6,MH7,MH8,MH9],hash_bucket_size=int(1e6))
MC1 = tf.contrib.layers.sparse_column_with_hash_bucket("MC1", hash_bucket_size=350)
MC2 = tf.contrib.layers.sparse_column_with_hash_bucket("MC2", hash_bucket_size=180)
MC3 = tf.contrib.layers.sparse_column_with_hash_bucket("MC3", hash_bucket_size=95)
MC4 = tf.contrib.layers.sparse_column_with_hash_bucket("MC4", hash_bucket_size=55)
MC5 = tf.contrib.layers.sparse_column_with_hash_bucket("MC5", hash_bucket_size=23)
MC6 = tf.contrib.layers.sparse_column_with_hash_bucket("MC6", hash_bucket_size=15)
MC7 = tf.contrib.layers.sparse_column_with_keys(column_name="MC7", keys=["Na","1267","2031","1223","1950","2055"])
MC8 = tf.contrib.layers.sparse_column_with_keys(column_name="MC8", keys=["Na","1971","2588","2206"])
MC9 = tf.contrib.layers.sparse_column_with_keys(column_name="MC9", keys=["Na","2588"])
MC=tf.contrib.layers.crossed_column([MC1,MC2,MC3,MC4,MC5,MC6,MC7,MC8,MC9],hash_bucket_size=int(1e6))
IT1 = tf.contrib.layers.sparse_column_with_hash_bucket("IT1", hash_bucket_size=30)
IT2 = tf.contrib.layers.sparse_column_with_hash_bucket("IT2", hash_bucket_size=25)
IT=tf.contrib.layers.crossed_column([IT1,IT2],hash_bucket_size=int(1e4))
OG1 = tf.contrib.layers.sparse_column_with_hash_bucket("OG1", hash_bucket_size=18)
OG2 = tf.contrib.layers.sparse_column_with_hash_bucket("OG2", hash_bucket_size=25)
OG=tf.contrib.layers.crossed_column([OG1,OG2],hash_bucket_size=int(1e4))
# NPK = tf.contrib.layers.sparse_column_with_keys(column_name="NPK", keys=["YES", "NO"])
NPK = tf.contrib.layers.sparse_column_with_keys(column_name="NPK", keys=["0", "<10", "10-100", "100-1000", ">1000"])
NPE = tf.contrib.layers.real_valued_column("NPE")
GC1 = tf.contrib.layers.sparse_column_with_hash_bucket("GC1", hash_bucket_size=12)
GC2 = tf.contrib.layers.sparse_column_with_hash_bucket("GC2", hash_bucket_size=10)
GC3 = tf.contrib.layers.sparse_column_with_hash_bucket("GC3", hash_bucket_size=10)
GC=tf.contrib.layers.crossed_column([GC1,GC2,GC3],hash_bucket_size=int(1e5))
SC1 = tf.contrib.layers.sparse_column_with_hash_bucket("SC1", hash_bucket_size=72)
SC2 = tf.contrib.layers.sparse_column_with_hash_bucket("SC2", hash_bucket_size=65)
SC3 = tf.contrib.layers.sparse_column_with_hash_bucket("SC3", hash_bucket_size=50)
SC=tf.contrib.layers.crossed_column([SC1,SC2,SC3],hash_bucket_size=int(1e5))
QY = tf.contrib.layers.sparse_column_with_keys(column_name="QY", keys=["Na","1000-10000","100-1000","10-100","from1to10","<1",">10000"])
IS1 = tf.contrib.layers.sparse_column_with_hash_bucket("IS1", hash_bucket_size=12)
IS2 = tf.contrib.layers.sparse_column_with_hash_bucket("IS2", hash_bucket_size=20)
IS=tf.contrib.layers.crossed_column([IS1,IS2],hash_bucket_size=int(1e4))
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.LinearClassifier(feature_columns=[DA, PD, LO, MN1, MN2, MN3, MN4, MN5, MN6, MN7, MN8, MN9, MH1, MH2, MH3, MH4, MH5, MH6, MH7, MH8, MH9, MC1, MC2, MC3, MC4, MC5, MC6, MC7, MC8, MC9, IT1, IT2, OG1, OG2, NPE, GC1, GC2, GC3, SC1, SC2, SC3, QY, IS1, IS2, DAxLO, MN, MH, MC, IT, OG, GC, SC, IS], model_dir=model_dir)
#evaluate the results
m.fit(input_fn=train_input_fn, steps=200)
results = m.evaluate(input_fn=eval_input_fn,steps=1)
for key in sorted(results):
print("%s: %s" % (key, results[key]))
#print probability
predictions=list(m.predict_proba(input_fn=eval_input_fn))
print("\n".join(map(str, predictions)))
Я должен напечатать вероятности, чтобы уточнить их, чтобы сравнить результаты с результатами кода.
Я хотел бы знать вероятность того, что печать кода настолько высока.
Что-то не так в коде?
Нужно ли что-то добавлять?
Есть ли другая структура для вывода вероятности?
Нужно ли использовать другие функции вкод