Вероятность печати с помощью широкой модели Python - PullRequest
0 голосов
/ 20 сентября 2018
 #Structure of the code for wide model
 #!/usr/bin/env python 
    # -*- coding: utf-8-*-

    import pandas as pd
    import tensorflow as tf
    import tempfile

COLUMNS=["DA","PD","LO","MN1","MN2","MN3","MN4","MN5","MN6","MN7","MN8","MN9","MH1","MH2","MH3","MH4","MH5","MH6","MH7",
    "MH8","MH9","MC1","MC2","MC3","MC4","MC5","MC6","MC7","MC8","MC9","IT1","IT2","OG1","OG2",
    "NPK","NPE","GC1","GC2","GC3","SC1","SC2","SC3","QY","IS1","IS2"]

    # train_file = "/home/davide/Scrivania/eventdataYN.csv" 
    # test_file = "/home/davide/Scrivania/eventtestYN.csv"
    train_file = "/home/davide/Scrivania/eventdataCAT.csv"
    test_file = "/home/davide/Scrivania/eventtestCAT.csv"

    df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
    df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True)
    # LABELY_COLUMN = "labelY"
    # df_train[LABELY_COLUMN] = (df_train["NPK"].apply(lambda x: "YES" in x)).astype(int)
    # df_test[LABELY_COLUMN] = (df_test["NPK"].apply(lambda x: "YES" in x)).astype(int)
    # LABELN_COLUMN = "labelN"
    # df_train[LABELN_COLUMN] = (df_train["NPK"].apply(lambda x: "NO" in x)).astype(int)
    # df_test[LABELN_COLUMN] = (df_test["NPK"].apply(lambda x: "NO" in x)).astype(int)
    LABEL1_COLUMN = "label1"
    df_train[LABEL1_COLUMN] = (df_train["NPK"].apply(lambda x: "<10" in x)).astype(int)
    df_test[LABEL1_COLUMN] = (df_test["NPK"].apply(lambda x: "<10" in x)).astype(int)
    # LABEL10_COLUMN = "label10"
    # df_train[LABEL10_COLUMN] = (df_train["NPK"].apply(lambda x: "10-100" in x)).astype(int)
    # df_test[LABEL10_COLUMN] = (df_test["NPK"].apply(lambda x: "10-100" in x)).astype(int)
    # LABEL100_COLUMN = "label100"
    # df_train[LABEL100_COLUMN] = (df_train["NPK"].apply(lambda x: "100-1000" in x)).astype(int)
    # df_test[LABEL100_COLUMN] = (df_test["NPK"].apply(lambda x: "100-1000" in x)).astype(int)
    # LABEL1000_COLUMN = "label1000"
    # df_train[LABEL1000_COLUMN] = (df_train["NPK"].apply(lambda x: ">1000" in x)).astype(int)
    # df_test[LABEL1000_COLUMN] = (df_test["NPK"].apply(lambda x: ">1000" in x)).astype(int)
    CATEGORICAL_COLUMNS = ["DA","PD","LO","MN1","MN2","MN3","MN4","MN5","MN6","MN7","MN8","MN9","MH1","MH2","MH3","MH4","MH5","MH6","MH7","MH8","MH9","MC1","MC2","MC3","MC4","MC5","MC6","MC7","MC8","MC9","IT1","IT2","OG1","OG2","NPK","GC1","GC2","GC3","SC1","SC2","SC3","QY","IS1","IS2"]
    CONTINUOUS_COLUMNS = ["NPE"]
    def input_fn(df):
       # Creates a dictionary mapping from each continuous feature column name (k) to
       # the values of that column stored in a constant Tensor.
       continuous_cols = {k: tf.constant(df[k].values)
                          for k in CONTINUOUS_COLUMNS}
       # Creates a dictionary mapping from each categorical feature column name (k)
       # to the values of that column stored in a tf.SparseTensor.
       categorical_cols = {k: tf.SparseTensor(
           indices=[[i, 0] for i in range(df[k].size)],
           values=df[k].values,
           dense_shape=[df[k].size, 1])
                           for k in CATEGORICAL_COLUMNS}
       # Merges the two dictionaries into one.
       feature_cols = dict(continuous_cols.items() + categorical_cols.items())
       # Converts the label columns into constant Tensors.
       # labelY = tf.constant(df[LABELY_COLUMN].values)
       # labelN = tf.constant(df[LABELN_COLUMN].values)
       label1 = tf.constant(df[LABEL1_COLUMN].values)
       # label10 = tf.constant(df[LABEL10_COLUMN].values)
       # label100 = tf.constant(df[LABEL100_COLUMN].values)
       # label1000 = tf.constant(df[LABEL1000_COLUMN].values)
       # Returns the feature columns and the labels.
       return feature_cols, label1

    def train_input_fn():
       return input_fn(df_train)

    def eval_input_fn():
       return input_fn(df_test)

    #define columns
    DA = tf.contrib.layers.sparse_column_with_keys(column_name="DA", keys=["<1900s","1900s","1910s","1920s","1930s","1940s","1950s","1960s","1970s","1980s","1990s"])
    PD = tf.contrib.layers.sparse_column_with_keys(column_name="PD", keys=["R", "U","Na"])
    LO = tf.contrib.layers.sparse_column_with_hash_bucket("LO", hash_bucket_size=280)
    DAxLO=tf.contrib.layers.crossed_column([DA,LO],hash_bucket_size=int(1e5))
    MN1 = tf.contrib.layers.sparse_column_with_hash_bucket("MN1", hash_bucket_size=1450)
    MN2 = tf.contrib.layers.sparse_column_with_hash_bucket("MN2", hash_bucket_size=375)
    MN3 = tf.contrib.layers.sparse_column_with_hash_bucket("MN3", hash_bucket_size=170)
    MN4 = tf.contrib.layers.sparse_column_with_hash_bucket("MN4", hash_bucket_size=90)
    MN5 = tf.contrib.layers.sparse_column_with_hash_bucket("MN5", hash_bucket_size=30)
    MN6 = tf.contrib.layers.sparse_column_with_hash_bucket("MN6", hash_bucket_size=15)
    MN7 = tf.contrib.layers.sparse_column_with_keys(column_name="MN7", keys=["AEROSOLS","FUEL OIL","Na","NITRIC ACID","OIL","STYRENE"])
    MN8 = tf.contrib.layers.sparse_column_with_keys(column_name="MN8", keys=["HG SEED DRESS","Na","NATURAL GAS","TOLUENE DIISOCYANATE"])
    MN9 = tf.contrib.layers.sparse_column_with_keys(column_name="MN9", keys=["Na","ORG.PHOS.PESTIC"])
    MN=tf.contrib.layers.crossed_column([MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9],hash_bucket_size=int(1e6))
    MH1 = tf.contrib.layers.sparse_column_with_keys(column_name="MH1", keys=["AS","CD","CO","EX","FI","Na","OX","TO"])
    MH2 = tf.contrib.layers.sparse_column_with_keys(column_name="MH2", keys=["AS","CD","CO","EX","FI","Na","OX","TO"])
    MH3 = tf.contrib.layers.sparse_column_with_keys(column_name="MH3", keys=["AS","CO","EX","FI","Na","OX","TO"])
    MH4 = tf.contrib.layers.sparse_column_with_keys(column_name="MH4", keys=["CO","FI","Na","OX","TO"])
    MH5 = tf.contrib.layers.sparse_column_with_keys(column_name="MH5", keys=["CO","FI","Na","TO"])
    MH6 = tf.contrib.layers.sparse_column_with_keys(column_name="MH6", keys=["CO","FI","Na","TO"])
    MH7 = tf.contrib.layers.sparse_column_with_keys(column_name="MH7", keys=["CO","FI","Na"])
    MH8 = tf.contrib.layers.sparse_column_with_keys(column_name="MH8", keys=["FI","Na","TO"])
    MH9 = tf.contrib.layers.sparse_column_with_keys(column_name="MH9", keys=["Na","TO"])
    MH=tf.contrib.layers.crossed_column([MH1,MH2,MH3,MH4,MH5,MH6,MH7,MH8,MH9],hash_bucket_size=int(1e6))
    MC1 = tf.contrib.layers.sparse_column_with_hash_bucket("MC1", hash_bucket_size=350)
    MC2 = tf.contrib.layers.sparse_column_with_hash_bucket("MC2", hash_bucket_size=180)
    MC3 = tf.contrib.layers.sparse_column_with_hash_bucket("MC3", hash_bucket_size=95)
    MC4 = tf.contrib.layers.sparse_column_with_hash_bucket("MC4", hash_bucket_size=55)
    MC5 = tf.contrib.layers.sparse_column_with_hash_bucket("MC5", hash_bucket_size=23)
    MC6 = tf.contrib.layers.sparse_column_with_hash_bucket("MC6", hash_bucket_size=15)
    MC7 = tf.contrib.layers.sparse_column_with_keys(column_name="MC7", keys=["Na","1267","2031","1223","1950","2055"])
    MC8 = tf.contrib.layers.sparse_column_with_keys(column_name="MC8", keys=["Na","1971","2588","2206"])
    MC9 = tf.contrib.layers.sparse_column_with_keys(column_name="MC9", keys=["Na","2588"])
    MC=tf.contrib.layers.crossed_column([MC1,MC2,MC3,MC4,MC5,MC6,MC7,MC8,MC9],hash_bucket_size=int(1e6))
    IT1 = tf.contrib.layers.sparse_column_with_hash_bucket("IT1", hash_bucket_size=30)
    IT2 = tf.contrib.layers.sparse_column_with_hash_bucket("IT2", hash_bucket_size=25)
    IT=tf.contrib.layers.crossed_column([IT1,IT2],hash_bucket_size=int(1e4))
    OG1 = tf.contrib.layers.sparse_column_with_hash_bucket("OG1", hash_bucket_size=18)
    OG2 = tf.contrib.layers.sparse_column_with_hash_bucket("OG2", hash_bucket_size=25)
    OG=tf.contrib.layers.crossed_column([OG1,OG2],hash_bucket_size=int(1e4))
    # NPK = tf.contrib.layers.sparse_column_with_keys(column_name="NPK", keys=["YES", "NO"])
    NPK = tf.contrib.layers.sparse_column_with_keys(column_name="NPK", keys=["0", "<10", "10-100", "100-1000", ">1000"])
    NPE = tf.contrib.layers.real_valued_column("NPE")
    GC1 = tf.contrib.layers.sparse_column_with_hash_bucket("GC1", hash_bucket_size=12)
    GC2 = tf.contrib.layers.sparse_column_with_hash_bucket("GC2", hash_bucket_size=10)
    GC3 = tf.contrib.layers.sparse_column_with_hash_bucket("GC3", hash_bucket_size=10)
    GC=tf.contrib.layers.crossed_column([GC1,GC2,GC3],hash_bucket_size=int(1e5))
    SC1 = tf.contrib.layers.sparse_column_with_hash_bucket("SC1", hash_bucket_size=72)
    SC2 = tf.contrib.layers.sparse_column_with_hash_bucket("SC2", hash_bucket_size=65)
    SC3 = tf.contrib.layers.sparse_column_with_hash_bucket("SC3", hash_bucket_size=50)
    SC=tf.contrib.layers.crossed_column([SC1,SC2,SC3],hash_bucket_size=int(1e5))
    QY = tf.contrib.layers.sparse_column_with_keys(column_name="QY", keys=["Na","1000-10000","100-1000","10-100","from1to10","<1",">10000"])
    IS1 = tf.contrib.layers.sparse_column_with_hash_bucket("IS1", hash_bucket_size=12)
    IS2 = tf.contrib.layers.sparse_column_with_hash_bucket("IS2", hash_bucket_size=20)
    IS=tf.contrib.layers.crossed_column([IS1,IS2],hash_bucket_size=int(1e4))
    model_dir = tempfile.mkdtemp()
    m = tf.contrib.learn.LinearClassifier(feature_columns=[DA, PD, LO, MN1, MN2, MN3, MN4, MN5, MN6, MN7, MN8, MN9, MH1, MH2, MH3, MH4, MH5, MH6, MH7, MH8, MH9, MC1, MC2, MC3, MC4, MC5, MC6, MC7, MC8, MC9, IT1, IT2, OG1, OG2, NPE, GC1, GC2, GC3, SC1, SC2, SC3, QY, IS1, IS2, DAxLO, MN, MH, MC, IT, OG, GC, SC, IS], model_dir=model_dir)

    #evaluate the results
    m.fit(input_fn=train_input_fn, steps=200)

    results = m.evaluate(input_fn=eval_input_fn,steps=1)
    for key in sorted(results):
        print("%s: %s" % (key, results[key]))

#print probability 
    predictions=list(m.predict_proba(input_fn=eval_input_fn))

    print("\n".join(map(str, predictions)))

Я должен напечатать вероятности, чтобы уточнить их, чтобы сравнить результаты с результатами кода.

Я хотел бы знать вероятность того, что печать кода настолько высока.

Что-то не так в коде?

Нужно ли что-то добавлять?

Есть ли другая структура для вывода вероятности?

Нужно ли использовать другие функции вкод

...