from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import json
import urllib
from sklearn.preprocessing import MultiLabelBinarizer
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.flags.DEFINE_integer(
'steps', 10, 'The number of steps to train a model')
tf.app.flags.DEFINE_string(
'model_dir', './models/ckpt/', 'Dir to save a model and checkpoints')
tf.app.flags.DEFINE_string(
'saved_dir', './models/pb/', 'Dir to save a model for TF serving')
FLAGS = tf.app.flags.FLAGS
'''
Data loading and preprocessing method
Dropped rows that do not contain sentiment
Casted text column to all lowercase to normalize tweets
Created new feature column for tweets containing a website or link
Created new feature column to check for profanity from a text file
'''
def load_and_preprocess():
data = pd.read_csv('https://www.figure-eight.com/wp-content/uploads/2016/03/Apple-Twitter-Sentiment-DFE.csv',
encoding="ISO-8859-1")
data['text'] = data['text'].str.lower()
data = data[data.sentiment.str.contains("not_relevant") == False]
data['contains_url'] = data['text'].str.contains('http').astype(int)
profanity = pd.read_fwf('data/profanity.txt', header=None)
words = [any(i in words for i in profanity[0].values)
for words in data['text'].str.split().values]
words = np.array(words, dtype=np.float32)
data['contains_profanity'] = words.astype(int)
sentiment = data['sentiment']
text = data['text']
url = data['contains_url']
profanity = data['contains_profanity']
return data, sentiment, text, url, profanity
data, sentiment, text, url, profanity = load_and_preprocess()
def serving_input_receiver_fn():
feature_spec = {
"text": tf.placeholder(dtype=tf.string, shape=[None]),
"url": tf.placeholder(dtype=tf.bool, shape=[None]),
"profanity": tf.placeholder(dtype=tf.bool, shape=[None]),
}
return tf.estimator.export.ServingInputReceiver(feature_spec, feature_spec)
def main(unused_argv):
train_size = int(len(text) * .8)
train_text = text[:train_size]
train_sentiment = sentiment[:train_size]
train_url = url[:train_size]
train_profanity = profanity[:train_size]
test_text = text[train_size:]
test_sentiment = sentiment[train_size:]
test_url = url[train_size:]
test_profanity = profanity[train_size:]
text_embeddings = hub.text_embedding_column(
"text",
module_spec="https://tfhub.dev/google/elmo/2", trainable=True
)
encoder = MultiLabelBinarizer()
encoder.fit_transform(train_sentiment)
train_encoded = encoder.transform(train_sentiment)
test_encoded = encoder.transform(test_sentiment)
num_classes = len(encoder.classes_)
multi_label_head = tf.contrib.estimator.multi_label_head(
num_classes,
loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE
)
estimator = tf.contrib.estimator.DNNEstimator(
head=multi_label_head,
hidden_units=[64, 10],
feature_columns=[text_embeddings],
model_dir=FLAGS.model_dir
)
# Format our data for the numpy_input_fn
features = {
"text": np.array(train_text),
"url": np.array(train_url),
"profanity": np.array(train_profanity)
}
labels = np.array(train_encoded)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
features,
labels,
shuffle=True,
batch_size=FLAGS.steps,
num_epochs=10
)
estimator.train(input_fn=train_input_fn)
estimator.export_savedmodel(
FLAGS.saved_dir, serving_input_receiver_fn=serving_input_receiver_fn)
eval_input_fn = tf.estimator.inputs.numpy_input_fn({"text": np.array(
test_text).astype(np.str)}, test_encoded.astype(np.int32), shuffle=False)
estimator.evaluate(input_fn=eval_input_fn)
if __name__ == "__main__":
tf.app.run()