This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import plotly.plotly as py | |
# (*) Useful Python/Plotly tools | |
import plotly.tools as tls | |
from plotly.graph_objs import * | |
import numpy as np | |
import random | |
import sys, serial | |
import urllib2 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# We'll consider everything with a Normalized Score below the 25th percentile to be rude. | |
# Above the 75th percentile is considered polite. | |
# Scores in the middle are considered neutral | |
# Get the 25th and 75th percentiles of Normalized Scoes | |
rude_thresh = data.describe().loc["25%"]['Normalized Score'] | |
polite_thresh = data.describe().loc["75%"]['Normalized Score'] | |
label_list = ["rude", "neutral", "polite"] | |
def score_to_label(score): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import train_test_split | |
import pandas as pd | |
import tensorflow as tf | |
import tensorflow_hub as hub | |
from datetime import datetime | |
# Install and import BERT's libraries | |
!pip install bert-tensorflow | |
import bert | |
from bert import run_classifier |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tensorflow import keras | |
import os | |
import re | |
# Load all files from a directory in a DataFrame. | |
def load_directory_data(directory): | |
data = {} | |
data["sentence"] = [] | |
data["sentiment"] = [] | |
for file_path in os.listdir(directory): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Use the InputExample class from BERT's run_classifier code to create examples from the data | |
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example | |
text_a = x[DATA_COLUMN], | |
text_b = None, | |
label = x[LABEL_COLUMN]), axis = 1) | |
test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, | |
text_a = x[DATA_COLUMN], | |
text_b = None, | |
label = x[LABEL_COLUMN]), axis = 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a path to an uncased (all lowercase) version of BERT | |
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1" | |
def create_tokenizer_from_hub_module(): | |
"""Get the vocab file and casing info from the Hub module.""" | |
with tf.Graph().as_default(): | |
bert_module = hub.Module(BERT_MODEL_HUB) | |
tokenization_info = bert_module(signature="tokenization_info", as_dict=True) | |
with tf.Session() as sess: | |
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels, | |
num_labels): | |
"""Creates a classification model.""" | |
bert_module = hub.Module( | |
BERT_MODEL_HUB, | |
trainable=True) | |
bert_inputs = dict( | |
input_ids=input_ids, | |
input_mask=input_mask, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# model_fn_builder actually creates our model function | |
# using the passed parameters for num_labels, learning_rate, etc. | |
def model_fn_builder(num_labels, learning_rate, num_train_steps, | |
num_warmup_steps): | |
"""Returns `model_fn` closure for TPUEstimator.""" | |
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument | |
"""The `model_fn` for TPUEstimator.""" | |
input_ids = features["input_ids"] | |
input_mask = features["input_mask"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Compute train and warmup steps from batch size | |
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb) | |
BATCH_SIZE = 32 | |
LEARNING_RATE = 2e-5 | |
NUM_TRAIN_EPOCHS = 3.0 | |
# Warmup is a period of time where hte learning rate | |
# is small and gradually increases--usually helps training. | |
WARMUP_PROPORTION = 0.1 | |
# Model configs | |
SAVE_CHECKPOINTS_STEPS = 500 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load our test data | |
test_input_fn = run_classifier.input_fn_builder( | |
features=test_features, | |
seq_length=MAX_SEQ_LENGTH, | |
is_training=False, | |
drop_remainder=False) | |
estimator.evaluate(input_fn=test_input_fn, steps=None) |
OlderNewer