emillykkejensen · August 21, 2020 10:23 · emillykkejensen · Nov 30, 2022 · textclassificationlearner · Jan 23, 2023
diff --git a/MultiLabel_MultiClass_TextClassification_with_BERT_Transformer_and_Keras.py b/MultiLabel_MultiClass_TextClassification_with_BERT_Transformer_and_Keras.py
 #######################################
 ### -------- Load libraries ------- ###

 # Load Huggingface transformers
 from transformers import TFBertModel,  BertConfig, BertTokenizerFast

 # Then what you need from tensorflow.keras
 from tensorflow.keras.layers import Input, Dropout, Dense
 from tensorflow.keras.models import Model
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.callbacks import EarlyStopping
 from tensorflow.keras.initializers import TruncatedNormal
 from tensorflow.keras.losses import CategoricalCrossentropy
 from tensorflow.keras.metrics import CategoricalAccuracy
 from tensorflow.keras.utils import to_categorical

 # And pandas for data import + sklearn because you allways need sklearn
 import pandas as pd
 from sklearn.model_selection import train_test_split


 #######################################
 ### --------- Import data --------- ###

 # Import data from csv
 data = pd.read_csv('dev/Fun with BERT/complaints.csv')

 # Select required columns
 data = data[['Consumer complaint narrative', 'Product', 'Issue']]

 # Remove a row if any of the three remaining columns are missing
 data = data.dropna()

 # Remove rows, where the label is present only ones (can't be split)
 data = data.groupby('Issue').filter(lambda x : len(x) > 1)
 data = data.groupby('Product').filter(lambda x : len(x) > 1)

 # Set your model output as categorical and save in new label col
 data['Issue_label'] = pd.Categorical(data['Issue'])
 data['Product_label'] = pd.Categorical(data['Product'])

 # Transform your output to numeric
 data['Issue'] = data['Issue_label'].cat.codes
 data['Product'] = data['Product_label'].cat.codes

 # Split into train and test - stratify over Issue
 data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Issue']])


 #######################################
 ### --------- Setup BERT ---------- ###

 # Name of the BERT model to use
 model_name = 'bert-base-uncased'

 # Max length of tokens
 max_length = 100

 # Load transformers config and set output_hidden_states to False
 config = BertConfig.from_pretrained(model_name)
 config.output_hidden_states = False

 # Load BERT tokenizer
 tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

 # Load the Transformers BERT model
 transformer_model = TFBertModel.from_pretrained(model_name, config = config)


 #######################################
 ### ------- Build the model ------- ###

 # TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model

 # Load the MainLayer
 bert = transformer_model.layers[0]

 # Build your model input
 input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
 # attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') 
 # inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
 inputs = {'input_ids': input_ids}

 # Load the Transformers BERT model as a layer in a Keras model
 bert_model = bert(inputs)[1]
 dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
 pooled_output = dropout(bert_model, training=False)

 # Then build your model output
 issue = Dense(units=len(data.Issue_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='issue')(pooled_output)
 product = Dense(units=len(data.Product_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='product')(pooled_output)
 outputs = {'issue': issue, 'product': product}

 # And combine it all in a model object
 model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

 # Take a look at the model
 model.summary()


 #######################################
 ### ------- Train the model ------- ###

 # Set an optimizer
 optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

 # Set loss and metrics
 loss = {'issue': CategoricalCrossentropy(from_logits = True), 'product': CategoricalCrossentropy(from_logits = True)}
 metric = {'issue': CategoricalAccuracy('accuracy'), 'product': CategoricalAccuracy('accuracy')}

 # Compile the model
 model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

 # Ready output data for the model
 y_issue = to_categorical(data['Issue'])
 y_product = to_categorical(data['Product'])

 # Tokenize the input (takes some time)
 x = tokenizer(
    text=data['Consumer complaint narrative'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

 # Fit the model
 history = model.fit(
    # x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
    x={'input_ids': x['input_ids']},
    y={'issue': y_issue, 'product': y_product},
    validation_split=0.2,
    batch_size=64,
    epochs=10)


 #######################################
 ### ----- Evaluate the model ------ ###

 # Ready test data
 test_y_issue = to_categorical(data_test['Issue'])
 test_y_product = to_categorical(data_test['Product'])
 test_x = tokenizer(
    text=data_test['Consumer complaint narrative'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

 # Run evaluation
 model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids']},
    y={'issue': test_y_issue, 'product': test_y_product}
 )
	#######################################
	### -------- Load libraries ------- ###

	# Load Huggingface transformers
	from transformers import TFBertModel, BertConfig, BertTokenizerFast

	# Then what you need from tensorflow.keras
	from tensorflow.keras.layers import Input, Dropout, Dense
	from tensorflow.keras.models import Model
	from tensorflow.keras.optimizers import Adam
	from tensorflow.keras.callbacks import EarlyStopping
	from tensorflow.keras.initializers import TruncatedNormal
	from tensorflow.keras.losses import CategoricalCrossentropy
	from tensorflow.keras.metrics import CategoricalAccuracy
	from tensorflow.keras.utils import to_categorical

	# And pandas for data import + sklearn because you allways need sklearn
	import pandas as pd
	from sklearn.model_selection import train_test_split


	#######################################
	### --------- Import data --------- ###

	# Import data from csv
	data = pd.read_csv('dev/Fun with BERT/complaints.csv')

	# Select required columns
	data = data[['Consumer complaint narrative', 'Product', 'Issue']]

	# Remove a row if any of the three remaining columns are missing
	data = data.dropna()

	# Remove rows, where the label is present only ones (can't be split)
	data = data.groupby('Issue').filter(lambda x : len(x) > 1)
	data = data.groupby('Product').filter(lambda x : len(x) > 1)

	# Set your model output as categorical and save in new label col
	data['Issue_label'] = pd.Categorical(data['Issue'])
	data['Product_label'] = pd.Categorical(data['Product'])

	# Transform your output to numeric
	data['Issue'] = data['Issue_label'].cat.codes
	data['Product'] = data['Product_label'].cat.codes

	# Split into train and test - stratify over Issue
	data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Issue']])


	#######################################
	### --------- Setup BERT ---------- ###

	# Name of the BERT model to use
	model_name = 'bert-base-uncased'

	# Max length of tokens
	max_length = 100

	# Load transformers config and set output_hidden_states to False
	config = BertConfig.from_pretrained(model_name)
	config.output_hidden_states = False

	# Load BERT tokenizer
	tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

	# Load the Transformers BERT model
	transformer_model = TFBertModel.from_pretrained(model_name, config = config)


	#######################################
	### ------- Build the model ------- ###

	# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model

	# Load the MainLayer
	bert = transformer_model.layers[0]

	# Build your model input
	input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
	# attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32')
	# inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
	inputs = {'input_ids': input_ids}

	# Load the Transformers BERT model as a layer in a Keras model
	bert_model = bert(inputs)[1]
	dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
	pooled_output = dropout(bert_model, training=False)

	# Then build your model output
	issue = Dense(units=len(data.Issue_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='issue')(pooled_output)
	product = Dense(units=len(data.Product_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='product')(pooled_output)
	outputs = {'issue': issue, 'product': product}

	# And combine it all in a model object
	model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

	# Take a look at the model
	model.summary()


	#######################################
	### ------- Train the model ------- ###

	# Set an optimizer
	optimizer = Adam(
	learning_rate=5e-05,
	epsilon=1e-08,
	decay=0.01,
	clipnorm=1.0)

	# Set loss and metrics
	loss = {'issue': CategoricalCrossentropy(from_logits = True), 'product': CategoricalCrossentropy(from_logits = True)}
	metric = {'issue': CategoricalAccuracy('accuracy'), 'product': CategoricalAccuracy('accuracy')}

	# Compile the model
	model.compile(
	optimizer = optimizer,
	loss = loss,
	metrics = metric)

	# Ready output data for the model
	y_issue = to_categorical(data['Issue'])
	y_product = to_categorical(data['Product'])

	# Tokenize the input (takes some time)
	x = tokenizer(
	text=data['Consumer complaint narrative'].to_list(),
	add_special_tokens=True,
	max_length=max_length,
	truncation=True,
	padding=True,
	return_tensors='tf',
	return_token_type_ids = False,
	return_attention_mask = True,
	verbose = True)

	# Fit the model
	history = model.fit(
	# x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
	x={'input_ids': x['input_ids']},
	y={'issue': y_issue, 'product': y_product},
	validation_split=0.2,
	batch_size=64,
	epochs=10)


	#######################################
	### ----- Evaluate the model ------ ###

	# Ready test data
	test_y_issue = to_categorical(data_test['Issue'])
	test_y_product = to_categorical(data_test['Product'])
	test_x = tokenizer(
	text=data_test['Consumer complaint narrative'].to_list(),
	add_special_tokens=True,
	max_length=max_length,
	truncation=True,
	padding=True,
	return_tensors='tf',
	return_token_type_ids = False,
	return_attention_mask = False,
	verbose = True)

	# Run evaluation
	model_eval = model.evaluate(
	x={'input_ids': test_x['input_ids']},
	y={'issue': test_y_issue, 'product': test_y_product}
	)