Skip to content

Instantly share code, notes, and snippets.

@aialenti
Created July 19, 2020 17:55
Show Gist options
  • Save aialenti/82321db23a6f0d07a48b4289d5f30eaf to your computer and use it in GitHub Desktop.
Save aialenti/82321db23a6f0d07a48b4289d5f30eaf to your computer and use it in GitHub Desktop.
import pandas as pd
import enchant
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
# Declare tokenizers
punctuation_tokenizer = RegexpTokenizer(r'[^\sA-Za-z0-9]')
numbers_tokenizer = RegexpTokenizer(r'[0-9]')
# TQDM for nice loading bars
tqdm.pandas()
# Enchant dictionary to test mispelling
enchant_dict = enchant.Dict("en_US")
# Load data
df = pd.read_csv("./data/jigsaw.csv")
# Load Swear words dataset
swear_words = pd.read_csv("./data/swear-words.csv")["word"].to_list()
def count_mispelled_words(tokens):
english = [enchant_dict.check(t) for t in tokens]
correctly_spelled = sum(english)
mispelled = len(english) - correctly_spelled
return mispelled
def count_stopwords(tokens):
filtered_words = [w for w in tokens if w in stopwords.words('english')]
return len(filtered_words)
def count_swearwords(tokens):
filtered_words = [w for w in tokens if w in swear_words]
return len(filtered_words)
# Tokenize
df["tokens"] = df.progress_apply(lambda row: nltk.word_tokenize(row.comment_text), axis=1)
# Get Number of tokens
df["num_words"] = df.progress_apply(lambda row: len(row.tokens), axis=1)
# Get Number of characters
df["num_chars"] = df.progress_apply(lambda row: len(row.comment_text), axis=1)
# Count Mispelled words
df["mispelled"] = df.progress_apply(lambda row: count_mispelled_words(row.tokens), axis=1)
# Calculate correctly spelled words
df["correctly_spelled"] = df.progress_apply(lambda row: len(row.tokens) - row.mispelled, axis=1)
# Count stopwords
df["stopwords"] = df.progress_apply(lambda row: count_stopwords(row.tokens), axis=1)
# Count swear words
df["swear_words"] = df.progress_apply(lambda row: count_swearwords(row.tokens), axis=1)
# Count Numbers
df["numbers"] = df.progress_apply(lambda row: len(numbers_tokenizer.tokenize(row.comment_text)), axis=1)
# Count Punctuation
df["punctuation"] = df.progress_apply(lambda row: len(punctuation_tokenizer.tokenize(row.comment_text)), axis=1)
df["punctuation_ratio"] = df["punctuation"] / df["num_chars"]
df["swear_words_ratio"] = df["swear_words"] / df["num_words"]
df["stopwords_ratio"] = df["stopwords"] / df["num_words"]
df["correctly_spelled_ratio"] = df["correctly_spelled"] / df["num_words"]
df["mispelled_spelled_ratio"] = df["mispelled"] / df["num_words"]
df["toxic_string"] = ""
df.loc[df["toxic"] == 1, "toxic_string"] = " Toxic "
df["severe_toxic_string"] = ""
df.loc[df["severe_toxic"] == 1, "severe_toxic_string"] = " Severe-Toxic "
df["obscene_string"] = ""
df.loc[df["obscene"] == 1, "obscene_string"] = " Obscene "
df["threat_string"] = ""
df.loc[df["threat"] == 1, "threat_string"] = " Threat "
df["insult_string"] = ""
df.loc[df["insult"] == 1, "insult_string"] = " Insult "
df["identity_hate_string"] = ""
df.loc[df["identity_hate"] == 1, "identity_hate_string"] = " Identity-hate "
df["category"] = df["toxic_string"] + df["severe_toxic_string"] + \
df["obscene_string"] + df["threat_string"] + df["insult_string"] + df["identity_hate_string"]
df.loc[df["category"] == "", "category"] = "Non Toxic Comment"
df.drop(["comment_text", "tokens"], axis=1).to_csv("./data/jigsaw_dataset.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment