aialenti · July 19, 2020 17:55
diff --git a/extract-info-jigsaw.py b/extract-info-jigsaw.py
 import pandas as pd
 import enchant
 import nltk
 from tqdm import tqdm
 from nltk.corpus import stopwords
 from nltk.tokenize import RegexpTokenizer

 #   Declare tokenizers
 punctuation_tokenizer = RegexpTokenizer(r'[^\sA-Za-z0-9]')
 numbers_tokenizer = RegexpTokenizer(r'[0-9]')

 #   TQDM for nice loading bars
 tqdm.pandas()

 #   Enchant dictionary to test mispelling
 enchant_dict = enchant.Dict("en_US")

 #   Load data
 df = pd.read_csv("./data/jigsaw.csv")

 #   Load Swear words dataset
 swear_words = pd.read_csv("./data/swear-words.csv")["word"].to_list()


 def count_mispelled_words(tokens):
    english = [enchant_dict.check(t) for t in tokens]
    correctly_spelled = sum(english)
    mispelled = len(english) - correctly_spelled
    return mispelled


 def count_stopwords(tokens):
    filtered_words = [w for w in tokens if w in stopwords.words('english')]
    return len(filtered_words)


 def count_swearwords(tokens):
    filtered_words = [w for w in tokens if w in swear_words]
    return len(filtered_words)


 #   Tokenize
 df["tokens"] = df.progress_apply(lambda row: nltk.word_tokenize(row.comment_text), axis=1)

 #   Get Number of tokens
 df["num_words"] = df.progress_apply(lambda row: len(row.tokens), axis=1)

 #   Get Number of characters
 df["num_chars"] = df.progress_apply(lambda row: len(row.comment_text), axis=1)

 #   Count Mispelled words
 df["mispelled"] = df.progress_apply(lambda row: count_mispelled_words(row.tokens), axis=1)

 #   Calculate correctly spelled words
 df["correctly_spelled"] = df.progress_apply(lambda row: len(row.tokens) - row.mispelled, axis=1)

 #   Count stopwords
 df["stopwords"] = df.progress_apply(lambda row: count_stopwords(row.tokens), axis=1)

 #   Count swear words
 df["swear_words"] = df.progress_apply(lambda row: count_swearwords(row.tokens), axis=1)

 #   Count Numbers
 df["numbers"] = df.progress_apply(lambda row: len(numbers_tokenizer.tokenize(row.comment_text)), axis=1)

 #   Count Punctuation
 df["punctuation"] = df.progress_apply(lambda row: len(punctuation_tokenizer.tokenize(row.comment_text)), axis=1)

 df["punctuation_ratio"] = df["punctuation"] / df["num_chars"]
 df["swear_words_ratio"] = df["swear_words"] / df["num_words"]
 df["stopwords_ratio"] = df["stopwords"] / df["num_words"]
 df["correctly_spelled_ratio"] = df["correctly_spelled"] / df["num_words"]
 df["mispelled_spelled_ratio"] = df["mispelled"] / df["num_words"]

 df["toxic_string"] = ""
 df.loc[df["toxic"] == 1, "toxic_string"] = " Toxic "

 df["severe_toxic_string"] = ""
 df.loc[df["severe_toxic"] == 1, "severe_toxic_string"] = " Severe-Toxic "

 df["obscene_string"] = ""
 df.loc[df["obscene"] == 1, "obscene_string"] = " Obscene "

 df["threat_string"] = ""
 df.loc[df["threat"] == 1, "threat_string"] = " Threat "

 df["insult_string"] = ""
 df.loc[df["insult"] == 1, "insult_string"] = " Insult "

 df["identity_hate_string"] = ""
 df.loc[df["identity_hate"] == 1, "identity_hate_string"] = " Identity-hate "

 df["category"] = df["toxic_string"] + df["severe_toxic_string"] + \
                 df["obscene_string"] + df["threat_string"] + df["insult_string"] + df["identity_hate_string"]
 df.loc[df["category"] == "", "category"] = "Non Toxic Comment"

 df.drop(["comment_text", "tokens"], axis=1).to_csv("./data/jigsaw_dataset.csv", index=False)
	import pandas as pd
	import enchant
	import nltk
	from tqdm import tqdm
	from nltk.corpus import stopwords
	from nltk.tokenize import RegexpTokenizer

	# Declare tokenizers
	punctuation_tokenizer = RegexpTokenizer(r'[^\sA-Za-z0-9]')
	numbers_tokenizer = RegexpTokenizer(r'[0-9]')

	# TQDM for nice loading bars
	tqdm.pandas()

	# Enchant dictionary to test mispelling
	enchant_dict = enchant.Dict("en_US")

	# Load data
	df = pd.read_csv("./data/jigsaw.csv")

	# Load Swear words dataset
	swear_words = pd.read_csv("./data/swear-words.csv")["word"].to_list()


	def count_mispelled_words(tokens):
	english = [enchant_dict.check(t) for t in tokens]
	correctly_spelled = sum(english)
	mispelled = len(english) - correctly_spelled
	return mispelled


	def count_stopwords(tokens):
	filtered_words = [w for w in tokens if w in stopwords.words('english')]
	return len(filtered_words)


	def count_swearwords(tokens):
	filtered_words = [w for w in tokens if w in swear_words]
	return len(filtered_words)


	# Tokenize
	df["tokens"] = df.progress_apply(lambda row: nltk.word_tokenize(row.comment_text), axis=1)

	# Get Number of tokens
	df["num_words"] = df.progress_apply(lambda row: len(row.tokens), axis=1)

	# Get Number of characters
	df["num_chars"] = df.progress_apply(lambda row: len(row.comment_text), axis=1)

	# Count Mispelled words
	df["mispelled"] = df.progress_apply(lambda row: count_mispelled_words(row.tokens), axis=1)

	# Calculate correctly spelled words
	df["correctly_spelled"] = df.progress_apply(lambda row: len(row.tokens) - row.mispelled, axis=1)

	# Count stopwords
	df["stopwords"] = df.progress_apply(lambda row: count_stopwords(row.tokens), axis=1)

	# Count swear words
	df["swear_words"] = df.progress_apply(lambda row: count_swearwords(row.tokens), axis=1)

	# Count Numbers
	df["numbers"] = df.progress_apply(lambda row: len(numbers_tokenizer.tokenize(row.comment_text)), axis=1)

	# Count Punctuation
	df["punctuation"] = df.progress_apply(lambda row: len(punctuation_tokenizer.tokenize(row.comment_text)), axis=1)

	df["punctuation_ratio"] = df["punctuation"] / df["num_chars"]
	df["swear_words_ratio"] = df["swear_words"] / df["num_words"]
	df["stopwords_ratio"] = df["stopwords"] / df["num_words"]
	df["correctly_spelled_ratio"] = df["correctly_spelled"] / df["num_words"]
	df["mispelled_spelled_ratio"] = df["mispelled"] / df["num_words"]

	df["toxic_string"] = ""
	df.loc[df["toxic"] == 1, "toxic_string"] = " Toxic "

	df["severe_toxic_string"] = ""
	df.loc[df["severe_toxic"] == 1, "severe_toxic_string"] = " Severe-Toxic "

	df["obscene_string"] = ""
	df.loc[df["obscene"] == 1, "obscene_string"] = " Obscene "

	df["threat_string"] = ""
	df.loc[df["threat"] == 1, "threat_string"] = " Threat "

	df["insult_string"] = ""
	df.loc[df["insult"] == 1, "insult_string"] = " Insult "

	df["identity_hate_string"] = ""
	df.loc[df["identity_hate"] == 1, "identity_hate_string"] = " Identity-hate "

	df["category"] = df["toxic_string"] + df["severe_toxic_string"] + \
	df["obscene_string"] + df["threat_string"] + df["insult_string"] + df["identity_hate_string"]
	df.loc[df["category"] == "", "category"] = "Non Toxic Comment"

	df.drop(["comment_text", "tokens"], axis=1).to_csv("./data/jigsaw_dataset.csv", index=False)