Created
July 19, 2020 17:55
-
-
Save aialenti/82321db23a6f0d07a48b4289d5f30eaf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import enchant | |
| import nltk | |
| from tqdm import tqdm | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import RegexpTokenizer | |
| # Declare tokenizers | |
| punctuation_tokenizer = RegexpTokenizer(r'[^\sA-Za-z0-9]') | |
| numbers_tokenizer = RegexpTokenizer(r'[0-9]') | |
| # TQDM for nice loading bars | |
| tqdm.pandas() | |
| # Enchant dictionary to test mispelling | |
| enchant_dict = enchant.Dict("en_US") | |
| # Load data | |
| df = pd.read_csv("./data/jigsaw.csv") | |
| # Load Swear words dataset | |
| swear_words = pd.read_csv("./data/swear-words.csv")["word"].to_list() | |
| def count_mispelled_words(tokens): | |
| english = [enchant_dict.check(t) for t in tokens] | |
| correctly_spelled = sum(english) | |
| mispelled = len(english) - correctly_spelled | |
| return mispelled | |
| def count_stopwords(tokens): | |
| filtered_words = [w for w in tokens if w in stopwords.words('english')] | |
| return len(filtered_words) | |
| def count_swearwords(tokens): | |
| filtered_words = [w for w in tokens if w in swear_words] | |
| return len(filtered_words) | |
| # Tokenize | |
| df["tokens"] = df.progress_apply(lambda row: nltk.word_tokenize(row.comment_text), axis=1) | |
| # Get Number of tokens | |
| df["num_words"] = df.progress_apply(lambda row: len(row.tokens), axis=1) | |
| # Get Number of characters | |
| df["num_chars"] = df.progress_apply(lambda row: len(row.comment_text), axis=1) | |
| # Count Mispelled words | |
| df["mispelled"] = df.progress_apply(lambda row: count_mispelled_words(row.tokens), axis=1) | |
| # Calculate correctly spelled words | |
| df["correctly_spelled"] = df.progress_apply(lambda row: len(row.tokens) - row.mispelled, axis=1) | |
| # Count stopwords | |
| df["stopwords"] = df.progress_apply(lambda row: count_stopwords(row.tokens), axis=1) | |
| # Count swear words | |
| df["swear_words"] = df.progress_apply(lambda row: count_swearwords(row.tokens), axis=1) | |
| # Count Numbers | |
| df["numbers"] = df.progress_apply(lambda row: len(numbers_tokenizer.tokenize(row.comment_text)), axis=1) | |
| # Count Punctuation | |
| df["punctuation"] = df.progress_apply(lambda row: len(punctuation_tokenizer.tokenize(row.comment_text)), axis=1) | |
| df["punctuation_ratio"] = df["punctuation"] / df["num_chars"] | |
| df["swear_words_ratio"] = df["swear_words"] / df["num_words"] | |
| df["stopwords_ratio"] = df["stopwords"] / df["num_words"] | |
| df["correctly_spelled_ratio"] = df["correctly_spelled"] / df["num_words"] | |
| df["mispelled_spelled_ratio"] = df["mispelled"] / df["num_words"] | |
| df["toxic_string"] = "" | |
| df.loc[df["toxic"] == 1, "toxic_string"] = " Toxic " | |
| df["severe_toxic_string"] = "" | |
| df.loc[df["severe_toxic"] == 1, "severe_toxic_string"] = " Severe-Toxic " | |
| df["obscene_string"] = "" | |
| df.loc[df["obscene"] == 1, "obscene_string"] = " Obscene " | |
| df["threat_string"] = "" | |
| df.loc[df["threat"] == 1, "threat_string"] = " Threat " | |
| df["insult_string"] = "" | |
| df.loc[df["insult"] == 1, "insult_string"] = " Insult " | |
| df["identity_hate_string"] = "" | |
| df.loc[df["identity_hate"] == 1, "identity_hate_string"] = " Identity-hate " | |
| df["category"] = df["toxic_string"] + df["severe_toxic_string"] + \ | |
| df["obscene_string"] + df["threat_string"] + df["insult_string"] + df["identity_hate_string"] | |
| df.loc[df["category"] == "", "category"] = "Non Toxic Comment" | |
| df.drop(["comment_text", "tokens"], axis=1).to_csv("./data/jigsaw_dataset.csv", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment