Last active
November 1, 2015 22:12
-
-
Save scazon/0bf7753b03355132cda8 to your computer and use it in GitHub Desktop.
Offensive word detection in Python 3 via Wiktionary lists
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
import json | |
def isOffensive(word): | |
""" | |
Determines whether a word string is considered offensive. | |
Searches the word on Wiktionary, then iterates through all the 'Categories' the word belongs to. | |
If the category name contains a word related to 'offensive', the function returns True | |
""" | |
wikiUrl = "http://en.wiktionary.org/w/api.php?format=json&action=query&prop=categories&redirects=1&titles=" | |
qContinue="" | |
while(True): | |
#Continue making HTTP requests for the word category lists until no more pages of categories exist | |
url = wikiUrl+word+("&clcontinue="+urllib.parse.quote(qContinue) if qContinue!="" else "") | |
#The continue attribute sometimes contains pipes (|) which must be escaped by quote() | |
rawQuery = urllib.request.Request(url) #Make the HTTP request | |
response = urllib.request.urlopen(rawQuery) #Parse it into a readable | |
wikiQuery = json.loads(response.readall().decode('utf-8')) #JSON format | |
try: | |
#Save the continue attribute to append to the next search | |
qContinue = wikiQuery["continue"]["clcontinue"] | |
except KeyError: | |
#when "continue" doesn't exist, i.e. no more pages, the function can quit | |
qContinue = "" | |
categories = wikiQuery["query"]["pages"][list(wikiQuery["query"]["pages"])[0]]["categories"] | |
#the key between "pages" and "categories" is a random number, e.g. ["query"]["pages"]["3076"]["categories"] | |
for category in categories: | |
title = category["title"] | |
print(title) | |
bannedCats = ["offensive", "swear", "vulgarities", "slurs", "derogatory", "slang"] | |
#if the category title contains any of these words, return True | |
if any(x in title.lower() for x in bannedCats): | |
print("SKIPPING") | |
return True | |
if (qContinue == ""): | |
print("Not offensive") | |
return False |
Author
scazon
commented
Nov 1, 2015
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment