Created
September 4, 2014 07:20
-
-
Save mobilestack/a4e87153b9cd990be19b to your computer and use it in GitHub Desktop.
I use this to parse badwords files from several sources
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
badwords source: https://github.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en | |
badwords source 2: http://urbanoalvarez.es/blog/2008/04/04/bad-words-list/ | |
""" | |
f = open("badwords.txt") | |
lines = f.readlines() | |
lines2 = [] | |
for i in lines: | |
#remove trailing and prepending space | |
lines2.append(i.strip()) | |
lines3 = [] | |
for i in lines2: | |
#remove spaces | |
b = i.replace(" ", "").replace("-","") | |
line3.append(b) | |
""" | |
for i in lines: | |
#check if is alphabetics | |
#so remove numbers and marks | |
if not i.isalpha(): | |
lines.remove(i) | |
#note that this will not clear all items which is not alpha | |
#has to do more than serveral times, and print out to check | |
#or use the following | |
""" | |
line4 = [] | |
for i in line3: | |
if i.isalpha(): | |
#from capital to lower case | |
line4.append(i.lower()) | |
output = [] | |
for i in lines4: | |
if i is not in output: | |
output.append(i) | |
f2 = open("badwords2.txt", "w") | |
for x in output: | |
f2.write("%s\n", x) | |
f2.close() | |
f.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment