Last active
November 9, 2015 22:44
-
-
Save ArtOfCode-/8991b5aa7d1983dba51d to your computer and use it in GitHub Desktop.
An analysis script for bad flags on SO, when given a list of bad flags one-per-line in bad_flags.dta.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
def get_list_index(list_obj, item, alternative=None): | |
try: | |
return list_obj.index(item) | |
except: | |
if alternative is not None: | |
try: | |
return list_obj.index(alternative) | |
except: | |
return None | |
else: | |
return None | |
def main(): | |
threshold = 5 | |
if "-t" in sys.argv or "--threshold" in sys.argv: | |
threshold_index = get_list_index(sys.argv, "-t", "--threshold") | |
if len(sys.argv) - 1 >= threshold_index + 1: | |
threshold = int(sys.argv[threshold_index + 1]) | |
size = 3 | |
lines = read_file_lines("bad_flags.dta") | |
n_grams = {} | |
for line in lines: | |
line_grams = get_n_grams(strip_line(line.lower()), size) | |
for gram in line_grams: | |
if gram in n_grams: | |
n_grams[gram] += 1 | |
else: | |
n_grams[gram] = 1 | |
sorted_grams = sorted(n_grams.items(), key=lambda x: x[1], reverse=True) | |
for k, v in sorted_grams: | |
if v >= threshold: | |
print("'{0}' : {1}".format(k, str(v))) | |
def strip_line(line): | |
return ''.join(c for c in line if c not in ["\n", ".", ",", "?", "!", ":", ";"]) | |
def read_file_lines(file): | |
with open(file, "r") as f: | |
return f.readlines() | |
def get_n_grams(line, size=2): | |
words = line.split(" ") | |
n_sets = ["-".join(words[i:i+size]) for i in range(0, len(words), size)] | |
words.pop(0) | |
alternate_n_sets = ["-".join(words[i:i+size]) for i in range(0, len(words), size)] | |
if size == 3 and len(words) > 0: | |
words.pop(0) | |
second_alternate = ["-".join(words[i:i+size]) for i in range(0, len(words), size)] | |
return n_sets + alternate_n_sets + second_alternate | |
else: | |
return n_sets + alternate_n_sets | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment