Skip to content

Instantly share code, notes, and snippets.

@ArtOfCode-
Last active November 9, 2015 22:44
Show Gist options
  • Save ArtOfCode-/8991b5aa7d1983dba51d to your computer and use it in GitHub Desktop.
Save ArtOfCode-/8991b5aa7d1983dba51d to your computer and use it in GitHub Desktop.
An analysis script for bad flags on SO, when given a list of bad flags one-per-line in bad_flags.dta.
import sys
def get_list_index(list_obj, item, alternative=None):
try:
return list_obj.index(item)
except:
if alternative is not None:
try:
return list_obj.index(alternative)
except:
return None
else:
return None
def main():
threshold = 5
if "-t" in sys.argv or "--threshold" in sys.argv:
threshold_index = get_list_index(sys.argv, "-t", "--threshold")
if len(sys.argv) - 1 >= threshold_index + 1:
threshold = int(sys.argv[threshold_index + 1])
size = 3
lines = read_file_lines("bad_flags.dta")
n_grams = {}
for line in lines:
line_grams = get_n_grams(strip_line(line.lower()), size)
for gram in line_grams:
if gram in n_grams:
n_grams[gram] += 1
else:
n_grams[gram] = 1
sorted_grams = sorted(n_grams.items(), key=lambda x: x[1], reverse=True)
for k, v in sorted_grams:
if v >= threshold:
print("'{0}' : {1}".format(k, str(v)))
def strip_line(line):
return ''.join(c for c in line if c not in ["\n", ".", ",", "?", "!", ":", ";"])
def read_file_lines(file):
with open(file, "r") as f:
return f.readlines()
def get_n_grams(line, size=2):
words = line.split(" ")
n_sets = ["-".join(words[i:i+size]) for i in range(0, len(words), size)]
words.pop(0)
alternate_n_sets = ["-".join(words[i:i+size]) for i in range(0, len(words), size)]
if size == 3 and len(words) > 0:
words.pop(0)
second_alternate = ["-".join(words[i:i+size]) for i in range(0, len(words), size)]
return n_sets + alternate_n_sets + second_alternate
else:
return n_sets + alternate_n_sets
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment