Skip to content

Instantly share code, notes, and snippets.

@pinscript
Created November 17, 2011 19:08
Show Gist options
  • Select an option

  • Save pinscript/1374113 to your computer and use it in GitHub Desktop.

Select an option

Save pinscript/1374113 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
score_table = [
('fitta', -300),
('kuk', -300),
("jävla", -20),
("helvete", -70),
("skit", -40),
('satans', -20),
('fan', -30),
]
stop_words = [
"alla", "allt", "alltså", "andra", "att", "bara", "bli", "blir", "borde", "bra",
"mitt", "ser","dem","den","denna","det","detta","dig","din","dock","dom","där",
"edit","efter","eftersom","eller","ett","fast","fel","fick","finns","fram","från",
"får","fått","för","första","genom","ger","går","gör","göra","hade","han","har",
"hela","helt","honom","hur","här","iaf","igen","ingen","inget","inte","jag","kan",
"kanske","kommer","lika","lite","man","med","men","mer","mig","min","mot","mycket",
"många","måste","nog","när","någon","något","några","nån","nåt","och","också","rätt",
"samma","sedan","sen","sig","sin","själv","ska","skulle","som","sätt","tar","till",
"tror","tycker","typ","upp","utan","vad","var","vara","vet","vid","vilket","vill",
"väl","är","även","över"
]
minimum_reward_length = 8
reward = 0.8
minimum_word_length = 3
good_word_score = 2
def get_score(word):
word = word.decode('utf-8').strip()
length = len(word)
# words shorter than 3 characters are not counted
if length < minimum_word_length:
return 0
# stop words are not counted
if word.lower() in stop_words:
return 0
# bad words are rewarded with a negative score
for item in score_table:
if re.search(item[0], word, re.IGNORECASE):
return item[1]
# remove words without a single ascii character
if re.search("[a-zA-z]", word, re.IGNORECASE) is None:
return 0
# reward longer (probably more specific and relevant words) with 20% of their length
if length > minimum_reward_length:
print "rewarding ", word , " with ", length * reward
return length * reward
return good_word_score
# score = sum(-profanity) + sum(good_word)
def score_text(text):
words = text.split(' ')
total_score = 0
for word in words:
total_score += get_score(word)
return total_score
f = open("text.txt", "r")
text = f.read()
score = score_text(text)
print score
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment