Created
November 18, 2012 15:11
-
-
Save zakwilson/4105729 to your computer and use it in GitHub Desktop.
reddit spam filter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
description "train the spam filter" | |
instance $x | |
stop on reddit-stop or runlevel [016] | |
respawn | |
respawn limit 10 5 | |
nice 10 | |
script | |
. /etc/default/reddit | |
wrap-job paster run --proctitle learn_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/learn.py -c 'run()' | |
end script |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
description "check spam for newly submitted links" | |
instance $x | |
stop on reddit-stop or runlevel [016] | |
respawn | |
respawn limit 10 5 | |
nice 10 | |
script | |
. /etc/default/reddit | |
wrap-job paster run --proctitle spam_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/spam.py -c 'run()' | |
end script |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is my current declare_queues(). You must also add spam_q to the consumer-counts file | |
def declare_queues(): | |
queues = Queues({ | |
"scraper_q": MessageQueue(), | |
"newcomments_q": MessageQueue(), | |
"commentstree_q": MessageQueue(), | |
"commentstree_fastlane_q": MessageQueue(), | |
"vote_link_q": MessageQueue(bind_to_self=True), | |
"vote_comment_q": MessageQueue(bind_to_self=True), | |
"vote_fastlane_q": MessageQueue(bind_to_self=True), | |
"log_q": MessageQueue(bind_to_self=True), | |
"usage_q": MessageQueue(bind_to_self=True, durable=False), | |
"cloudsearch_changes": MessageQueue(bind_to_self=True), | |
"update_promos_q": MessageQueue(bind_to_self=True), | |
"spam_q": MessageQueue(), | |
"learn_q": MessageQueue(), | |
}) | |
queues.cloudsearch_changes << "search_changes" | |
queues.scraper_q << "new_link" | |
queues.newcomments_q << "new_comment" | |
queues.commentstree_q << "new_comment" | |
queues.commentstree_fastlane_q << "new_fastlane_comment" | |
queues.spam_q << "new_link" | |
queues.spam_q << "new_comment" | |
queues.learn_q << "ban" | |
queues.learn_q << "unban" | |
return queues |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Changed: now uses amqp. Add this to the imports: | |
from r2.lib import amqp | |
# and this to spam() after t.commit | |
amqp.add_item('ban', t._fullname) | |
# and this to unspam() | |
amqp.add_item('unban', t._fullname) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pylons import g, config | |
from r2.models.link import Link, Comment | |
from r2.lib import amqp | |
from r2.lib.spam import classifyr_request, classify_link | |
from r2.lib.utils import TimeoutFunction, TimeoutFunctionException | |
import json | |
def learn(link): | |
msg = "" | |
url = "" | |
if hasattr(link, 'selftext'): | |
# self post | |
msg = link.selftext | |
elif hasattr(link, 'body'): | |
# comment | |
msg = link.body | |
if hasattr(link, 'url') and link.url[0:3] != "/r/": | |
# not self post | |
url = link.url | |
body = json.dumps({'message': msg, | |
'url': url, | |
'features': {'author:' + str(link.author_id): 1}, | |
'category': 'spam' if link._spam else 'ham'}) | |
classifyr_request('/api/simple-spam/learn', body) | |
def run(): | |
def process_link(msg): | |
def _process_link(fname): | |
link = Link._by_fullname(fname, data=True) | |
learn(link) | |
fname = msg.body | |
try: | |
TimeoutFunction(_process_link, 30)(fname) | |
except TimeoutFunctionException: | |
print "Timed out on %s" % fname | |
except KeyboardInterrupt: | |
raise | |
except: | |
print "Error fetching %s" % fname | |
print traceback.format_exc() | |
amqp.consume_items('learn_q', process_link) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Initial classifyr.com support | |
from pylons import g, config | |
from r2.models.link import Link, Comment | |
from r2.lib import amqp | |
from hashlib import sha1 | |
from r2.lib.db.queries import ban | |
import httplib, urllib, json | |
from r2.lib.utils import TimeoutFunction, TimeoutFunctionException | |
def classifyr_request(endpoint, body): | |
checksum = sha1(body + g.classifyr_key).hexdigest() | |
headers = {"classifyr-api-checksum": checksum, | |
"classifyr-api-user": g.classifyr_username,} | |
conn = httplib.HTTPConnection("classifyr.com") | |
conn.request('POST', endpoint, body, headers) | |
resp = conn.getresponse() | |
return resp.read() | |
def classify_link(link): | |
msg = "" | |
url = "" | |
if hasattr(link, 'selftext'): | |
# self post | |
msg = link.selftext | |
elif hasattr(link, 'body'): | |
# comment | |
msg = link.body | |
if hasattr(link, 'url') and link.url[0:3] != "/r/": | |
# not self post | |
url = link.url | |
body = json.dumps({'message': msg, | |
'url': url, | |
'features': {'author:' + str(link.author_id): 1}}) | |
return int(classifyr_request('/api/simple-spam/score', body)) | |
def spam_if_classified(link): | |
account = Account._byID(link.author_id) | |
if account._spam: | |
link._spam = True | |
link._commit() | |
ban(link) | |
return True | |
score = classify_link(link) | |
multiplier = 1 | |
karma = account.link_karma + account.comment_karma | |
if karma < 20: | |
multiplier *= 1.5 | |
score *= multiplier | |
if score >= int(g.spam_threshold): # spam | |
link._spam = True | |
link._commit() | |
if score >= int(g.blackhole_threshold): # kill it with fire! | |
ban(link) | |
def run(): | |
def process_link(msg): | |
def _process_link(fname): | |
link = Link._by_fullname(fname, data=True) | |
spam_if_classified(link) | |
fname = msg.body | |
try: | |
TimeoutFunction(_process_link, 30)(fname) | |
except TimeoutFunctionException: | |
print "Timed out on %s" % fname | |
except KeyboardInterrupt: | |
raise | |
except: | |
print "Error fetching %s" % fname | |
print traceback.format_exc() | |
amqp.consume_items('spam_q', process_link) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a script for learning everything in your reddit as spam or not depending on how you've already marked it. | |
from r2.lib.learn import train_link, learn | |
from r2.lib.spam import classify_link | |
from r2.models.link import Link, Comment | |
def learn_spam_range(start, finish): | |
for i in xrange(start, finish): | |
l = Link._byID(i) | |
try: | |
learn(l) | |
except Exception: | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a script to run the filter on every link in a sub, or to ban/train everything in a sub that contains only spam | |
from r2.models.link import Link | |
from r2.models.subreddit import Subreddit | |
from r2.lib.spam import spam_if_classified | |
from r2.lib.learn import learn | |
def check_links(s): | |
s = Subreddit._by_name(s) | |
ls = s.get_links('new','all') | |
ls.fetch() | |
ll = [Link._by_fullname(x[0]) for x in ls.data] | |
for link in ll: | |
spam_if_classified(link) | |
def spam_all(s): | |
s = Subreddit._by_name(s) | |
ls = s.get_links('new','all') | |
ls.fetch() | |
ll = [Link._by_fullname(x[0]) for x in ls.data] | |
for link in ll: | |
link._spam = True | |
link._commit() | |
learn(link) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks
Some fixes:
r2libspam.py line 42
link._commit()
r2libspam.py, r2liblearn.py should
import traceback