zakwilson · November 18, 2012 15:11 · xsleonard · Nov 28, 2012
diff --git a/etcinitreddit-consumer-learn_q.conf b/etcinitreddit-consumer-learn_q.conf
 description "train the spam filter"

 instance $x

 stop on reddit-stop or runlevel [016]

 respawn
 respawn limit 10 5

 nice 10
 script
    . /etc/default/reddit
    wrap-job paster run --proctitle learn_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/learn.py -c 'run()'
 end script
diff --git a/etcinitreddit-consumer-spam_q.conf b/etcinitreddit-consumer-spam_q.conf
 description "check spam for newly submitted links"

 instance $x

 stop on reddit-stop or runlevel [016]

 respawn
 respawn limit 10 5

 nice 10
 script
    . /etc/default/reddit
    wrap-job paster run --proctitle spam_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/spam.py -c 'run()'
 end script
diff --git a/r2configqueues.py b/r2configqueues.py
 # This is my current declare_queues(). You must also add spam_q to the consumer-counts file

 def declare_queues():
    queues = Queues({
        "scraper_q": MessageQueue(),
        "newcomments_q": MessageQueue(),
        "commentstree_q": MessageQueue(),
        "commentstree_fastlane_q": MessageQueue(),
        "vote_link_q": MessageQueue(bind_to_self=True),
        "vote_comment_q": MessageQueue(bind_to_self=True),
        "vote_fastlane_q": MessageQueue(bind_to_self=True),
        "log_q": MessageQueue(bind_to_self=True),
        "usage_q": MessageQueue(bind_to_self=True, durable=False),
        "cloudsearch_changes": MessageQueue(bind_to_self=True),
        "update_promos_q": MessageQueue(bind_to_self=True),
        
        "spam_q": MessageQueue(),
        "learn_q": MessageQueue(),
    })

    queues.cloudsearch_changes << "search_changes"
    queues.scraper_q << "new_link"
    queues.newcomments_q << "new_comment"
    queues.commentstree_q << "new_comment"
    queues.commentstree_fastlane_q << "new_fastlane_comment"
    
    queues.spam_q << "new_link"
    queues.spam_q << "new_comment"

    queues.learn_q << "ban"
    queues.learn_q << "unban"
    
    return queues
diff --git a/r2libadmintools.py b/r2libadmintools.py
 # Changed: now uses amqp. Add this to the imports:

 from r2.lib import amqp

 # and this to spam() after t.commit

 amqp.add_item('ban', t._fullname)

 # and this to unspam()

 amqp.add_item('unban', t._fullname)
diff --git a/r2liblearn.py b/r2liblearn.py
 from pylons import g, config
 from r2.models.link import Link, Comment
 from r2.lib import amqp
 from r2.lib.spam import classifyr_request, classify_link
 from r2.lib.utils import TimeoutFunction, TimeoutFunctionException
 import json

 def learn(link):
    msg = ""
    url = ""
    if hasattr(link, 'selftext'):
        # self post
        msg = link.selftext
    elif hasattr(link, 'body'):
        # comment
        msg = link.body
    if hasattr(link, 'url') and link.url[0:3] != "/r/":
        # not self post
        url = link.url
    body = json.dumps({'message': msg,
                       'url': url,
                       'features': {'author:' + str(link.author_id): 1},
                       'category': 'spam' if link._spam else 'ham'})
    classifyr_request('/api/simple-spam/learn', body)

 def run():
    def process_link(msg):
        def _process_link(fname):
            link = Link._by_fullname(fname, data=True)
            learn(link)

        fname = msg.body
        try:
            TimeoutFunction(_process_link, 30)(fname)
        except TimeoutFunctionException:
            print "Timed out on %s" % fname
        except KeyboardInterrupt:
            raise
        except:
            print "Error fetching %s" % fname
            print traceback.format_exc()
    amqp.consume_items('learn_q', process_link)
diff --git a/r2libspam.py b/r2libspam.py
 # Initial classifyr.com support

 from pylons import g, config
 from r2.models.link import Link, Comment
 from r2.lib import amqp
 from hashlib import sha1
 from r2.lib.db.queries import ban
 import httplib, urllib, json

 from r2.lib.utils import TimeoutFunction, TimeoutFunctionException

 def classifyr_request(endpoint, body):
    checksum = sha1(body + g.classifyr_key).hexdigest()
    headers = {"classifyr-api-checksum": checksum,
               "classifyr-api-user": g.classifyr_username,}
    conn = httplib.HTTPConnection("classifyr.com")
    conn.request('POST', endpoint, body, headers)
    resp = conn.getresponse()
    return resp.read()

 def classify_link(link):
    msg = ""
    url = ""
    if hasattr(link, 'selftext'):
        # self post
        msg = link.selftext
    elif hasattr(link, 'body'):
        # comment
        msg = link.body
    if hasattr(link, 'url') and link.url[0:3] != "/r/":
        # not self post
        url = link.url
    body = json.dumps({'message': msg,
                       'url': url,
                       'features': {'author:' + str(link.author_id): 1}})
    return int(classifyr_request('/api/simple-spam/score', body))

 def spam_if_classified(link):
    account = Account._byID(link.author_id)
    if account._spam:
        link._spam = True
        link._commit()
        ban(link)
        return True
    score = classify_link(link)
    multiplier = 1
    karma = account.link_karma + account.comment_karma
    if karma < 20:
        multiplier *= 1.5
    score *= multiplier
    if score >= int(g.spam_threshold): # spam
        link._spam = True
        link._commit()
    if score >= int(g.blackhole_threshold): # kill it with fire!
        ban(link)

 def run():
    def process_link(msg):
        def _process_link(fname):
            link = Link._by_fullname(fname, data=True)
            spam_if_classified(link)

        fname = msg.body
        try:
            TimeoutFunction(_process_link, 30)(fname)
        except TimeoutFunctionException:
            print "Timed out on %s" % fname
        except KeyboardInterrupt:
            raise
        except:
            print "Error fetching %s" % fname
            print traceback.format_exc()
    amqp.consume_items('spam_q', process_link)
diff --git a/spamlearn.py b/spamlearn.py
 # This is a script for learning everything in your reddit as spam or not depending on how you've already marked it.

 from r2.lib.learn import train_link, learn
 from r2.lib.spam import classify_link
 from r2.models.link import Link, Comment

 def learn_spam_range(start, finish):
    for i in xrange(start, finish):
        l = Link._byID(i)
        try:
            learn(l)
        except Exception:
            pass
diff --git a/spamsub.py b/spamsub.py
 # This is a script to run the filter on every link in a sub, or to ban/train everything in a sub that contains only spam

 from r2.models.link import Link
 from r2.models.subreddit import Subreddit
 from r2.lib.spam import spam_if_classified
 from r2.lib.learn import learn

 def check_links(s):
  s = Subreddit._by_name(s)
  ls = s.get_links('new','all')
  ls.fetch()
  ll = [Link._by_fullname(x[0]) for x in ls.data]
  for link in ll:
    spam_if_classified(link)

 def spam_all(s):
  s = Subreddit._by_name(s)
  ls = s.get_links('new','all')
  ls.fetch()
  ll = [Link._by_fullname(x[0]) for x in ls.data]
  for link in ll:
    link._spam = True
    link._commit()
    learn(link)
	description "train the spam filter"

	instance $x

	stop on reddit-stop or runlevel [016]

	respawn
	respawn limit 10 5

	nice 10
	script
	. /etc/default/reddit
	wrap-job paster run --proctitle learn_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/learn.py -c 'run()'
	end script
	description "check spam for newly submitted links"

	instance $x

	stop on reddit-stop or runlevel [016]

	respawn
	respawn limit 10 5

	nice 10
	script
	. /etc/default/reddit
	wrap-job paster run --proctitle spam_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/spam.py -c 'run()'
	end script
	# This is my current declare_queues(). You must also add spam_q to the consumer-counts file

	def declare_queues():
	queues = Queues({
	"scraper_q": MessageQueue(),
	"newcomments_q": MessageQueue(),
	"commentstree_q": MessageQueue(),
	"commentstree_fastlane_q": MessageQueue(),
	"vote_link_q": MessageQueue(bind_to_self=True),
	"vote_comment_q": MessageQueue(bind_to_self=True),
	"vote_fastlane_q": MessageQueue(bind_to_self=True),
	"log_q": MessageQueue(bind_to_self=True),
	"usage_q": MessageQueue(bind_to_self=True, durable=False),
	"cloudsearch_changes": MessageQueue(bind_to_self=True),
	"update_promos_q": MessageQueue(bind_to_self=True),

	"spam_q": MessageQueue(),
	"learn_q": MessageQueue(),
	})

	queues.cloudsearch_changes << "search_changes"
	queues.scraper_q << "new_link"
	queues.newcomments_q << "new_comment"
	queues.commentstree_q << "new_comment"
	queues.commentstree_fastlane_q << "new_fastlane_comment"

	queues.spam_q << "new_link"
	queues.spam_q << "new_comment"

	queues.learn_q << "ban"
	queues.learn_q << "unban"

	return queues
	# Changed: now uses amqp. Add this to the imports:

	from r2.lib import amqp

	# and this to spam() after t.commit

	amqp.add_item('ban', t._fullname)

	# and this to unspam()

	amqp.add_item('unban', t._fullname)
	from pylons import g, config
	from r2.models.link import Link, Comment
	from r2.lib import amqp
	from r2.lib.spam import classifyr_request, classify_link
	from r2.lib.utils import TimeoutFunction, TimeoutFunctionException
	import json

	def learn(link):
	msg = ""
	url = ""
	if hasattr(link, 'selftext'):
	# self post
	msg = link.selftext
	elif hasattr(link, 'body'):
	# comment
	msg = link.body
	if hasattr(link, 'url') and link.url[0:3] != "/r/":
	# not self post
	url = link.url
	body = json.dumps({'message': msg,
	'url': url,
	'features': {'author:' + str(link.author_id): 1},
	'category': 'spam' if link._spam else 'ham'})
	classifyr_request('/api/simple-spam/learn', body)

	def run():
	def process_link(msg):
	def _process_link(fname):
	link = Link._by_fullname(fname, data=True)
	learn(link)

	fname = msg.body
	try:
	TimeoutFunction(_process_link, 30)(fname)
	except TimeoutFunctionException:
	print "Timed out on %s" % fname
	except KeyboardInterrupt:
	raise
	except:
	print "Error fetching %s" % fname
	print traceback.format_exc()
	amqp.consume_items('learn_q', process_link)
	# Initial classifyr.com support

	from pylons import g, config
	from r2.models.link import Link, Comment
	from r2.lib import amqp
	from hashlib import sha1
	from r2.lib.db.queries import ban
	import httplib, urllib, json

	from r2.lib.utils import TimeoutFunction, TimeoutFunctionException

	def classifyr_request(endpoint, body):
	checksum = sha1(body + g.classifyr_key).hexdigest()
	headers = {"classifyr-api-checksum": checksum,
	"classifyr-api-user": g.classifyr_username,}
	conn = httplib.HTTPConnection("classifyr.com")
	conn.request('POST', endpoint, body, headers)
	resp = conn.getresponse()
	return resp.read()

	def classify_link(link):
	msg = ""
	url = ""
	if hasattr(link, 'selftext'):
	# self post
	msg = link.selftext
	elif hasattr(link, 'body'):
	# comment
	msg = link.body
	if hasattr(link, 'url') and link.url[0:3] != "/r/":
	# not self post
	url = link.url
	body = json.dumps({'message': msg,
	'url': url,
	'features': {'author:' + str(link.author_id): 1}})
	return int(classifyr_request('/api/simple-spam/score', body))

	def spam_if_classified(link):
	account = Account._byID(link.author_id)
	if account._spam:
	link._spam = True
	link._commit()
	ban(link)
	return True
	score = classify_link(link)
	multiplier = 1
	karma = account.link_karma + account.comment_karma
	if karma < 20:
	multiplier *= 1.5
	score *= multiplier
	if score >= int(g.spam_threshold): # spam
	link._spam = True
	link._commit()
	if score >= int(g.blackhole_threshold): # kill it with fire!
	ban(link)

	def run():
	def process_link(msg):
	def _process_link(fname):
	link = Link._by_fullname(fname, data=True)
	spam_if_classified(link)

	fname = msg.body
	try:
	TimeoutFunction(_process_link, 30)(fname)
	except TimeoutFunctionException:
	print "Timed out on %s" % fname
	except KeyboardInterrupt:
	raise
	except:
	print "Error fetching %s" % fname
	print traceback.format_exc()
	amqp.consume_items('spam_q', process_link)
	# This is a script for learning everything in your reddit as spam or not depending on how you've already marked it.

	from r2.lib.learn import train_link, learn
	from r2.lib.spam import classify_link
	from r2.models.link import Link, Comment

	def learn_spam_range(start, finish):
	for i in xrange(start, finish):
	l = Link._byID(i)
	try:
	learn(l)
	except Exception:
	pass
	# This is a script to run the filter on every link in a sub, or to ban/train everything in a sub that contains only spam

	from r2.models.link import Link
	from r2.models.subreddit import Subreddit
	from r2.lib.spam import spam_if_classified
	from r2.lib.learn import learn

	def check_links(s):
	s = Subreddit._by_name(s)
	ls = s.get_links('new','all')
	ls.fetch()
	ll = [Link._by_fullname(x[0]) for x in ls.data]
	for link in ll:
	spam_if_classified(link)

	def spam_all(s):
	s = Subreddit._by_name(s)
	ls = s.get_links('new','all')
	ls.fetch()
	ll = [Link._by_fullname(x[0]) for x in ls.data]
	for link in ll:
	link._spam = True
	link._commit()
	learn(link)