Skip to content

Instantly share code, notes, and snippets.

@slaporte
Created July 13, 2015 02:30
Show Gist options
  • Select an option

  • Save slaporte/f9a3d8e7f1570e8065c1 to your computer and use it in GitHub Desktop.

Select an option

Save slaporte/f9a3d8e7f1570e8065c1 to your computer and use it in GitHub Desktop.
import os
import oursql
from .utils import find_hashtags
'''
Schema TODO:
- unique index on lang and rcid
- add update_date
'''
DEFAULT_HOURS = 1
HT_DB_HOST = 's1.labsdb' # enwiki db replica
HT_DB_NAME = 's52490__hashtags_p'
DB_CONFIG_PATH = os.path.expanduser('~/replica.my.cnf') # Available by default on Labs
class RecentChangeUpdater(object):
def __init__(self, lang):
self.lang = lang
self.ht_id_map = {}
self.htrc_id_map = []
def connect(self):
wiki_db_name = lang + 'wiki_p'
wiki_db_host = lang + 'wiki.labsdb'
self.wiki_connect = oursql.connect(db=wiki_db_name,
host=wiki_db_host,
read_default_file=DB_CONFIG_PATH,
charset=None)
self.ht_connect = oursql.connect(db=HT_DB_NAME,
host=HT_DB_HOST,
read_default_file=DB_CONFIG_PATH,
charset=None)
def _wiki_execute(self, query, params, as_dict=False):
if as_dict:
wiki_cursor = self.wiki_connect.cursor(oursql.DictCursor)
else:
wiki_cursor = self.wiki_connect.cursor()
return wiki_cursor.execute(query, params)
def _ht_execute(self, query, params, as_dict=False):
if as_dict:
ht_cursor = self.ht_connect.cursor(oursql.DictCursor)
else:
ht_cursor = self.ht_connect.cursor()
return ht_cursor.execute(query, params)
def update_recentchanges(self, hours=DEFAULT_HOURS):
rc_query = '''
SELECT *
FROM recentchanges
WHERE rc_type = 0
AND rc_timestamp > DATE_SUB(NOW(), INTERVAL ? HOUR)
AND rc_comment REGEXP ?
ORDER BY rc_id DESC'''
rc_params = (hours, '(^| )#[[:alpha:]]{2}[[:alnum:]]*[[:>:]]', limit)
cursor = self._wiki_execute(rc_query, rc_params)
changes = cursor.fetchall()
for change in changes:
hashtags = find_hashtags(rc['rc_comment']) # find rc_comment in the tuple
htrc_id = self.add_recentchange(change)
self.htrc_id_map[htrc_id] = hashtags
for hashtag in hashtags:
self.add_hashtag(hashtag)
for htrc_id, hashtags in self.htrc_id_map.items():
for hashtag in hashtags:
ht_id = self.ht_id_map[hashtag]
query = '''
INSERT INTO hashtag_recentchanges
VALUES (?, ?)'''
params = (ht_id, htrc_id)
cursor = self._ht_execute(query, params)
return
def add_recentchange(self, rc):
# 25 values
query = '''
INSERT INTO recentchanges
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
'''
params = (self.lang,) + rc
cursor = run_query(query, query_params, HT_DB_NAME, HT_DB_HOST)
return cursor.lastrowid
def add_hashtag(self, hashtag):
if hashtag in self.ht_id_map:
return self.ht_id_map[hashtag]
hashtag_query = '''
SELECT ht_id
FROM hashtags
WHERE ht_text = ?
LIMIT 1'''
hashtags_params = (hashtag,)
cursor = self._ht_execute(hashtag_query, hashtags_params)
ht_res = cursor.fetchall()
if ht_res:
self.ht_id_map[hashtag] = ht_res[0][0]
return self.ht_id_map[hashtag]
hashtag_insert_query = '''
INSERT INTO hashtags
VALUES (?, NOW())'''
hashtag_insert_params = (hashtag,)
cursor = self._ht_execute(hashtag_insert_query, hashtag_insert_params)
self.ht_id_map[hashtag] = cursor.lastrowid
return self.ht_id_map[hashtag]
def main():
cursor = get_tagged_changes()
changes = cursor.fetchall()
for change in changes:
add_recentchange(hashtags, change)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment