Created
July 13, 2015 02:30
-
-
Save slaporte/f9a3d8e7f1570e8065c1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import oursql | |
| from .utils import find_hashtags | |
| ''' | |
| Schema TODO: | |
| - unique index on lang and rcid | |
| - add update_date | |
| ''' | |
| DEFAULT_HOURS = 1 | |
| HT_DB_HOST = 's1.labsdb' # enwiki db replica | |
| HT_DB_NAME = 's52490__hashtags_p' | |
| DB_CONFIG_PATH = os.path.expanduser('~/replica.my.cnf') # Available by default on Labs | |
| class RecentChangeUpdater(object): | |
| def __init__(self, lang): | |
| self.lang = lang | |
| self.ht_id_map = {} | |
| self.htrc_id_map = [] | |
| def connect(self): | |
| wiki_db_name = lang + 'wiki_p' | |
| wiki_db_host = lang + 'wiki.labsdb' | |
| self.wiki_connect = oursql.connect(db=wiki_db_name, | |
| host=wiki_db_host, | |
| read_default_file=DB_CONFIG_PATH, | |
| charset=None) | |
| self.ht_connect = oursql.connect(db=HT_DB_NAME, | |
| host=HT_DB_HOST, | |
| read_default_file=DB_CONFIG_PATH, | |
| charset=None) | |
| def _wiki_execute(self, query, params, as_dict=False): | |
| if as_dict: | |
| wiki_cursor = self.wiki_connect.cursor(oursql.DictCursor) | |
| else: | |
| wiki_cursor = self.wiki_connect.cursor() | |
| return wiki_cursor.execute(query, params) | |
| def _ht_execute(self, query, params, as_dict=False): | |
| if as_dict: | |
| ht_cursor = self.ht_connect.cursor(oursql.DictCursor) | |
| else: | |
| ht_cursor = self.ht_connect.cursor() | |
| return ht_cursor.execute(query, params) | |
| def update_recentchanges(self, hours=DEFAULT_HOURS): | |
| rc_query = ''' | |
| SELECT * | |
| FROM recentchanges | |
| WHERE rc_type = 0 | |
| AND rc_timestamp > DATE_SUB(NOW(), INTERVAL ? HOUR) | |
| AND rc_comment REGEXP ? | |
| ORDER BY rc_id DESC''' | |
| rc_params = (hours, '(^| )#[[:alpha:]]{2}[[:alnum:]]*[[:>:]]', limit) | |
| cursor = self._wiki_execute(rc_query, rc_params) | |
| changes = cursor.fetchall() | |
| for change in changes: | |
| hashtags = find_hashtags(rc['rc_comment']) # find rc_comment in the tuple | |
| htrc_id = self.add_recentchange(change) | |
| self.htrc_id_map[htrc_id] = hashtags | |
| for hashtag in hashtags: | |
| self.add_hashtag(hashtag) | |
| for htrc_id, hashtags in self.htrc_id_map.items(): | |
| for hashtag in hashtags: | |
| ht_id = self.ht_id_map[hashtag] | |
| query = ''' | |
| INSERT INTO hashtag_recentchanges | |
| VALUES (?, ?)''' | |
| params = (ht_id, htrc_id) | |
| cursor = self._ht_execute(query, params) | |
| return | |
| def add_recentchange(self, rc): | |
| # 25 values | |
| query = ''' | |
| INSERT INTO recentchanges | |
| VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) | |
| ''' | |
| params = (self.lang,) + rc | |
| cursor = run_query(query, query_params, HT_DB_NAME, HT_DB_HOST) | |
| return cursor.lastrowid | |
| def add_hashtag(self, hashtag): | |
| if hashtag in self.ht_id_map: | |
| return self.ht_id_map[hashtag] | |
| hashtag_query = ''' | |
| SELECT ht_id | |
| FROM hashtags | |
| WHERE ht_text = ? | |
| LIMIT 1''' | |
| hashtags_params = (hashtag,) | |
| cursor = self._ht_execute(hashtag_query, hashtags_params) | |
| ht_res = cursor.fetchall() | |
| if ht_res: | |
| self.ht_id_map[hashtag] = ht_res[0][0] | |
| return self.ht_id_map[hashtag] | |
| hashtag_insert_query = ''' | |
| INSERT INTO hashtags | |
| VALUES (?, NOW())''' | |
| hashtag_insert_params = (hashtag,) | |
| cursor = self._ht_execute(hashtag_insert_query, hashtag_insert_params) | |
| self.ht_id_map[hashtag] = cursor.lastrowid | |
| return self.ht_id_map[hashtag] | |
| def main(): | |
| cursor = get_tagged_changes() | |
| changes = cursor.fetchall() | |
| for change in changes: | |
| add_recentchange(hashtags, change) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment