Created
November 29, 2016 05:17
-
-
Save milimetric/a8447734a6f650c73b7a16d44ec2574a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# NOTE: required for the following to work: | |
# !pip install pymysql\n", | |
# !git clone https://gerrit.wikimedia.org/r/p/operations/mediawiki-config\n", | |
# !cd mediawiki-config && git pull origin master" | |
import pymysql | |
import ipaddress | |
import os | |
connection = pymysql.connect( | |
host='analytics-store.eqiad.wmnet', | |
read_default_file='/etc/mysql/conf.d/research-client.cnf', | |
charset='utf8mb4', | |
db='commonswiki', | |
cursorclass=pymysql.cursors.DictCursor | |
) | |
LABS_NETWORKS = [ | |
ipaddress.IPv4Network(net) | |
for net in ['10.68.0.0/24', '10.68.16.0/21', '10.68.32.0/24', '10.68.48.0/24'] | |
] | |
""" | |
-- NOTE: couldn't get this python to run because I can't install pymysql without pip on stat1003 | |
-- So here's the alternative SQL: | |
set @start = '20161001000000'; | |
set @end = '20161001000100'; | |
select count(*) as total_edits, | |
sum(if( | |
inet_aton(cuc_ip) between inet_aton('10.68.0.0') and inet_aton('10.68.0.255') | |
or inet_aton(cuc_ip) between inet_aton('10.68.16.0') and inet_aton('10.68.16.255') | |
or inet_aton(cuc_ip) between inet_aton('10.68.17.0') and inet_aton('10.68.17.255') | |
or inet_aton(cuc_ip) between inet_aton('10.68.18.0') and inet_aton('10.68.18.255') | |
or inet_aton(cuc_ip) between inet_aton('10.68.32.0') and inet_aton('10.68.32.255') | |
or inet_aton(cuc_ip) between inet_aton('10.68.48.0') and inet_aton('10.68.48.255') | |
, 1, 0)) as labs_edits | |
from enwiki.cu_changes | |
where cuc_timestamp > @start and cuc_timestamp < @end | |
; | |
""" | |
def get_edit_counts(dbname, startts, endts): | |
""" | |
Returns number of labs edits (and total edits) for given date range in given db | |
""" | |
connection.ping(True) | |
labs_edits = 0 | |
total_edits = 0 | |
with connection.cursor() as cur: | |
cur.execute("USE %s" % dbname) | |
cur.execute(""" | |
SELECT cuc_ip FROM cu_changes | |
WHERE cuc_timestamp > %s AND cuc_timestamp < %s | |
""", (startts, endts)) | |
for row in cur: | |
total_edits += 1 | |
#if total_edits % 1000000 == 0: | |
#print('processed %s edits so far' % total_edits) | |
try: | |
ip = ipaddress.IPv4Address(row['cuc_ip'].decode('utf-8')) | |
except ipaddress.AddressValueError: | |
# IPV4 address | |
continue | |
for network in LABS_NETWORKS: | |
if ip in network: | |
labs_edits += 1 | |
continue | |
def get_public_open_wikis(): | |
""" | |
Return list of dbnames of public, non-closed wikis. | |
Requires a mediawiki-config git clone in current directory | |
""" | |
def get_dblist(name): | |
""" | |
Returns a set containing dbs in a given dblist | |
""" | |
path = os.path.join('mediawiki-config', 'dblists', name + '.dblist') | |
with open(path, encoding='utf-8') as f: | |
return set([l.strip() for l in f]) | |
return get_dblist('all') - get_dblist('closed') - get_dblist('private') | |
def calc_labs_edits(starttime, endttime): | |
""" | |
Calculate the number of all / labs edits for all open wikis in a given time period | |
Return a dict of 'dbname' => { 'all': <all-edits-count>, 'labs': <labs-edit-count>} | |
""" | |
stats = {} | |
for dbname in get_public_open_wikis(): | |
try: | |
stats[dbname] = get_edit_counts(dbname, starttime, endttime) | |
except pymysql.InternalError as e: | |
print('skipping %s' % (dbname)) | |
return stats | |
calc_labs_edits('20161001000000', '20161001000100') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment