Created
August 7, 2009 14:42
-
-
Save CaptSolo/163932 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
get-sioc-apps-list.py | |
Extracts a list of SIOC applications and their creation dates, | |
based on data at: http://wiki.sioc-project.org/index.php/ApplicationTimeline | |
Print a TAB-delimited list of the number of applications created every month. | |
Author: Uldis Bojars, http://captsolo.net | |
""" | |
import urllib2 | |
def table_content(raw_text): | |
"""Extract table rows from MediaWiki table markup""" | |
# NOTE: this code extracts all table rows in the documents, regardless if they are in one table or many. | |
in_table = False | |
data = [] | |
for l in raw_text: | |
line = l.strip() | |
if line.startswith("{|"): | |
in_table = True | |
data = [] | |
continue | |
if in_table: | |
if line.startswith("|}") or line.startswith("|-"): | |
if data: | |
yield data | |
data = [] | |
elif line.startswith("|"): | |
# this code is limited to line cells described in a single row, delimited by || | |
data.extend([x.strip() for x in line[1:].split('||')]) | |
if line.startswith("|}"): | |
in_table = False | |
data = [] | |
def filter_rows(rows): | |
"""Filter out table rows that have 3 or more cells""" | |
return (x for x in rows if len(x)>=3) | |
def extract_data(rows): | |
""" | |
Extract columns 2, 3: | |
2 = origin of application ('D' from inside DERI; 'C' from the community) | |
3 = creation date of application | |
""" | |
return ((x[1],x[2]) for x in rows) | |
def pivot_on_month(rows): | |
"""Months data = a dictionary containing a list of 2 items [count of 'C', count of 'D']""" | |
months = {} | |
for (i_type, i_month) in rows: | |
item = months.setdefault(i_month,[0,0]) | |
if i_type == 'C': | |
item[0] += 1 | |
if i_type == 'D': | |
item[1] += 1 | |
return months | |
def print_month_summary(url): | |
"""Print a TAB-delimited list in form [month, no of 'C' entries, no of 'D' entries]""" | |
data = urllib2.urlopen(url) | |
months = pivot_on_month(extract_data(filter_rows(table_content(data)))) | |
m_list = months.items() | |
m_list.sort() | |
for item in m_list: | |
print "%s\t%s\t%s" % (item[0],item[1][0],item[1][1]) | |
def print_table(url): | |
"""Print all table rows""" | |
data = urllib2.urlopen(url) | |
for item in table_content(data): | |
print item | |
if __name__ == "__main__": | |
url = "http://wiki.sioc-project.org/index.php/ApplicationTimeline?action=raw" | |
print_month_summary(url) | |
# print_table(url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment