Skip to content

Instantly share code, notes, and snippets.

@CaptSolo
Created August 7, 2009 14:42
Show Gist options
  • Save CaptSolo/163932 to your computer and use it in GitHub Desktop.
Save CaptSolo/163932 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
get-sioc-apps-list.py
Extracts a list of SIOC applications and their creation dates,
based on data at: http://wiki.sioc-project.org/index.php/ApplicationTimeline
Print a TAB-delimited list of the number of applications created every month.
Author: Uldis Bojars, http://captsolo.net
"""
import urllib2
def table_content(raw_text):
"""Extract table rows from MediaWiki table markup"""
# NOTE: this code extracts all table rows in the documents, regardless if they are in one table or many.
in_table = False
data = []
for l in raw_text:
line = l.strip()
if line.startswith("{|"):
in_table = True
data = []
continue
if in_table:
if line.startswith("|}") or line.startswith("|-"):
if data:
yield data
data = []
elif line.startswith("|"):
# this code is limited to line cells described in a single row, delimited by ||
data.extend([x.strip() for x in line[1:].split('||')])
if line.startswith("|}"):
in_table = False
data = []
def filter_rows(rows):
"""Filter out table rows that have 3 or more cells"""
return (x for x in rows if len(x)>=3)
def extract_data(rows):
"""
Extract columns 2, 3:
2 = origin of application ('D' from inside DERI; 'C' from the community)
3 = creation date of application
"""
return ((x[1],x[2]) for x in rows)
def pivot_on_month(rows):
"""Months data = a dictionary containing a list of 2 items [count of 'C', count of 'D']"""
months = {}
for (i_type, i_month) in rows:
item = months.setdefault(i_month,[0,0])
if i_type == 'C':
item[0] += 1
if i_type == 'D':
item[1] += 1
return months
def print_month_summary(url):
"""Print a TAB-delimited list in form [month, no of 'C' entries, no of 'D' entries]"""
data = urllib2.urlopen(url)
months = pivot_on_month(extract_data(filter_rows(table_content(data))))
m_list = months.items()
m_list.sort()
for item in m_list:
print "%s\t%s\t%s" % (item[0],item[1][0],item[1][1])
def print_table(url):
"""Print all table rows"""
data = urllib2.urlopen(url)
for item in table_content(data):
print item
if __name__ == "__main__":
url = "http://wiki.sioc-project.org/index.php/ApplicationTimeline?action=raw"
print_month_summary(url)
# print_table(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment