Skip to content

Instantly share code, notes, and snippets.

@thomaskelder
Created June 16, 2016 08:29
Show Gist options
  • Select an option

  • Save thomaskelder/8016e50dbf4508dc6b6a4d6a8f1c335c to your computer and use it in GitHub Desktop.

Select an option

Save thomaskelder/8016e50dbf4508dc6b6a4d6a8f1c335c to your computer and use it in GitHub Desktop.
Small python script to create an input file for Gource (http://gource.io/) to visualize WikiPathways (http://www.wikipathways.org) edit histroy.
## Get pathway edit history from wikipathways.org and export as gource log file
## Run using:
## python wikipathways2gource.py wikipathways.log
## View in gourse using:
## gource -1280x720 -a 3 --seconds-per-day 0.01 --hide dirnames,filenames --user-filter MaintBot --key wikipathways.log
## Create video with gource using:
## gource -1280x720 -a 2 --seconds-per-day 0.005 --hide dirnames,filenames --user-filter MaintBot --key --stop-at-end -o - wikipathways.log | ffmpeg -y -r 30 -f image2pipe -vcodec ppm -i - -vcodec libvpx -b 10000K wikipathways.webm
## Known issues/shortcomings:
## - Pathway ontology terms can have multiple parents, this is ignored, only first is taken here to prevent pathways from showing multiple times in the visualization
## - Only works on pathways that are currently present, pathways that were deleted to not show up in history (limitation of WikiPathways webservice)
## - All log entries are recorded as "Modified" action, as the WikiPathways webservice does not provide info on type of edit (addition, modification, deletion)
## - Can't distinguish bot edits (filtered out MaintBot, but some scripted edits left, also old GenMAPP users still in (burst in beginnning))
import requests
import requests_cache
import json
import pandas
import time
import datetime
import sys
import collections
import matplotlib.cm as mplcm
import matplotlib.colors as mplcolors
requests_cache.install_cache('wikipathways-gource-cache')
wpUrl = "http://webservice.wikipathways.org/"
def main():
fromDate = '20080101000000'
## Open output file
f = open(sys.argv[1], 'w')
## Get all pathways
print('Querying pathway list')
pathways = getJSON(wpUrl + 'listPathways?format=json')
## Generate colors per organism
organisms = getJSON(wpUrl + 'listOrganisms?format=json')['organisms']
colors = mplcolors.Normalize(vmin=0, vmax=len(organisms)-1)
colormap = mplcm.ScalarMappable(norm=colors, cmap='hsv')
organismColors = {}
for i,o in enumerate(organisms):
organismColors[o] = colormap.to_rgba(i)
## Read pathway ontology hierarchy\
print('Reading pathway ontology hierarchy')
pwo = pandas.read_csv('http://data.bioontology.org/ontologies/PW/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv', compression = 'gzip')
ontologyTerms = {}
for i,row in pwo.iterrows():
id = row['Class ID'].replace('http://purl.obolibrary.org/obo/', '').replace('_', ':')
label = row['Preferred Label']
ontologyTerms[id] = PathwayOntologyTerm(id, label, None)
for i,row in pwo.iterrows():
id = row['Class ID'].replace('http://purl.obolibrary.org/obo/', '').replace('_', ':')
t = ontologyTerms[id]
parents = str(row['Parents'])
if parents == 'nan':
parents = ''
p = parents.split('|')[0]
p = p.replace('http://purl.obolibrary.org/obo/', '').replace('_', ':')
p = ontologyTerms.get(p, '')
t.parent = p
## Query and format history entries for each pathway
print('Query history for pathways from date ' + fromDate)
linesByDate = {}
for p in pathways['pathways']:
print('Processing pathway ' + p['id'])
pathwayPath = getPathwayPath(p, ontologyTerms) + '.' + p['species']
color = mplcolors.rgb2hex(organismColors[p['species']]).replace('#', '').upper()
logItems = getPathwayLogItems(p['id'], fromDate)
for i in logItems:
if not i['date'] < wpTimestampToUnix(fromDate):
line = str(int(i['date'])) + '|' + i['user'] + '|M|' + pathwayPath + '|' + color
line = line.replace(':', '_')
#line = line.replace(' ', '_')
linesByDate[i['date']] = line
linesByDate = collections.OrderedDict(sorted(linesByDate.items()))
for k,l in linesByDate.items():
f.write(l + '\n')
## Close output
f.close()
def getJSON(url):
r = requests.get(url)
obj = r.json()
return obj
def wpTimestampToUnix(ts):
return time.mktime(datetime.datetime.strptime(ts, "%Y%m%d%H%M%S").timetuple())
def getPathwayLogItems(p, fromDate):
h = getJSON(wpUrl + 'getPathwayHistory?pwId=' + p + '&timestamp=' + fromDate + '&format=json')
items = []
for hi in h['history']['history']:
date = wpTimestampToUnix(hi['timestamp'])
items.append({'user': hi['user'], 'date': date})
return items
def getPathwayPath(p, ontologyTerms):
#name = p['name'].replace('/', '-') + ' (' + p['id'] + ')'
name = p['name'].replace('/', '-')
species = p['species']
oterms = getJSON(wpUrl + 'getOntologyTermsByPathway?pwId=' + p['id'] + '&format=json')
parentTerms = []
for t in oterms['terms']:
if t['ontology'] == 'Pathway Ontology':
term = ontologyTerms.get(t['id'].strip(), '')
if term:
parentTerms = term.getParentsRecursive([])
parentTerms = [x.name for x in parentTerms]
break # Only take first ontology assignment
ontologyPath = 'no ontology assigned'
if len(parentTerms) > 0:
parentTerms = [s[:10] + (s[10:] and '..') for s in parentTerms]
ontologyPath = '/'.join(reversed(parentTerms))
return species + '/' + ontologyPath + '/' + name
class PathwayOntologyTerm:
def __init__(self, id, name, parent):
self.id = id
self.name = name
self.parent = parent
def getParentsRecursive(self, result):
result.append(self)
if self.parent:
self.parent.getParentsRecursive(result)
return result
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment