Created
June 16, 2016 08:29
-
-
Save thomaskelder/8016e50dbf4508dc6b6a4d6a8f1c335c to your computer and use it in GitHub Desktop.
Small python script to create an input file for Gource (http://gource.io/) to visualize WikiPathways (http://www.wikipathways.org) edit histroy.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## Get pathway edit history from wikipathways.org and export as gource log file | |
| ## Run using: | |
| ## python wikipathways2gource.py wikipathways.log | |
| ## View in gourse using: | |
| ## gource -1280x720 -a 3 --seconds-per-day 0.01 --hide dirnames,filenames --user-filter MaintBot --key wikipathways.log | |
| ## Create video with gource using: | |
| ## gource -1280x720 -a 2 --seconds-per-day 0.005 --hide dirnames,filenames --user-filter MaintBot --key --stop-at-end -o - wikipathways.log | ffmpeg -y -r 30 -f image2pipe -vcodec ppm -i - -vcodec libvpx -b 10000K wikipathways.webm | |
| ## Known issues/shortcomings: | |
| ## - Pathway ontology terms can have multiple parents, this is ignored, only first is taken here to prevent pathways from showing multiple times in the visualization | |
| ## - Only works on pathways that are currently present, pathways that were deleted to not show up in history (limitation of WikiPathways webservice) | |
| ## - All log entries are recorded as "Modified" action, as the WikiPathways webservice does not provide info on type of edit (addition, modification, deletion) | |
| ## - Can't distinguish bot edits (filtered out MaintBot, but some scripted edits left, also old GenMAPP users still in (burst in beginnning)) | |
| import requests | |
| import requests_cache | |
| import json | |
| import pandas | |
| import time | |
| import datetime | |
| import sys | |
| import collections | |
| import matplotlib.cm as mplcm | |
| import matplotlib.colors as mplcolors | |
| requests_cache.install_cache('wikipathways-gource-cache') | |
| wpUrl = "http://webservice.wikipathways.org/" | |
| def main(): | |
| fromDate = '20080101000000' | |
| ## Open output file | |
| f = open(sys.argv[1], 'w') | |
| ## Get all pathways | |
| print('Querying pathway list') | |
| pathways = getJSON(wpUrl + 'listPathways?format=json') | |
| ## Generate colors per organism | |
| organisms = getJSON(wpUrl + 'listOrganisms?format=json')['organisms'] | |
| colors = mplcolors.Normalize(vmin=0, vmax=len(organisms)-1) | |
| colormap = mplcm.ScalarMappable(norm=colors, cmap='hsv') | |
| organismColors = {} | |
| for i,o in enumerate(organisms): | |
| organismColors[o] = colormap.to_rgba(i) | |
| ## Read pathway ontology hierarchy\ | |
| print('Reading pathway ontology hierarchy') | |
| pwo = pandas.read_csv('http://data.bioontology.org/ontologies/PW/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv', compression = 'gzip') | |
| ontologyTerms = {} | |
| for i,row in pwo.iterrows(): | |
| id = row['Class ID'].replace('http://purl.obolibrary.org/obo/', '').replace('_', ':') | |
| label = row['Preferred Label'] | |
| ontologyTerms[id] = PathwayOntologyTerm(id, label, None) | |
| for i,row in pwo.iterrows(): | |
| id = row['Class ID'].replace('http://purl.obolibrary.org/obo/', '').replace('_', ':') | |
| t = ontologyTerms[id] | |
| parents = str(row['Parents']) | |
| if parents == 'nan': | |
| parents = '' | |
| p = parents.split('|')[0] | |
| p = p.replace('http://purl.obolibrary.org/obo/', '').replace('_', ':') | |
| p = ontologyTerms.get(p, '') | |
| t.parent = p | |
| ## Query and format history entries for each pathway | |
| print('Query history for pathways from date ' + fromDate) | |
| linesByDate = {} | |
| for p in pathways['pathways']: | |
| print('Processing pathway ' + p['id']) | |
| pathwayPath = getPathwayPath(p, ontologyTerms) + '.' + p['species'] | |
| color = mplcolors.rgb2hex(organismColors[p['species']]).replace('#', '').upper() | |
| logItems = getPathwayLogItems(p['id'], fromDate) | |
| for i in logItems: | |
| if not i['date'] < wpTimestampToUnix(fromDate): | |
| line = str(int(i['date'])) + '|' + i['user'] + '|M|' + pathwayPath + '|' + color | |
| line = line.replace(':', '_') | |
| #line = line.replace(' ', '_') | |
| linesByDate[i['date']] = line | |
| linesByDate = collections.OrderedDict(sorted(linesByDate.items())) | |
| for k,l in linesByDate.items(): | |
| f.write(l + '\n') | |
| ## Close output | |
| f.close() | |
| def getJSON(url): | |
| r = requests.get(url) | |
| obj = r.json() | |
| return obj | |
| def wpTimestampToUnix(ts): | |
| return time.mktime(datetime.datetime.strptime(ts, "%Y%m%d%H%M%S").timetuple()) | |
| def getPathwayLogItems(p, fromDate): | |
| h = getJSON(wpUrl + 'getPathwayHistory?pwId=' + p + '×tamp=' + fromDate + '&format=json') | |
| items = [] | |
| for hi in h['history']['history']: | |
| date = wpTimestampToUnix(hi['timestamp']) | |
| items.append({'user': hi['user'], 'date': date}) | |
| return items | |
| def getPathwayPath(p, ontologyTerms): | |
| #name = p['name'].replace('/', '-') + ' (' + p['id'] + ')' | |
| name = p['name'].replace('/', '-') | |
| species = p['species'] | |
| oterms = getJSON(wpUrl + 'getOntologyTermsByPathway?pwId=' + p['id'] + '&format=json') | |
| parentTerms = [] | |
| for t in oterms['terms']: | |
| if t['ontology'] == 'Pathway Ontology': | |
| term = ontologyTerms.get(t['id'].strip(), '') | |
| if term: | |
| parentTerms = term.getParentsRecursive([]) | |
| parentTerms = [x.name for x in parentTerms] | |
| break # Only take first ontology assignment | |
| ontologyPath = 'no ontology assigned' | |
| if len(parentTerms) > 0: | |
| parentTerms = [s[:10] + (s[10:] and '..') for s in parentTerms] | |
| ontologyPath = '/'.join(reversed(parentTerms)) | |
| return species + '/' + ontologyPath + '/' + name | |
| class PathwayOntologyTerm: | |
| def __init__(self, id, name, parent): | |
| self.id = id | |
| self.name = name | |
| self.parent = parent | |
| def getParentsRecursive(self, result): | |
| result.append(self) | |
| if self.parent: | |
| self.parent.getParentsRecursive(result) | |
| return result | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment