thomaskelder · June 16, 2016 08:29
diff --git a/wikipathways2gource.py b/wikipathways2gource.py
 ## Get pathway edit history from wikipathways.org and export as gource log file
 ## Run using:
 ## python wikipathways2gource.py wikipathways.log
 ## View in gourse using:
 ## gource -1280x720 -a 3 --seconds-per-day 0.01 --hide dirnames,filenames --user-filter MaintBot --key wikipathways.log
 ## Create video with gource using:
 ## gource -1280x720 -a 2 --seconds-per-day 0.005 --hide dirnames,filenames --user-filter MaintBot --key --stop-at-end -o - wikipathways.log | ffmpeg -y -r 30 -f image2pipe -vcodec ppm -i - -vcodec libvpx -b 10000K wikipathways.webm

 ## Known issues/shortcomings:
 ## - Pathway ontology terms can have multiple parents, this is ignored, only first is taken here to prevent pathways from showing multiple times in the visualization
 ## - Only works on pathways that are currently present, pathways that were deleted to not show up in history (limitation of WikiPathways webservice)
 ## - All log entries are recorded as "Modified" action, as the WikiPathways webservice does not provide info on type of edit (addition, modification, deletion)
 ## - Can't distinguish bot edits (filtered out MaintBot, but some scripted edits left, also old GenMAPP users still in (burst in beginnning))

 import requests
 import requests_cache
 import json
 import pandas
 import time
 import datetime
 import sys
 import collections
 import matplotlib.cm as mplcm
 import matplotlib.colors as mplcolors

 requests_cache.install_cache('wikipathways-gource-cache')

 wpUrl = "http://webservice.wikipathways.org/"

 def main():
    fromDate = '20080101000000'

    ## Open output file
    f = open(sys.argv[1], 'w')

    ## Get all pathways
    print('Querying pathway list')
    pathways = getJSON(wpUrl + 'listPathways?format=json')

    ## Generate colors per organism
    organisms = getJSON(wpUrl + 'listOrganisms?format=json')['organisms']
    colors  = mplcolors.Normalize(vmin=0, vmax=len(organisms)-1)
    colormap = mplcm.ScalarMappable(norm=colors, cmap='hsv')
    organismColors = {}
    for i,o in enumerate(organisms):
        organismColors[o] = colormap.to_rgba(i)

    ## Read pathway ontology hierarchy\
    print('Reading pathway ontology hierarchy')
    pwo = pandas.read_csv('http://data.bioontology.org/ontologies/PW/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv', compression = 'gzip')

    ontologyTerms = {}
    for i,row in pwo.iterrows():
        id = row['Class ID'].replace('http://purl.obolibrary.org/obo/', '').replace('_', ':')
        label = row['Preferred Label']
        ontologyTerms[id] = PathwayOntologyTerm(id, label, None)

    for i,row in pwo.iterrows():
        id = row['Class ID'].replace('http://purl.obolibrary.org/obo/', '').replace('_', ':')
        t = ontologyTerms[id]
        parents = str(row['Parents'])
        if parents == 'nan':
            parents = ''
        p = parents.split('|')[0]
        p = p.replace('http://purl.obolibrary.org/obo/', '').replace('_', ':')
        p = ontologyTerms.get(p, '')
        t.parent = p

    ## Query and format history entries for each pathway
    print('Query history for pathways from date ' + fromDate)
    linesByDate = {}
    for p in pathways['pathways']:
        print('Processing pathway ' + p['id'])
        pathwayPath = getPathwayPath(p, ontologyTerms) + '.' + p['species']
        color = mplcolors.rgb2hex(organismColors[p['species']]).replace('#', '').upper()
        logItems = getPathwayLogItems(p['id'], fromDate)
        for i in logItems:
            if not i['date'] < wpTimestampToUnix(fromDate):
                line = str(int(i['date'])) + '|' + i['user'] + '|M|' + pathwayPath + '|' + color
                line = line.replace(':', '_')
                #line = line.replace(' ', '_')
                linesByDate[i['date']] = line

    linesByDate = collections.OrderedDict(sorted(linesByDate.items()))
    for k,l in linesByDate.items():
        f.write(l + '\n')

    ## Close output
    f.close()

 def getJSON(url):
    r = requests.get(url)
    obj = r.json()
    return obj

 def wpTimestampToUnix(ts):
    return time.mktime(datetime.datetime.strptime(ts, "%Y%m%d%H%M%S").timetuple())

 def getPathwayLogItems(p, fromDate):
    h = getJSON(wpUrl + 'getPathwayHistory?pwId=' + p + '&timestamp=' + fromDate + '&format=json')
    items = []
    for hi in h['history']['history']:
        date = wpTimestampToUnix(hi['timestamp'])
        items.append({'user': hi['user'], 'date': date})
    return items

 def getPathwayPath(p, ontologyTerms):
    #name = p['name'].replace('/', '-') + ' (' + p['id'] + ')'
    name = p['name'].replace('/', '-')
    species = p['species']

    oterms = getJSON(wpUrl + 'getOntologyTermsByPathway?pwId=' + p['id'] + '&format=json')
    parentTerms = []
    for t in oterms['terms']:
        if t['ontology'] == 'Pathway Ontology':
            term = ontologyTerms.get(t['id'].strip(), '')
            if term:
                parentTerms = term.getParentsRecursive([])
                parentTerms = [x.name for x in parentTerms]
                break # Only take first ontology assignment

    ontologyPath = 'no ontology assigned'
    if len(parentTerms) > 0:
        parentTerms = [s[:10] + (s[10:] and '..') for s in parentTerms]
        ontologyPath = '/'.join(reversed(parentTerms))
    return species + '/' + ontologyPath + '/' + name

 class PathwayOntologyTerm:
    def __init__(self, id, name, parent):
        self.id = id
        self.name = name
        self.parent = parent

    def getParentsRecursive(self, result):
        result.append(self)
        if self.parent:
            self.parent.getParentsRecursive(result)
        return result

 if __name__ == '__main__':
    main()
	## Get pathway edit history from wikipathways.org and export as gource log file
	## Run using:
	## python wikipathways2gource.py wikipathways.log
	## View in gourse using:
	## gource -1280x720 -a 3 --seconds-per-day 0.01 --hide dirnames,filenames --user-filter MaintBot --key wikipathways.log
	## Create video with gource using:
	## gource -1280x720 -a 2 --seconds-per-day 0.005 --hide dirnames,filenames --user-filter MaintBot --key --stop-at-end -o - wikipathways.log \| ffmpeg -y -r 30 -f image2pipe -vcodec ppm -i - -vcodec libvpx -b 10000K wikipathways.webm

	## Known issues/shortcomings:
	## - Pathway ontology terms can have multiple parents, this is ignored, only first is taken here to prevent pathways from showing multiple times in the visualization
	## - Only works on pathways that are currently present, pathways that were deleted to not show up in history (limitation of WikiPathways webservice)
	## - All log entries are recorded as "Modified" action, as the WikiPathways webservice does not provide info on type of edit (addition, modification, deletion)
	## - Can't distinguish bot edits (filtered out MaintBot, but some scripted edits left, also old GenMAPP users still in (burst in beginnning))

	import requests
	import requests_cache
	import json
	import pandas
	import time
	import datetime
	import sys
	import collections
	import matplotlib.cm as mplcm
	import matplotlib.colors as mplcolors

	requests_cache.install_cache('wikipathways-gource-cache')

	wpUrl = "http://webservice.wikipathways.org/"

	def main():
	fromDate = '20080101000000'

	## Open output file
	f = open(sys.argv[1], 'w')

	## Get all pathways
	print('Querying pathway list')
	pathways = getJSON(wpUrl + 'listPathways?format=json')

	## Generate colors per organism
	organisms = getJSON(wpUrl + 'listOrganisms?format=json')['organisms']
	colors = mplcolors.Normalize(vmin=0, vmax=len(organisms)-1)
	colormap = mplcm.ScalarMappable(norm=colors, cmap='hsv')
	organismColors = {}
	for i,o in enumerate(organisms):
	organismColors[o] = colormap.to_rgba(i)

	## Read pathway ontology hierarchy\
	print('Reading pathway ontology hierarchy')
	pwo = pandas.read_csv('http://data.bioontology.org/ontologies/PW/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv', compression = 'gzip')

	ontologyTerms = {}
	for i,row in pwo.iterrows():
	id = row['Class ID'].replace('http://purl.obolibrary.org/obo/', '').replace('_', ':')
	label = row['Preferred Label']
	ontologyTerms[id] = PathwayOntologyTerm(id, label, None)

	for i,row in pwo.iterrows():
	id = row['Class ID'].replace('http://purl.obolibrary.org/obo/', '').replace('_', ':')
	t = ontologyTerms[id]
	parents = str(row['Parents'])
	if parents == 'nan':
	parents = ''
	p = parents.split('\|')[0]
	p = p.replace('http://purl.obolibrary.org/obo/', '').replace('_', ':')
	p = ontologyTerms.get(p, '')
	t.parent = p

	## Query and format history entries for each pathway
	print('Query history for pathways from date ' + fromDate)
	linesByDate = {}
	for p in pathways['pathways']:
	print('Processing pathway ' + p['id'])
	pathwayPath = getPathwayPath(p, ontologyTerms) + '.' + p['species']
	color = mplcolors.rgb2hex(organismColors[p['species']]).replace('#', '').upper()
	logItems = getPathwayLogItems(p['id'], fromDate)
	for i in logItems:
	if not i['date'] < wpTimestampToUnix(fromDate):
	line = str(int(i['date'])) + '\|' + i['user'] + '\|M\|' + pathwayPath + '\|' + color
	line = line.replace(':', '_')
	#line = line.replace(' ', '_')
	linesByDate[i['date']] = line

	linesByDate = collections.OrderedDict(sorted(linesByDate.items()))
	for k,l in linesByDate.items():
	f.write(l + '\n')

	## Close output
	f.close()

	def getJSON(url):
	r = requests.get(url)
	obj = r.json()
	return obj

	def wpTimestampToUnix(ts):
	return time.mktime(datetime.datetime.strptime(ts, "%Y%m%d%H%M%S").timetuple())

	def getPathwayLogItems(p, fromDate):
	h = getJSON(wpUrl + 'getPathwayHistory?pwId=' + p + '&timestamp=' + fromDate + '&format=json')
	items = []
	for hi in h['history']['history']:
	date = wpTimestampToUnix(hi['timestamp'])
	items.append({'user': hi['user'], 'date': date})
	return items

	def getPathwayPath(p, ontologyTerms):
	#name = p['name'].replace('/', '-') + ' (' + p['id'] + ')'
	name = p['name'].replace('/', '-')
	species = p['species']

	oterms = getJSON(wpUrl + 'getOntologyTermsByPathway?pwId=' + p['id'] + '&format=json')
	parentTerms = []
	for t in oterms['terms']:
	if t['ontology'] == 'Pathway Ontology':
	term = ontologyTerms.get(t['id'].strip(), '')
	if term:
	parentTerms = term.getParentsRecursive([])
	parentTerms = [x.name for x in parentTerms]
	break # Only take first ontology assignment

	ontologyPath = 'no ontology assigned'
	if len(parentTerms) > 0:
	parentTerms = [s[:10] + (s[10:] and '..') for s in parentTerms]
	ontologyPath = '/'.join(reversed(parentTerms))
	return species + '/' + ontologyPath + '/' + name

	class PathwayOntologyTerm:
	def __init__(self, id, name, parent):
	self.id = id
	self.name = name
	self.parent = parent

	def getParentsRecursive(self, result):
	result.append(self)
	if self.parent:
	self.parent.getParentsRecursive(result)
	return result

	if __name__ == '__main__':
	main()
No results found