Skip to content

Instantly share code, notes, and snippets.

@DrLulz
Created April 12, 2015 19:06
Show Gist options
  • Save DrLulz/21ccc6407d0ee56a99e4 to your computer and use it in GitHub Desktop.
Save DrLulz/21ccc6407d0ee56a99e4 to your computer and use it in GitHub Desktop.
Parsing XML for Alfred
# encoding: utf-8
from __future__ import print_function, unicode_literals
from HTMLParser import HTMLParser
from urllib import quote
import sys
import hashlib
from xml.etree.cElementTree import fromstring as make_etree
from workflow import Workflow, ICON_WEB, web
API_KEY = ''
# These don't really need to be bytes, but it's more correct, as
# we're going to replace {query} and {title} with bytestrings
API_URL = b'http://www.dictionaryapi.com/api/v1/references/medical/xml/{query}'
WEB_URL = b'http://www.merriam-webster.com/medical/{title}'
log = None
def dict_search(query):
"""Return XML from API"""
url = API_URL.format(query=quote(query.encode('utf-8')))
params = {'key': API_KEY}
r = web.get(url, params)
r.raise_for_status()
# Return `r.text`, not `r.content` because
# we want Unicode, not bytes. In this case, returning `r.content`
# should work just as well, but `web.py` has additional encoding
# from the HTTP headers, so it's usually a good idea to let
# it do any decoding
return r.text
def sanitise_output(text, simplify=False):
"""Decode HTML entities. Also remove/replace some non-ASCII characters
if `simplify` is True"""
# HTML entities -> text
h = HTMLParser()
text = h.unescape(text)
if simplify:
# Remove ·
text = text.replace('\xb7', '')
# Replace en-dashes with hyphens
text = text.replace('\u2013', '-')
return text
def parse_response(xmlstr):
"""Parse XML response to list of dicts"""
results = []
root = make_etree(xmlstr)
entries = root.findall('entry')
log.debug('{} entries'.format(len(entries)))
for entry in entries:
# Default values
data = {'title': None, 'url': None, 'definition': None}
# Title
hw = entry.find('hw')
if hw is not None and hw.text is not None:
title = sanitise_output(wf.decode(hw.text), True)
data['title'] = title
data['url'] = WEB_URL.format(title=quote(title.encode('utf-8')))
# Definition
definition = entry.find('def/sensb/sens/dt')
if definition is not None and definition.text is not None:
data['definition'] = sanitise_output(wf.decode(definition.text))
log.debug(data)
if data['title'] is None: # Ignore results w/o title
continue
results.append(data)
return results
def make_cache_key(query):
"""Return cache key for `query`"""
m = hashlib.md5(query)
return m.hexdigest()
def main(wf):
query = wf.args[0]
def wrapper():
return dict_search(query)
# We want to keep a separate cache for each query, so we generate
# a cache key based on `query`. We use an MD5 hash of `query` because
# query might contain characters that are not allowed in filenames
key = 'search-{}'.format(make_cache_key(query))
# During development, cache the XML rather than the parsed results.
# This way, we can change the parsing code and get different results
# in Alfred without hitting the API all the time
xmlstr = wf.cached_data(key, wrapper, max_age=600)
results = parse_response(xmlstr)
# Compile results for Alfred
for result in results:
wf.add_item(
title=result['title'],
subtitle=result['definition'],
arg=result['definition'],
valid=True,
icon="icon.png")
wf.send_feedback()
if __name__ == '__main__':
wf = Workflow()
log = wf.logger
sys.exit(wf.run(main))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment