Created
April 12, 2015 19:06
-
-
Save DrLulz/21ccc6407d0ee56a99e4 to your computer and use it in GitHub Desktop.
Parsing XML for Alfred
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
from __future__ import print_function, unicode_literals | |
from HTMLParser import HTMLParser | |
from urllib import quote | |
import sys | |
import hashlib | |
from xml.etree.cElementTree import fromstring as make_etree | |
from workflow import Workflow, ICON_WEB, web | |
API_KEY = '' | |
# These don't really need to be bytes, but it's more correct, as | |
# we're going to replace {query} and {title} with bytestrings | |
API_URL = b'http://www.dictionaryapi.com/api/v1/references/medical/xml/{query}' | |
WEB_URL = b'http://www.merriam-webster.com/medical/{title}' | |
log = None | |
def dict_search(query): | |
"""Return XML from API""" | |
url = API_URL.format(query=quote(query.encode('utf-8'))) | |
params = {'key': API_KEY} | |
r = web.get(url, params) | |
r.raise_for_status() | |
# Return `r.text`, not `r.content` because | |
# we want Unicode, not bytes. In this case, returning `r.content` | |
# should work just as well, but `web.py` has additional encoding | |
# from the HTTP headers, so it's usually a good idea to let | |
# it do any decoding | |
return r.text | |
def sanitise_output(text, simplify=False): | |
"""Decode HTML entities. Also remove/replace some non-ASCII characters | |
if `simplify` is True""" | |
# HTML entities -> text | |
h = HTMLParser() | |
text = h.unescape(text) | |
if simplify: | |
# Remove · | |
text = text.replace('\xb7', '') | |
# Replace en-dashes with hyphens | |
text = text.replace('\u2013', '-') | |
return text | |
def parse_response(xmlstr): | |
"""Parse XML response to list of dicts""" | |
results = [] | |
root = make_etree(xmlstr) | |
entries = root.findall('entry') | |
log.debug('{} entries'.format(len(entries))) | |
for entry in entries: | |
# Default values | |
data = {'title': None, 'url': None, 'definition': None} | |
# Title | |
hw = entry.find('hw') | |
if hw is not None and hw.text is not None: | |
title = sanitise_output(wf.decode(hw.text), True) | |
data['title'] = title | |
data['url'] = WEB_URL.format(title=quote(title.encode('utf-8'))) | |
# Definition | |
definition = entry.find('def/sensb/sens/dt') | |
if definition is not None and definition.text is not None: | |
data['definition'] = sanitise_output(wf.decode(definition.text)) | |
log.debug(data) | |
if data['title'] is None: # Ignore results w/o title | |
continue | |
results.append(data) | |
return results | |
def make_cache_key(query): | |
"""Return cache key for `query`""" | |
m = hashlib.md5(query) | |
return m.hexdigest() | |
def main(wf): | |
query = wf.args[0] | |
def wrapper(): | |
return dict_search(query) | |
# We want to keep a separate cache for each query, so we generate | |
# a cache key based on `query`. We use an MD5 hash of `query` because | |
# query might contain characters that are not allowed in filenames | |
key = 'search-{}'.format(make_cache_key(query)) | |
# During development, cache the XML rather than the parsed results. | |
# This way, we can change the parsing code and get different results | |
# in Alfred without hitting the API all the time | |
xmlstr = wf.cached_data(key, wrapper, max_age=600) | |
results = parse_response(xmlstr) | |
# Compile results for Alfred | |
for result in results: | |
wf.add_item( | |
title=result['title'], | |
subtitle=result['definition'], | |
arg=result['definition'], | |
valid=True, | |
icon="icon.png") | |
wf.send_feedback() | |
if __name__ == '__main__': | |
wf = Workflow() | |
log = wf.logger | |
sys.exit(wf.run(main)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment