Skip to content

Instantly share code, notes, and snippets.

@schwanksta
Created June 21, 2013 23:16
Show Gist options
  • Save schwanksta/5835021 to your computer and use it in GitHub Desktop.
Save schwanksta/5835021 to your computer and use it in GitHub Desktop.
Extract authors from byline string
import re
def _extract_authors(byline):
"""
Takes one of our bylines returned from the Oxygen API, and
tries to split it out into individual authors. Only works up
to a triple byline.
"""
fallback_byline = re.compile("([\-\w.&; ]+)")
single_byline = re.compile("By ([\-\w.&; ]+)")
double_byline = re.compile("By ([\-\w.&; ]+) and ([\-\w.&; ]+)")
triple_byline = re.compile("By ([\-\w.&; ]+), ([\-\w.&; ]+) and ([\-\w.&; ]+)")
byline_res = (
triple_byline,
double_byline,
single_byline,
fallback_byline
)
# Scan through from more complex to less complex. Take
# first match, because single_byline will match the first
# name from both double and triples.
for regex in byline_res:
matches = regex.search(byline)
if matches:
return matches.groups()
# If not matches, return full byline
return (byline,)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment