Created
November 11, 2013 11:54
-
-
Save Bouke/7412121 to your computer and use it in GitHub Desktop.
IANA Language Tag Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Parses the language tags provided by IANA_ | |
For a definition of the tag types, see | |
http://www.w3.org/International/questions/qa-choosing-language-tags | |
IANA_: http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | |
""" | |
from collections import OrderedDict | |
from pprint import pprint | |
from urllib.request import urlopen | |
URL = "http://www.iana.org/assignments/language-subtag-registry/language" \ | |
"-subtag-registry" | |
def string_lines(iterator): | |
prev_line = None | |
for line in iterator: | |
line = line.decode(encoding='UTF-8').replace('\n', '') | |
if line.startswith(' '): | |
prev_line += line[1:] | |
continue | |
if prev_line: | |
yield prev_line | |
prev_line = line | |
if prev_line: | |
yield prev_line | |
def locale_chunks(iterator): | |
data = [] | |
for line in iterator: | |
if line == '%%': | |
yield data | |
data = [] | |
else: | |
try: | |
data.append(line.split(': ')[0:2]) | |
except ValueError: | |
pass | |
def locale_dict(iterator): | |
for data in iterator: | |
yield {key: value for key, value in data} | |
def language_filter(iterator): | |
for locale in iterator: | |
type = locale.get('Type') | |
if type == 'language': | |
yield locale['Subtag'], locale['Description'] | |
elif type == 'extlang': | |
yield locale['Subtag'], locale['Description'] | |
elif type == 'redundant': | |
yield locale['Tag'], locale['Description'] | |
language_iterator = language_filter(locale_dict(locale_chunks(string_lines( | |
urlopen(URL))))) | |
LANGUAGES = OrderedDict(sorted(language_iterator), key=lambda item: item[0]) | |
pprint(LANGUAGES) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Parses the language tags provided by IANA. For a definition of the tag types, see W3's explanation.