Created
May 9, 2019 17:13
-
-
Save JordanReiter/5f63c43d542ad0e6a78f17a733e5626a to your computer and use it in GitHub Desktop.
Given a block with mixed languages, split into individual sections by language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
requires langdetect, available on pypi (pip install langdetect) | |
https://github.com/Mimino666/langdetect | |
''' | |
import langdetect | |
def split_by_language(content, delimiter='\n', joiner='\n', | |
languages=None, fail_silently=True): | |
''' | |
Given a stretch of text written in 2 or more languages | |
This function identifies each language section, returns | |
the first section, the first language, and a dictionary | |
of all languages. | |
Given languages, it ignores all languages other than the | |
languages provided and assumes it is the previous language | |
or the first identifiable language. | |
If the delimiter is given as a regex, then it is used to | |
split the chunks, and the value for joiner is used to | |
combine the chunks at the end. | |
If the delimiter is a string, any value given for joiner | |
is ignored. | |
''' | |
try: | |
chunks = content.split(delimiter) | |
joiner = delimiter | |
except TypeError: | |
# must not be a string, so must be a regex! | |
chunks = delimiter.split(content) | |
first_lang = None | |
last_lang = None | |
lang_sections = {} | |
for chunk in chunks: | |
try: | |
current_lang = langdetect.detect(chunk) | |
except langdetect.lang_detect_exception.LangDetectException: | |
if not fail_silently: | |
raise | |
current_lang = last_lang | |
if languages and current_lang not in languages: | |
if not fail_silently: | |
raise ValueError("Invalid language: {}".format(current_lang)) | |
current_lang = last_lang | |
if not first_lang: | |
first_lang = current_lang | |
lang_sections.setdefault(current_lang, []) | |
if current_lang != last_lang: | |
if len(lang_sections[current_lang]): | |
# language changed but this language already has content | |
if not fail_silently: | |
raise ValueError( | |
"Language {} found in different locations".format( | |
current_lang | |
) | |
) | |
# assume language didn't change & is just a detection error | |
current_lang = last_lang | |
if None in lang_sections: | |
# there was an unidentified language, so put it here | |
lang_sections[current_lang] += lang_sections.pop(None) | |
if last_lang in lang_sections: | |
# add empty element so it ends with joiner | |
lang_sections[last_lang].append('') | |
lang_sections[current_lang].append(chunk) | |
last_lang = current_lang | |
primary = joiner.join(lang_sections.get(first_lang)).strip() | |
if languages and not first_lang: | |
first_lang = languages[0] | |
lang_sections[first_lang] = lang_sections.pop(None, None) or [] | |
return ( | |
primary, | |
first_lang, | |
{kk: joiner.join(vv).strip() for kk, vv in lang_sections.items()} | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment