Created
October 21, 2017 12:00
-
-
Save mindey/da4318e86217e5b89d226d2a294faf4c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import collections | |
import langdetect | |
LANGUAGE_CODES = os.listdir(langdetect.PROFILES_DIRECTORY) | |
def detect_language(text, max_length=2): | |
""" Make sure we return N-letter keys for languages""" | |
shorter = {'zh-cn': 'cn', 'zh-tw': 'zh'} | |
code = langdetect.detect(text) | |
short_code = shorter.get(code) if len(code) > max_length else code | |
return short_code[:max_length] | |
def split(text, sep='.:', ends=['\n', ':'], min_key_length=2, max_key_length=2, | |
autodetect=True, pargraph_sep='\n\n', markdown=False, title=False): | |
""" | |
Splits text by `sep`, and combines texts with same keys before `ends`, | |
if they are not shorter/longer than `min_key_length` and `max_key_length`. | |
Assigns the rest of the parts to key called None. Returns a dict. | |
Detects language if not present, treating each paragraph separately. | |
Tip: | |
Change 'markdown' to True to get result combined back to markdown. | |
Pass title=True to convert to title version, using the ':' as end. | |
""" | |
result = collections.defaultdict(str) | |
lang_seq = [] | |
for token in text.split(sep): | |
if not token: | |
continue | |
name = None | |
chunk = token | |
if len(token[:max_key_length+1]) == max_key_length+1: | |
for symbol in ends: | |
pos = token[:max_key_length+1].find(symbol) | |
if min_key_length <= pos <= max_key_length: | |
name, chunk = token[:pos], token[pos+1:] | |
if not name: | |
if autodetect: | |
paragraphs = chunk.split(pargraph_sep) | |
number_of_paragraphs = len(paragraphs) | |
for i, paragraph in enumerate(paragraphs): | |
if not paragraph: | |
continue | |
name = detect_language(paragraph) | |
result[name] += paragraph | |
if i < number_of_paragraphs - 1: | |
result[name] += pargraph_sep | |
if name not in lang_seq: | |
lang_seq.append(name) | |
else: | |
result[name] += chunk | |
if name not in lang_seq: | |
lang_seq.append(name) | |
else: | |
result[name] += chunk | |
if name not in lang_seq: | |
lang_seq.append(name) | |
result = collections.OrderedDict( | |
[(lang, result[lang]) for lang in lang_seq] | |
) | |
if markdown: | |
text_md = '' | |
for lang in lang_seq: | |
text_md += '{sep}{lang}{end}{text}'.format( | |
sep = sep, | |
lang = lang, | |
end = ends[0] if not title else ends[1], | |
text = result[lang], | |
) | |
return text_md.strip() | |
return result | |
def test_title(): | |
text = '.:en:hello world.:lt:smart world.:ja:今日は、世界' | |
expect = collections.OrderedDict( | |
[('en', 'hello world'), | |
('lt', 'smart world'), | |
('ja', '今日は、世界')] | |
) | |
assert( | |
split(text) == expect | |
) | |
def test_body(): | |
text ='''.:en | |
some text | |
which is good | |
.:ru | |
несколько текста | |
.:en | |
so want to try | |
.:lt | |
nieko sau, viskas gerai | |
.:cn | |
中文也可以的 | |
''' | |
expect = collections.OrderedDict( | |
[('en', 'some text\n\nwhich is good\n\nso want to try\n\n'), | |
('ru', 'несколько текста\n\n'), | |
('lt', 'nieko sau, viskas gerai\n\n'), | |
('cn', '中文也可以的\n')] | |
) | |
assert( | |
split(text) == expect | |
) | |
def test_partial_autodetect(): | |
text = 'hello world.:lt:smart world.:ja:今日は、世界' | |
expect = collections.OrderedDict( | |
[('en', 'hello world'), | |
('lt', 'smart world'), | |
('ja', '今日は、世界')] | |
) | |
result = split(text) | |
assert( | |
result == expect | |
) | |
def test_autodetect(): | |
text = '''some text | |
which is good | |
несколько текста | |
so want to try | |
šienpjovys džemas | |
中文也可以的 | |
''' | |
expect = collections.OrderedDict( | |
[('en', 'some text\nwhich is good\n\nso want to try\n\n'), | |
('ru', 'несколько текста\n\n'), | |
('lt', 'šienpjovys džemas\n\n'), | |
('cn', '中文也可以的\n')] | |
) | |
result = split(text) | |
assert( | |
result == expect | |
) | |
def test_markdown(): | |
text = '''中文也可以的 | |
some text | |
which is good | |
несколько текста | |
so want to try | |
šienpjovys džemas''' | |
expect = '''.:cn | |
中文也可以的 | |
.:en | |
some text | |
which is good | |
so want to try | |
.:ru | |
несколько текста | |
.:lt | |
šienpjovys džemas''' | |
result = split(text, markdown=True) | |
assert( | |
result == expect | |
) | |
def test_markdown_title(): | |
text = '''世界,你好.:lt:Sveikas, Pasauli''' | |
expect = '.:cn:世界,你好.:lt:Sveikas, Pasauli' | |
result = split(text, markdown=True, title=True) | |
assert( | |
result == expect | |
) | |
if __name__ == '__main__': | |
test_title() | |
test_body() | |
test_partial_autodetect() | |
test_autodetect() | |
test_markdown() | |
test_markdown_title() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment