Created
April 13, 2021 09:54
-
-
Save jelmervdl/27ac4b01c936bbe2c8ae7c485058d81e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import site | |
| site.addsitedir('env/src/tmxutil') | |
| import sys | |
| import re | |
| from datetime import datetime | |
| from tmxutil import make_reader, TMXWriter | |
| from collections import defaultdict | |
| from logging import getLogger, ERROR | |
| getLogger().setLevel(ERROR) # Hide warnings while importing | |
| def domain(url): | |
| match = re.match(r'^(https?:)?(//)?(?P<domain>[^/]+)', url) | |
| return match.group('domain') if match else url | |
| with TMXWriter(sys.stdout, creation_date=datetime.now()) as writer: | |
| reader = make_reader(sys.stdin.buffer) | |
| for unit in reader: | |
| for translation in unit.translations.values(): | |
| url_per_domain = dict() | |
| for url in translation['source-document']: | |
| domain_name = domain(url) | |
| if not domain_name in url_per_domain or url_per_domain[domain_name] > url: | |
| url_per_domain[domain_name] = url | |
| translation['source-document'] = set(url_per_domain.values()) | |
| writer.write(unit) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment