Skip to content

Instantly share code, notes, and snippets.

@jelmervdl
Created April 13, 2021 09:54
Show Gist options
  • Save jelmervdl/27ac4b01c936bbe2c8ae7c485058d81e to your computer and use it in GitHub Desktop.
Save jelmervdl/27ac4b01c936bbe2c8ae7c485058d81e to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import site
site.addsitedir('env/src/tmxutil')
import sys
import re
from datetime import datetime
from tmxutil import make_reader, TMXWriter
from collections import defaultdict
from logging import getLogger, ERROR
getLogger().setLevel(ERROR) # Hide warnings while importing
def domain(url):
match = re.match(r'^(https?:)?(//)?(?P<domain>[^/]+)', url)
return match.group('domain') if match else url
with TMXWriter(sys.stdout, creation_date=datetime.now()) as writer:
reader = make_reader(sys.stdin.buffer)
for unit in reader:
for translation in unit.translations.values():
url_per_domain = dict()
for url in translation['source-document']:
domain_name = domain(url)
if not domain_name in url_per_domain or url_per_domain[domain_name] > url:
url_per_domain[domain_name] = url
translation['source-document'] = set(url_per_domain.values())
writer.write(unit)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment