Last active
September 20, 2019 15:34
-
-
Save pspaul/37f2a4cd1507c8515a46ac0fcbd1fd5c to your computer and use it in GitHub Desktop.
Unicode URL Shortener
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
import sys | |
import requests | |
from urllib.parse import quote | |
def download_table(): | |
r = requests.get('http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt') | |
return r.text | |
def read_table(): | |
# Source: http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt | |
with open('IdnaMappingTable.txt', 'rb') as f: | |
return f.read().decode('utf-8') | |
def int2unicode(i): | |
return chr(i) | |
def code2unicode(code): | |
return int2unicode(int(code, 16)) | |
def parse_table(table): | |
lines = table.split('\n') | |
# strip header | |
lines = lines[11:] | |
entries = {} | |
for line in lines: | |
match = re.match(r'^([0-9A-F]{4,}(?:\.\.[0-9A-F]{4,})?)\s+;\s+([^;#]+?)\s+(?:;((?: [0-9A-F]{4,})+)\s+)?#.+$', line) | |
if match is None: | |
continue | |
parts = match.groups() | |
chars = parts[0] | |
state = parts[1].strip() | |
if state != 'mapped': | |
continue | |
if '..' in chars: | |
start, end = [int(char, 16) for char in chars.split('..')] | |
chars = [] | |
for code in range(start, end + 1): | |
chars.append(int2unicode(code)) | |
else: | |
chars = [code2unicode(chars)] | |
mapping = parts[2].strip().split(' ') | |
for char in chars: | |
entries[char] = ''.join([code2unicode(code) for code in mapping]) | |
return entries | |
def create_lookup(parsed): | |
reverse = {} | |
for key, value in parsed.items(): | |
if value not in reverse: | |
reverse[value] = [] | |
reverse[value].append(key) | |
return reverse | |
def optimize(lookup): | |
# remove 1:1 mappings | |
optimized = {} | |
for long, shortcuts in lookup.items(): | |
for shortcut in shortcuts: | |
if len(shortcut) < len(long): | |
if (long not in optimized) or (len(shortcut) < len(optimized[long][0])): | |
# shortcut is the first or shorter | |
optimized[long] = [shortcut] | |
elif len(shortcut) == len(optimized[long][0]): | |
# shortcut is not longer than the previously found one | |
optimized[long].append(shortcut) | |
return optimized | |
class Node: | |
def __init__(self, value): | |
self.value = value | |
self.id = None | |
def __hash__(self): | |
return hash(self.id) | |
def __eq__(self, other): | |
return self.id == other.id | |
def __str__(self): | |
return str((self.id, self.value)) | |
def __repr__(self): | |
return self.__str__() | |
# Directed Acyclic Graph | |
class DAG: | |
def __init__(self): | |
self.nodes = [] | |
self.edges_to = {} | |
self.edges_from = {} | |
def add_node(self, value=None): | |
node = Node(value) | |
self.nodes.append(node) | |
node.id = len(self.nodes) - 1 | |
return node | |
def add_edge(self, node_from, node_to): | |
if node_from not in self.edges_from: | |
self.edges_from[node_from] = [] | |
self.edges_from[node_from].append(node_to) | |
if node_to not in self.edges_to: | |
self.edges_to[node_to] = [] | |
self.edges_to[node_to].append(node_from) | |
def predecessors(self, node): | |
if node in self.edges_to: | |
return self.edges_to[node][:] | |
else: | |
return [] | |
def successors(self, node): | |
if node in self.edges_from: | |
return self.edges_from[node][:] | |
else: | |
return [] | |
def shorten_optimal(lookup, string): | |
# create graph | |
g = DAG() | |
# insert starting point | |
start = g.add_node() | |
# insert all 'regular' nodes + edges | |
char_nodes = [] | |
last = start | |
for char in string: | |
n = g.add_node(char) | |
char_nodes.append(n) | |
g.add_edge(last, n) | |
last = n | |
# insert destination point | |
end = g.add_node() | |
g.add_edge(last, end) | |
# insert all shortcuts | |
for long, shortcuts in lookup.items(): | |
# select one of the possible shortest shortcuts | |
shortcut = shortcuts[0] | |
n = g.add_node(shortcut) | |
# check all occurrences | |
start_pos = 0 | |
while True: | |
start_pos = string.find(long, start_pos) | |
if start_pos == -1: | |
break | |
start_node = char_nodes[start_pos] | |
end_node = char_nodes[start_pos + len(long) - 1] | |
for from_node in g.predecessors(start_node): | |
for to_node in g.successors(end_node): | |
g.add_edge(from_node, n) | |
g.add_edge(n, to_node) | |
start_pos += 1 | |
# prepare | |
pred = {start: None} | |
dist = {start: 0} | |
# find shortest path | |
q = [start] | |
while len(q) > 0: | |
current = q.pop(0) | |
for succ in g.successors(current): | |
distance = dist[current] + 1 | |
if succ in dist: | |
if distance < dist[succ]: | |
dist[succ] = distance | |
pred[succ] = current | |
else: | |
dist[succ] = distance | |
pred[succ] = current | |
q.append(succ) | |
# reconstruct path | |
path = [] | |
current = end | |
while current is not None: | |
path.insert(0, current) | |
current = pred[current] | |
return ''.join(n.value for n in path[1:-1]) | |
def shorten_url(url): | |
# table = read_table() | |
table = download_table() | |
parsed = parse_table(table) | |
lookup = create_lookup(parsed) | |
optimized = optimize(lookup) | |
shortened = shorten_optimal(optimized, url) | |
print('[*] Shortened from {} to {} chars!'.format(len(url), len(shortened))) | |
print('[+] Result: ', shortened) | |
print('[+] Encoded:', quote(shortened)) | |
def main(): | |
if len(sys.argv) != 2: | |
print('Usage: {} <url>'.format(sys.argv[0])) | |
exit(1) | |
url = sys.argv[1] | |
shorten_url(url) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment