Skip to content

Instantly share code, notes, and snippets.

@pspaul
Last active September 20, 2019 15:34
Show Gist options
  • Save pspaul/37f2a4cd1507c8515a46ac0fcbd1fd5c to your computer and use it in GitHub Desktop.
Save pspaul/37f2a4cd1507c8515a46ac0fcbd1fd5c to your computer and use it in GitHub Desktop.
Unicode URL Shortener
#!/usr/bin/env python3
import re
import sys
import requests
from urllib.parse import quote
def download_table():
r = requests.get('http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt')
return r.text
def read_table():
# Source: http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
with open('IdnaMappingTable.txt', 'rb') as f:
return f.read().decode('utf-8')
def int2unicode(i):
return chr(i)
def code2unicode(code):
return int2unicode(int(code, 16))
def parse_table(table):
lines = table.split('\n')
# strip header
lines = lines[11:]
entries = {}
for line in lines:
match = re.match(r'^([0-9A-F]{4,}(?:\.\.[0-9A-F]{4,})?)\s+;\s+([^;#]+?)\s+(?:;((?: [0-9A-F]{4,})+)\s+)?#.+$', line)
if match is None:
continue
parts = match.groups()
chars = parts[0]
state = parts[1].strip()
if state != 'mapped':
continue
if '..' in chars:
start, end = [int(char, 16) for char in chars.split('..')]
chars = []
for code in range(start, end + 1):
chars.append(int2unicode(code))
else:
chars = [code2unicode(chars)]
mapping = parts[2].strip().split(' ')
for char in chars:
entries[char] = ''.join([code2unicode(code) for code in mapping])
return entries
def create_lookup(parsed):
reverse = {}
for key, value in parsed.items():
if value not in reverse:
reverse[value] = []
reverse[value].append(key)
return reverse
def optimize(lookup):
# remove 1:1 mappings
optimized = {}
for long, shortcuts in lookup.items():
for shortcut in shortcuts:
if len(shortcut) < len(long):
if (long not in optimized) or (len(shortcut) < len(optimized[long][0])):
# shortcut is the first or shorter
optimized[long] = [shortcut]
elif len(shortcut) == len(optimized[long][0]):
# shortcut is not longer than the previously found one
optimized[long].append(shortcut)
return optimized
class Node:
def __init__(self, value):
self.value = value
self.id = None
def __hash__(self):
return hash(self.id)
def __eq__(self, other):
return self.id == other.id
def __str__(self):
return str((self.id, self.value))
def __repr__(self):
return self.__str__()
# Directed Acyclic Graph
class DAG:
def __init__(self):
self.nodes = []
self.edges_to = {}
self.edges_from = {}
def add_node(self, value=None):
node = Node(value)
self.nodes.append(node)
node.id = len(self.nodes) - 1
return node
def add_edge(self, node_from, node_to):
if node_from not in self.edges_from:
self.edges_from[node_from] = []
self.edges_from[node_from].append(node_to)
if node_to not in self.edges_to:
self.edges_to[node_to] = []
self.edges_to[node_to].append(node_from)
def predecessors(self, node):
if node in self.edges_to:
return self.edges_to[node][:]
else:
return []
def successors(self, node):
if node in self.edges_from:
return self.edges_from[node][:]
else:
return []
def shorten_optimal(lookup, string):
# create graph
g = DAG()
# insert starting point
start = g.add_node()
# insert all 'regular' nodes + edges
char_nodes = []
last = start
for char in string:
n = g.add_node(char)
char_nodes.append(n)
g.add_edge(last, n)
last = n
# insert destination point
end = g.add_node()
g.add_edge(last, end)
# insert all shortcuts
for long, shortcuts in lookup.items():
# select one of the possible shortest shortcuts
shortcut = shortcuts[0]
n = g.add_node(shortcut)
# check all occurrences
start_pos = 0
while True:
start_pos = string.find(long, start_pos)
if start_pos == -1:
break
start_node = char_nodes[start_pos]
end_node = char_nodes[start_pos + len(long) - 1]
for from_node in g.predecessors(start_node):
for to_node in g.successors(end_node):
g.add_edge(from_node, n)
g.add_edge(n, to_node)
start_pos += 1
# prepare
pred = {start: None}
dist = {start: 0}
# find shortest path
q = [start]
while len(q) > 0:
current = q.pop(0)
for succ in g.successors(current):
distance = dist[current] + 1
if succ in dist:
if distance < dist[succ]:
dist[succ] = distance
pred[succ] = current
else:
dist[succ] = distance
pred[succ] = current
q.append(succ)
# reconstruct path
path = []
current = end
while current is not None:
path.insert(0, current)
current = pred[current]
return ''.join(n.value for n in path[1:-1])
def shorten_url(url):
# table = read_table()
table = download_table()
parsed = parse_table(table)
lookup = create_lookup(parsed)
optimized = optimize(lookup)
shortened = shorten_optimal(optimized, url)
print('[*] Shortened from {} to {} chars!'.format(len(url), len(shortened)))
print('[+] Result: ', shortened)
print('[+] Encoded:', quote(shortened))
def main():
if len(sys.argv) != 2:
print('Usage: {} <url>'.format(sys.argv[0]))
exit(1)
url = sys.argv[1]
shorten_url(url)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment