Created
February 22, 2021 11:44
-
-
Save nuno-andre/52d19a3665c6ef2c67d81811ffb16bd0 to your computer and use it in GitHub Desktop.
Retrieve a URL's eTLD (effective top-level domain) and its operator from the Public Suffix List
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Retrieve a URL's eTLD (effective top-level domain) and its operator | |
from the Public Suffix List | |
https://wiki.mozilla.org/Public_Suffix_List | |
""" | |
from functools import cached_property | |
from urllib.parse import urlparse | |
from typing import Optional | |
from io import StringIO | |
import requests | |
import re | |
SOURCE = 'https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat' | |
REOPER = re.compile(r'^//\s(?P<root>.*?)\s:\s(?:[\-0-9]*\s)?(?P<operator>.*?)\s*?$') | |
class EtldParse: | |
@cached_property | |
def etlds(self) -> dict[str, str]: | |
'''Dictionary {TLD: operator}. | |
''' | |
etlds = dict() | |
operator = None | |
data = StringIO(requests.get(SOURCE).text) | |
for line in iter(data.readline, ''): | |
if line[0] not in '/\n': | |
etlds[line.strip()] = operator | |
elif (match := REOPER.match(line)): | |
operator = match.group(2) | |
return etlds | |
def parse(self, url: str) -> dict[str, Optional[str]]: | |
'''Returns host and eTLD. | |
''' | |
host = urlparse(url).hostname.split('.') | |
etld = list() | |
operator = None | |
while host: | |
if (_op := self.etlds.get('.'.join((host[-1], *etld)))): | |
operator = _op | |
etld.insert(0, host.pop(-1)) | |
else: | |
break | |
return dict(host='.'.join(host) or None, | |
etld='.'.join(etld) or None, | |
operator=operator) | |
if __name__ == '__main__': | |
tldp = EtldParse() | |
print(tldp.parse('http://foo.bar.gal/whatever')) | |
# returns: {'host': 'foo.bar', 'etld': 'gal', 'operator': 'Asociación puntoGAL'} | |
print(tldp.parse('http://foo.bar.kouzushima.tokyo.jp/whatever')) | |
# returns: {'host': 'foo.bar', 'etld': 'kouzushima.tokyo.jp', 'operator': 'https://en.wikipedia.org/wiki/.jp'} | |
print(tldp.parse('http://kouzushima.tokyo.jp/whatever')) | |
# returns: {'host': None, 'etld': 'kouzushima.tokyo.jp', 'operator': 'https://en.wikipedia.org/wiki/.jp'} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment