Created
February 7, 2018 19:19
-
-
Save bash/015bc272fe46b7b8850cc3ba05d4ef19 to your computer and use it in GitHub Desktop.
URL Extractor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from urllib.parse import urlparse | |
from typing import Generator, Tuple | |
BRACKETS = [ | |
('(', ')'), | |
('[', ']'), | |
('{', '}'), | |
('<', '>'), | |
] | |
PROTOCOL_REGEX = re.compile('(http|https)://') | |
WHITESPACE_REGEX = re.compile('[\\s]+') | |
def find_first_unbalanced(input: str, opener: str, closer: str) -> int: | |
""" | |
Finds the first index of an unbalanced bracket pair. | |
""" | |
stack = [] | |
for i, char in enumerate(input): | |
if char == opener: | |
stack.append(i) | |
if char == closer: | |
if len(stack) == 0: | |
return i | |
else: | |
stack.pop() | |
if len(stack) > 0: | |
return stack[0] | |
def find_urls(input: str) -> Generator[Tuple[int, int], None, None]: | |
""" | |
Finds and yields the start and ending position of URLs in a given string. | |
""" | |
input_len = len(input) | |
pos = 0 | |
while True: | |
rest = input[pos:] | |
# find the next occurrence of http:// or https:// | |
# which makes it a potential url candiate | |
next_url = PROTOCOL_REGEX.search(rest) | |
if next_url is None: | |
return | |
start = next_url.start() | |
# urls are not allowed to contain whitespaces which | |
# makes whitespaces a pretty good upper limit | |
whitespace = WHITESPACE_REGEX.search(rest[start:]) | |
if whitespace is None: | |
end = len(rest) | |
else: | |
end = whitespace.start() | |
url = rest[start:(start + end)] | |
# urls are allowed to contain all sorts of brackets. | |
# to prevent brackets that are part of the text to be mistaken as | |
# part of the url, we require urls to have balanced brackets and remove | |
# unbalanced brackets from our url "candiate". | |
unbalanced_indexes = [] | |
for bracket in BRACKETS: | |
unbalanced_index = find_first_unbalanced(url, bracket[0], bracket[1]) | |
if unbalanced_index is not None: | |
unbalanced_indexes.append(unbalanced_index) | |
if len(unbalanced_indexes) > 0: | |
end = min(unbalanced_indexes) | |
url = rest[start:(start + end)] | |
# dots are also valid inside a URL even at trailing position. | |
# this is inconvenient because dots also end sentences. | |
# that's why we remove trailing dots. | |
while url[-1:] == '.': | |
end = end - 1 | |
url = url[:-1] | |
parsed = urlparse(url) | |
# only allow http and https schemes and require a hostname | |
# which is a pretty good indicator that we have a valid URL | |
if parsed.scheme in ['http', 'https'] and parsed.hostname is not None: | |
yield (pos + start, pos + start + end) | |
pos = pos + start + end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment