Created
December 19, 2021 19:21
-
-
Save eliasdabbas/fc21e016e281b26ae52d427ab385008a to your computer and use it in GitHub Desktop.
Check if a given URL will be crawled or not given a set of conditions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.parse import urlsplit, parse_qs | |
import re | |
def crawl_or_not(url, | |
exclude_url_params=None, | |
include_url_params=None, | |
include_url_pattern=None, | |
exclude_url_pattern=None): | |
"""Check if ``url`` will be crawled or not given the supplied conditions. | |
Note that these conditions only apply when discovering URLs by following links on pages. | |
URLs provided to the ``url_list`` parameter of the ``crawl`` function will be crawled | |
without taking into consideration any of these conditions. | |
:param list exclude_url_params: A list of URL parameter to exclude while crawling. URLs | |
containing any of these URLs will not be crawled. | |
:param list include_url_params: A list of URL parameters to include while crawling. Only | |
URLs containing one or more of these parameters will be crawled. | |
:param str exclude_url_pattern: A regular expression to exclude URLs. If this regex matches the | |
given URL it will not be crawled. | |
:param str include_url_pattern: A regular expression to include URLs. If this regex matches the | |
given URL it will be crawled. | |
""" | |
if exclude_url_params is not None and include_url_params is not None: | |
same_params = set(exclude_url_params).intersection(include_url_params) | |
if same_params: | |
raise ValueError(f"Please make sure you dont have the same parameters to exclude and include.\n" | |
f"Common parameters entered: {','.join(same_params)}") | |
if include_url_pattern is not None and exclude_url_pattern is not None: | |
if re.findall(include_url_pattern, url) == re.findall(exclude_url_pattern, url): | |
raise ValueError(f"Please make sure you don't include and exclud the same pattern.\n" | |
f"You entered '{include_url_pattern}'") | |
qs = parse_qs(urlsplit(url).query) | |
supplied_conditions = [] | |
if exclude_url_params is not None: | |
exclude_params_in_url = not bool(set(exclude_url_params).intersection(qs)) | |
supplied_conditions.append(exclude_params_in_url) | |
if include_url_params is not None: | |
include_params_in_url = bool(set(include_url_params).intersection(qs)) | |
supplied_conditions.append(include_params_in_url) | |
if exclude_url_pattern is not None: | |
exclude_pattern_matched = not bool(re.findall(exclude_url_pattern, url)) | |
supplied_conditions.append(exclude_pattern_matched) | |
if include_url_pattern is not None: | |
include_pattern_matched = bool(re.findall(include_url_pattern, url)) | |
supplied_conditions.append(include_pattern_matched) | |
return all(supplied_conditions) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment