Created
October 23, 2019 11:11
-
-
Save pemagrg1/7cde994801455a4a056a287999c3ad60 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
regex based to search if a page is inner page or home page or category page. | |
""" | |
import re | |
def url_check(url): | |
url = url.split("/") | |
url = list(filter(None, url)) | |
if "http" in url[0]: | |
url.pop(0) | |
if len(url) > 1: | |
category_check = re.search(r'.*\d+', url[-1]) | |
if category_check is not None or ".html" in url[-1]: | |
url_type = "subpage" | |
else: | |
url_type = "categorypage" | |
else: | |
url_type = "homepage" | |
return url_type | |
print(url_check("https://edition.cnn.com/")) | |
print(url_check("https://edition.cnn.com/health")) | |
print(url_check("https://edition.cnn.com/2019/08/27/health/heart-disease-stroke-diabetes-death-rates-study/index.html")) | |
""" | |
RESULT: | |
homepage | |
categorypage | |
subpage | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment