Created
September 9, 2017 02:09
-
-
Save wetlife/c6a4784da5e6eee04c7fc7a7f6f36d56 to your computer and use it in GitHub Desktop.
Find hrefs in file_path. Distinguish absolute hrefs by the presence of '://' then give counts of relative- and absolute-hrefs.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from requests import request | |
from pprint import pprint | |
import re | |
file_path = './index.html' | |
with open(file_path, encoding='utf8') as file_object: | |
file_markup = file_object.read() | |
file_soup = bs(file_markup, 'lxml') | |
relative_hrefs = absolute_hrefs = () | |
href_tags = file_soup.find_all(href=True) | |
for tag in href_tags: | |
if re.search('://',tag['href']): | |
absolute_hrefs += (tag['href'],) | |
else: | |
relative_hrefs += (tag['href'],) | |
print(f"absolute_hrefs: {absolute_hrefs}") | |
print(f"relative_hrefs: {relative_hrefs}") | |
print(f"{len(absolute_hrefs+relative_hrefs)} hrefs were found. {len(absolute_hrefs)} hrefs are absolute and {len(relative_hrefs)} hrefs are relative.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment