Skip to content

Instantly share code, notes, and snippets.

@DorHason
Created August 4, 2019 23:18
Show Gist options
  • Save DorHason/0d45dd77d7435c4a12076b9f948d1f67 to your computer and use it in GitHub Desktop.
Save DorHason/0d45dd77d7435c4a12076b9f948d1f67 to your computer and use it in GitHub Desktop.
"""
Page Scraper
an application which connects to a site and pulls out all links and images and prints them
"""
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
def get_links(url):
"""
returns a list with all the unique links from the url
url should start with 'http://'
if fails to open the url prints an error
"""
links = []
try:
# try to open the url
html_page = urlopen(url)
soup = BeautifulSoup(html_page, features="html.parser")
except:
print("Failed to open url")
else:
# extract only valid links
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
links.append(link.get('href'))
return list(set(links))
def get_images(url):
"""
returns a list with all the unique images from the url
url should start with 'http://'
"""
images = []
try:
# try to open the url
html_page = urlopen(url)
soup = BeautifulSoup(html_page, features="html.parser")
except:
# does not print an error because one was already printed in get_links()
pass
else:
# find the base url
base_url = ''
for part in url.split('/')[:3]:
base_url += part + '/'
base_url = base_url[:len(base_url) - 1]
# extract only valid images
for image in soup.findAll('img'):
if 'http' in image.get('src'):
images.append(image.get('src'))
else:
images.append(base_url + image.get('src'))
return list(set(images))
def interface():
"""
The interface from which the program gets a valid url from the user
prints all the links and images from this url, and their number
"""
# asks for url until a valid url was entered
url = input("Please enter a url:\n")
while not re.match(r"^http://", url):
print("The url must start with 'http://', please try again")
url = input("Please enter a url:\n")
# extract
links = get_links(url)
images = get_images(url)
# check if the extraction succeeded, if so print the links and their number
if links:
print(f"l\nlinks (total {len(links)}):")
for link in links:
print(link)
# check if the extraction succeeded, if so print the images (= url's) and their number
if images:
print(f"\nimages (total {len(images)}):")
for image in images:
print(image)
if __name__ == '__main__':
interface()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment