Created
July 8, 2019 10:51
-
-
Save Musinux/fbd417961fbe93df873a7c1343b56d2d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import regex | |
def get(url): | |
mycontent = requests.get(url) | |
soup = BeautifulSoup(mycontent.text, "html.parser") | |
return soup | |
# the links we want to visit | |
my_list_of_links = ["https://www.nytimes.com/2019/07/02/dining/ice-cream-shops.html"] | |
# the links that we don't want to visit twice | |
already_visited = [] | |
# while we still have links to visit, continue | |
while len(my_list_of_links) > 0: | |
# get the first element of the list and remove it from the list | |
current_link = my_list_of_links.pop(0) | |
# add the link to the "already_visited" list, to avoid visiting it twice | |
already_visited.append(current_link) | |
# get the html content from the link (URL) | |
html = get(current_link) | |
# for tomorrow, find a way to fill all these variables from the "html" object | |
# I recommend that you check out the BeautifulSoup documentation to find what you need | |
# you also need to read the html content to find the right html tags to extract | |
url = | |
title = | |
content = | |
writing_time = | |
author = | |
crawling_time = | |
links = | |
keywords = | |
base_path = regex.match(r'^(.*/)[^/]*', current_link)[1] | |
domain_name = regex.match(r'^[a-z]+://[^/]*', current_link)[0] | |
print("base_path", base_path) | |
print("domain_name", domain_name) | |
# to put everything in a database, we need to: | |
# 1) connect to the db at the top of the script | |
# 2) insert the values in this loop each time | |
# 3) don't forget to commit after the execute | |
# retrieve all the <a> tags to get new links | |
all_a_tags = html.find_all('a') | |
for tag in all_a_tags: | |
link = tag.get('href') # should look like https://www.nytimes.com | |
# only add the link if it's not empty and if it's from the same website | |
# for now, this condition is not valid for relative links | |
# we need to add an option that validates relative links => use regular expressions | |
# 1) match the link against the regular expression r'(([a-z]+)://([^/]*))?(.*)' | |
match = regex.match(r'(([a-z]+)://([^/]*))?(.*)', link) | |
# recall that match[0] is the whole string, match[2] is the protocol, match[3] is the domain name, match[4] is the path | |
if match and not match[1] and match[4]: # if something matches but only the path, not the rest | |
# it is a relative link | |
if match[4][0] == '/': # it's relative to the root of the webserver | |
link = domain_name + link | |
else: # it's relative to our current path | |
# we need to get the base path of our current URL | |
link = base_path + link | |
print("link", link) | |
# 2) if everything None except the path (match[4]), it's a relative link | |
# 3) if it's a relative link, do what is on the board | |
if link is not None and "https://www.nytimes.com" in link: | |
# only add the link if we didn't already visit it | |
if link not in already_visited: # find the right condition here | |
my_list_of_links.append(link) | |
print("number of visited links", len(already_visited)) | |
print("number of links", len(my_list_of_links)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment