Created
December 19, 2017 19:55
-
-
Save PandaWhoCodes/d54b13387b5aea379ab9988c78777173 to your computer and use it in GitHub Desktop.
Scrapes the page to get the links, filters them and store them as a CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from urllib.parse import urlparse | |
import csv | |
"""Define the links you do not want""" | |
stop_links = ["flowingdata.com/", "facebook.com/", "twitter.com/", "eepurl.com/"] | |
def get_content(URL): | |
return requests.get(URL).content | |
def get_soups(content): | |
return BeautifulSoup(content, 'lxml').find_all('a', href=True) | |
def WriteDictToCSV(dict_data): | |
csv_file = "links.csv" | |
csv_columns = dict_data[0].keys() | |
try: | |
with open(csv_file, 'w') as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames=csv_columns) | |
writer.writeheader() | |
for data in dict_data: | |
writer.writerow(data) | |
except: | |
print("I/O error") | |
return | |
def filter_links(soups): | |
all_links = [] | |
for link in soups: | |
url = link['href'] | |
domain = '{uri.netloc}/'.format(uri=urlparse(url)) | |
if domain not in stop_links: | |
all_links.append(link['href']) | |
return all_links | |
def get_titles(links): | |
data = [] | |
for link in links: | |
try: | |
r = requests.get(link).content | |
title = str(BeautifulSoup(r, 'lxml').title.string) | |
data.append({"link": link, "title": title}) | |
except: | |
print(link + ": Cannot be reached") | |
return data | |
LINK_TO_SCRAPE = "http://flowingdata.com/2009/10/01/30-resources-to-find-the-data-you-need/" | |
WriteDictToCSV(get_titles( | |
filter_links(get_soups(get_content(LINK_TO_SCRAPE))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment