Last active
May 30, 2021 06:24
-
-
Save revox/5dc8e037445db4590b81 to your computer and use it in GitHub Desktop.
Basic scrape and write to CSV example using BS4
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# basic scrape and write demonstration used in Goldsmiths digital sandbox 2014 | |
import urllib # fetches raw web pages for us | |
import bs4 # turns raw web pages into object hierarchy and provides selectors (like CSS and Xpath does) | |
import csv # simplifies the process of writing data to Comma Separated Values in a file | |
# a list of URLs on YouTube that we want to scrape data from | |
pagesToScrape = ['http://www.youtube.com/watch?v=9hIQjrMHTv4' | |
,'https://www.youtube.com/watch?v=Uk8x3V-sUgU'] | |
# open a file in append mode to write into in the same directory where we ran this script from | |
csvfile = open('tubedata.csv', 'a') | |
csvwriter = csv.writer(csvfile) | |
# loop over our list of URL's one at a time | |
for URL in pagesToScrape: | |
webpage = urllib.urlopen(URL) # fetch webpage | |
soup = bs4.BeautifulSoup(webpage) # make an object from the HTML | |
# extract info from soup | |
title = soup.find('span', {'id':'eow-title'}).string | |
uploader = soup.find('div', {'class':'yt-user-info'}).a.string | |
date = soup.find('div', {'id':'watch-uploader-info'}).strong.text[12:] | |
# another way of using BS4 to select stuff | |
views = soup.find("div", class_="watch-view-count").string | |
# views is a number with commas in it, lets make it a proper number by replacing the commas with nothing! | |
views = int(views.replace(',', '')) | |
# print info to screen, we can reomve excess white space around the title using the strip() function | |
print 'title: ', title.strip() | |
print 'uploader: ', uploader | |
print 'date: ', date | |
print 'views:' , views | |
csvwriter.writerow([title.strip(), uploader, date, views]) # write a row in the file | |
# FURTHER EXERCISES | |
# ruggedise your script by using the python 'with' statement to open your file and the 'try-except' structure | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment