Created
August 25, 2016 18:01
-
-
Save artlogic/61f54ba6bad505b81ecca94d1a08d376 to your computer and use it in GitHub Desktop.
GPS Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from io import BytesIO | |
from bs4 import BeautifulSoup | |
import exifread | |
import requests | |
# the url to rip from... you could easily use .format to replace 1 | |
# with any page you like, or even loop through many pages | |
url = 'http://tinypic.com/images.php?page=1' | |
# retrieve the URL | |
summary_req = requests.get(url) | |
# parse the text of the request into HTML | |
summary_html = BeautifulSoup(summary_req.text, 'html.parser') | |
# find the <div class="browse">...</div> | |
browse_div = summary_html.find('div', class_='browse') | |
# find all the <a href> tags inside the browse div | |
browse_links = browse_div.find_all('a') | |
# loop over the browse links | |
for link in browse_links: | |
# the URL for the individual image page | |
image_page = link['href'] | |
# retrieve the image page | |
image_page_req = requests.get(image_page) | |
# parse the image page HTML | |
image_page_html = BeautifulSoup(image_page_req.text, 'html.parser') | |
# find the <div id="imgFrame">...</div> | |
img_frame_div = image_page_html.find('div', id='imgFrame') | |
# find the first <a href> inside the imgFrame div and get the value of href | |
img_frame_link = img_frame_div.find('a')['href'] | |
# request the actual image (jpg) | |
image_req = requests.get(img_frame_link) | |
# convert the response into an in memory file-like object | |
image_file = BytesIO(image_req.content) | |
# look for EXIF information | |
tags = exifread.process_file(image_file) | |
# print the link to the image | |
print(img_frame_link) | |
# print the GPS info - we use tags.get('blah') as opposed to | |
# tags['blah'] because if 'blah' doesn't exist, get returns None | |
# instead of giving us an error | |
print('Long: {}'.format(tags.get('GPS GPSLongitude'))) | |
print('Long Ref: {}'.format(tags.get('GPS GPSLongitudeRef'))) | |
print('Lat: {}'.format(tags.get('GPS GPSLatitude'))) | |
print('Lat Ref: {}'.format(tags.get('GPS GPSLatitudeRef'))) | |
# print a blank line before the next entry | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment