Created
June 21, 2016 00:30
-
-
Save dreamlayers/3a065461c27c25539090530b8a892338 to your computer and use it in GitHub Desktop.
Download photo information for a user from Panoramio and save it to a CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Program for downloading photo information for a user from Panoramio | |
# and saving it to a CSV file. | |
# By Boris Gjenero <[email protected]> | |
# | |
# This program makes requests to Panoramio servers for every single photo. | |
# That's not very efficient. Don't run this often. It is meant for occasional | |
# personal backup purposes. | |
# | |
# Also see https://dreamlayers.blogspot.ca/2009/12/downloading-photos-coordinates-titles.html | |
# The get_panoramas interface mentioned there would be a lot more efficient, | |
# but it currently misses photos uploaded within the last few years. | |
# | |
# The intent here was to make this work with Python 2 and 3 but | |
# unicode errors on Python 2 suck. | |
from __future__ import print_function | |
import sys | |
import requests | |
import json | |
import csv | |
# Takes a list of keys and dictionaries with those keys, and adds to | |
# photo_data. For each key, the value will be a list made by concatenating | |
# values from dictionaries in dictionary_list, in the order the dictionaries | |
# appear in dictionary_list. Values can be individual values or lists. | |
photo_data = {} | |
def append_dictionaries(keys, dictionary_list): | |
for key in keys: | |
newval = [] | |
for dict in dictionary_list: | |
oldval = dict.pop(key) | |
if isinstance(oldval, list): | |
newval += oldval | |
else: | |
newval.append(oldval) | |
if key in photo_data: | |
print("Warning: duplicate data for ", key) | |
photo_data[key] = newval | |
for dict in dictionary_list: | |
if (len(dict) > 0): | |
print("Warning: unused data: ", dict) | |
# Get text from l, between before and after text, | |
# starting search at position x | |
def get_from_to(l, before, after, start=0): | |
b4len = len(before) | |
stidx = l.find(before, start) | |
if stidx < 0: | |
return None | |
endidx = l.find(after, stidx + b4len) | |
if endidx < stidx: | |
return None | |
return (endidx, l[stidx + b4len : endidx]) | |
# Wrapper for requests, allowing logging and delays by editing code | |
def do_request(url): | |
text = requests.get(url).text | |
return text | |
# Each function for web scraping has a process_ and a get_ variant. | |
# The process_ function takes a string, so it can be used for debugging. | |
# The get_ function actually does the web request and passes text to the | |
# corresponding process_ function. | |
def process_counters(counter_text): | |
repr(counter_text) | |
res = {} | |
j = json.loads(counter_text) | |
text_dict = j['counter_views'] | |
for key, value in text_dict.items(): | |
space_pos = value.find(' ') | |
if space_pos < 1: | |
res[int(key)] = 0; | |
else: | |
res[int(key)] = int(value[:space_pos]) | |
return res | |
def get_counters(user_id, photo_id_list): | |
return process_counters(do_request( \ | |
'http://www.panoramio.com/user_counter_snippet?user_id=' + \ | |
str(user_id) + '&photo_ids=' + \ | |
','.join(map(str, photo_id_list)))) | |
def process_photo(page_text): | |
longitude = [ '' ] | |
latitude = [ '' ] | |
takenon = [ '' ] | |
def setiffound(l, found): | |
if found: | |
l[0] = found[1] | |
for l in page_text.splitlines(): | |
setiffound(takenon, get_from_to(l, '<li>Taken on ', '</li>')) | |
setiffound(longitude, get_from_to(l, '<abbr class="longitude" title="','">')) | |
setiffound(latitude, get_from_to(l, '<abbr class="latitude" title="', '">')) | |
return longitude + latitude + takenon | |
def get_photo(photo_id): | |
return process_photo(do_request('http://www.panoramio.com/photo/' + \ | |
str(photo_id))) | |
def get_photo_pages(photo_id_list): | |
res = {} | |
for photo_id in photo_id_list: | |
res[photo_id] = get_photo(photo_id) | |
return res | |
# This is for photo index pages. It builds several separate dictionaries | |
# and then merges them while adding to photo_data, via append_dictionaries() | |
def process_index(panoramio_user, page_text): | |
# This function contains several different functions which can be called | |
# for a line. Each function returns which function should be called for | |
# the next line. | |
# Construct list of photo IDs. | |
# Redundant, but the data is already downloaded, so who cares. | |
photo_ids = [] | |
def photo_id_line(l): | |
if l == '];': | |
return default_line | |
else: | |
if l.endswith(','): | |
photo_ids.append(int(l[:-1])) | |
elif l != "": | |
photo_ids.append(int(l)) | |
return photo_id_line | |
# Construct dictionary from photos to lists of tags for each photo | |
photos_with_tag_info = [] | |
photo_to_tag = {} | |
def photo_tag_line(l): | |
if l == "};": | |
return default_line | |
else: | |
if l.endswith("':["): | |
photo_id = int(l[1:-3]) | |
photos_with_tag_info.append(photo_id) | |
photo_to_tag[photo_id] = [] | |
elif l.endswith("',"): | |
photo_to_tag[photos_with_tag_info[-1]].append(l[1:-2]) | |
return photo_tag_line | |
# Extract data from the photos JavaScript variable | |
photo_to_info = {} | |
def var_photos_line(l): | |
found = [0, ''] | |
while True: | |
photodata = [] | |
for delimiter in ((', photoId: ', ', '), | |
(', title: "', '", snapped: '), | |
(', placeId: "', '", placeName: '), | |
(', placeName: "', '", approved: ')): | |
found = get_from_to(l, delimiter[0], delimiter[1], found[0]) | |
if not found: | |
return | |
photodata.append(found[1]) | |
if l[found[0] + 13:].startswith('true'): | |
photodata.append(True) | |
else: | |
photodata.append(False) | |
photo_to_info[int(photodata[0])] = photodata[1:] | |
# Default function, called for lines not handled by functions above. | |
res = [ True ] | |
def default_line(l): | |
if l.startswith('var photo_ids = ['): | |
return photo_id_line | |
elif l.startswith('var tagsByPhoto = {'): | |
return photo_tag_line | |
elif l.startswith('var photos = [new pano.acrylic.UserPagePhoto({'): | |
var_photos_line(l) | |
elif res[0] and l.startswith('<div class="pages">') and \ | |
'<span class="inactive">Next' in l: | |
res[0] = False | |
return default_line | |
line_func = default_line | |
for l in page_text.splitlines(): | |
line_func = line_func(l.strip()) | |
# Counters require a separate web request | |
photo_to_counter = get_counters(panoramio_user, photo_ids) | |
# Coordinates and photo upload dates retrieved by requesting individual | |
# photo pages. Inefficient! | |
photo_to_coords = get_photo_pages(photo_ids) | |
# Add all data for this page to photo_data dictionary | |
append_dictionaries(photo_ids, [photo_to_info, photo_to_counter, \ | |
photo_to_coords, photo_to_tag]) | |
return res[0] | |
def get_index(panoramio_user, page): | |
return process_index(panoramio_user, | |
do_request('http://www.panoramio.com/user/' + \ | |
str(panoramio_user) + \ | |
'?show=all&comment_page=1&photo_page=' + \ | |
str(page))) | |
# This function does all the scraping | |
def scrape_all(panoramio_user): | |
page = 1 | |
while True: | |
print("Processing page: " + str(page)) | |
if not get_index(panoramio_user, page): | |
break | |
page += 1 | |
# Main code | |
if len(sys.argv) != 3: | |
print('Usage: ' + sys.argv[0] + ' panoramio_user_number output_file_name.csv') | |
exit(-1) | |
# File opened first to try to make sure the scaping data | |
# won't be lost if output file opening fails. | |
with open(sys.argv[2], 'w') as csvfile: | |
writer = csv.writer(csvfile) | |
scrape_all(int(sys.argv[1])) | |
keys = list(photo_data) | |
keys.sort() | |
for key in keys: | |
writer.writerow([key] + photo_data[key]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment