Skip to content

Instantly share code, notes, and snippets.

@dreamlayers
Created June 21, 2016 00:30
Show Gist options
  • Save dreamlayers/3a065461c27c25539090530b8a892338 to your computer and use it in GitHub Desktop.
Save dreamlayers/3a065461c27c25539090530b8a892338 to your computer and use it in GitHub Desktop.
Download photo information for a user from Panoramio and save it to a CSV file
#!/usr/bin/env python3
# Program for downloading photo information for a user from Panoramio
# and saving it to a CSV file.
# By Boris Gjenero <[email protected]>
#
# This program makes requests to Panoramio servers for every single photo.
# That's not very efficient. Don't run this often. It is meant for occasional
# personal backup purposes.
#
# Also see https://dreamlayers.blogspot.ca/2009/12/downloading-photos-coordinates-titles.html
# The get_panoramas interface mentioned there would be a lot more efficient,
# but it currently misses photos uploaded within the last few years.
#
# The intent here was to make this work with Python 2 and 3 but
# unicode errors on Python 2 suck.
from __future__ import print_function
import sys
import requests
import json
import csv
# Takes a list of keys and dictionaries with those keys, and adds to
# photo_data. For each key, the value will be a list made by concatenating
# values from dictionaries in dictionary_list, in the order the dictionaries
# appear in dictionary_list. Values can be individual values or lists.
photo_data = {}
def append_dictionaries(keys, dictionary_list):
for key in keys:
newval = []
for dict in dictionary_list:
oldval = dict.pop(key)
if isinstance(oldval, list):
newval += oldval
else:
newval.append(oldval)
if key in photo_data:
print("Warning: duplicate data for ", key)
photo_data[key] = newval
for dict in dictionary_list:
if (len(dict) > 0):
print("Warning: unused data: ", dict)
# Get text from l, between before and after text,
# starting search at position x
def get_from_to(l, before, after, start=0):
b4len = len(before)
stidx = l.find(before, start)
if stidx < 0:
return None
endidx = l.find(after, stidx + b4len)
if endidx < stidx:
return None
return (endidx, l[stidx + b4len : endidx])
# Wrapper for requests, allowing logging and delays by editing code
def do_request(url):
text = requests.get(url).text
return text
# Each function for web scraping has a process_ and a get_ variant.
# The process_ function takes a string, so it can be used for debugging.
# The get_ function actually does the web request and passes text to the
# corresponding process_ function.
def process_counters(counter_text):
repr(counter_text)
res = {}
j = json.loads(counter_text)
text_dict = j['counter_views']
for key, value in text_dict.items():
space_pos = value.find(' ')
if space_pos < 1:
res[int(key)] = 0;
else:
res[int(key)] = int(value[:space_pos])
return res
def get_counters(user_id, photo_id_list):
return process_counters(do_request( \
'http://www.panoramio.com/user_counter_snippet?user_id=' + \
str(user_id) + '&photo_ids=' + \
','.join(map(str, photo_id_list))))
def process_photo(page_text):
longitude = [ '' ]
latitude = [ '' ]
takenon = [ '' ]
def setiffound(l, found):
if found:
l[0] = found[1]
for l in page_text.splitlines():
setiffound(takenon, get_from_to(l, '<li>Taken on ', '</li>'))
setiffound(longitude, get_from_to(l, '<abbr class="longitude" title="','">'))
setiffound(latitude, get_from_to(l, '<abbr class="latitude" title="', '">'))
return longitude + latitude + takenon
def get_photo(photo_id):
return process_photo(do_request('http://www.panoramio.com/photo/' + \
str(photo_id)))
def get_photo_pages(photo_id_list):
res = {}
for photo_id in photo_id_list:
res[photo_id] = get_photo(photo_id)
return res
# This is for photo index pages. It builds several separate dictionaries
# and then merges them while adding to photo_data, via append_dictionaries()
def process_index(panoramio_user, page_text):
# This function contains several different functions which can be called
# for a line. Each function returns which function should be called for
# the next line.
# Construct list of photo IDs.
# Redundant, but the data is already downloaded, so who cares.
photo_ids = []
def photo_id_line(l):
if l == '];':
return default_line
else:
if l.endswith(','):
photo_ids.append(int(l[:-1]))
elif l != "":
photo_ids.append(int(l))
return photo_id_line
# Construct dictionary from photos to lists of tags for each photo
photos_with_tag_info = []
photo_to_tag = {}
def photo_tag_line(l):
if l == "};":
return default_line
else:
if l.endswith("':["):
photo_id = int(l[1:-3])
photos_with_tag_info.append(photo_id)
photo_to_tag[photo_id] = []
elif l.endswith("',"):
photo_to_tag[photos_with_tag_info[-1]].append(l[1:-2])
return photo_tag_line
# Extract data from the photos JavaScript variable
photo_to_info = {}
def var_photos_line(l):
found = [0, '']
while True:
photodata = []
for delimiter in ((', photoId: ', ', '),
(', title: "', '", snapped: '),
(', placeId: "', '", placeName: '),
(', placeName: "', '", approved: ')):
found = get_from_to(l, delimiter[0], delimiter[1], found[0])
if not found:
return
photodata.append(found[1])
if l[found[0] + 13:].startswith('true'):
photodata.append(True)
else:
photodata.append(False)
photo_to_info[int(photodata[0])] = photodata[1:]
# Default function, called for lines not handled by functions above.
res = [ True ]
def default_line(l):
if l.startswith('var photo_ids = ['):
return photo_id_line
elif l.startswith('var tagsByPhoto = {'):
return photo_tag_line
elif l.startswith('var photos = [new pano.acrylic.UserPagePhoto({'):
var_photos_line(l)
elif res[0] and l.startswith('<div class="pages">') and \
'<span class="inactive">Next' in l:
res[0] = False
return default_line
line_func = default_line
for l in page_text.splitlines():
line_func = line_func(l.strip())
# Counters require a separate web request
photo_to_counter = get_counters(panoramio_user, photo_ids)
# Coordinates and photo upload dates retrieved by requesting individual
# photo pages. Inefficient!
photo_to_coords = get_photo_pages(photo_ids)
# Add all data for this page to photo_data dictionary
append_dictionaries(photo_ids, [photo_to_info, photo_to_counter, \
photo_to_coords, photo_to_tag])
return res[0]
def get_index(panoramio_user, page):
return process_index(panoramio_user,
do_request('http://www.panoramio.com/user/' + \
str(panoramio_user) + \
'?show=all&comment_page=1&photo_page=' + \
str(page)))
# This function does all the scraping
def scrape_all(panoramio_user):
page = 1
while True:
print("Processing page: " + str(page))
if not get_index(panoramio_user, page):
break
page += 1
# Main code
if len(sys.argv) != 3:
print('Usage: ' + sys.argv[0] + ' panoramio_user_number output_file_name.csv')
exit(-1)
# File opened first to try to make sure the scaping data
# won't be lost if output file opening fails.
with open(sys.argv[2], 'w') as csvfile:
writer = csv.writer(csvfile)
scrape_all(int(sys.argv[1]))
keys = list(photo_data)
keys.sort()
for key in keys:
writer.writerow([key] + photo_data[key])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment