dreamlayers · June 21, 2016 00:30
diff --git a/pan2csv.py b/pan2csv.py
 #!/usr/bin/env python3
 # Program for downloading photo information for a user from Panoramio
 # and saving it to a CSV file.
 # By Boris Gjenero <[email protected]>
 #
 # This program makes requests to Panoramio servers for every single photo.
 # That's not very efficient. Don't run this often. It is meant for occasional
 # personal backup purposes.
 #
 # Also see https://dreamlayers.blogspot.ca/2009/12/downloading-photos-coordinates-titles.html
 # The get_panoramas interface mentioned there would be a lot more efficient,
 # but it currently misses photos uploaded within the last few years.
 #
 # The intent here was to make this work with Python 2 and 3 but 
 # unicode errors on Python 2 suck.

 from __future__ import print_function
 import sys
 import requests
 import json
 import csv

 # Takes a list of keys and dictionaries with those keys, and adds to
 # photo_data. For each key, the value will be a list made by concatenating
 # values from dictionaries in dictionary_list, in the order the dictionaries
 # appear in dictionary_list. Values can be individual values or lists.
 photo_data = {}
 def append_dictionaries(keys, dictionary_list):
    for key in keys:
        newval = []
        for dict in dictionary_list:
            oldval = dict.pop(key)
            if isinstance(oldval, list):
                newval += oldval
            else:
                newval.append(oldval)
        if key in photo_data:
            print("Warning: duplicate data for ", key)
        photo_data[key] = newval
    for dict in dictionary_list:
        if (len(dict) > 0):
            print("Warning: unused data: ", dict)

 # Get text from l, between before and after text,
 # starting search at position x
 def get_from_to(l, before, after, start=0):
    b4len = len(before)
    stidx = l.find(before, start)
    if stidx < 0:
        return None
    endidx = l.find(after, stidx + b4len)
    if endidx < stidx:
        return None
    return (endidx, l[stidx + b4len : endidx])

 # Wrapper for requests, allowing logging and delays by editing code
 def do_request(url):
    text = requests.get(url).text
    return text

 # Each function for web scraping has a process_ and a get_ variant.
 # The process_ function takes a string, so it can be used for debugging.
 # The get_ function actually does the web request and passes text to the
 # corresponding process_ function.

 def process_counters(counter_text):
    repr(counter_text)
    res = {}
    j = json.loads(counter_text)
    text_dict = j['counter_views']
    for key, value in text_dict.items():
        space_pos = value.find(' ')
        if space_pos < 1:
            res[int(key)] = 0;
        else:
            res[int(key)] = int(value[:space_pos])
    return res

 def get_counters(user_id, photo_id_list):
    return process_counters(do_request( \
        'http://www.panoramio.com/user_counter_snippet?user_id=' + \
        str(user_id) + '&photo_ids=' + \
        ','.join(map(str, photo_id_list))))

 def process_photo(page_text):
    longitude = [ '' ]
    latitude = [ '' ]
    takenon = [ '' ]
    def setiffound(l, found):
        if found:
            l[0] = found[1]
    for l in page_text.splitlines():
        setiffound(takenon, get_from_to(l, '<li>Taken on ', '</li>'))
        setiffound(longitude, get_from_to(l, '<abbr class="longitude" title="','">'))
        setiffound(latitude, get_from_to(l, '<abbr class="latitude" title="', '">'))

    return longitude + latitude + takenon

 def get_photo(photo_id):
    return process_photo(do_request('http://www.panoramio.com/photo/' + \
                                    str(photo_id)))

 def get_photo_pages(photo_id_list):
    res = {}
    for photo_id in photo_id_list:
        res[photo_id] = get_photo(photo_id)
    return res

 # This is for photo index pages. It builds several separate dictionaries
 # and then merges them while adding to photo_data, via append_dictionaries()
 def process_index(panoramio_user, page_text):
    # This function contains several different functions which can be called
    # for a line. Each function returns which function should be called for
    # the next line.

    # Construct list of photo IDs.
    # Redundant, but the data is already downloaded, so who cares.
    photo_ids = []
    def photo_id_line(l):
        if l == '];':
            return default_line
        else:
            if l.endswith(','):
                photo_ids.append(int(l[:-1]))
            elif l != "":
                photo_ids.append(int(l))
            return photo_id_line

    # Construct dictionary from photos to lists of tags for each photo
    photos_with_tag_info = []
    photo_to_tag = {}
    def photo_tag_line(l):
        if l == "};":
            return default_line
        else:
            if l.endswith("':["):
                photo_id = int(l[1:-3])
                photos_with_tag_info.append(photo_id)
                photo_to_tag[photo_id] = []
            elif l.endswith("',"):
                photo_to_tag[photos_with_tag_info[-1]].append(l[1:-2])
            return photo_tag_line

    # Extract data from the photos JavaScript variable
    photo_to_info = {}
    def var_photos_line(l):
        found = [0, '']
        while True:
            photodata = []
            for delimiter in ((', photoId: ', ', '),
                             (', title: "', '", snapped: '),
                             (', placeId: "', '", placeName: '),
                             (', placeName: "', '", approved: ')):
                found = get_from_to(l, delimiter[0], delimiter[1], found[0])
                if not found:
                    return
                photodata.append(found[1])
            if l[found[0] + 13:].startswith('true'):
                photodata.append(True)
            else:
                photodata.append(False)
            photo_to_info[int(photodata[0])] = photodata[1:]

    # Default function, called for lines not handled by functions above.
    res = [ True ]
    def default_line(l):
        if l.startswith('var photo_ids = ['):
            return photo_id_line
        elif l.startswith('var tagsByPhoto = {'):
            return photo_tag_line
        elif l.startswith('var photos = [new pano.acrylic.UserPagePhoto({'):
            var_photos_line(l)
        elif res[0] and l.startswith('<div class="pages">') and \
             '<span class="inactive">Next' in l:
                res[0] = False
        return default_line

    line_func = default_line
    for l in page_text.splitlines():
        line_func = line_func(l.strip())

    # Counters require a separate web request
    photo_to_counter = get_counters(panoramio_user, photo_ids)

    # Coordinates and photo upload dates retrieved by requesting individual
    # photo pages. Inefficient!
    photo_to_coords = get_photo_pages(photo_ids)

    # Add all data for this page to photo_data dictionary
    append_dictionaries(photo_ids, [photo_to_info, photo_to_counter, \
                                    photo_to_coords, photo_to_tag])

    return res[0]

 def get_index(panoramio_user, page):
    return process_index(panoramio_user,
                         do_request('http://www.panoramio.com/user/' + \
                                    str(panoramio_user) + \
                                    '?show=all&comment_page=1&photo_page=' + \
                                    str(page)))

 # This function does all the scraping
 def scrape_all(panoramio_user):
    page = 1
    while True:
        print("Processing page: " + str(page))
        if not get_index(panoramio_user, page):
            break
        page += 1

 # Main code

 if len(sys.argv) != 3:
    print('Usage: ' + sys.argv[0] + ' panoramio_user_number output_file_name.csv')
    exit(-1)

 # File opened first to try to make sure the scaping data
 # won't be lost if output file opening fails.
 with open(sys.argv[2], 'w') as csvfile:
    writer = csv.writer(csvfile)

    scrape_all(int(sys.argv[1]))

    keys = list(photo_data)
    keys.sort()
    for key in keys:
        writer.writerow([key] + photo_data[key])
	#!/usr/bin/env python3
	# Program for downloading photo information for a user from Panoramio
	# and saving it to a CSV file.
	# By Boris Gjenero <[email protected]>
	#
	# This program makes requests to Panoramio servers for every single photo.
	# That's not very efficient. Don't run this often. It is meant for occasional
	# personal backup purposes.
	#
	# Also see https://dreamlayers.blogspot.ca/2009/12/downloading-photos-coordinates-titles.html
	# The get_panoramas interface mentioned there would be a lot more efficient,
	# but it currently misses photos uploaded within the last few years.
	#
	# The intent here was to make this work with Python 2 and 3 but
	# unicode errors on Python 2 suck.

	from __future__ import print_function
	import sys
	import requests
	import json
	import csv

	# Takes a list of keys and dictionaries with those keys, and adds to
	# photo_data. For each key, the value will be a list made by concatenating
	# values from dictionaries in dictionary_list, in the order the dictionaries
	# appear in dictionary_list. Values can be individual values or lists.
	photo_data = {}
	def append_dictionaries(keys, dictionary_list):
	for key in keys:
	newval = []
	for dict in dictionary_list:
	oldval = dict.pop(key)
	if isinstance(oldval, list):
	newval += oldval
	else:
	newval.append(oldval)
	if key in photo_data:
	print("Warning: duplicate data for ", key)
	photo_data[key] = newval
	for dict in dictionary_list:
	if (len(dict) > 0):
	print("Warning: unused data: ", dict)

	# Get text from l, between before and after text,
	# starting search at position x
	def get_from_to(l, before, after, start=0):
	b4len = len(before)
	stidx = l.find(before, start)
	if stidx < 0:
	return None
	endidx = l.find(after, stidx + b4len)
	if endidx < stidx:
	return None
	return (endidx, l[stidx + b4len : endidx])

	# Wrapper for requests, allowing logging and delays by editing code
	def do_request(url):
	text = requests.get(url).text
	return text

	# Each function for web scraping has a process_ and a get_ variant.
	# The process_ function takes a string, so it can be used for debugging.
	# The get_ function actually does the web request and passes text to the
	# corresponding process_ function.

	def process_counters(counter_text):
	repr(counter_text)
	res = {}
	j = json.loads(counter_text)
	text_dict = j['counter_views']
	for key, value in text_dict.items():
	space_pos = value.find(' ')
	if space_pos < 1:
	res[int(key)] = 0;
	else:
	res[int(key)] = int(value[:space_pos])
	return res

	def get_counters(user_id, photo_id_list):
	return process_counters(do_request( \
	'http://www.panoramio.com/user_counter_snippet?user_id=' + \
	str(user_id) + '&photo_ids=' + \
	','.join(map(str, photo_id_list))))

	def process_photo(page_text):
	longitude = [ '' ]
	latitude = [ '' ]
	takenon = [ '' ]
	def setiffound(l, found):
	if found:
	l[0] = found[1]
	for l in page_text.splitlines():
	setiffound(takenon, get_from_to(l, '<li>Taken on ', '</li>'))
	setiffound(longitude, get_from_to(l, '<abbr class="longitude" title="','">'))
	setiffound(latitude, get_from_to(l, '<abbr class="latitude" title="', '">'))

	return longitude + latitude + takenon

	def get_photo(photo_id):
	return process_photo(do_request('http://www.panoramio.com/photo/' + \
	str(photo_id)))

	def get_photo_pages(photo_id_list):
	res = {}
	for photo_id in photo_id_list:
	res[photo_id] = get_photo(photo_id)
	return res

	# This is for photo index pages. It builds several separate dictionaries
	# and then merges them while adding to photo_data, via append_dictionaries()
	def process_index(panoramio_user, page_text):
	# This function contains several different functions which can be called
	# for a line. Each function returns which function should be called for
	# the next line.

	# Construct list of photo IDs.
	# Redundant, but the data is already downloaded, so who cares.
	photo_ids = []
	def photo_id_line(l):
	if l == '];':
	return default_line
	else:
	if l.endswith(','):
	photo_ids.append(int(l[:-1]))
	elif l != "":
	photo_ids.append(int(l))
	return photo_id_line

	# Construct dictionary from photos to lists of tags for each photo
	photos_with_tag_info = []
	photo_to_tag = {}
	def photo_tag_line(l):
	if l == "};":
	return default_line
	else:
	if l.endswith("':["):
	photo_id = int(l[1:-3])
	photos_with_tag_info.append(photo_id)
	photo_to_tag[photo_id] = []
	elif l.endswith("',"):
	photo_to_tag[photos_with_tag_info[-1]].append(l[1:-2])
	return photo_tag_line

	# Extract data from the photos JavaScript variable
	photo_to_info = {}
	def var_photos_line(l):
	found = [0, '']
	while True:
	photodata = []
	for delimiter in ((', photoId: ', ', '),
	(', title: "', '", snapped: '),
	(', placeId: "', '", placeName: '),
	(', placeName: "', '", approved: ')):
	found = get_from_to(l, delimiter[0], delimiter[1], found[0])
	if not found:
	return
	photodata.append(found[1])
	if l[found[0] + 13:].startswith('true'):
	photodata.append(True)
	else:
	photodata.append(False)
	photo_to_info[int(photodata[0])] = photodata[1:]

	# Default function, called for lines not handled by functions above.
	res = [ True ]
	def default_line(l):
	if l.startswith('var photo_ids = ['):
	return photo_id_line
	elif l.startswith('var tagsByPhoto = {'):
	return photo_tag_line
	elif l.startswith('var photos = [new pano.acrylic.UserPagePhoto({'):
	var_photos_line(l)
	elif res[0] and l.startswith('<div class="pages">') and \
	'<span class="inactive">Next' in l:
	res[0] = False
	return default_line

	line_func = default_line
	for l in page_text.splitlines():
	line_func = line_func(l.strip())

	# Counters require a separate web request
	photo_to_counter = get_counters(panoramio_user, photo_ids)

	# Coordinates and photo upload dates retrieved by requesting individual
	# photo pages. Inefficient!
	photo_to_coords = get_photo_pages(photo_ids)

	# Add all data for this page to photo_data dictionary
	append_dictionaries(photo_ids, [photo_to_info, photo_to_counter, \
	photo_to_coords, photo_to_tag])

	return res[0]

	def get_index(panoramio_user, page):
	return process_index(panoramio_user,
	do_request('http://www.panoramio.com/user/' + \
	str(panoramio_user) + \
	'?show=all&comment_page=1&photo_page=' + \
	str(page)))

	# This function does all the scraping
	def scrape_all(panoramio_user):
	page = 1
	while True:
	print("Processing page: " + str(page))
	if not get_index(panoramio_user, page):
	break
	page += 1

	# Main code

	if len(sys.argv) != 3:
	print('Usage: ' + sys.argv[0] + ' panoramio_user_number output_file_name.csv')
	exit(-1)

	# File opened first to try to make sure the scaping data
	# won't be lost if output file opening fails.
	with open(sys.argv[2], 'w') as csvfile:
	writer = csv.writer(csvfile)

	scrape_all(int(sys.argv[1]))

	keys = list(photo_data)
	keys.sort()
	for key in keys:
	writer.writerow([key] + photo_data[key])