debboutr · June 29, 2018 19:33
diff --git a/instagram_scrape.py b/instagram_scrape.py
 # import dependencies
 import urllib2,json,arcpy
 import numpy as np
 import pandas as pd
 from django.utils.encoding import smart_str
 import time
 ##################################################################################
 # VARIABLES

 # set working directory
 workingPath = 'D:/Projects/Panoramio'

 # set ouput file path/name
 outFileName = workingPath + '/outputPhotoList.csv'

 # set bounding x,y values
 minX = -92.335981
 minY = 46.630695
 maxX = -91.946101
 maxY = 46.804721

 # set up panoramio query strings , 
 getPhotoCount(url)
 url = 'https://api.instagram.com/v1/media/search?lat=34.048502&lng=-118.246008&distance=5000&access_token=1771051239.ab103e5.7e013b99ce924cb7a894ecd0dd030be5&callback=?&count=500'
 initialGET = 'http://www.panoramio.com/map/get_panoramas.php?set=full&from=0&to=100&minx=%s&miny=%s&maxx=%s&maxy=%s&size=medium&mapfilter=true'%(minX,minY,maxX,maxY)
 stringGET_1 = 'http://www.panoramio.com/map/get_panoramas.php?set=full&from='
 stringGET_2 = '&to='
 stringGET_3 = '&minx=%s&miny=%s&maxx=%s&maxy=%s&size=medium&mapfilter=true'%(minX,minY,maxX,maxY)

 # output field name dictionary
 outFileHeaders = ['upload_date','owner_name','photo_id','longitude','height','width','photo_title','latitude','owner_url','photo_url','photo_file_url','owner_id']

 # arcpy variables
 inTable = workingPath + '/outputPhotoList.csv'
 xCoords = 'longitude'
 yCoords = 'latitude'
 outLayer = 'pointLayer'
 savedLayer = workingPath + '/photoPoints.shp'
 spRef = "C:\Users\Rdebbout\AppData\Roaming\ESRI\Desktop10.3\ArcMap\Coordinate Systems\GCS_WGS_1984.prj"
     
 ##################################################################################
 # FUNCTIONS  5 km spacing 0.04491265  0.0449157  div by 2 : 0.022456325
     
 def getPhotoCount(url):
    # query website, parse JSON, and return photo count
    urlResponse = urllib2.urlopen(url).read()
    parsedResponse = json.loads(urlResponse)
    queryCount = parsedResponse['data']
 #    print 'Query count returned: ' + str(len(queryCount))
    return queryCount

 def getRecords(data):
    for loc in range(len(data)):
        if data[loc]['id'] in klip:
            print data[loc]['id']
        if data[loc]['id'] not in klip:
            ID = data[loc]['location']['id']
            latitude = data[loc]['location']['latitude']
            longitude = data[loc]['location']['longitude']
            title = smart_str(data[loc]['location']['name'])
            url = smart_str(data[loc]['images']['standard_resolution']['url'])
            username = smart_str(data[loc]['user']['username'])
            tbl = tbl.append(pd.DataFrame([[latitude, longitude, ID, username, title, url]], columns=cols), ignore_index=True)
            klip.append(data[loc]['id'])
 # 5km  0.0449157
 #  2.5 km  0.02245785
 # 1.25 km  0.011228925
 # 0.625 km 0.0056144625
 data[loc]['location']['name'].encode()
 type(data[loc]['location']['name'])    
 # 7 km spacing ex.
 klip = [] 
 cols = ['latitude', 'longitude', 'id', 'username', 'title', 'url']
 tbl = pd.DataFrame()
 len(np.arange(minX, maxX, 0.0056144625))
 len(np.arange(minY, maxY, 0.0056144625))
 for xcoord in np.arange(minX, maxX, 0.0056144625):
    for ycoord in np.arange(minY, maxY, 0.0056144625):
        url = 'https://api.instagram.com/v1/media/search?lat=%s&lng=%s&distance=625&access_token=1771051239.ab103e5.7e013b99ce924cb7a894ecd0dd030be5&callback=?&count=100' % (ycoord, xcoord)  # &callback=?&count=500
        data = getPhotoCount(url)
        if str(len(data)) == 100:
            print 'X: %s Y: %s has %s returns.' % (xcoord, ycoord, str(len(data)))
        for loc in range(len(data)):
 #            if data[loc]['id'] in klip:
 #                print data[loc]['id']
            if data[loc]['id'] not in klip:
                ID = data[loc]['location']['id']
                latitude = data[loc]['location']['latitude']
                longitude = data[loc]['location']['longitude']
                title = smart_str(data[loc]['location']['name'])
                url = smart_str(data[loc]['images']['standard_resolution']['url'])
                username = smart_str(data[loc]['user']['username'])
                tbl = tbl.append(pd.DataFrame([[latitude, longitude, ID, username, title, url]], columns=cols), ignore_index=True)
                klip.append(data[loc]['id'])

 tbl2.columns = ['latitude', 'longitude', 'insta_id', 'username', 'title', 'url']
 tbl2.to_csv(workingPath + '/outputPhotoList_locID.csv',index=False)

 t1 = pd.read_csv(workingPath + '/outputPhotoListCeck.csv')
 t2 = pd.read_csv(workingPath + '/outputPhotoList.csv')
 count = 0
 t1.columns
 for x in t2.insta_id.values:
    if x not in t1.insta_id.values:
        print x
        count+=1
        print count
 url = 'https://api.instagram.com/v1/users/1771051239/?access_token=1771051239.ab103e5.7e013b99ce924cb7a894ecd0dd030be5'

 arcpy.MakeXYEventLayer_management(inTable, xCoords, yCoords, outLayer, spRef)
 arcpy.arcpy.CopyFeatures_management(outLayer, savedLayer)
 arcpy.Delete_management(outLayer)
 type(username)


 import requests
 results = requests.get(url)
 type(results)
 data = results.json()
 type(data)
 results.headers

 data['data'][90]['images']['standard_resolution']['url']

 data['data'][90]['videos']['standard_resolution']['url']


 'videos' in data['data'][83]
 payload = {'lat': '34.048502', 
           'lng': '-118.246008', 
           'distance': '250', 
           'access_token': '1771051239.ab103e5.7e013b99ce924cb7a894ecd0dd030be5', 
           'count': '100'}
 url = 'https://api.instagram.com/v1/media/search'


 #def getPhotoCount(url, payload):
 #    # query website, parse JSON, and return photo count
 #    r = requests.get(url, params=payload)
 #    parsedResponse = r.json()
 #    query= parsedResponse['data']
 ##    print 'Query count returned: ' + str(len(query))
 #    return query
    
 url = 'https://api.instagram.com/v1/users/self/media/recent/?access_token=1771051239.ab103e5.7e013b99ce924cb7a894ecd0dd030be5'
 data = getPhotoCount(url)    
    r.text

 klip = [] 
 recirc = []
 cols = ['latitude', 'longitude', 'id', 'username', 'title', 'tags', 'url']
 tbl = pd.DataFrame()
 token = '1771051239.ab103e5.7e013b99ce924cb7a894ecd0dd030be5'
 #url = 'https://api.instagram.com/v1/media/search'
 count = 0
 for xcoord in np.arange(minX, maxX, 0.0449157):
    for ycoord in np.arange(minY, maxY, 0.0449157):
        url = 'https://api.instagram.com/v1/media/search?lat=%s&lng=%s&distance=5000&access_token=%s&callback=?&count=500' % (ycoord, xcoord, token)  # &callback=?&count=500
        data = getPhotoCount(url)
        count += 1
        print count
        if len(data) == 100:
            print 'X: %s Y: %s has %s returns.' % (xcoord, ycoord, str(len(data)))
        for loc in range(len(data)):
            chk = data[loc]['id']
            if chk not in klip:  
                ID = data[loc]['location']['id']
                if ID not in recirc:                
                    recirc.append(ID)
                latitude = data[loc]['location']['latitude']
                longitude = data[loc]['location']['longitude']
                title = smart_str(data[loc]['location']['name'])
                if 'videos' in data[loc]:
 #                    print 'video ' + str(loc)
                    url = data[loc]['videos']['standard_resolution']['url']
                else:    
                    url = smart_str(data[loc]['images']['standard_resolution']['url'])
                username = smart_str(data[loc]['user']['username'])
                tags = smart_str(", ".join(data[0]['tags']))
                tbl = tbl.append(pd.DataFrame([[latitude, longitude, ID, username, title, tags, url]], columns=cols), ignore_index=True)
                klip.append(chk)
 print 'Recirc: %s' % str(len(recirc))
 tbl2 = pd.DataFrame()
 for rec in recirc:
    print '********'
    print rec
    url = 'https://api.instagram.com/v1/locations/%s/media/recent?access_token=%s&count=500' % (rec, token)
    try:    
        data = getPhotoCount(url)
    except urllib2.HTTPError:
        time.sleep(20)        
        data = getPhotoCount(url)
    print '********'
    print len(data)        
    for loc in range(len(data)):
        chk = data[loc]['id']
        if chk not in klip: 
            ID = data[loc]['location']['id']
            print '********'
            print ID
 #            recirc.append(ID)
            latitude = data[loc]['location']['latitude']
            longitude = data[loc]['location']['longitude']
            title = smart_str(data[loc]['location']['name'])
            if 'videos' in data[loc]:
 #                    print 'video ' + str(loc)
                url = data[loc]['videos']['standard_resolution']['url']
            else:    
                url = smart_str(data[loc]['images']['standard_resolution']['url'])
            username = smart_str(data[loc]['user']['username'])
            tags = smart_str(", ".join(data[0]['tags']))
            tbl2 = tbl2.append(pd.DataFrame([[latitude, longitude, ID, username, title, tags, url]], columns=cols), ignore_index=True)


 chktbl = pd.concat([tbl1_25,tbl2_5])

 str([str(x) for x in ", ".join(data[0]['tags'])])
 tags = smart_str(", ".join(data[0]['tags']))


 ycoord = np.arange(minY, maxY, 0.0449157)[0]
 xcoord = np.arange(minX, maxX, 0.0449157)[1]
 minX = -92.335981
 minY = 46.630695
 maxX = -91.946101
 maxY = 46.804721

 from math import radians, sin, cos, sqrt, asin
 
 def haversine(lat1, lon1, lat2, lon2):
 
  R = 6371 # Earth radius in kilometers
 
  dLat = radians(lat2 - lat1)
  dLon = radians(lon2 - lon1)
  lat1 = radians(lat1)
  lat2 = radians(lat2)
 
  a = sin(dLat/2)**2 + cos(lat1)*cos(lat2)*sin(dLon/2)**2
  c = 2*asin(sqrt(a))
 
  return R * c
 
 haversine(46.630695, -92.335981, 46.630695, -91.946101)
 count = 0
 for url in tbl.url.values:
    if url in tbl2.url.values:
        count += 1
        
 chks = []
 flop = []
 for x in range(20):       
    data = getPhotoCount(url)
    print len(data)
    for loc in range(len(data)):
        chk = data[loc]['id']
        ID = data[loc]['location']['id']
        chks.append(chk)
        flop.append(ID)
        len(set(chks))
        
 import requests

 results = requests.get(url)
	# import dependencies
	import urllib2,json,arcpy
	import numpy as np
	import pandas as pd
	from django.utils.encoding import smart_str
	import time
	##################################################################################
	# VARIABLES

	# set working directory
	workingPath = 'D:/Projects/Panoramio'

	# set ouput file path/name
	outFileName = workingPath + '/outputPhotoList.csv'

	# set bounding x,y values
	minX = -92.335981
	minY = 46.630695
	maxX = -91.946101
	maxY = 46.804721

	# set up panoramio query strings ,
	getPhotoCount(url)
	url = 'https://api.instagram.com/v1/media/search?lat=34.048502&lng=-118.246008&distance=5000&access_token=1771051239.ab103e5.7e013b99ce924cb7a894ecd0dd030be5&callback=?&count=500'
	initialGET = 'http://www.panoramio.com/map/get_panoramas.php?set=full&from=0&to=100&minx=%s&miny=%s&maxx=%s&maxy=%s&size=medium&mapfilter=true'%(minX,minY,maxX,maxY)
	stringGET_1 = 'http://www.panoramio.com/map/get_panoramas.php?set=full&from='
	stringGET_2 = '&to='
	stringGET_3 = '&minx=%s&miny=%s&maxx=%s&maxy=%s&size=medium&mapfilter=true'%(minX,minY,maxX,maxY)

	# output field name dictionary
	outFileHeaders = ['upload_date','owner_name','photo_id','longitude','height','width','photo_title','latitude','owner_url','photo_url','photo_file_url','owner_id']

	# arcpy variables
	inTable = workingPath + '/outputPhotoList.csv'
	xCoords = 'longitude'
	yCoords = 'latitude'
	outLayer = 'pointLayer'
	savedLayer = workingPath + '/photoPoints.shp'
	spRef = "C:\Users\Rdebbout\AppData\Roaming\ESRI\Desktop10.3\ArcMap\Coordinate Systems\GCS_WGS_1984.prj"

	##################################################################################
	# FUNCTIONS 5 km spacing 0.04491265 0.0449157 div by 2 : 0.022456325

	def getPhotoCount(url):
	# query website, parse JSON, and return photo count
	urlResponse = urllib2.urlopen(url).read()
	parsedResponse = json.loads(urlResponse)
	queryCount = parsedResponse['data']
	# print 'Query count returned: ' + str(len(queryCount))
	return queryCount

	def getRecords(data):
	for loc in range(len(data)):
	if data[loc]['id'] in klip:
	print data[loc]['id']
	if data[loc]['id'] not in klip:
	ID = data[loc]['location']['id']
	latitude = data[loc]['location']['latitude']
	longitude = data[loc]['location']['longitude']
	title = smart_str(data[loc]['location']['name'])
	url = smart_str(data[loc]['images']['standard_resolution']['url'])
	username = smart_str(data[loc]['user']['username'])
	tbl = tbl.append(pd.DataFrame([[latitude, longitude, ID, username, title, url]], columns=cols), ignore_index=True)
	klip.append(data[loc]['id'])
	# 5km 0.0449157
	# 2.5 km 0.02245785
	# 1.25 km 0.011228925
	# 0.625 km 0.0056144625
	data[loc]['location']['name'].encode()
	type(data[loc]['location']['name'])
	# 7 km spacing ex.
	klip = []
	cols = ['latitude', 'longitude', 'id', 'username', 'title', 'url']
	tbl = pd.DataFrame()
	len(np.arange(minX, maxX, 0.0056144625))
	len(np.arange(minY, maxY, 0.0056144625))
	for xcoord in np.arange(minX, maxX, 0.0056144625):
	for ycoord in np.arange(minY, maxY, 0.0056144625):
	url = 'https://api.instagram.com/v1/media/search?lat=%s&lng=%s&distance=625&access_token=1771051239.ab103e5.7e013b99ce924cb7a894ecd0dd030be5&callback=?&count=100' % (ycoord, xcoord) # &callback=?&count=500
	data = getPhotoCount(url)
	if str(len(data)) == 100:
	print 'X: %s Y: %s has %s returns.' % (xcoord, ycoord, str(len(data)))
	for loc in range(len(data)):
	# if data[loc]['id'] in klip:
	# print data[loc]['id']
	if data[loc]['id'] not in klip:
	ID = data[loc]['location']['id']
	latitude = data[loc]['location']['latitude']
	longitude = data[loc]['location']['longitude']
	title = smart_str(data[loc]['location']['name'])
	url = smart_str(data[loc]['images']['standard_resolution']['url'])
	username = smart_str(data[loc]['user']['username'])
	tbl = tbl.append(pd.DataFrame([[latitude, longitude, ID, username, title, url]], columns=cols), ignore_index=True)
	klip.append(data[loc]['id'])

	tbl2.columns = ['latitude', 'longitude', 'insta_id', 'username', 'title', 'url']
	tbl2.to_csv(workingPath + '/outputPhotoList_locID.csv',index=False)

	t1 = pd.read_csv(workingPath + '/outputPhotoListCeck.csv')
	t2 = pd.read_csv(workingPath + '/outputPhotoList.csv')
	count = 0
	t1.columns
	for x in t2.insta_id.values:
	if x not in t1.insta_id.values:
	print x
	count+=1
	print count
	url = 'https://api.instagram.com/v1/users/1771051239/?access_token=1771051239.ab103e5.7e013b99ce924cb7a894ecd0dd030be5'

	arcpy.MakeXYEventLayer_management(inTable, xCoords, yCoords, outLayer, spRef)
	arcpy.arcpy.CopyFeatures_management(outLayer, savedLayer)
	arcpy.Delete_management(outLayer)
	type(username)


	import requests
	results = requests.get(url)
	type(results)
	data = results.json()
	type(data)
	results.headers

	data['data'][90]['images']['standard_resolution']['url']

	data['data'][90]['videos']['standard_resolution']['url']


	'videos' in data['data'][83]
	payload = {'lat': '34.048502',
	'lng': '-118.246008',
	'distance': '250',
	'access_token': '1771051239.ab103e5.7e013b99ce924cb7a894ecd0dd030be5',
	'count': '100'}
	url = 'https://api.instagram.com/v1/media/search'


	#def getPhotoCount(url, payload):
	# # query website, parse JSON, and return photo count
	# r = requests.get(url, params=payload)
	# parsedResponse = r.json()
	# query= parsedResponse['data']
	## print 'Query count returned: ' + str(len(query))
	# return query

	url = 'https://api.instagram.com/v1/users/self/media/recent/?access_token=1771051239.ab103e5.7e013b99ce924cb7a894ecd0dd030be5'
	data = getPhotoCount(url)
	r.text

	klip = []
	recirc = []
	cols = ['latitude', 'longitude', 'id', 'username', 'title', 'tags', 'url']
	tbl = pd.DataFrame()
	token = '1771051239.ab103e5.7e013b99ce924cb7a894ecd0dd030be5'
	#url = 'https://api.instagram.com/v1/media/search'
	count = 0
	for xcoord in np.arange(minX, maxX, 0.0449157):
	for ycoord in np.arange(minY, maxY, 0.0449157):
	url = 'https://api.instagram.com/v1/media/search?lat=%s&lng=%s&distance=5000&access_token=%s&callback=?&count=500' % (ycoord, xcoord, token) # &callback=?&count=500
	data = getPhotoCount(url)
	count += 1
	print count
	if len(data) == 100:
	print 'X: %s Y: %s has %s returns.' % (xcoord, ycoord, str(len(data)))
	for loc in range(len(data)):
	chk = data[loc]['id']
	if chk not in klip:
	ID = data[loc]['location']['id']
	if ID not in recirc:
	recirc.append(ID)
	latitude = data[loc]['location']['latitude']
	longitude = data[loc]['location']['longitude']
	title = smart_str(data[loc]['location']['name'])
	if 'videos' in data[loc]:
	# print 'video ' + str(loc)
	url = data[loc]['videos']['standard_resolution']['url']
	else:
	url = smart_str(data[loc]['images']['standard_resolution']['url'])
	username = smart_str(data[loc]['user']['username'])
	tags = smart_str(", ".join(data[0]['tags']))
	tbl = tbl.append(pd.DataFrame([[latitude, longitude, ID, username, title, tags, url]], columns=cols), ignore_index=True)
	klip.append(chk)
	print 'Recirc: %s' % str(len(recirc))
	tbl2 = pd.DataFrame()
	for rec in recirc:
	print '********'
	print rec
	url = 'https://api.instagram.com/v1/locations/%s/media/recent?access_token=%s&count=500' % (rec, token)
	try:
	data = getPhotoCount(url)
	except urllib2.HTTPError:
	time.sleep(20)
	data = getPhotoCount(url)
	print '********'
	print len(data)
	for loc in range(len(data)):
	chk = data[loc]['id']
	if chk not in klip:
	ID = data[loc]['location']['id']
	print '********'
	print ID
	# recirc.append(ID)
	latitude = data[loc]['location']['latitude']
	longitude = data[loc]['location']['longitude']
	title = smart_str(data[loc]['location']['name'])
	if 'videos' in data[loc]:
	# print 'video ' + str(loc)
	url = data[loc]['videos']['standard_resolution']['url']
	else:
	url = smart_str(data[loc]['images']['standard_resolution']['url'])
	username = smart_str(data[loc]['user']['username'])
	tags = smart_str(", ".join(data[0]['tags']))
	tbl2 = tbl2.append(pd.DataFrame([[latitude, longitude, ID, username, title, tags, url]], columns=cols), ignore_index=True)


	chktbl = pd.concat([tbl1_25,tbl2_5])

	str([str(x) for x in ", ".join(data[0]['tags'])])
	tags = smart_str(", ".join(data[0]['tags']))


	ycoord = np.arange(minY, maxY, 0.0449157)[0]
	xcoord = np.arange(minX, maxX, 0.0449157)[1]
	minX = -92.335981
	minY = 46.630695
	maxX = -91.946101
	maxY = 46.804721

	from math import radians, sin, cos, sqrt, asin

	def haversine(lat1, lon1, lat2, lon2):

	R = 6371 # Earth radius in kilometers

	dLat = radians(lat2 - lat1)
	dLon = radians(lon2 - lon1)
	lat1 = radians(lat1)
	lat2 = radians(lat2)

	a = sin(dLat/2)*2 + cos(lat1)cos(lat2)sin(dLon/2)*2
	c = 2*asin(sqrt(a))

	return R * c

	haversine(46.630695, -92.335981, 46.630695, -91.946101)
	count = 0
	for url in tbl.url.values:
	if url in tbl2.url.values:
	count += 1

	chks = []
	flop = []
	for x in range(20):
	data = getPhotoCount(url)
	print len(data)
	for loc in range(len(data)):
	chk = data[loc]['id']
	ID = data[loc]['location']['id']
	chks.append(chk)
	flop.append(ID)
	len(set(chks))

	import requests

	results = requests.get(url)