Last active
April 11, 2018 02:01
-
-
Save uluQulu/f38ee4dfe06b5a1be3260ab5e9147d6c to your computer and use it in GitHub Desktop.
location link grabber function for @dimahadghi
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_links_for_location(browser, | |
location, | |
amount, | |
logger, | |
media=None, | |
skip_top_posts=True): | |
"""Fetches the number of links specified | |
by amount and returns a list of links""" | |
if media is None: | |
# All known media types | |
media = ['', 'Post', 'Video'] | |
elif media == 'Photo': | |
# Include posts with multiple images in it | |
media = ['', 'Post'] | |
else: | |
# Make it an array to use it in the following part | |
media = [media] | |
browser.get('https://www.instagram.com/explore/locations/' + location) | |
# update server calls | |
update_activity() | |
sleep(2) | |
top_elements = browser.find_element_by_xpath('//main/article/div[1]') | |
top_posts = top_elements.find_elements_by_tag_name('a') | |
sleep(1) | |
if skip_top_posts: | |
main_elem = browser.find_element_by_xpath('//main/article/div[2]') | |
else: | |
main_elem = browser.find_element_by_tag_name('main') | |
link_elems = main_elem.find_elements_by_tag_name('a') | |
sleep(1) | |
if not link_elems: #this location does not have `Top Posts` or it really is empty.. | |
main_elem = browser.find_element_by_xpath('//main/article/div[1]') | |
top_posts = [] | |
sleep(2) | |
#Get links | |
links = get_links(browser, location, logger, media, main_elem) | |
filtered_links = len(links) | |
try_again = 0 | |
sc_rolled = 0 | |
nap = 1.5 | |
put_sleep = 0 | |
try: | |
while filtered_links in range(1, amount): | |
if sc_rolled > 100: | |
logger.info("Scrolled too much! ~ sleeping a bit :>") | |
sleep(600) | |
sc_rolled = 0 | |
for i in range(3): | |
browser.execute_script( | |
"window.scrollTo(0, document.body.scrollHeight);") | |
sc_rolled += 1 | |
update_activity() | |
sleep(nap) #if not slept, and internet speed is low, instagram will only scroll one time, instead of many times you sent scroll command... | |
sleep(3) | |
links.extend(get_links(browser, location, logger, media, main_elem)) | |
links_all = links #uniqify links while preserving order | |
s = set() | |
links = [] | |
for i in links_all: | |
if i not in s: | |
s.add(i) | |
links.append(i) | |
if len(links) == filtered_links: | |
try_again += 1 | |
nap = 3 if try_again==1 else 5 | |
logger.info("Insufficient amount of links ~ trying again: {}".format(try_again)) | |
sleep(3) | |
if try_again > 2: #you can try again as much as you want by changing this number | |
if put_sleep < 1 and filtered_links <= 21 : | |
logger.info("Cor! Did you send too many requests? ~ let's rest some") | |
sleep(600) | |
put_sleep += 1 | |
browser.execute_script("location.reload()") | |
try_again = 0 | |
sleep(10) | |
main_elem = (browser.find_element_by_xpath('//main/article/div[1]') if not link_elems else | |
browser.find_element_by_xpath('//main/article/div[2]') if skip_top_posts else | |
browser.find_element_by_tag_name('main')) | |
else: | |
logger.info("'{}' location POSSIBLY has less images than desired...".format(location)) | |
break | |
else: | |
filtered_links = len(links) | |
try_again = 0 | |
nap = 1.5 | |
except: | |
raise | |
sleep(4) | |
return links[:amount] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment