-
-
Save jslim89/9b0eebaefffcbad10285921cdd8aec70 to your computer and use it in GitHub Desktop.
The instagram internal api
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import re | |
from hashlib import md5 | |
import requests | |
INSTAGRAM_URL = "https://www.instagram.com" | |
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" | |
INSTAGRAM_USER = "youruname" | |
INSTAGRAM_PASS = "yoursecret" | |
def get_first_page(hashtag): | |
return requests.get(INSTAGRAM_URL + "/explore/tags/{}/".format(hashtag), headers={ "user-agent": USER_AGENT }) | |
def get_csrf_token(cookies): | |
return cookies.get("csrftoken") | |
def get_query_id(html): | |
script_path = re.search(r'/static(.*)TagPageContainer\.js/(.*).js', html).group(0) | |
script_req = requests.get(INSTAGRAM_URL + script_path) | |
return re.findall('return e.tagMedia.byTagName.get\\(t\\).pagination},queryId:"([^"]*)"', script_req.text)[0] | |
def get_rhx_gis(html): | |
return re.search(r'rhx_gis":"([^"]*)"', html).group(1) | |
def get_end_cursor(html): | |
return re.search(r'end_cursor":"([^"]*)"', html).group(1) | |
def get_params(hashtag, end_cursor): | |
return '{{"tag_name":"{}","first":200,"after":"{}"}}'.format(hashtag, end_cursor) | |
def get_ig_gis(rhx_gis, csrf_token, params): | |
return md5((rhx_gis + ":" + csrf_token + ":" + USER_AGENT + ":" + params).encode('utf-8')).hexdigest() | |
def get_next_page(new_cookies, ig_gis, query_id, params): | |
headers = { | |
"x-instagram-gis": ig_gis, | |
"x-requested-with": "XMLHttpRequest", | |
"user-agent": USER_AGENT | |
} | |
url = INSTAGRAM_URL + "/graphql/query/?query_hash={}&variables={}".format(query_id, params) | |
time.sleep(10) | |
next_page = requests.get(url, headers=headers, cookies=new_cookies) | |
obj = next_page.json() | |
obj = obj['data']['hashtag']['edge_hashtag_to_media'] | |
end_cursor = obj['page_info']['end_cursor'] | |
data = [o['node'] for o in obj['edges']] | |
return data, end_cursor | |
def crawl(hashtag, max_count): | |
first_page = get_first_page(hashtag) | |
is_logged_in, cookies = login(INSTAGRAM_USER, INSTAGRAM_PASS) | |
if (not is_logged_in): | |
print('Not logged in') | |
return | |
csrf_token = get_csrf_token(cookies) | |
query_id = get_query_id(first_page.text) | |
rhx_gis = get_rhx_gis(first_page.text) | |
end_cursor = get_end_cursor(first_page.text) | |
params = get_params(hashtag, end_cursor) | |
ig_gis = get_ig_gis(rhx_gis, csrf_token, params) | |
results = [] | |
while end_cursor: | |
params = get_params(hashtag, end_cursor) | |
ig_gis = get_ig_gis(rhx_gis, csrf_token, params) | |
data, end_cursor = get_next_page(cookies, ig_gis, query_id, params) | |
# for testing purpose only, to print out all posts link | |
for o in data: | |
print('https://www.instagram.com/p/%s' % o['shortcode']) | |
results.extend(data) | |
# the maximum number of results | |
if (len(results) > max_count): | |
break | |
return results | |
def login(username, password): | |
login_url = INSTAGRAM_URL + '/accounts/login/ajax/' | |
session = requests.Session() | |
session.headers = { "user-agent": USER_AGENT } | |
session.headers.update({'Referer': INSTAGRAM_URL}) | |
req = session.get(INSTAGRAM_URL) | |
session.headers.update({'X-CSRFToken': req.cookies['csrftoken']}) | |
login = session.post(login_url, data={'username': username, 'password': password}, allow_redirects=True) | |
session.headers.update({'X-CSRFToken': login.cookies['csrftoken']}) | |
cookies = login.cookies | |
results = login.json() | |
if (results['authenticated'] == False): | |
return False, None | |
return True, cookies | |
crawl('antarctica', 5000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment