-
-
Save marcoqu/e17e1c4414f8d18e6672976d941161fa to your computer and use it in GitHub Desktop.
# pylint: skip-file | |
import time | |
import re | |
import md5 | |
import requests | |
import json | |
INSTAGRAM_URL = "https://www.instagram.com" | |
HASHTAG_ENDPOINT = "/graphql/query/?query_hash={}&variables={}" | |
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" | |
def get_first_page(hashtag): | |
return requests.get(INSTAGRAM_URL + "/explore/tags/{}/".format(hashtag), headers={"user-agent": USER_AGENT}) | |
def get_csrf_token(cookies): | |
return cookies.get("csrftoken") | |
def get_query_id(html): | |
script_path = re.search(r'/static(.*)TagPageContainer\.js/(.*).js', html).group(0) | |
script_req = requests.get(INSTAGRAM_URL + script_path) | |
return re.findall('return e.tagMedia.byTagName.get\\(t\\).pagination},queryId:"([^"]*)"', script_req.text)[0] | |
def get_rhx_gis(html): | |
return re.search(r'rhx_gis":"([^"]*)"', html).group(1) | |
def get_end_cursor_from_html(html): | |
return re.search(r'end_cursor":"([^"]*)"', html).group(1) | |
def get_end_cursor_from_json(json_obj): | |
return json_obj['data']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor'] | |
def get_params(hashtag, end_cursor): | |
return '{{"tag_name":"{}","first":50,"after":"{}"}}'.format(hashtag, end_cursor) | |
def get_ig_gis(rhx_gis, params): | |
return md5.new(rhx_gis + ":" + params).hexdigest() | |
def get_posts_from_json(json_obj): | |
edges = json_obj['hashtag']['edge_hashtag_to_media']['edges'] | |
return [o['node'] for o in edges] | |
def get_posts_from_html(html): | |
json_str = re.search(r'window._sharedData = (.*);</script>', html).group(1) | |
json_obj = json.loads(json_str) | |
graphql = json_obj["entry_data"]["TagPage"][0]["graphql"] | |
return get_posts_from_json(graphql) | |
def make_cookies(csrf_token): | |
return { | |
"ig_pr": "2", | |
"csrftoken": csrf_token, | |
} | |
def make_headers(ig_gis): | |
return { | |
"x-instagram-gis": ig_gis, | |
"x-requested-with": "XMLHttpRequest", | |
"user-agent": USER_AGENT | |
} | |
def get_next_page(csrf_token, ig_gis, query_id, params): | |
cookies = make_cookies(csrf_token) | |
headers = make_headers(ig_gis) | |
url = INSTAGRAM_URL + HASHTAG_ENDPOINT.format(query_id, params) | |
req = requests.get(url, headers=headers, cookies=cookies) | |
req.raise_for_status() | |
json_obj = req.json() | |
end_cursor = get_end_cursor_from_json(json_obj) | |
posts = get_posts_from_json(json_obj['data']) | |
return posts, end_cursor | |
def scrape_hashtag(hashtag, sleep=3): | |
""" | |
Yields scraped posts, one by one | |
""" | |
first_page = get_first_page(hashtag) | |
csrf_token = get_csrf_token(first_page.cookies) | |
query_id = get_query_id(first_page.text) | |
rhx_gis = get_rhx_gis(first_page.text) | |
end_cursor = get_end_cursor_from_html(first_page.text) | |
home_posts = get_posts_from_html(first_page.text) | |
for post in home_posts: | |
yield post | |
while end_cursor is not None: | |
params = get_params(hashtag, end_cursor) | |
ig_gis = get_ig_gis(rhx_gis, params) | |
posts, end_cursor = get_next_page(csrf_token, ig_gis, query_id, params) | |
for post in posts: | |
yield post | |
time.sleep(sleep) | |
# main | |
for post in scrape_hashtag("summer"): | |
print post['id'] | |
# do stuff |
No, you just need to wait a while when you get a 429, or slow down your requests
Trying to run this errors out with simplejson.scanner.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
. Adding print(next_page.status_code)
to line 48 shows that IG is returning a 403 error
@cyrian-1756 +1
I think the generation of x-instagram-gis has changed.
When i want to build my x-instagram-gis with rhx_gis + ":" + csrf_token + ":" + user_agent + ":" + params
the md5 doesn't match with the correct md5.
it's working if you remove ":" + user_agent
Updated, thanks @devauxa
@marcoqu: Do you have any idea how to access https://www.instagram.com/<username>/?__a=1
endpoint?
@kuldeepaggarwal: Haven't tried yet, sorry.
@kuldeepaggarwal need to use just path, so for https://www.instagram.com/<username>/?__a=1
param should be /<username>/
updated: now x-instagram-gis
is just rhx_gis + ":" + params
first
value has to be at most 50.
@marcoqu coll, thanks
Use "https://www.instagram.com/explore/tags/cat/?__a=1" API to return json data.
@Kingson that would be only for the first page, right?
Thanks a ton, saved a lot of time.
I modified it slightly to scrape instagram user's public timeline photos link.
Could you please explain "ig_pr":2
in cookies, I don't find it necessary, thanks.
Also you could change while True:
to while end_cursor != None:
in case the scraper reaches the very last page(where end_cursor is None).
@ketankr9 thanks for the suggestions. ig_pr
was needed at some point, so I'm going to leave it there just in case..
I also integrated the loading of posts from the home, thanks about that.
Is anybody else getting a 500 status after roughly 70 posts? The same script worked fine 2 weeks ago ... Maybe they tightened their temporary IP blocking?!?
Has somebody had problems with the scraper?
Basically the cookies are not having anymore the csrf token
, and so, it is breaking
Thanks for the great updated script.
I've tried and managed to get around 120 results in total, then encounter an issue with the rate limit and 429 status
This is the response header.
Any solution for this?