-
-
Save marcoqu/e17e1c4414f8d18e6672976d941161fa to your computer and use it in GitHub Desktop.
# pylint: skip-file | |
import time | |
import re | |
import md5 | |
import requests | |
import json | |
INSTAGRAM_URL = "https://www.instagram.com" | |
HASHTAG_ENDPOINT = "/graphql/query/?query_hash={}&variables={}" | |
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" | |
def get_first_page(hashtag): | |
return requests.get(INSTAGRAM_URL + "/explore/tags/{}/".format(hashtag), headers={"user-agent": USER_AGENT}) | |
def get_csrf_token(cookies): | |
return cookies.get("csrftoken") | |
def get_query_id(html): | |
script_path = re.search(r'/static(.*)TagPageContainer\.js/(.*).js', html).group(0) | |
script_req = requests.get(INSTAGRAM_URL + script_path) | |
return re.findall('return e.tagMedia.byTagName.get\\(t\\).pagination},queryId:"([^"]*)"', script_req.text)[0] | |
def get_rhx_gis(html): | |
return re.search(r'rhx_gis":"([^"]*)"', html).group(1) | |
def get_end_cursor_from_html(html): | |
return re.search(r'end_cursor":"([^"]*)"', html).group(1) | |
def get_end_cursor_from_json(json_obj): | |
return json_obj['data']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor'] | |
def get_params(hashtag, end_cursor): | |
return '{{"tag_name":"{}","first":50,"after":"{}"}}'.format(hashtag, end_cursor) | |
def get_ig_gis(rhx_gis, params): | |
return md5.new(rhx_gis + ":" + params).hexdigest() | |
def get_posts_from_json(json_obj): | |
edges = json_obj['hashtag']['edge_hashtag_to_media']['edges'] | |
return [o['node'] for o in edges] | |
def get_posts_from_html(html): | |
json_str = re.search(r'window._sharedData = (.*);</script>', html).group(1) | |
json_obj = json.loads(json_str) | |
graphql = json_obj["entry_data"]["TagPage"][0]["graphql"] | |
return get_posts_from_json(graphql) | |
def make_cookies(csrf_token): | |
return { | |
"ig_pr": "2", | |
"csrftoken": csrf_token, | |
} | |
def make_headers(ig_gis): | |
return { | |
"x-instagram-gis": ig_gis, | |
"x-requested-with": "XMLHttpRequest", | |
"user-agent": USER_AGENT | |
} | |
def get_next_page(csrf_token, ig_gis, query_id, params): | |
cookies = make_cookies(csrf_token) | |
headers = make_headers(ig_gis) | |
url = INSTAGRAM_URL + HASHTAG_ENDPOINT.format(query_id, params) | |
req = requests.get(url, headers=headers, cookies=cookies) | |
req.raise_for_status() | |
json_obj = req.json() | |
end_cursor = get_end_cursor_from_json(json_obj) | |
posts = get_posts_from_json(json_obj['data']) | |
return posts, end_cursor | |
def scrape_hashtag(hashtag, sleep=3): | |
""" | |
Yields scraped posts, one by one | |
""" | |
first_page = get_first_page(hashtag) | |
csrf_token = get_csrf_token(first_page.cookies) | |
query_id = get_query_id(first_page.text) | |
rhx_gis = get_rhx_gis(first_page.text) | |
end_cursor = get_end_cursor_from_html(first_page.text) | |
home_posts = get_posts_from_html(first_page.text) | |
for post in home_posts: | |
yield post | |
while end_cursor is not None: | |
params = get_params(hashtag, end_cursor) | |
ig_gis = get_ig_gis(rhx_gis, params) | |
posts, end_cursor = get_next_page(csrf_token, ig_gis, query_id, params) | |
for post in posts: | |
yield post | |
time.sleep(sleep) | |
# main | |
for post in scrape_hashtag("summer"): | |
print post['id'] | |
# do stuff |
Thanks a ton, saved a lot of time.
I modified it slightly to scrape instagram user's public timeline photos link.
Could you please explain "ig_pr":2
in cookies, I don't find it necessary, thanks.
Also you could change while True:
to while end_cursor != None:
in case the scraper reaches the very last page(where end_cursor is None).
@ketankr9 thanks for the suggestions. ig_pr
was needed at some point, so I'm going to leave it there just in case..
I also integrated the loading of posts from the home, thanks about that.
Is anybody else getting a 500 status after roughly 70 posts? The same script worked fine 2 weeks ago ... Maybe they tightened their temporary IP blocking?!?
Has somebody had problems with the scraper?
Basically the cookies are not having anymore the csrf token
, and so, it is breaking
@Kingson that would be only for the first page, right?