Created
September 16, 2019 13:07
-
-
Save ankitshekhawat/c6a5fcfe93c42d6d0ae0fc476385f9cf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests \n", | |
"import json\n", | |
"from IPython.display import display, Image as IM, HTML\n", | |
"import random, time\n", | |
"import os\n", | |
"import aria2p\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"base_url='https://www.instagram.com/explore/tags/{}/?__a=1&max_id='\n", | |
"insta_posts = {}\n", | |
"hashtag = 'beachfashion'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"download = True\n", | |
"\n", | |
"# parellel run:\n", | |
"# aria2c --enable-rpc --rpc-listen-all\n", | |
"if download:\n", | |
" directory = os.path.abspath('./'+hashtag)\n", | |
" os.makedirs(directory, exist_ok=True)\n", | |
" aria2 = aria2p.API(\n", | |
" aria2p.Client(\n", | |
" host=\"http://localhost\",\n", | |
" port=6800,\n", | |
" secret=\"\" ) )\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"17159 / 476700\r" | |
] | |
} | |
], | |
"source": [ | |
"sess = requests.Session()\n", | |
"\n", | |
"\n", | |
"res = sess.get(base_url.format(hashtag)).json()\n", | |
"def parse(response):\n", | |
" posts = {}\n", | |
" page_status = response['graphql']['hashtag']['edge_hashtag_to_media']['page_info']\n", | |
" nodes = response['graphql']['hashtag']['edge_hashtag_to_media']['edges']\n", | |
" total = response['graphql']['hashtag']['edge_hashtag_to_media']['count']\n", | |
"\n", | |
" urls = [x['node']['thumbnail_src'] for x in res['graphql']['hashtag']['edge_hashtag_to_media']['edges']]\n", | |
" res['graphql']['hashtag'].keys()\n", | |
" for n in nodes:\n", | |
" if not n['node']['is_video'] :\n", | |
" captions = n['node']['edge_media_to_caption']['edges']\n", | |
" payload = {'display_url': n['node']['display_url'],\n", | |
" 'likes': n['node']['edge_liked_by']['count'],\n", | |
" 'comments': n['node']['edge_media_to_comment']['count'],\n", | |
" 'is_video':n['node']['is_video'],\n", | |
" 'thumbnail': n['node']['thumbnail_src'],\n", | |
" 'time': n['node']['taken_at_timestamp'],\n", | |
" 'id': n['node']['id'],\n", | |
" 'caption': captions[0]['node']['text'] if captions else ''\n", | |
" }\n", | |
" posts[n['node']['id']] = payload\n", | |
" return posts, page_status.get('has_next_page', None), page_status.get('end_cursor', None) , total \n", | |
"\n", | |
"\n", | |
"total = float('inf')\n", | |
"has_next= True \n", | |
"cursor = None\n", | |
"ids = []\n", | |
"while has_next:\n", | |
" cur_text = cursor if cursor else ''\n", | |
" try:\n", | |
" res = sess.get(base_url.format(hashtag)+ cur_text).json()\n", | |
" posts, has_next, cursor, total = parse(res)\n", | |
" except Exception as e:\n", | |
" print('sleeping for 5 seconds before retrying')\n", | |
" time.sleep(5) \n", | |
" else:\n", | |
" for p in posts:\n", | |
" insta_posts[p] = posts[p]\n", | |
" if download:\n", | |
" aria2.add_uris([posts[p]['display_url']], dict(dir=directory, auto_file_renaming=False, out='{}.jpg'.format(posts[p]['id'])))\n", | |
" for p in posts:\n", | |
" ids.append(p)\n", | |
" print(len(insta_posts),'/',total, end=\"\\r\")\n", | |
" time.sleep(random.random()*5)\n", | |
" with open(hashtag+'.json','w') as f:\n", | |
" json.dump(insta_posts, f)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"print(len(insta_posts))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open(hashtag+'.json','w') as f:\n", | |
" json.dump(insta_posts, f)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for post in dataset:\n", | |
" print(post)\n", | |
" break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "python 3 tf2", | |
"language": "python", | |
"name": "venv" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello, I seem to be getting an error when the script is run. Is there any workaround for this?
HTTPConnectionPool(host='localhost', port=6800): Max retries exceeded with url: /jsonrpc (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1037ad0f0>: Failed to establish a new connection: [Errno 61] Connection refused'))