Skip to content

Instantly share code, notes, and snippets.

@ankitshekhawat
Created September 16, 2019 13:07
Show Gist options
  • Save ankitshekhawat/c6a5fcfe93c42d6d0ae0fc476385f9cf to your computer and use it in GitHub Desktop.
Save ankitshekhawat/c6a5fcfe93c42d6d0ae0fc476385f9cf to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import requests \n",
"import json\n",
"from IPython.display import display, Image as IM, HTML\n",
"import random, time\n",
"import os\n",
"import aria2p\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"base_url='https://www.instagram.com/explore/tags/{}/?__a=1&max_id='\n",
"insta_posts = {}\n",
"hashtag = 'beachfashion'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"download = True\n",
"\n",
"# parellel run:\n",
"# aria2c --enable-rpc --rpc-listen-all\n",
"if download:\n",
" directory = os.path.abspath('./'+hashtag)\n",
" os.makedirs(directory, exist_ok=True)\n",
" aria2 = aria2p.API(\n",
" aria2p.Client(\n",
" host=\"http://localhost\",\n",
" port=6800,\n",
" secret=\"\" ) )\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"17159 / 476700\r"
]
}
],
"source": [
"sess = requests.Session()\n",
"\n",
"\n",
"res = sess.get(base_url.format(hashtag)).json()\n",
"def parse(response):\n",
" posts = {}\n",
" page_status = response['graphql']['hashtag']['edge_hashtag_to_media']['page_info']\n",
" nodes = response['graphql']['hashtag']['edge_hashtag_to_media']['edges']\n",
" total = response['graphql']['hashtag']['edge_hashtag_to_media']['count']\n",
"\n",
" urls = [x['node']['thumbnail_src'] for x in res['graphql']['hashtag']['edge_hashtag_to_media']['edges']]\n",
" res['graphql']['hashtag'].keys()\n",
" for n in nodes:\n",
" if not n['node']['is_video'] :\n",
" captions = n['node']['edge_media_to_caption']['edges']\n",
" payload = {'display_url': n['node']['display_url'],\n",
" 'likes': n['node']['edge_liked_by']['count'],\n",
" 'comments': n['node']['edge_media_to_comment']['count'],\n",
" 'is_video':n['node']['is_video'],\n",
" 'thumbnail': n['node']['thumbnail_src'],\n",
" 'time': n['node']['taken_at_timestamp'],\n",
" 'id': n['node']['id'],\n",
" 'caption': captions[0]['node']['text'] if captions else ''\n",
" }\n",
" posts[n['node']['id']] = payload\n",
" return posts, page_status.get('has_next_page', None), page_status.get('end_cursor', None) , total \n",
"\n",
"\n",
"total = float('inf')\n",
"has_next= True \n",
"cursor = None\n",
"ids = []\n",
"while has_next:\n",
" cur_text = cursor if cursor else ''\n",
" try:\n",
" res = sess.get(base_url.format(hashtag)+ cur_text).json()\n",
" posts, has_next, cursor, total = parse(res)\n",
" except Exception as e:\n",
" print('sleeping for 5 seconds before retrying')\n",
" time.sleep(5) \n",
" else:\n",
" for p in posts:\n",
" insta_posts[p] = posts[p]\n",
" if download:\n",
" aria2.add_uris([posts[p]['display_url']], dict(dir=directory, auto_file_renaming=False, out='{}.jpg'.format(posts[p]['id'])))\n",
" for p in posts:\n",
" ids.append(p)\n",
" print(len(insta_posts),'/',total, end=\"\\r\")\n",
" time.sleep(random.random()*5)\n",
" with open(hashtag+'.json','w') as f:\n",
" json.dump(insta_posts, f)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(len(insta_posts))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(hashtag+'.json','w') as f:\n",
" json.dump(insta_posts, f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for post in dataset:\n",
" print(post)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "python 3 tf2",
"language": "python",
"name": "venv"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@jangrzeg
Copy link

Hello, I seem to be getting an error when the script is run. Is there any workaround for this?

HTTPConnectionPool(host='localhost', port=6800): Max retries exceeded with url: /jsonrpc (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1037ad0f0>: Failed to establish a new connection: [Errno 61] Connection refused'))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment