ankitshekhawat · September 16, 2019 13:07 · jangrzeg · Apr 15, 2022
diff --git a/instagram pull hashtag.ipynb b/instagram pull hashtag.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests \n",
    "import json\n",
    "from IPython.display import display, Image as IM, HTML\n",
    "import random, time\n",
    "import os\n",
    "import aria2p\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_url='https://www.instagram.com/explore/tags/{}/?__a=1&max_id='\n",
    "insta_posts = {}\n",
    "hashtag = 'beachfashion'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "download = True\n",
    "\n",
    "# parellel run:\n",
    "# aria2c --enable-rpc --rpc-listen-all\n",
    "if download:\n",
    "    directory = os.path.abspath('./'+hashtag)\n",
    "    os.makedirs(directory, exist_ok=True)\n",
    "    aria2 = aria2p.API(\n",
    "        aria2p.Client(\n",
    "            host=\"http://localhost\",\n",
    "            port=6800,\n",
    "            secret=\"\"  ) )\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17159 / 476700\r"
     ]
    }
   ],
   "source": [
    "sess = requests.Session()\n",
    "\n",
    "\n",
    "res = sess.get(base_url.format(hashtag)).json()\n",
    "def parse(response):\n",
    "    posts = {}\n",
    "    page_status = response['graphql']['hashtag']['edge_hashtag_to_media']['page_info']\n",
    "    nodes = response['graphql']['hashtag']['edge_hashtag_to_media']['edges']\n",
    "    total = response['graphql']['hashtag']['edge_hashtag_to_media']['count']\n",
    "\n",
    "    urls = [x['node']['thumbnail_src'] for x in res['graphql']['hashtag']['edge_hashtag_to_media']['edges']]\n",
    "    res['graphql']['hashtag'].keys()\n",
    "    for n in nodes:\n",
    "        if not n['node']['is_video'] :\n",
    "            captions = n['node']['edge_media_to_caption']['edges']\n",
    "            payload = {'display_url': n['node']['display_url'],\n",
    "                       'likes': n['node']['edge_liked_by']['count'],\n",
    "                        'comments': n['node']['edge_media_to_comment']['count'],\n",
    "                         'is_video':n['node']['is_video'],\n",
    "                        'thumbnail': n['node']['thumbnail_src'],\n",
    "                        'time': n['node']['taken_at_timestamp'],\n",
    "                          'id': n['node']['id'],\n",
    "                         'caption': captions[0]['node']['text'] if captions else ''\n",
    "                          }\n",
    "            posts[n['node']['id']] = payload\n",
    "    return posts, page_status.get('has_next_page', None),  page_status.get('end_cursor', None) , total  \n",
    "\n",
    "\n",
    "total = float('inf')\n",
    "has_next= True \n",
    "cursor = None\n",
    "ids = []\n",
    "while has_next:\n",
    "    cur_text = cursor if cursor else ''\n",
    "    try:\n",
    "        res = sess.get(base_url.format(hashtag)+ cur_text).json()\n",
    "        posts, has_next, cursor, total = parse(res)\n",
    "    except Exception as e:\n",
    "        print('sleeping for 5 seconds before retrying')\n",
    "        time.sleep(5)             \n",
    "    else:\n",
    "        for p in posts:\n",
    "            insta_posts[p] = posts[p]\n",
    "            if download:\n",
    "                aria2.add_uris([posts[p]['display_url']], dict(dir=directory, auto_file_renaming=False, out='{}.jpg'.format(posts[p]['id'])))\n",
    "        for p in posts:\n",
    "            ids.append(p)\n",
    "        print(len(insta_posts),'/',total,  end=\"\\r\")\n",
    "        time.sleep(random.random()*5)\n",
    "        with open(hashtag+'.json','w') as f:\n",
    "            json.dump(insta_posts, f)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(insta_posts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(hashtag+'.json','w') as f:\n",
    "            json.dump(insta_posts, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for post in dataset:\n",
    "    print(post)\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python 3 tf2",
   "language": "python",
   "name": "venv"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import requests \n",
	"import json\n",
	"from IPython.display import display, Image as IM, HTML\n",
	"import random, time\n",
	"import os\n",
	"import aria2p\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"base_url='https://www.instagram.com/explore/tags/{}/?__a=1&max_id='\n",
	"insta_posts = {}\n",
	"hashtag = 'beachfashion'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"download = True\n",
	"\n",
	"# parellel run:\n",
	"# aria2c --enable-rpc --rpc-listen-all\n",
	"if download:\n",
	" directory = os.path.abspath('./'+hashtag)\n",
	" os.makedirs(directory, exist_ok=True)\n",
	" aria2 = aria2p.API(\n",
	" aria2p.Client(\n",
	" host=\"http://localhost\",\n",
	" port=6800,\n",
	" secret=\"\" ) )\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"17159 / 476700\r"
	]
	}
	],
	"source": [
	"sess = requests.Session()\n",
	"\n",
	"\n",
	"res = sess.get(base_url.format(hashtag)).json()\n",
	"def parse(response):\n",
	" posts = {}\n",
	" page_status = response['graphql']['hashtag']['edge_hashtag_to_media']['page_info']\n",
	" nodes = response['graphql']['hashtag']['edge_hashtag_to_media']['edges']\n",
	" total = response['graphql']['hashtag']['edge_hashtag_to_media']['count']\n",
	"\n",
	" urls = [x['node']['thumbnail_src'] for x in res['graphql']['hashtag']['edge_hashtag_to_media']['edges']]\n",
	" res['graphql']['hashtag'].keys()\n",
	" for n in nodes:\n",
	" if not n['node']['is_video'] :\n",
	" captions = n['node']['edge_media_to_caption']['edges']\n",
	" payload = {'display_url': n['node']['display_url'],\n",
	" 'likes': n['node']['edge_liked_by']['count'],\n",
	" 'comments': n['node']['edge_media_to_comment']['count'],\n",
	" 'is_video':n['node']['is_video'],\n",
	" 'thumbnail': n['node']['thumbnail_src'],\n",
	" 'time': n['node']['taken_at_timestamp'],\n",
	" 'id': n['node']['id'],\n",
	" 'caption': captions[0]['node']['text'] if captions else ''\n",
	" }\n",
	" posts[n['node']['id']] = payload\n",
	" return posts, page_status.get('has_next_page', None), page_status.get('end_cursor', None) , total \n",
	"\n",
	"\n",
	"total = float('inf')\n",
	"has_next= True \n",
	"cursor = None\n",
	"ids = []\n",
	"while has_next:\n",
	" cur_text = cursor if cursor else ''\n",
	" try:\n",
	" res = sess.get(base_url.format(hashtag)+ cur_text).json()\n",
	" posts, has_next, cursor, total = parse(res)\n",
	" except Exception as e:\n",
	" print('sleeping for 5 seconds before retrying')\n",
	" time.sleep(5) \n",
	" else:\n",
	" for p in posts:\n",
	" insta_posts[p] = posts[p]\n",
	" if download:\n",
	" aria2.add_uris([posts[p]['display_url']], dict(dir=directory, auto_file_renaming=False, out='{}.jpg'.format(posts[p]['id'])))\n",
	" for p in posts:\n",
	" ids.append(p)\n",
	" print(len(insta_posts),'/',total, end=\"\\r\")\n",
	" time.sleep(random.random()*5)\n",
	" with open(hashtag+'.json','w') as f:\n",
	" json.dump(insta_posts, f)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"print(len(insta_posts))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"with open(hashtag+'.json','w') as f:\n",
	" json.dump(insta_posts, f)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"for post in dataset:\n",
	" print(post)\n",
	" break"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "python 3 tf2",
	"language": "python",
	"name": "venv"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}