amaarora · June 29, 2021 06:18
diff --git a/Untitled.ipynb b/Untitled.ipynb
 {
  "cells": [
    {
      "metadata": {
        "trusted": true
      },
      "id": "415dc37f",
      "cell_type": "code",
      "source": "import fastbook\nfrom fastai.vision.all import * \nfrom tqdm import tqdm",
      "execution_count": 1,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "def search_images_ddg(term, max_images=200):\n    \"Search for `term` with DuckDuckGo and return a unique urls of about `max_images` images\"\n    assert max_images<1000\n    url = 'https://duckduckgo.com/'\n    res = urlread(url,data={'q':term}, decode=False).decode()\n    searchObj = re.search(r'vqd=([\\d-]+)\\&', res)\n    assert searchObj\n    requestUrl = url + 'i.js'\n    params = dict(l='us-en', o='json', q=term, vqd=searchObj.group(1), f=',,,', p='1', v7exp='a')\n    urls,data = set(),{'next':1}\n    while len(urls)<max_images and 'next' in data:\n        try:\n            data = urljson(requestUrl,data=params)\n            urls.update(L(data['results']).itemgot('image'))\n            requestUrl = url + data['next']\n        except (URLError,HTTPError): pass\n        time.sleep(0.2)\n    return L(urls)",
      "execution_count": 2,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "urls = search_images_ddg('grizzly bear', max_images=100)\nurls",
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 4,
          "data": {
            "text/plain": "(#100) ['https://wallpapertag.com/wallpaper/full/e/b/8/437151-grizzly-bear-backgrounds-2560x1600-for-tablet.jpg','http://www.fws.gov/mountain-prairie/pressrel/images/01222015_grizzlybear.jpg','https://cdn0.wideopenspaces.com/wp-content/uploads/2020/04/whatdogrizzlybearseat4-scaled.jpg','https://www.campbellriver.travel/media/2020/02/bear-ears1.jpg','http://cdn.roaring.earth/wp-content/uploads/2016/03/Scratching-head-main-text.jpg','https://ourplnt.com/wp-content/uploads/2020/12/grizzly-bear.jpg','http://www.wyofile.com/wp-content/uploads/2017/11/grizzly-pic-e1509641980482.jpg','https://keyassets.timeincuk.net/inspirewp/live/wp-content/uploads/sites/2/2019/06/GettyImages-525103104.jpg','https://devdenverzoo.com/wp-content/uploads/2018/09/Grizzly-Bear_04.jpg','http://i.huffpost.com/gen/1747973/images/o-GRIZZLY-BEAR-RAINFOREST-facebook.jpg'...]"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Next, to download images we need to pass the `directory` (where to download the images) and `urls` (which images to download)."
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Let's say we are trying to download images for `grizzly bear`, `teddy bear` and `black bear`. This looks like:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "bear_types = ['grizzly', 'teddy', 'black']\n\n# path where to download the images\npath = Path('./bear')\n\n# first create folder in `path` and then download the images\nif not path.exists():\n    path.mkdir()\n    for o in tqdm(bear_types):\n        dest = (path/o)\n        results = search_images_ddg(f'{o} bear', max_images=100)  \n        images = download_images(dest, urls=results)",
      "execution_count": 7,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "This will download the images in `bear` folder in current directory. See below. "
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "!tree ./bear -d",
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "text": "\u001b[01;34m./bear\u001b[00m\r\n├── \u001b[01;34mblack\u001b[00m\r\n├── \u001b[01;34mgrizzly\u001b[00m\r\n└── \u001b[01;34mteddy\u001b[00m\r\n\r\n3 directories\r\n",
          "name": "stdout"
        }
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "base",
      "display_name": "Python[base]",
      "language": "python"
    },
    "language_info": {
      "name": "python",
      "version": "3.7.10",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "toc": {
      "nav_menu": {},
      "number_sections": true,
      "sideBar": true,
      "skip_h1_title": false,
      "base_numbering": 1,
      "title_cell": "Table of Contents",
      "title_sidebar": "Contents",
      "toc_cell": false,
      "toc_position": {},
      "toc_section_display": true,
      "toc_window_display": false
    },
    "gist": {
      "id": "",
      "data": {
        "description": "git_repos/fastbook/create_custom_datasetipynb",
        "public": true
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"metadata": {
	"trusted": true
	},
	"id": "415dc37f",
	"cell_type": "code",
	"source": "import fastbook\nfrom fastai.vision.all import * \nfrom tqdm import tqdm",
	"execution_count": 1,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "def search_images_ddg(term, max_images=200):\n \"Search for `term` with DuckDuckGo and return a unique urls of about `max_images` images\"\n assert max_images<1000\n url = 'https://duckduckgo.com/'\n res = urlread(url,data={'q':term}, decode=False).decode()\n searchObj = re.search(r'vqd=([\\d-]+)\\&', res)\n assert searchObj\n requestUrl = url + 'i.js'\n params = dict(l='us-en', o='json', q=term, vqd=searchObj.group(1), f=',,,', p='1', v7exp='a')\n urls,data = set(),{'next':1}\n while len(urls)<max_images and 'next' in data:\n try:\n data = urljson(requestUrl,data=params)\n urls.update(L(data['results']).itemgot('image'))\n requestUrl = url + data['next']\n except (URLError,HTTPError): pass\n time.sleep(0.2)\n return L(urls)",
	"execution_count": 2,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "urls = search_images_ddg('grizzly bear', max_images=100)\nurls",
	"execution_count": 4,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 4,
	"data": {
	"text/plain": "(#100) ['https://wallpapertag.com/wallpaper/full/e/b/8/437151-grizzly-bear-backgrounds-2560x1600-for-tablet.jpg','http://www.fws.gov/mountain-prairie/pressrel/images/01222015_grizzlybear.jpg','https://cdn0.wideopenspaces.com/wp-content/uploads/2020/04/whatdogrizzlybearseat4-scaled.jpg','https://www.campbellriver.travel/media/2020/02/bear-ears1.jpg','http://cdn.roaring.earth/wp-content/uploads/2016/03/Scratching-head-main-text.jpg','https://ourplnt.com/wp-content/uploads/2020/12/grizzly-bear.jpg','http://www.wyofile.com/wp-content/uploads/2017/11/grizzly-pic-e1509641980482.jpg','https://keyassets.timeincuk.net/inspirewp/live/wp-content/uploads/sites/2/2019/06/GettyImages-525103104.jpg','https://devdenverzoo.com/wp-content/uploads/2018/09/Grizzly-Bear_04.jpg','http://i.huffpost.com/gen/1747973/images/o-GRIZZLY-BEAR-RAINFOREST-facebook.jpg'...]"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "Next, to download images we need to pass the `directory` (where to download the images) and `urls` (which images to download)."
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "Let's say we are trying to download images for `grizzly bear`, `teddy bear` and `black bear`. This looks like:"
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "bear_types = ['grizzly', 'teddy', 'black']\n\n# path where to download the images\npath = Path('./bear')\n\n# first create folder in `path` and then download the images\nif not path.exists():\n path.mkdir()\n for o in tqdm(bear_types):\n dest = (path/o)\n results = search_images_ddg(f'{o} bear', max_images=100) \n images = download_images(dest, urls=results)",
	"execution_count": 7,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "This will download the images in `bear` folder in current directory. See below. "
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "!tree ./bear -d",
	"execution_count": 9,
	"outputs": [
	{
	"output_type": "stream",
	"text": "\u001b[01;34m./bear\u001b[00m\r\n├── \u001b[01;34mblack\u001b[00m\r\n├── \u001b[01;34mgrizzly\u001b[00m\r\n└── \u001b[01;34mteddy\u001b[00m\r\n\r\n3 directories\r\n",
	"name": "stdout"
	}
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"name": "base",
	"display_name": "Python[base]",
	"language": "python"
	},
	"language_info": {
	"name": "python",
	"version": "3.7.10",
	"mimetype": "text/x-python",
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"pygments_lexer": "ipython3",
	"nbconvert_exporter": "python",
	"file_extension": ".py"
	},
	"toc": {
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": false,
	"base_numbering": 1,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	},
	"gist": {
	"id": "",
	"data": {
	"description": "git_repos/fastbook/create_custom_datasetipynb",
	"public": true
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}