Created
June 29, 2021 06:18
-
-
Save amaarora/97166d3d2beeff59814907dbdf3bd4fc to your computer and use it in GitHub Desktop.
git_repos/fastbook/create_custom_datasetipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"id": "415dc37f", | |
"cell_type": "code", | |
"source": "import fastbook\nfrom fastai.vision.all import * \nfrom tqdm import tqdm", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "def search_images_ddg(term, max_images=200):\n \"Search for `term` with DuckDuckGo and return a unique urls of about `max_images` images\"\n assert max_images<1000\n url = 'https://duckduckgo.com/'\n res = urlread(url,data={'q':term}, decode=False).decode()\n searchObj = re.search(r'vqd=([\\d-]+)\\&', res)\n assert searchObj\n requestUrl = url + 'i.js'\n params = dict(l='us-en', o='json', q=term, vqd=searchObj.group(1), f=',,,', p='1', v7exp='a')\n urls,data = set(),{'next':1}\n while len(urls)<max_images and 'next' in data:\n try:\n data = urljson(requestUrl,data=params)\n urls.update(L(data['results']).itemgot('image'))\n requestUrl = url + data['next']\n except (URLError,HTTPError): pass\n time.sleep(0.2)\n return L(urls)", | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "urls = search_images_ddg('grizzly bear', max_images=100)\nurls", | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 4, | |
"data": { | |
"text/plain": "(#100) ['https://wallpapertag.com/wallpaper/full/e/b/8/437151-grizzly-bear-backgrounds-2560x1600-for-tablet.jpg','http://www.fws.gov/mountain-prairie/pressrel/images/01222015_grizzlybear.jpg','https://cdn0.wideopenspaces.com/wp-content/uploads/2020/04/whatdogrizzlybearseat4-scaled.jpg','https://www.campbellriver.travel/media/2020/02/bear-ears1.jpg','http://cdn.roaring.earth/wp-content/uploads/2016/03/Scratching-head-main-text.jpg','https://ourplnt.com/wp-content/uploads/2020/12/grizzly-bear.jpg','http://www.wyofile.com/wp-content/uploads/2017/11/grizzly-pic-e1509641980482.jpg','https://keyassets.timeincuk.net/inspirewp/live/wp-content/uploads/sites/2/2019/06/GettyImages-525103104.jpg','https://devdenverzoo.com/wp-content/uploads/2018/09/Grizzly-Bear_04.jpg','http://i.huffpost.com/gen/1747973/images/o-GRIZZLY-BEAR-RAINFOREST-facebook.jpg'...]" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Next, to download images we need to pass the `directory` (where to download the images) and `urls` (which images to download)." | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Let's say we are trying to download images for `grizzly bear`, `teddy bear` and `black bear`. This looks like:" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "bear_types = ['grizzly', 'teddy', 'black']\n\n# path where to download the images\npath = Path('./bear')\n\n# first create folder in `path` and then download the images\nif not path.exists():\n path.mkdir()\n for o in tqdm(bear_types):\n dest = (path/o)\n results = search_images_ddg(f'{o} bear', max_images=100) \n images = download_images(dest, urls=results)", | |
"execution_count": 7, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "This will download the images in `bear` folder in current directory. See below. " | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "!tree ./bear -d", | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "\u001b[01;34m./bear\u001b[00m\r\n├── \u001b[01;34mblack\u001b[00m\r\n├── \u001b[01;34mgrizzly\u001b[00m\r\n└── \u001b[01;34mteddy\u001b[00m\r\n\r\n3 directories\r\n", | |
"name": "stdout" | |
} | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "base", | |
"display_name": "Python[base]", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.7.10", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"toc": { | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"base_numbering": 1, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "git_repos/fastbook/create_custom_datasetipynb", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment