Created
March 2, 2019 18:58
-
-
Save limsammy/fa06e2bfbc50e61e00030fb86a1b57f9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re | |
from urllib.request import * | |
import urllib.request | |
import argparse | |
import requests | |
import os | |
import cv2 | |
from imutils import paths | |
# ap = argparse.ArgumentParser() | |
# ap.add_argument("-o", "--output", required=True, | |
# help="path to training-set directory") | |
# args = vars(ap.parse_args()) | |
# given the gun category url, group sub cateogires into classifications | |
# iterate over each classification's sub categories | |
# iterate over each article | |
# grab all images on article | |
# save to classification dir with unique name | |
# define classifications and their respective categories | |
CLASSES = { | |
'rifle': [ | |
'AR Derivatives', | |
'Assault Rifle', | |
'Battle Rifle', | |
'Bullpup', | |
'Carbine', | |
'Rifle', | |
'Sniper Rifle', | |
'Submachine Gun', | |
'Machine Gun', | |
'Muzzleloader' | |
], | |
'handgun': [ | |
'Flare Gun', | |
'Machine Pistol', | |
'Pistol', | |
'Revolver', | |
], | |
'knife': [], | |
'shotgun': ['Shotgun'], | |
'non-lethal': ['Less-Than Lethal'], | |
'explosive': [ | |
'Flamethrower', | |
'Missile Launcher', | |
'Grenade', | |
'Grenade Launcher', | |
'Mine', | |
'Mortar', | |
] | |
} | |
IGNORE = ['Fictional Firearm', 'MANPADS', 'STANAG', 'UBGL', 'Underwater Firearm'] | |
# initialize urls dictionary | |
urls = {} | |
# base url | |
BASE_URL = 'http://www.imfdb.org' | |
# open main category page | |
print("[INFO] navigating to url...") | |
main_category_html = urlopen(BASE_URL + '/wiki/Category:Gun') | |
# parse html w beautiful soup | |
print("[INFO] parsing document...") | |
bs = BeautifulSoup(main_category_html, 'html.parser') | |
# grab each sub category url | |
print("[INFO] finding all sub-category links...") | |
subcategories_div = bs.find('div', id='mw-subcategories') | |
subcategories = subcategories_div.find_all('a') | |
counter = 0 | |
for idx, link in enumerate(subcategories, start=1): | |
name = link.get('title').split(':')[1] | |
print(name) | |
urls[name] = link.get('href') | |
print("[INFO] grabbed link #" + str(idx)) | |
for title, url in urls.items(): | |
if title in IGNORE: | |
print("[INFO] ignoring {}".format(title)) | |
continue | |
sub_category_html = urlopen(BASE_URL + url) | |
bs = BeautifulSoup(sub_category_html, 'html.parser') | |
# get all articles to iterate | |
pages = bs.find('div', { "class": 'mw-content-ltr' }) | |
pages = pages.find_all('a') | |
for page in pages: | |
page_html = urlopen(BASE_URL + page.get('href')) | |
page_bs = BeautifulSoup(page_html, 'html.parser') | |
images = page_bs.find_all('img', { 'src': re.compile('.jpg') }) | |
for image in images: | |
image_url = BASE_URL + image['src'][:7] + image['src'][13:re.search('.jpg', image['src']).end()] | |
# get correct class | |
for c,k in CLASSES.items(): | |
if title in k: | |
classification = c | |
# download to correct dir | |
try: | |
r = requests.get(image_url, timeout=60) | |
name = re.findall('(?:[^/][\d\w\.]+)$(?<=(?:.jpg))', image_url)[0] | |
p = os.path.sep.join(["datasets/{}".format( | |
classification), "{}.jpg".format(name)]) | |
f = open(p, "wb") | |
f.write(r.content) | |
f.close() | |
counter += 1 | |
print("[INFO] Downloaded image {}. So far we have scraped {} images...".format( | |
name, counter)) | |
except requests.exceptions.RequestException as e: | |
print("[INFO] error: {}".format(e)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment