mattvonrocketstein · January 7, 2015 09:28
diff --git a/grays_anatomy_downloader b/grays_anatomy_downloader
 #
 # this code downloads all the grays anatomy images on wikimedia into ./grays_anatomy_plates
 #
 import time
 import os, urllib2
 from BeautifulSoup import BeautifulSoup

 plates_url = 'http://commons.wikimedia.org/wiki/Gray%27s_Anatomy_plates'
 base = 'https://commons.wikimedia.org'
 save_dir = 'grays_anatomy_plates'
 assert os.path.exists(save_dir)
 print 'getting index..'
 tmp = BeautifulSoup(urllib2.urlopen(plates_url).read())
 links = tmp.findAll('li',attrs={'class':'gallerybox'})
 links = [l.find('a').get('href') for l in links]

 count = 0
 for link in links:
    count += 1
    tmp = base + link
    print tmp
    tmp = urllib2.urlopen(tmp).read()
    tmp = BeautifulSoup(tmp)
    img = tmp.find('div', attrs={'class':'fullMedia'}).find('a').get('href')
    img = img[1:] if img.startswith('//') else img
    img = img[1:] if img.startswith('/') else img
    fname = img.split('/')[-1].lower()
    fext=os.path.splitext(fname)[-1]
    if fname.startswith('gray{0}'.format(count)):
        fname = fext
    else:
        fname = '_'+fname
    print img
    u3 = img
    newf = 'gray{0}{1}'.format(count, fname)
    os.system('cd {0} && wget -4 {1} -O {2}'.format(
        save_dir, 'http://'+u3,
        newf))
    print
    time.sleep(2)
	#
	# this code downloads all the grays anatomy images on wikimedia into ./grays_anatomy_plates
	#
	import time
	import os, urllib2
	from BeautifulSoup import BeautifulSoup

	plates_url = 'http://commons.wikimedia.org/wiki/Gray%27s_Anatomy_plates'
	base = 'https://commons.wikimedia.org'
	save_dir = 'grays_anatomy_plates'
	assert os.path.exists(save_dir)
	print 'getting index..'
	tmp = BeautifulSoup(urllib2.urlopen(plates_url).read())
	links = tmp.findAll('li',attrs={'class':'gallerybox'})
	links = [l.find('a').get('href') for l in links]

	count = 0
	for link in links:
	count += 1
	tmp = base + link
	print tmp
	tmp = urllib2.urlopen(tmp).read()
	tmp = BeautifulSoup(tmp)
	img = tmp.find('div', attrs={'class':'fullMedia'}).find('a').get('href')
	img = img[1:] if img.startswith('//') else img
	img = img[1:] if img.startswith('/') else img
	fname = img.split('/')[-1].lower()
	fext=os.path.splitext(fname)[-1]
	if fname.startswith('gray{0}'.format(count)):
	fname = fext
	else:
	fname = '_'+fname
	print img
	u3 = img
	newf = 'gray{0}{1}'.format(count, fname)
	os.system('cd {0} && wget -4 {1} -O {2}'.format(
	save_dir, 'http://'+u3,
	newf))
	print
	time.sleep(2)
No results found