InputBlackBoxOutput · February 26, 2023 19:49
diff --git a/scraper.py b/scraper.py
 import os
 import shutil
 import requests
 import time
 import random

 # pip install requests-html
 from requests_html import HTMLSession 
 session = HTMLSession()


 def scrape_images(keyword, n_pages=3):

 	# Create an output directory
 	os.mkdir(f"output/{keyword}")

 	# Get the page and render the content
 	for page in range(n_pages):
 		count = 0

 		url = f"http://clipart-library.com/search1/?q={keyword}#gsc.tab=1&gsc.q={keyword}&gsc.page={page}"
 		print(url)

 		r = session.get(url)
 		r.html.render()
 		time.sleep(random.randint(2, 7))

 		# Extract src attributes from img tags
 		img_list = r.html.find("img")
 		src_list = []
 		for each_img in img_list:
 			try:
 				src_list += [each_img.attrs['src']]
 			except:
 				pass 

 		# Collect image data and store in a file
 		for each_src in list(set(src_list)):
 			if '..' not in each_src:
 				print(each_src)

 				response = requests.get(each_src, stream=True)
 				with open(f"output/{keyword}/{page}-{count+1}.png", 'wb') as out_file:
 				    shutil.copyfileobj(response.raw, out_file)

 				del response
 				count += 1


 if __name__ == '__main__':
 	scrape_images(keyword="apple")

 	# with open("keywords.lst") as keyword_file:
 	# 	keywords = keyword_file.read().splitlines()
 	# 	print(len(keywords))

 	# 	for keyword in keywords:
 	# 		print(keyword)
 	# 		scrape_images(keyword)

 	# 		time.sleep(random.randint(2,7))
	import os
	import shutil
	import requests
	import time
	import random

	# pip install requests-html
	from requests_html import HTMLSession
	session = HTMLSession()


	def scrape_images(keyword, n_pages=3):

	# Create an output directory
	os.mkdir(f"output/{keyword}")

	# Get the page and render the content
	for page in range(n_pages):
	count = 0

	url = f"http://clipart-library.com/search1/?q={keyword}#gsc.tab=1&gsc.q={keyword}&gsc.page={page}"
	print(url)

	r = session.get(url)
	r.html.render()
	time.sleep(random.randint(2, 7))

	# Extract src attributes from img tags
	img_list = r.html.find("img")
	src_list = []
	for each_img in img_list:
	try:
	src_list += [each_img.attrs['src']]
	except:
	pass

	# Collect image data and store in a file
	for each_src in list(set(src_list)):
	if '..' not in each_src:
	print(each_src)

	response = requests.get(each_src, stream=True)
	with open(f"output/{keyword}/{page}-{count+1}.png", 'wb') as out_file:
	shutil.copyfileobj(response.raw, out_file)

	del response
	count += 1


	if __name__ == '__main__':
	scrape_images(keyword="apple")

	# with open("keywords.lst") as keyword_file:
	# keywords = keyword_file.read().splitlines()
	# print(len(keywords))

	# for keyword in keywords:
	# print(keyword)
	# scrape_images(keyword)

	# time.sleep(random.randint(2,7))
No results found