Created
February 26, 2023 19:49
-
-
Save InputBlackBoxOutput/1f63fc2bbc1139bb25bf635d2ca0bed5 to your computer and use it in GitHub Desktop.
Scrape images from a website
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import shutil | |
import requests | |
import time | |
import random | |
# pip install requests-html | |
from requests_html import HTMLSession | |
session = HTMLSession() | |
def scrape_images(keyword, n_pages=3): | |
# Create an output directory | |
os.mkdir(f"output/{keyword}") | |
# Get the page and render the content | |
for page in range(n_pages): | |
count = 0 | |
url = f"http://clipart-library.com/search1/?q={keyword}#gsc.tab=1&gsc.q={keyword}&gsc.page={page}" | |
print(url) | |
r = session.get(url) | |
r.html.render() | |
time.sleep(random.randint(2, 7)) | |
# Extract src attributes from img tags | |
img_list = r.html.find("img") | |
src_list = [] | |
for each_img in img_list: | |
try: | |
src_list += [each_img.attrs['src']] | |
except: | |
pass | |
# Collect image data and store in a file | |
for each_src in list(set(src_list)): | |
if '..' not in each_src: | |
print(each_src) | |
response = requests.get(each_src, stream=True) | |
with open(f"output/{keyword}/{page}-{count+1}.png", 'wb') as out_file: | |
shutil.copyfileobj(response.raw, out_file) | |
del response | |
count += 1 | |
if __name__ == '__main__': | |
scrape_images(keyword="apple") | |
# with open("keywords.lst") as keyword_file: | |
# keywords = keyword_file.read().splitlines() | |
# print(len(keywords)) | |
# for keyword in keywords: | |
# print(keyword) | |
# scrape_images(keyword) | |
# time.sleep(random.randint(2,7)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment