Last active
June 14, 2021 12:46
-
-
Save Nannigalaxy/789f5c008ed264edb7c186d4c1adc49c to your computer and use it in GitHub Desktop.
Image scraping script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Image scrapping script | |
Install libraries | |
$pip install requests bs4 | |
''' | |
# import required modules | |
import requests # for get requests | |
from bs4 import BeautifulSoup as bs # for scraping | |
import os # for creating dirs & writing files | |
Images = ['sedan', "suv", "hatchback", "sports car"] # the required images list | |
for img in Images: | |
print("Image: ", img) | |
url = 'https://unsplash.com/search/photos/' + img # the unsplash api for searching a required image | |
x = 0 # set the var x to 0 | |
filePath = 'images/' + img # file path for the directory | |
if not os.path.exists(filePath): | |
os.makedirs(filePath) | |
# download page for parsing | |
page = requests.get(url) # get the url | |
soup = bs(page.text, 'html.parser') # parse it with beautifulSoup, imported as bs, store it in soup var | |
# locate all elements with image tag | |
image_tags = soup.findAll('img') | |
# create directory for required images | |
if not os.path.exists(filePath): # if the dir doesn't exist | |
os.makedirs(filePath) # create the dir | |
# move to new directory | |
os.chdir(filePath) | |
print("Image tags found: ", len(image_tags)) | |
# writing images in the created folder | |
for i, image in enumerate(image_tags): # for each image in the image_tags array, | |
try: # go thru this loop | |
url = image['src'] # set the url variable to the src of the image tags | |
response = requests.get(url) # go to the url and store it in the response var | |
if response.status_code == 200: # if the status code === 200 | |
with open(img + '-' + str(x) + '.jpg', 'wb') as f: # open the image as the mentioned file format, (w for writing, and b for binary) | |
# as the format is jpg, it needs to be saved as a binary file | |
# here "f" is just a variable assignment | |
f.write(requests.get(url).content) # get the content of the url and write/save in the created dir | |
f.close() # stop writing/saving the image | |
x += 1 # increment x by 1 | |
except: # on excpetion (i.e, status code !== 200, or other errors) | |
pass # repeat the loop again | |
print(i+1, end=" ") | |
print() | |
os.chdir("../../") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment