Skip to content

Instantly share code, notes, and snippets.

@Curt-Park
Last active January 19, 2025 23:37
Show Gist options
  • Save Curt-Park/db040a2bf892c69c61a0c6eb634a57cf to your computer and use it in GitHub Desktop.
Save Curt-Park/db040a2bf892c69c61a0c6eb634a57cf to your computer and use it in GitHub Desktop.
Instagram Hashtag Crawler
"""Instagram image crawler by hashtag.
Prerequisites:
- `pip install selenium shortuuid`
Usage:
- `python scripts/crawl_instagram.py -u $ID -p $PASSWORD -ht selfie -n 6000`
"""
import argparse
import os
import time
import random
import urllib.request
import shortuuid
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
from selenium import webdriver
from selenium.webdriver.common.by import By
parser = argparse.ArgumentParser(description="Instagram hashtag crawler")
parser.add_argument("-u", "--username", required=True, help="Instagram user name")
parser.add_argument("-p", "--password", required=True, help="Instagram password")
parser.add_argument("-ht", "--hashtag", required=True, help="Instagram hastag")
parser.add_argument("-n", "--number", type=int, required=True, help="number of images to crawl")
args = parser.parse_args()
def login(driver: RemoteWebDriver):
driver.get("https://www.instagram.com/accounts/login/")
time.sleep(3)
username = driver.find_element(By.NAME, "username")
password = driver.find_element(By.NAME, "password")
username.send_keys(args.username)
password.send_keys(args.password)
login_button = driver.find_element(By.XPATH, "//button[@type='submit']")
login_button.click()
time.sleep(5)
def search_hashtag(driver: RemoteWebDriver):
driver.get(f"https://www.instagram.com/explore/tags/{args.hashtag}/")
time.sleep(5)
def scroll_down(driver: RemoteWebDriver, timeout: int = 10):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(random.randint(2, 4))
def get_images(driver: RemoteWebDriver, directory: str) -> list[str]:
uuids_seen = set()
image_count = 0
while image_count < args.number:
images = driver.find_elements(By.CSS_SELECTOR, "div._aagv img")
for image in images:
try:
image_url = image.get_attribute("src")
uuid = shortuuid.uuid(name=image_url)
if uuid in uuids_seen:
continue
image_name = os.path.join(f"{directory}", f"{uuid}.jpg")
urllib.request.urlretrieve(image_url, image_name)
print(f"Saved an image as {image_name}")
uuids_seen.add(uuid)
image_count += 1
except Exception as e:
print(f"Failed to process image due to {e}")
scroll_down(driver)
if __name__ == "__main__":
directory = os.path.join("crawled", f"{args.hashtag}_{time.strftime('%Y%m%d%H%M%S')}")
os.makedirs(directory, exist_ok=True)
driver = webdriver.Chrome()
login(driver)
search_hashtag(driver)
get_images(driver, directory)
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment