Skip to content

Instantly share code, notes, and snippets.

@thetafferboy
Last active May 3, 2024 20:39
Show Gist options
  • Save thetafferboy/4cf28743cdc1d82b43da706a41fcb0d9 to your computer and use it in GitHub Desktop.
Save thetafferboy/4cf28743cdc1d82b43da706a41fcb0d9 to your computer and use it in GitHub Desktop.
hreflang flag getter for pi
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import os
from flag_recognition import recognize_flag, is_flag
from config import CHROMEDRIVER_PATH, STARTING_URL
# Configure WebDriver for Chrome
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=options)
def crawl_website(url):
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
links = {a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith('http')}
images = {img['src']: url for img in soup.find_all('img', src=True) if img['src'].startswith('http')}
download_images(images)
return links
def download_images(image_urls):
for image_url, source_url in image_urls.items():
try:
response = requests.get(image_url)
if response.status_code == 200:
image_path = f'images/img_{os.path.basename(image_url)}'
with open(image_path, 'wb') as f:
f.write(response.content)
if is_flag(image_path):
country_name = recognize_flag(image_path)
if country_name:
iso_code = fetch_iso_code(country_name)
print(f"Flag of {country_name} identified: ISO code is {iso_code}. Found at {source_url}")
except requests.exceptions.RequestException as e:
print(f"Failed to download {image_url}: {e}")
def fetch_iso_code(country_name):
url = "https://en.wikipedia.org/wiki/ISO_3166-1"
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable'})
for row in table.find_all('tr'):
cells = row.find_all('td')
if cells and country_name in cells[0].text:
return cells[1].text.strip() # Assuming the ISO code is in the second column
def main():
if not os.path.exists('images'):
os.mkdir('images')
links = crawl_website(STARTING_URL)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment