Last active
May 3, 2024 20:39
-
-
Save thetafferboy/4cf28743cdc1d82b43da706a41fcb0d9 to your computer and use it in GitHub Desktop.
hreflang flag getter for pi
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
import requests | |
import os | |
from flag_recognition import recognize_flag, is_flag | |
from config import CHROMEDRIVER_PATH, STARTING_URL | |
# Configure WebDriver for Chrome | |
options = webdriver.ChromeOptions() | |
options.add_argument('headless') | |
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=options) | |
def crawl_website(url): | |
driver.get(url) | |
soup = BeautifulSoup(driver.page_source, 'html.parser') | |
links = {a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith('http')} | |
images = {img['src']: url for img in soup.find_all('img', src=True) if img['src'].startswith('http')} | |
download_images(images) | |
return links | |
def download_images(image_urls): | |
for image_url, source_url in image_urls.items(): | |
try: | |
response = requests.get(image_url) | |
if response.status_code == 200: | |
image_path = f'images/img_{os.path.basename(image_url)}' | |
with open(image_path, 'wb') as f: | |
f.write(response.content) | |
if is_flag(image_path): | |
country_name = recognize_flag(image_path) | |
if country_name: | |
iso_code = fetch_iso_code(country_name) | |
print(f"Flag of {country_name} identified: ISO code is {iso_code}. Found at {source_url}") | |
except requests.exceptions.RequestException as e: | |
print(f"Failed to download {image_url}: {e}") | |
def fetch_iso_code(country_name): | |
url = "https://en.wikipedia.org/wiki/ISO_3166-1" | |
driver.get(url) | |
soup = BeautifulSoup(driver.page_source, 'html.parser') | |
table = soup.find('table', {'class': 'wikitable sortable'}) | |
for row in table.find_all('tr'): | |
cells = row.find_all('td') | |
if cells and country_name in cells[0].text: | |
return cells[1].text.strip() # Assuming the ISO code is in the second column | |
def main(): | |
if not os.path.exists('images'): | |
os.mkdir('images') | |
links = crawl_website(STARTING_URL) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment