Created
September 10, 2025 16:39
-
-
Save FarisHijazi/5435d5455aa3e766023a49c9bb68a95c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "896312bf", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pip install ipython requests tqdm joblib backoff -qqq" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "cf917995", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from tabnanny import verbose\n", | |
| "import requests\n", | |
| "import time\n", | |
| "from IPython.display import display, HTML\n", | |
| "from concurrent.futures import ThreadPoolExecutor\n", | |
| "from multiprocessing.pool import ThreadPool\n", | |
| "import joblib\n", | |
| "from tqdm.auto import tqdm\n", | |
| "import os\n", | |
| "import backoff\n", | |
| "import json\n", | |
| "from dotenv import load_dotenv\n", | |
| "\n", | |
| "assert load_dotenv()\n", | |
| "\n", | |
| "memory = joblib.Memory(location='./cache', verbose=0)\n", | |
| "\n", | |
| "def duckduckgo_proxify(url: str) -> str:\n", | |
| " return f\"https://proxy.duckduckgo.com/iu/?u={requests.utils.quote(url)}&f=1\"\n", | |
| "\n", | |
| "\n", | |
| "def search_google_images(\n", | |
| " search_query,\n", | |
| " cx=None,\n", | |
| " api_key=None,\n", | |
| " num_images=10,\n", | |
| " params=None,\n", | |
| " gif=False,\n", | |
| "):\n", | |
| " \"\"\"\n", | |
| " how one result looks like:\n", | |
| "\n", | |
| " {\n", | |
| " 'kind': 'customsearch#result',\n", | |
| " 'title': '...',\n", | |
| " 'htmlTitle': '...',\n", | |
| " 'link': 'https://www.....gif',\n", | |
| " 'displayLink': '....com',\n", | |
| " 'snippet': 'a cute cat with a...',\n", | |
| " 'htmlSnippet': '<b>Cats</b> A cute cat with a ....',\n", | |
| " 'mime': 'image/gif',\n", | |
| " 'fileFormat': 'image/gif',\n", | |
| " 'image': {\n", | |
| " 'contextLink': 'https://www.cats.com',\n", | |
| " 'height': 248,\n", | |
| " 'width': 500,\n", | |
| " 'byteSize': 483604,\n", | |
| " 'thumbnailLink': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:-9rsb-KU&s',\n", | |
| " 'thumbnailHeight': 64,\n", | |
| " 'thumbnailWidth': 130\n", | |
| " }\n", | |
| " }\n", | |
| " \"\"\"\n", | |
| " if cx is None:\n", | |
| " cx = os.environ[\"GOOGLE_CUSTOM_SEARCH_ENGINE_ID\"]\n", | |
| " if api_key is None:\n", | |
| " api_key = os.environ[\"GOOGLE_API_KEY\"]\n", | |
| " if params is None:\n", | |
| " params = {}\n", | |
| " BATCH_SIZE = 10\n", | |
| "\n", | |
| " # search_query += \"\"\n", | |
| "\n", | |
| " search_url = \"https://www.googleapis.com/customsearch/v1\"\n", | |
| " params_ = {\n", | |
| " \"q\": search_query + (\" filetype:gif\" if gif else \"\"),\n", | |
| " \"cx\": cx,\n", | |
| " \"searchType\": \"image\",\n", | |
| " \"key\": api_key,\n", | |
| " }\n", | |
| " params_.update(params)\n", | |
| " start_indices = range(\n", | |
| " 1, num_images + 1, BATCH_SIZE\n", | |
| " ) # Generate start indices for batches\n", | |
| "\n", | |
| " @memory.cache\n", | |
| " @backoff.on_exception(backoff.expo, (requests.exceptions.Timeout, requests.exceptions.ConnectionError), max_time=60)\n", | |
| " def fetch_images_chunk(search_url, params, start_index, num_images):\n", | |
| " params[\"start\"] = start_index\n", | |
| " params[\"num\"] = num_images\n", | |
| "\n", | |
| " print(f\"{search_url=}\", f\"{params=}\")\n", | |
| " response = requests.get(search_url, params=params)\n", | |
| " response.raise_for_status()\n", | |
| " return response.json().get(\"items\", [])\n", | |
| "\n", | |
| " chunks = [\n", | |
| " (search_url, params_.copy(), start, min(BATCH_SIZE, num_images - start + 1))\n", | |
| " for start in start_indices\n", | |
| " ]\n", | |
| "\n", | |
| " print(\"about to run threadpool\")\n", | |
| " with ThreadPool() as pool:\n", | |
| " results = pool.starmap(fetch_images_chunk, chunks)\n", | |
| "\n", | |
| " # Flatten the list of results\n", | |
| " results = [item for sublist in results for item in sublist]\n", | |
| " return results\n", | |
| "\n", | |
| "\n", | |
| "def display_image_grid(urls: list[str, dict], thumbnailLinks=None):\n", | |
| " if not urls:\n", | |
| " return\n", | |
| " if not thumbnailLinks:\n", | |
| " thumbnailLinks = [\"\" for _ in urls]\n", | |
| "\n", | |
| " if type(urls[0]) is not str:\n", | |
| " thumbnailLinks = [result[\"image\"][\"thumbnailLink\"] for result in urls]\n", | |
| " urls = [result[\"link\"] for result in urls]\n", | |
| "\n", | |
| " # Define the size for each GIF (e.g., width: 200px; height: 200px;)\n", | |
| " width = 200\n", | |
| " height = 200\n", | |
| "\n", | |
| " # Create HTML code for displaying the GIFs in a grid\n", | |
| " html_str = \"<table><tr>\"\n", | |
| " for i, (url, thumbnailLink) in enumerate(zip(urls, thumbnailLinks), start=1):\n", | |
| " thumbnailLink = duckduckgo_proxify(url)\n", | |
| " html_str += f'<td><a href=\"{url}\"><img src=\"{url}\" max-width=\"{width}\" height=\"{height}\" style=\"margin: 10px;\" onerror=\"this.src=\\'{thumbnailLink}\\'\"></a></td>'\n", | |
| " if i % 3 == 0: # Adjust the number 3 based on how many GIFs you want in one row\n", | |
| " html_str += \"</tr><tr>\"\n", | |
| " html_str += \"</tr></table>\"\n", | |
| " display(HTML(html_str))\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "def download_single_image(args: tuple[dict, int, str]) -> None:\n", | |
| " \"\"\"\n", | |
| " Download a single image from the result dictionary and save it to a directory.\n", | |
| "\n", | |
| " Args:\n", | |
| " args (tuple): (result dict, index, save_dir)\n", | |
| " \"\"\"\n", | |
| " result, idx, save_dir = args\n", | |
| " url = result.get(\"link\")\n", | |
| " if not url:\n", | |
| " return\n", | |
| " try:\n", | |
| " response = requests.get(url, timeout=10)\n", | |
| " response.raise_for_status()\n", | |
| " ext = os.path.splitext(url)[1]\n", | |
| " if not ext or len(ext) > 5:\n", | |
| " ext = \".jpg\"\n", | |
| " filename = f\"image_{idx}{ext}\"\n", | |
| " filepath = os.path.join(save_dir, filename)\n", | |
| " with open(filepath, \"wb\") as f:\n", | |
| " f.write(response.content)\n", | |
| " print(f\"Downloaded {filename}\")\n", | |
| " except Exception as e:\n", | |
| " print(f\"Failed to download image {idx} from {url}: {e}\")\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "id": "378be0ee", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def search_google_pdfs(\n", | |
| " search_query: str,\n", | |
| " cx: str = None,\n", | |
| " api_key: str = None,\n", | |
| " num_results: int = 10,\n", | |
| " params: dict = None,\n", | |
| ") -> list[dict]:\n", | |
| " \"\"\"\n", | |
| " Search for PDF files using Google Custom Search API.\n", | |
| " \n", | |
| " Args:\n", | |
| " search_query: The search term\n", | |
| " cx: Custom Search Engine ID\n", | |
| " api_key: Google API key\n", | |
| " num_results: Number of PDF results to return\n", | |
| " params: Additional search parameters\n", | |
| " \n", | |
| " Returns:\n", | |
| " List of search result dictionaries containing PDF URLs and metadata\n", | |
| " \n", | |
| " Example result:\n", | |
| " {\n", | |
| " 'kind': 'customsearch#result',\n", | |
| " 'title': 'Document Title',\n", | |
| " 'htmlTitle': 'Document Title',\n", | |
| " 'link': 'https://example.com/document.pdf',\n", | |
| " 'displayLink': 'example.com',\n", | |
| " 'snippet': 'Document description...',\n", | |
| " 'htmlSnippet': 'Document description...',\n", | |
| " 'fileFormat': 'PDF/Adobe Acrobat',\n", | |
| " 'mime': 'application/pdf'\n", | |
| " }\n", | |
| " \"\"\"\n", | |
| " if cx is None:\n", | |
| " cx = os.environ[\"GOOGLE_CUSTOM_SEARCH_ENGINE_ID\"]\n", | |
| " if api_key is None:\n", | |
| " api_key = os.environ[\"GOOGLE_API_KEY\"]\n", | |
| " if params is None:\n", | |
| " params = {}\n", | |
| " \n", | |
| " BATCH_SIZE = 10\n", | |
| " \n", | |
| " # Add filetype:pdf to the search query\n", | |
| " search_url = \"https://www.googleapis.com/customsearch/v1\"\n", | |
| " search_params = {\n", | |
| " \"q\": f\"{search_query} filetype:pdf\",\n", | |
| " \"cx\": cx,\n", | |
| " \"key\": api_key,\n", | |
| " \"num\": min(BATCH_SIZE, num_results)\n", | |
| " }\n", | |
| " search_params.update(params)\n", | |
| " \n", | |
| " start_indices = range(1, num_results + 1, BATCH_SIZE)\n", | |
| " \n", | |
| " @memory.cache\n", | |
| " @backoff.on_exception(\n", | |
| " backoff.expo, \n", | |
| " (requests.exceptions.Timeout, requests.exceptions.ConnectionError), \n", | |
| " max_time=60\n", | |
| " )\n", | |
| " def fetch_pdfs_chunk(search_url: str, params: dict, start_index: int, num_results: int) -> list[dict]:\n", | |
| " \"\"\"Fetch a chunk of PDF search results.\"\"\"\n", | |
| " params[\"start\"] = start_index\n", | |
| " params[\"num\"] = min(BATCH_SIZE, num_results)\n", | |
| " \n", | |
| " print(f\"Searching PDFs: {search_url} with params: {params}\")\n", | |
| " response = requests.get(search_url, params=params)\n", | |
| " response.raise_for_status()\n", | |
| " return response.json().get(\"items\", [])\n", | |
| " \n", | |
| " # Prepare chunks for parallel processing\n", | |
| " chunks = [\n", | |
| " (search_url, search_params.copy(), start, min(BATCH_SIZE, num_results - start + 1))\n", | |
| " for start in start_indices\n", | |
| " ]\n", | |
| " \n", | |
| " print(\"Starting PDF search with ThreadPool...\")\n", | |
| " with ThreadPool() as pool:\n", | |
| " results = pool.starmap(fetch_pdfs_chunk, chunks)\n", | |
| " \n", | |
| " # Flatten the list of results\n", | |
| " all_results = [item for sublist in results for item in sublist]\n", | |
| " return all_results\n", | |
| "\n", | |
| "\n", | |
| "def display_pdf_results(results: list[dict]) -> None:\n", | |
| " \"\"\"Display PDF search results in a formatted table.\"\"\"\n", | |
| " if not results:\n", | |
| " print(\"No PDF results found.\")\n", | |
| " return\n", | |
| " \n", | |
| " html_str = \"\"\"\n", | |
| " <style>\n", | |
| " .pdf-results { \n", | |
| " font-family: Arial, sans-serif; \n", | |
| " border-collapse: collapse; \n", | |
| " width: 100%; \n", | |
| " }\n", | |
| " .pdf-results th, .pdf-results td { \n", | |
| " border: 1px solid #ddd; \n", | |
| " padding: 8px; \n", | |
| " text-align: left; \n", | |
| " }\n", | |
| " .pdf-results th { \n", | |
| " background-color: #f2f2f2; \n", | |
| " font-weight: bold; \n", | |
| " }\n", | |
| " .pdf-results tr:nth-child(even) { \n", | |
| " background-color: #f9f9f9; \n", | |
| " }\n", | |
| " .pdf-link { \n", | |
| " color: #0066cc; \n", | |
| " text-decoration: none; \n", | |
| " }\n", | |
| " .pdf-link:hover { \n", | |
| " text-decoration: underline; \n", | |
| " }\n", | |
| " </style>\n", | |
| " <table class=\"pdf-results\">\n", | |
| " <thead>\n", | |
| " <tr>\n", | |
| " <th>#</th>\n", | |
| " <th>Title</th>\n", | |
| " <th>URL</th>\n", | |
| " <th>Source</th>\n", | |
| " <th>Snippet</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " \"\"\"\n", | |
| " \n", | |
| " for i, result in enumerate(results, 1):\n", | |
| " title = result.get('title', 'No title')\n", | |
| " link = result.get('link', '#')\n", | |
| " display_link = result.get('displayLink', 'Unknown')\n", | |
| " snippet = result.get('snippet', 'No description')\n", | |
| " \n", | |
| " html_str += f\"\"\"\n", | |
| " <tr>\n", | |
| " <td>{i}</td>\n", | |
| " <td>{title}</td>\n", | |
| " <td><a href=\"{link}\" class=\"pdf-link\" target=\"_blank\">Open PDF</a></td>\n", | |
| " <td>{display_link}</td>\n", | |
| " <td>{snippet}</td>\n", | |
| " </tr>\n", | |
| " \"\"\"\n", | |
| " \n", | |
| " html_str += \"\"\"\n", | |
| " </tbody>\n", | |
| " </table>\n", | |
| " \"\"\"\n", | |
| " \n", | |
| " display(HTML(html_str))\n", | |
| "\n", | |
| "\n", | |
| "def get_pdf_urls(results: list[dict]) -> list[str]:\n", | |
| " \"\"\"Extract just the URLs from PDF search results.\"\"\"\n", | |
| " return [result.get('link', '') for result in results if result.get('link')]\n", | |
| "\n", | |
| "\n", | |
| "def save_pdf_results(results: list[dict], filename: str = \"pdf_results.json\") -> None:\n", | |
| " \"\"\"Save PDF search results to a JSON file.\"\"\"\n", | |
| " with open(filename, 'w', encoding='utf-8') as f:\n", | |
| " json.dump(results, f, indent=2, ensure_ascii=False)\n", | |
| " print(f\"Results saved to {filename}\")\n", | |
| "\n", | |
| "\n", | |
| "# # Example usage\n", | |
| "# print(\"PDF Search functions loaded successfully!\")\n", | |
| "# print(\"Available functions:\")\n", | |
| "# print(\"- search_google_pdfs(query, num_results=10)\")\n", | |
| "# print(\"- display_pdf_results(results)\")\n", | |
| "# print(\"- get_pdf_urls(results)\")\n", | |
| "# print(\"- save_pdf_results(results, filename)\")\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "id": "37ebcff8", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def download_single_pdf(args: tuple[dict, int, str]) -> None:\n", | |
| " \"\"\"\n", | |
| " Download a single PDF from the result dictionary and save it to a directory.\n", | |
| "\n", | |
| " Args:\n", | |
| " args (tuple): (result dict, index, save_dir)\n", | |
| " \"\"\"\n", | |
| " result, idx, save_dir = args\n", | |
| " url = result.get(\"link\")\n", | |
| " if not url:\n", | |
| " return\n", | |
| " \n", | |
| " try:\n", | |
| " # Set headers to mimic a browser request\n", | |
| " headers = {\n", | |
| " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'\n", | |
| " }\n", | |
| " \n", | |
| " response = requests.get(url, timeout=30, headers=headers)\n", | |
| " response.raise_for_status()\n", | |
| " \n", | |
| " # Check if the response is actually a PDF\n", | |
| " content_type = response.headers.get('content-type', '').lower()\n", | |
| " if 'pdf' not in content_type and not url.lower().endswith('.pdf'):\n", | |
| " print(f\"Skipping {url} - not a PDF file (content-type: {content_type})\")\n", | |
| " return\n", | |
| " \n", | |
| " # Generate filename from title or URL\n", | |
| " title = result.get('title', f'pdf_{idx}')\n", | |
| " # Clean title for filename\n", | |
| " clean_title = \"\".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).rstrip()\n", | |
| " clean_title = clean_title.replace(' ', '_')[:50] # Limit length\n", | |
| " \n", | |
| " filename = f\"{clean_title}_{idx}.pdf\"\n", | |
| " filepath = os.path.join(save_dir, filename)\n", | |
| " \n", | |
| " with open(filepath, \"wb\") as f:\n", | |
| " f.write(response.content)\n", | |
| " \n", | |
| " print(f\"Downloaded PDF: {filename}\")\n", | |
| " \n", | |
| " except requests.exceptions.RequestException as e:\n", | |
| " print(f\"Failed to download PDF {idx} from {url}: {e}\")\n", | |
| " except Exception as e:\n", | |
| " print(f\"Unexpected error downloading PDF {idx}: {e}\")\n", | |
| "\n", | |
| "\n", | |
| "def download_pdfs(results: list[dict], search_query: str, force_download: bool = True) -> None:\n", | |
| " \"\"\"\n", | |
| " Download all PDFs from search results to a directory named after the search query.\n", | |
| " \n", | |
| " Args:\n", | |
| " results: List of PDF search results\n", | |
| " search_query: The search term used (for directory naming)\n", | |
| " force_download: If True, download without prompting\n", | |
| " \"\"\"\n", | |
| " if not results:\n", | |
| " print(\"No PDF results to download.\")\n", | |
| " return\n", | |
| " \n", | |
| " save_dir = f\"downloaded_images/{search_query}\" # Same directory as images\n", | |
| " print(f\"PDFs will be saved to: {save_dir}\")\n", | |
| " \n", | |
| " if force_download or input('Download PDFs? (Y/n): ').lower()[:1] != 'n':\n", | |
| " os.makedirs(save_dir, exist_ok=True)\n", | |
| " \n", | |
| " # Save PDF results metadata\n", | |
| " pdf_metadata_file = os.path.join(save_dir, \"pdf_results.json\")\n", | |
| " with open(pdf_metadata_file, \"w\", encoding='utf-8') as f:\n", | |
| " json.dump(results, f, indent=2, ensure_ascii=False)\n", | |
| " print(f\"PDF metadata saved to: {pdf_metadata_file}\")\n", | |
| " \n", | |
| " # Prepare download arguments\n", | |
| " args_list = [(result, idx, save_dir) for idx, result in enumerate(results, start=1)]\n", | |
| " \n", | |
| " print(f\"Starting download of {len(args_list)} PDFs...\")\n", | |
| " with ThreadPool() as pool:\n", | |
| " list(tqdm(pool.imap(download_single_pdf, args_list), \n", | |
| " desc=\"Downloading PDFs\", \n", | |
| " total=len(args_list)))\n", | |
| " \n", | |
| " print(f\"PDF download completed! Check the directory: {save_dir}\")\n", | |
| "\n", | |
| "\n", | |
| "# # Updated example with PDF downloading\n", | |
| "# print(\"PDF download functions loaded successfully!\")\n", | |
| "# print(\"Available functions:\")\n", | |
| "# print(\"- download_single_pdf(args)\")\n", | |
| "# print(\"- download_pdfs(results, search_query, force_download=True)\")\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "f20b37b1", | |
| "metadata": {}, | |
| "source": [ | |
| "## Set `query` and `num_images`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "id": "c7d0c96d", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "document_search_terms = [\n", | |
| " \"IBAN Confirmation Letter\",\n", | |
| " \"خطاب اثبات ايبان\",\n", | |
| " \"Bank Account Statement (SNB)\",\n", | |
| " \"كشف حساب البنك الأهلي السعودي\",\n", | |
| " \"Credit Card Statement (Al Rajhi)\",\n", | |
| " \"كشف حساب بطاقة ائتمان الراجحي\",\n", | |
| " \"Salary Transfer Certificate\",\n", | |
| " \"شهادة تحويل راتب\",\n", | |
| " \"Personal Loan Agreement\",\n", | |
| " \"عقد تمويل شخصي\",\n", | |
| " \"Proof of Funds Letter\",\n", | |
| " \"شهادة إثبات رصيد بنكي\",\n", | |
| " \"Bank Guarantee Document\",\n", | |
| " \"خطاب ضمان بنكي\",\n", | |
| " \"Local Purchase Order (LPO)\",\n", | |
| " \"أمر شراء محلي\",\n", | |
| " \"Point of Sale (Mada) Merchant Receipt\",\n", | |
| " \"إيصال نقاط بيع مدى\",\n", | |
| " \"Bank Transfer Confirmation Slip\",\n", | |
| " \"إشعار حوالة بنكية\",\n", | |
| " \"Riyad Bank Account Opening Form\",\n", | |
| " \"نموذج فتح حساب بنك الرياض\",\n", | |
| " \"Standing Instruction Form\",\n", | |
| " \"نموذج تعليمات مستديمة\",\n", | |
| " \"Articles of Association\",\n", | |
| " \"عقد تأسيس شركة\",\n", | |
| " \"Memorandum of Association\",\n", | |
| " \"النظام الأساسي للشركة\",\n", | |
| " \"Commercial Registration Certificate (CR)\",\n", | |
| " \"شهادة السجل التجاري\",\n", | |
| " \"Chamber of Commerce Membership\",\n", | |
| " \"شهادة عضوية الغرفة التجارية\",\n", | |
| " \"General Power of Attorney (POA)\",\n", | |
| " \"وكالة شرعية عامة\",\n", | |
| " \"Board of Directors Resolution\",\n", | |
| " \"قرار مجلس الإدارة\",\n", | |
| " \"Company Meeting Minutes\",\n", | |
| " \"محضر اجتماع شركة\",\n", | |
| " \"Commercial License (Baladi)\",\n", | |
| " \"رخصة تجارية بلدية\",\n", | |
| " \"GOSI Certificate (General Organization for Social Insurance)\",\n", | |
| " \"شهادة التأمينات الاجتماعية\",\n", | |
| " \"Mudad Platform Salary Sheet\",\n", | |
| " \"مسير رواتب منصة مدد\",\n", | |
| " \"Zakat and Tax Certificate\",\n", | |
| " \"شهادة الزكاة والضريبة والدخل\",\n", | |
| " \"Industrial License\",\n", | |
| " \"رخصة صناعية\",\n", | |
| " \"Legal Lawsuit Document\",\n", | |
| " \"صحيفة دعوى قضائية\",\n", | |
| " \"Absher Platform Document Screenshot\",\n", | |
| " \"صورة وثيقة من منصة أبشر\",\n", | |
| " \"Tawakkalna Services Document\",\n", | |
| " \"وثيقة من تطبيق توكلنا خدمات\",\n", | |
| " \"National Address Proof\",\n", | |
| " \"إثبات العنوان الوطني\",\n", | |
| " \"Vehicle Ownership Transfer Form\",\n", | |
| " \"مبايعة سيارة تم\",\n", | |
| " \"Building Permit\",\n", | |
| " \"رخصة بناء\",\n", | |
| " \"Vehicle Technical Inspection Certificate (Fahas)\",\n", | |
| " \"شهادة الفحص الفني الدوري للسيارات\",\n", | |
| " \"Civil Defense Certificate\",\n", | |
| " \"شهادة سلامة من الدفاع المدني\",\n", | |
| " \"Exit/Re-entry Visa Document\",\n", | |
| " \"تأشيرة خروج وعودة\",\n", | |
| " \"Qiwa Platform Employment Contract\",\n", | |
| " \"عقد عمل منصة قوى\",\n", | |
| " \"Saudi Council of Engineers Membership\",\n", | |
| " \"عضوية الهيئة السعودية للمهندسين\",\n", | |
| " \"UAE Emirates ID Card\",\n", | |
| " \"بطاقة الهوية الإماراتية\",\n", | |
| " \"Egyptian National ID Card\",\n", | |
| " \"بطاقة الرقم القومي المصرية\",\n", | |
| " \"Kuwaiti Civil ID\",\n", | |
| " \"البطاقة المدنية الكويتية\",\n", | |
| " \"Jordanian Passport\",\n", | |
| " \"جواز سفر أردني\",\n", | |
| " \"Oman Resident Card\",\n", | |
| " \"بطاقة مقيم عمان\",\n", | |
| " \"Qatari ID Card (QID)\",\n", | |
| " \"البطاقة الشخصية القطرية\",\n", | |
| " \"Bahraini CPR Card\",\n", | |
| " \"بطاقة الهوية البحرينية\",\n", | |
| " \"Egyptian Birth Certificate\",\n", | |
| " \"شهادة ميلاد مصرية\",\n", | |
| " \"Lebanese Identity Card\",\n", | |
| " \"بطاقة الهوية اللبنانية\",\n", | |
| " \"Moroccan National Identity Card\",\n", | |
| " \"بطاقة التعريف الوطنية المغربية\",\n", | |
| " \"Saudi Electricity Company (SEC) Bill\",\n", | |
| " \"فاتورة الشركة السعودية للكهرباء\",\n", | |
| " \"National Water Company (NWC) Bill\",\n", | |
| " \"فاتورة شركة المياه الوطنية\",\n", | |
| " \"STC Postpaid Bill\",\n", | |
| " \"فاتورة جوال stc مفوتر\",\n", | |
| " \"Mobily Fiber Bill\",\n", | |
| " \"فاتورة موبايلي فايبر\",\n", | |
| " \"Zain Invoice\",\n", | |
| " \"فاتورة زين\",\n", | |
| " \"Jarir Bookstore Receipt\",\n", | |
| " \"فاتورة مكتبة جرير\",\n", | |
| " \"Panda Hypermarket Receipt\",\n", | |
| " \"فاتورة بنده\",\n", | |
| " \"SACO Hardware Store Invoice\",\n", | |
| " \"فاتورة ساكو\",\n", | |
| " \"Gas Cylinder Refill Receipt\",\n", | |
| " \"إيصال تعبئة غاز\",\n", | |
| " \"Bill of Lading\",\n", | |
| " \"بوليصة شحن بحري\",\n", | |
| " \"Air Waybill (AWB)\",\n", | |
| " \"بوليصة شحن جوي\",\n", | |
| " \"Certificate of Origin\",\n", | |
| " \"شهادة منشأ\",\n", | |
| " \"Customs Declaration Form\",\n", | |
| " \"نموذج بيان جمركي\",\n", | |
| " \"Commercial Invoice for Shipment\",\n", | |
| " \"فاتورة تجارية للشحن\",\n", | |
| " \"Packing List\",\n", | |
| " \"قائمة تعبئة\",\n", | |
| " \"Aramex Shipment Receipt\",\n", | |
| " \"إيصال شحنة أرامكس\",\n", | |
| " \"Saudi Post (SPL) Label\",\n", | |
| " \"ملصق شحنة سبل\",\n", | |
| " \"Delivery Note\",\n", | |
| " \"سند استلام بضاعة\",\n", | |
| " \"Medical Report\",\n", | |
| " \"تقرير طبي\",\n", | |
| " \"Doctor's Prescription\",\n", | |
| " \"وصفة طبية\",\n", | |
| " \"Laboratory Test Results\",\n", | |
| " \"نتائج تحاليل مخبرية\",\n", | |
| " \"Bupa Medical Insurance Card\",\n", | |
| " \"بطاقة تأمين بوبا\",\n", | |
| " \"Tawuniya Vehicle Insurance Policy\",\n", | |
| " \"وثيقة تأمين مركبة التعاونية\",\n", | |
| " \"Medical Insurance Claim Form\",\n", | |
| " \"نموذج مطالبة تأمين طبي\",\n", | |
| " \"Hospital Discharge Summary\",\n", | |
| " \"ملخص خروج من المستشفى\",\n", | |
| " \"Vaccination Certificate\",\n", | |
| " \"شهادة تطعيم\",\n", | |
| " \"University Admission Letter\",\n", | |
| " \"خطاب قبول جامعي\",\n", | |
| " \"Academic Transcript\",\n", | |
| " \"كشف درجات أكاديمي\",\n", | |
| " \"Attested Degree Certificate\",\n", | |
| " \"شهادة جامعية مصدقة\",\n", | |
| " \"Experience Certificate\",\n", | |
| " \"شهادة خبرة\",\n", | |
| " \"Letter of Recommendation\",\n", | |
| " \"خطاب توصية\",\n", | |
| " \"Professional Training Certificate\",\n", | |
| " \"شهادة دورة تدريبية\",\n", | |
| " \"School Report Card\",\n", | |
| " \"شهادة درجات مدرسية\",\n", | |
| " \"Resignation Letter\",\n", | |
| " \"خطاب استقالة\",\n", | |
| " \"Professional CV/Resume\",\n", | |
| " \"سيرة ذاتية احترافية بالعربي\",\n", | |
| " \"Ejar Tenancy Contract\",\n", | |
| " \"عقد إيجار منصة إيجار\",\n", | |
| " \"Real Estate Title Deed (Sak)\",\n", | |
| " \"صك ملكية عقاري\",\n", | |
| " \"Property Sale and Purchase Agreement\",\n", | |
| " \"عقد بيع وشراء عقار\",\n", | |
| " \"Municipality Fee Receipt\",\n", | |
| " \"إيصال رسوم بلدية\",\n", | |
| " \"Property Handover Form\",\n", | |
| " \"محضر استلام عقار\",\n", | |
| " \"Real Estate Evaluation Report\",\n", | |
| " \"تقرير تقييم عقاري\",\n", | |
| " \"Police Report\",\n", | |
| " \"محضر شرطة\",\n", | |
| " \"Official Government Form\",\n", | |
| " \"نموذج تعبئة حكومي\",\n", | |
| " \"Product Warranty Card\",\n", | |
| " \"بطاقة ضمان منتج\",\n", | |
| " \"Car Rental Agreement\",\n", | |
| " \"عقد تأجير سيارة\",\n", | |
| " \"Hotel Booking Confirmation\",\n", | |
| " \"تأكيد حجز فندق\",\n", | |
| " \"Flight Ticket Itinerary\",\n", | |
| " \"تذكرة طيران\",\n", | |
| " \"Event Entry Ticket\",\n", | |
| " \"تذكرة دخول فعالية\",\n", | |
| " \"Government Tender Document\",\n", | |
| " \"وثيقة مناقصة حكومية\",\n", | |
| " \"Official Gazette Publication\",\n", | |
| " \"منشور في الجريدة الرسمية\",\n", | |
| " \"Scanned Application Form\",\n", | |
| " \"نموذج طلب مسح ضوئي\",\n", | |
| " \"Official Receipt Voucher\",\n", | |
| " \"سند قبض رسمي\",\n", | |
| " \"Payment Voucher\",\n", | |
| " \"سند صرف\",\n", | |
| " \"Old Typed Letter\",\n", | |
| " \"خطاب قديم مطبوع بالآلة الكاتبة\",\n", | |
| " \"Manual Entry Logbook Page\",\n", | |
| " \"صفحة سجل يدوي\",\n", | |
| "]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "0b5d108f", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "about to run threadpool\n", | |
| "Starting PDF search with ThreadPool...\n", | |
| "\n", | |
| "Found 50 PDF results\n", | |
| "Please respond to the prompt to download images to downloaded_images/article of association\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "e390add0f3ad45daaf47557432730927", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Downloading images: 0%| | 0/50 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Failed to download image 2 from https://www.investopedia.com/thmb/mNsZ7GpPb9dSU0174WO74XajR7I=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/TermDefinitions_ArticlesofAssociation_3-2-10e5a0c7fc704412be23d75c5bff5a0e.jpg: 460 Client Error: for url: https://www.investopedia.com/thmb/mNsZ7GpPb9dSU0174WO74XajR7I=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/TermDefinitions_ArticlesofAssociation_3-2-10e5a0c7fc704412be23d75c5bff5a0e.jpg\n", | |
| "Downloaded image_5.png\n", | |
| "Downloaded image_8.png\n", | |
| "Downloaded image_7.png\n", | |
| "Failed to download image 6 from https://www.sec.gov/Archives/edgar/data/1836517/000110465920139241/tm2038331d2_ex3-1img002.jpg: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/1836517/000110465920139241/tm2038331d2_ex3-1img002.jpg\n", | |
| "Failed to download image 11 from https://www.researchgate.net/publication/270505380/figure/fig4/AS:392032309792785@1470479194399/Front-piece-of-the-original-Memorandum-and-Articles-of-Association-that-created-the.png: 403 Client Error: Forbidden for url: https://www.researchgate.net/publication/270505380/figure/fig4/AS:392032309792785@1470479194399/Front-piece-of-the-original-Memorandum-and-Articles-of-Association-that-created-the.png\n", | |
| "Downloaded image_9.webp\n", | |
| "Failed to download image 12 from https://cdn.corporatefinanceinstitute.com/assets/articles-of-association-1-1024x684.jpeg: 451 Client Error: for url: https://cdn.corporatefinanceinstitute.com/assets/articles-of-association-1-1024x684.jpeg\n", | |
| "Downloaded image_3.jpg\n", | |
| "Downloaded image_10.png\n", | |
| "Failed to download image 16 from https://cdn.corporatefinanceinstitute.com/assets/articles-of-association-1.jpeg: 451 Client Error: for url: https://cdn.corporatefinanceinstitute.com/assets/articles-of-association-1.jpeg\n", | |
| "Downloaded image_1.png\n", | |
| "Downloaded image_18.jpg\n", | |
| "Downloaded image_13.jpg\n", | |
| "Downloaded image_14.jpg\n", | |
| "Downloaded image_17.jpg\n", | |
| "Downloaded image_15.jpg\n", | |
| "Failed to download image 21 from https://www.sec.gov/Archives/edgar/data/1116134/000119312508209835/g90587ex1_5pg3.jpg: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/1116134/000119312508209835/g90587ex1_5pg3.jpg\n", | |
| "Failed to download image 23 from http://hlpoa.org/wp-content/uploads/2014/12/Incorporation_004.jpeg: 406 Client Error: Not Acceptable for url: http://hlpoa.org/wp-content/uploads/2014/12/Incorporation_004.jpeg\n", | |
| "Downloaded image_19.gif\n", | |
| "Downloaded image_20.png\n", | |
| "Downloaded image_24.jpg\n", | |
| "Failed to download image 27 from https://www.studypool.com/documents/preview/131862844: 403 Client Error: Forbidden for url: https://www.studypool.com/documents/preview/131862844\n", | |
| "Downloaded image_4.jpg\n", | |
| "Downloaded image_26.jpg\n", | |
| "Downloaded image_25.png\n", | |
| "Failed to download image 31 from https://www.sec.gov/Archives/edgar/data/1026662/000110465911066853/g305921kei001.gif: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/1026662/000110465911066853/g305921kei001.gif\n", | |
| "Failed to download image 33 from https://cdn.educba.com/academy/wp-content/uploads/2021/11/Article-of-Association-1.png: 403 Client Error: Forbidden for url: https://cdn.educba.com/academy/wp-content/uploads/2021/11/Article-of-Association-1.png\n", | |
| "Downloaded image_35.jpg\n", | |
| "Downloaded image_32.jpg\n", | |
| "Downloaded image_29.jpg\n", | |
| "Failed to download image 38 from https://www.sec.gov/Archives/edgar/data/769218/000119312511143496/g187586g69u17.jpg: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/769218/000119312511143496/g187586g69u17.jpg\n", | |
| "Downloaded image_36.jpg\n", | |
| "Downloaded image_41.jpg\n", | |
| "Downloaded image_34.png\n", | |
| "Downloaded image_30.jpg\n", | |
| "Downloaded image_37.webp\n", | |
| "Failed to download image 40 from https://www.sec.gov/Archives/edgar/data/701818/000104746918003902/g124353lg01i002.gif: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/701818/000104746918003902/g124353lg01i002.gif\n", | |
| "Failed to download image 43 from https://www.sec.gov/Archives/edgar/data/1116134/000119312504164097/g46257new.jpg: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/1116134/000119312504164097/g46257new.jpg\n", | |
| "Failed to download image 45 from https://www.sec.gov/Archives/edgar/data/1116134/000119312508209835/g90587ex1_5pg4.jpg: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/1116134/000119312508209835/g90587ex1_5pg4.jpg\n", | |
| "Downloaded image_28.jpg\n", | |
| "Downloaded image_46.jpg\n", | |
| "Downloaded image_49.jpg\n", | |
| "Downloaded image_42.jpg\n", | |
| "Downloaded image_47.jpg\n", | |
| "Failed to download image 44 from https://bnwjournal.com/wp-content/uploads/2020/05/Articles-of-Association-1024x1024.png: 406 Client Error: Not Acceptable for url: https://bnwjournal.com/wp-content/uploads/2020/05/Articles-of-Association-1024x1024.png\n", | |
| "Downloaded image_39.jpg\n", | |
| "Downloaded image_50.jpg\n", | |
| "Downloaded image_48.png\n", | |
| "Downloaded image_22.jpg\n", | |
| "Results saved to pdf_search_article_of_association.json\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "28b6833975064d31b9815e13326abff4", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Downloading PDFs: 0%| | 0/50 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Failed to download PDF 2 from https://www.scc.virginia.gov/media/sccvirginiagov-home/business-home/start-a-new-business/business-types/llc1011.pdf: 403 Client Error: Forbidden for url: https://www.scc.virginia.gov/media/sccvirginiagov-home/business-home/start-a-new-business/business-types/llc1011.pdf\n", | |
| "Failed to download PDF 3 from https://sos.ga.gov/sites/default/files/forms/Filing%20Template%20-%20Articles%20of%20Organization%20for%20LLC%20%28CD%20030%29.pdf: 403 Client Error: Forbidden for url: https://sos.ga.gov/sites/default/files/forms/Filing%20Template%20-%20Articles%20of%20Organization%20for%20LLC%20%28CD%20030%29.pdf\n", | |
| "Failed to download PDF 5 from https://www.azcc.gov/docs/default-source/corps-files/forms/l010-articles-of-organization.pdf: HTTPSConnectionPool(host='www.azcc.gov', port=443): Max retries exceeded with url: /docs/default-source/corps-files/forms/l010-articles-of-organization.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x10728a720>: Failed to establish a new connection: [Errno 61] Connection refused'))\n", | |
| "Downloaded PDF: Articles_of_Association_of_the_Financial_Stability_1.pdf\n", | |
| "Failed to download PDF 9 from https://www.ohiosos.gov/globalassets/business/forms/610_web.pdf: 403 Client Error: Forbidden for url: https://www.ohiosos.gov/globalassets/business/forms/610_web.pdf\n", | |
| "Downloaded PDF: INSTRUCTIONS_ARTICLES_OF_INCORPORATION_OF_A_PROFIT_12.pdf\n", | |
| "Downloaded PDF: ARTICLES_OF_ORGANIZATION_4.pdf\n", | |
| "Downloaded PDF: INSTRUCTIONS_FOR_FILING_ARTICLES_OF_INCORPORATION_10.pdf\n", | |
| "Downloaded PDF: INSTRUCTIONS_FOR_FILING_ARTICLES_OF_ORGANIZATION_8.pdf\n", | |
| "Downloaded PDF: BUSINESS_REGISTRATION_DIVISION_PO_BOX_29622_Raleig_7.pdf\n", | |
| "Downloaded PDF: DOS-1336_11.pdf\n", | |
| "Downloaded PDF: TRANSMITTAL_INFORMATION_For_All_Business_Filings_P_14.pdf\n", | |
| "Downloaded PDF: This_is_a_translation_into_English_of_the_original_16.pdf\n", | |
| "Downloaded PDF: ARTICLES_OF_ASSOCIATION_ASTRAZENECA_PLC_20.pdf\n", | |
| "Downloaded PDF: Certificate_of_Incorporation_17.pdf\n", | |
| "Failed to download PDF 6 from https://dfi.wi.gov/Documents/BusinessServices/BusinessEntities/Forms/CORP502.pdf: HTTPSConnectionPool(host='dfi.wi.gov', port=443): Max retries exceeded with url: /Documents/BusinessServices/BusinessEntities/Forms/CORP502.pdf (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x107289c10>: Failed to resolve 'dfi.wi.gov' ([Errno 8] nodename nor servname provided, or not known)\"))\n", | |
| "Downloaded PDF: LLC_1_-_Articles_of_Organization_21.pdf\n", | |
| "Downloaded PDF: Articles_of_Organization_-_Limited_Liability_Compa_13.pdf\n", | |
| "Downloaded PDF: Office_of_the_Minnesota_Secretary_of_State_Minneso_18.pdf\n", | |
| "Downloaded PDF: DC-1_Articles_of_Incorporation_7-2010_15.pdf\n", | |
| "Downloaded PDF: Instructions__Articles_of_Association_22.pdf\n", | |
| "Downloaded PDF: THE_COMPANIES_ACT_2006_COMPANY_LIMITED_BY_GUARANTE_25.pdf\n", | |
| "Failed to download PDF 26 from https://www.ilsos.gov/publications/pdf_publications/nfp10210.pdf: 403 Client Error: Forbidden for url: https://www.ilsos.gov/publications/pdf_publications/nfp10210.pdf\n", | |
| "Downloaded PDF: Articles_of_Association_of_Sulzer_Ltd_Winterthur_24.pdf\n", | |
| "Downloaded PDF: articles_of_association_-_InterContinental_Hotels__28.pdf\n", | |
| "Failed to download PDF 31 from https://forms.in.gov/download.aspx?id=5674: 403 Client Error: Forbidden for url: https://forms.in.gov/download.aspx?id=5674\n", | |
| "Downloaded PDF: Articles_of_Association_of_Straumann_Holding_AG_in_32.pdf\n", | |
| "Downloaded PDF: Limited_Liability_Company_-_Articles_of_Organizati_29.pdf\n", | |
| "Downloaded PDF: Articles_of_Association_27.pdf\n", | |
| "Downloaded PDF: Memorandum_and_Articles_of_Association_35.pdf\n", | |
| "Downloaded PDF: Articles_of_Incorporation__Nonprofit_30.pdf\n", | |
| "Downloaded PDF: CORP_52_-_Articles_of_Incorporation_of_a_Nonprofit_34.pdf\n", | |
| "Downloaded PDF: LLC-Articles_of_Organization_19.pdf\n", | |
| "Downloaded PDF: Articles_of_Association_37.pdf\n", | |
| "Downloaded PDF: Articles_of_Organization_36.pdf\n", | |
| "Downloaded PDF: ARTICLES_OF_ASSOCIATION_Partners_Group_Holding_AG_38.pdf\n", | |
| "Downloaded PDF: Articles_of_Association_of_Volkswagen_Aktiengesell_42.pdf\n", | |
| "Downloaded PDF: Companies_Act_and_Articles_of_Association_Guidance_44.pdf\n", | |
| "Downloaded PDF: Sample_Memorandum__Articles_of_Association_for_Pri_43.pdf\n", | |
| "Downloaded PDF: ARTICLES_OF_ASSOCIATION_46.pdf\n", | |
| "Downloaded PDF: Articles_of_Association_of_Sonova_Holding_AG_I_Gen_47.pdf\n", | |
| "Downloaded PDF: Articles_of_Association_for_Aktiebolaget_Volvo_45.pdf\n", | |
| "Downloaded PDF: international_code_of_conduct_for_private_security_39.pdf\n", | |
| "Downloaded PDF: ARTICLES_OF_ASSOCIATION_OF_DAVIDE_CAMPARI-MILANO_N_49.pdf\n", | |
| "Downloaded PDF: Licensing_Manual_Articles_of_Association_Charter_a_40.pdf\n", | |
| "Downloaded PDF: Heineken_NV_2020_Articles_of_Association_50.pdf\n", | |
| "Failed to download PDF 23 from https://businessfilings.sc.gov/BusinessFiling/Entity/DownloadForm?formName=F0006&entityType=2&filingType=Articles%20of%20Organization: HTTPSConnectionPool(host='businessfilings.sc.gov', port=443): Max retries exceeded with url: /BusinessFiling/Entity/DownloadForm?formName=F0006&entityType=2&filingType=Articles%20of%20Organization (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x10728c140>, 'Connection to businessfilings.sc.gov timed out. (connect timeout=30)'))\n", | |
| "Failed to download PDF 33 from https://apps.dfi.wi.gov/r/Content/CorpForm2: HTTPSConnectionPool(host='apps.dfi.wi.gov', port=443): Max retries exceeded with url: /r/Content/CorpForm2 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x10727bcb0>, 'Connection to apps.dfi.wi.gov timed out. (connect timeout=30)'))\n", | |
| "Failed to download PDF 41 from https://investors.st.com/static-files/0380956b-4951-406e-8dbd-02bd68105662: HTTPSConnectionPool(host='investors.st.com', port=443): Read timed out. (read timeout=30)\n", | |
| "Failed to download PDF 48 from https://www.hud.gov/sites/dfiles/OCHCO/documents/fha1402.pdf: HTTPSConnectionPool(host='www.hud.gov', port=443): Max retries exceeded with url: /sites/dfiles/OCHCO/documents/fha1402.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x1086f0740>, 'Connection to www.hud.gov timed out. (connect timeout=30)'))\n", | |
| "Downloaded 50 images and 50 PDFs in 50.74 seconds\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "\n", | |
| "\n", | |
| "\n", | |
| "query = \"article of association\"\n", | |
| "pdf_query = query\n", | |
| "num_images = 50\n", | |
| "num_pdf_results = 50\n", | |
| "skip_show_and_just_force_download = True\n", | |
| "\n", | |
| "\n", | |
| "results = search_google_images(\n", | |
| " query,\n", | |
| " num_images=num_images,\n", | |
| " gif=False,\n", | |
| ")\n", | |
| "\n", | |
| "\n", | |
| "pdf_results = search_google_pdfs(\n", | |
| " search_query=pdf_query,\n", | |
| " num_results=num_pdf_results\n", | |
| ")\n", | |
| "\n", | |
| "print(f\"\\nFound {len(pdf_results)} PDF results\")\n", | |
| "\n", | |
| "\n", | |
| "if not skip_show_and_just_force_download:\n", | |
| " # # Display the results in a nice table\n", | |
| " display_pdf_results(pdf_results)\n", | |
| " # display_image_grid([result['image']['thumbnailLink'] for result in results])\n", | |
| " display_image_grid(\n", | |
| " [result[\"link\"] for result in results],\n", | |
| " [result[\"image\"][\"thumbnailLink\"] for result in results],\n", | |
| " )\n", | |
| "\n", | |
| "\n", | |
| "save_dir = f\"downloaded_images/{query}\"\n", | |
| "print(f\"Please respond to the prompt to download images to {save_dir}\")\n", | |
| "if skip_show_and_just_force_download or input('Download images? (Y/n): ').lower()[:1] != 'n':\n", | |
| " tic = time.time()\n", | |
| " os.makedirs(save_dir, exist_ok=True)\n", | |
| "\n", | |
| " with open(os.path.join(save_dir, \"results.json\"), \"w\") as f:\n", | |
| " json.dump(results, f)\n", | |
| "\n", | |
| " args_list = [(result, idx, save_dir) for idx, result in enumerate(results, start=1)]\n", | |
| " with ThreadPool() as pool:\n", | |
| " list(tqdm(pool.imap(download_single_image, args_list), desc=\"Downloading images\", total=len(args_list)))\n", | |
| "\n", | |
| "\n", | |
| " # Get just the URLs\n", | |
| " pdf_urls = get_pdf_urls(pdf_results)\n", | |
| "\n", | |
| " # Save results to file\n", | |
| " save_pdf_results(pdf_results, f\"pdf_search_{pdf_query.replace(' ', '_')}.json\")\n", | |
| "\n", | |
| " # Download PDFs using ThreadPool\n", | |
| " pdf_args_list = [(result, idx, save_dir) for idx, result in enumerate(pdf_results, start=1)]\n", | |
| " with ThreadPool() as pool:\n", | |
| " list(tqdm(pool.imap(download_single_pdf, pdf_args_list), desc=\"Downloading PDFs\", total=len(pdf_args_list)))\n", | |
| " toc = time.time()\n", | |
| " print(f\"Downloaded {len(results)} images and {len(pdf_results)} PDFs in {toc - tic:.2f} seconds\")\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "6ab4877f", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "0e512076", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "py312", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.11" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment