Last active
March 5, 2025 01:00
-
-
Save Khuirul-Huda/a1a6ad432edfef501e906d27f80cf227 to your computer and use it in GitHub Desktop.
Selenium in google colab, crawling SPA pages using Selenium and BeautifulSoup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "b45e2481", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "%%shell\n", | |
| "sudo apt -y update\n", | |
| "sudo apt install -y wget curl unzip\n", | |
| "wget http://archive.ubuntu.com/ubuntu/pool/main/libu/libu2f-host/libu2f-udev_1.1.4-1_all.deb\n", | |
| "dpkg -i libu2f-udev_1.1.4-1_all.deb\n", | |
| "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb\n", | |
| "dpkg -i google-chrome-stable_current_amd64.deb\n", | |
| "CHROME_DRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE)\n", | |
| "wget -N https://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip -P /tmp/\n", | |
| "unzip -o /tmp/chromedriver_linux64.zip -d /tmp/\n", | |
| "chmod +x /tmp/chromedriver\n", | |
| "mv /tmp/chromedriver /usr/local/bin/chromedriver\n", | |
| "pip install selenium" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "c5ebc1e1", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "!pip install chromedriver-autoinstaller\n", | |
| "import sys\n", | |
| "sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')\n", | |
| "\n", | |
| "import requests\n", | |
| "from bs4 import BeautifulSoup\n", | |
| "import time\n", | |
| "from collections import deque\n", | |
| "from selenium import webdriver\n", | |
| "from selenium.webdriver.chrome.service import Service\n", | |
| "from selenium.webdriver.chrome.options import Options\n", | |
| "import chromedriver_autoinstaller\n", | |
| "\n", | |
| "def get_rendered_html(url):\n", | |
| " options = Options()\n", | |
| " options.add_argument('--headless') # Run in headless mode\n", | |
| " options.add_argument('--no-sandbox')\n", | |
| " options.add_argument('--disable-dev-shm-usage')\n", | |
| " chromedriver_autoinstaller.install()\n", | |
| " driver = webdriver.Chrome(options=options)\n", | |
| " driver.get(url)\n", | |
| " html = driver.page_source\n", | |
| " driver.quit()\n", | |
| " return html\n", | |
| "\n", | |
| "def dfs_crawl(start_link, max_depth):\n", | |
| " visited = set()\n", | |
| " stack = [(start_link, 0)]\n", | |
| " start_time = time.time()\n", | |
| " while stack:\n", | |
| " url, depth = stack.pop()\n", | |
| " if depth > max_depth:\n", | |
| " continue\n", | |
| " if url in visited:\n", | |
| " continue\n", | |
| " visited.add(url)\n", | |
| " try:\n", | |
| " html = get_rendered_html(url)\n", | |
| " soup = BeautifulSoup(html, 'html.parser')\n", | |
| " print(f\"DFS - Crawling: {url} at depth {depth}\")\n", | |
| " for link in soup.find_all('a', href=True):\n", | |
| " absolute_url = link['href']\n", | |
| " if absolute_url.startswith('http'):\n", | |
| " stack.append((absolute_url, depth + 1))\n", | |
| " except Exception as e:\n", | |
| " print(f\"DFS - Error crawling {url}: {e}\")\n", | |
| " continue\n", | |
| " end_time = time.time()\n", | |
| " print(f\"DFS - Total execution time: {end_time - start_time:.2f} seconds\")\n", | |
| " print(f\"DFS - Total URLs crawled: {len(visited)}\")\n", | |
| "\n", | |
| "def bfs_crawl(start_link, max_depth):\n", | |
| " visited = set()\n", | |
| " queue = deque([(start_link, 0)])\n", | |
| " start_time = time.time()\n", | |
| " while queue:\n", | |
| " url, depth = queue.popleft()\n", | |
| " if depth > max_depth:\n", | |
| " continue\n", | |
| " if url in visited:\n", | |
| " continue\n", | |
| " visited.add(url)\n", | |
| " try:\n", | |
| " html = get_rendered_html(url)\n", | |
| " soup = BeautifulSoup(html, 'html.parser')\n", | |
| " print(f\"BFS - Crawling: {url} at depth {depth}\")\n", | |
| " for link in soup.find_all('a', href=True):\n", | |
| " absolute_url = link['href']\n", | |
| " if absolute_url.startswith('http'):\n", | |
| " queue.append((absolute_url, depth + 1))\n", | |
| " except Exception as e:\n", | |
| " print(f\"BFS - Error crawling {url}: {e}\")\n", | |
| " continue\n", | |
| " end_time = time.time()\n", | |
| " print(f\"BFS - Total execution time: {end_time - start_time:.2f} seconds\")\n", | |
| " print(f\"BFS - Total URLs crawled: {len(visited)}\")\n", | |
| "\n", | |
| "# Example usage\n", | |
| "start_url = \"__URL__\" # Replace with your desired starting URL\n", | |
| "max_depth_val = 2\n", | |
| "\n", | |
| "print(\"DFS Crawling:\")\n", | |
| "dfs_crawl(start_url, max_depth_val)\n", | |
| "\n", | |
| "print(\"\\nBFS Crawling:\")\n", | |
| "bfs_crawl(start_url, max_depth_val)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.8.10" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment