Khuirul-Huda · March 5, 2025 01:00
diff --git a/Selenium in google colab, crawling SPA pages using Selenium and BeautifulSoup.ipynb b/Selenium in google colab, crawling SPA pages using Selenium and BeautifulSoup.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b45e2481",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%shell\n",
    "sudo apt -y update\n",
    "sudo apt install -y wget curl unzip\n",
    "wget http://archive.ubuntu.com/ubuntu/pool/main/libu/libu2f-host/libu2f-udev_1.1.4-1_all.deb\n",
    "dpkg -i libu2f-udev_1.1.4-1_all.deb\n",
    "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb\n",
    "dpkg -i google-chrome-stable_current_amd64.deb\n",
    "CHROME_DRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE)\n",
    "wget -N https://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip -P /tmp/\n",
    "unzip -o /tmp/chromedriver_linux64.zip -d /tmp/\n",
    "chmod +x /tmp/chromedriver\n",
    "mv /tmp/chromedriver /usr/local/bin/chromedriver\n",
    "pip install selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5ebc1e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install chromedriver-autoinstaller\n",
    "import sys\n",
    "sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')\n",
    "\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import time\n",
    "from collections import deque\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.chrome.service import Service\n",
    "from selenium.webdriver.chrome.options import Options\n",
    "import chromedriver_autoinstaller\n",
    "\n",
    "def get_rendered_html(url):\n",
    "    options = Options()\n",
    "    options.add_argument('--headless')  # Run in headless mode\n",
    "    options.add_argument('--no-sandbox')\n",
    "    options.add_argument('--disable-dev-shm-usage')\n",
    "    chromedriver_autoinstaller.install()\n",
    "    driver = webdriver.Chrome(options=options)\n",
    "    driver.get(url)\n",
    "    html = driver.page_source\n",
    "    driver.quit()\n",
    "    return html\n",
    "\n",
    "def dfs_crawl(start_link, max_depth):\n",
    "    visited = set()\n",
    "    stack = [(start_link, 0)]\n",
    "    start_time = time.time()\n",
    "    while stack:\n",
    "        url, depth = stack.pop()\n",
    "        if depth > max_depth:\n",
    "            continue\n",
    "        if url in visited:\n",
    "            continue\n",
    "        visited.add(url)\n",
    "        try:\n",
    "            html = get_rendered_html(url)\n",
    "            soup = BeautifulSoup(html, 'html.parser')\n",
    "            print(f\"DFS - Crawling: {url} at depth {depth}\")\n",
    "            for link in soup.find_all('a', href=True):\n",
    "                absolute_url = link['href']\n",
    "                if absolute_url.startswith('http'):\n",
    "                    stack.append((absolute_url, depth + 1))\n",
    "        except Exception as e:\n",
    "            print(f\"DFS - Error crawling {url}: {e}\")\n",
    "            continue\n",
    "    end_time = time.time()\n",
    "    print(f\"DFS - Total execution time: {end_time - start_time:.2f} seconds\")\n",
    "    print(f\"DFS - Total URLs crawled: {len(visited)}\")\n",
    "\n",
    "def bfs_crawl(start_link, max_depth):\n",
    "    visited = set()\n",
    "    queue = deque([(start_link, 0)])\n",
    "    start_time = time.time()\n",
    "    while queue:\n",
    "        url, depth = queue.popleft()\n",
    "        if depth > max_depth:\n",
    "            continue\n",
    "        if url in visited:\n",
    "            continue\n",
    "        visited.add(url)\n",
    "        try:\n",
    "            html = get_rendered_html(url)\n",
    "            soup = BeautifulSoup(html, 'html.parser')\n",
    "            print(f\"BFS - Crawling: {url} at depth {depth}\")\n",
    "            for link in soup.find_all('a', href=True):\n",
    "                absolute_url = link['href']\n",
    "                if absolute_url.startswith('http'):\n",
    "                    queue.append((absolute_url, depth + 1))\n",
    "        except Exception as e:\n",
    "            print(f\"BFS - Error crawling {url}: {e}\")\n",
    "            continue\n",
    "    end_time = time.time()\n",
    "    print(f\"BFS - Total execution time: {end_time - start_time:.2f} seconds\")\n",
    "    print(f\"BFS - Total URLs crawled: {len(visited)}\")\n",
    "\n",
    "# Example usage\n",
    "start_url = \"__URL__\"  # Replace with your desired starting URL\n",
    "max_depth_val = 2\n",
    "\n",
    "print(\"DFS Crawling:\")\n",
    "dfs_crawl(start_url, max_depth_val)\n",
    "\n",
    "print(\"\\nBFS Crawling:\")\n",
    "bfs_crawl(start_url, max_depth_val)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b45e2481",
	"metadata": {},
	"outputs": [],
	"source": [
	"%%shell\n",
	"sudo apt -y update\n",
	"sudo apt install -y wget curl unzip\n",
	"wget http://archive.ubuntu.com/ubuntu/pool/main/libu/libu2f-host/libu2f-udev_1.1.4-1_all.deb\n",
	"dpkg -i libu2f-udev_1.1.4-1_all.deb\n",
	"wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb\n",
	"dpkg -i google-chrome-stable_current_amd64.deb\n",
	"CHROME_DRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE)\n",
	"wget -N https://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip -P /tmp/\n",
	"unzip -o /tmp/chromedriver_linux64.zip -d /tmp/\n",
	"chmod +x /tmp/chromedriver\n",
	"mv /tmp/chromedriver /usr/local/bin/chromedriver\n",
	"pip install selenium"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "c5ebc1e1",
	"metadata": {},
	"outputs": [],
	"source": [
	"!pip install chromedriver-autoinstaller\n",
	"import sys\n",
	"sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')\n",
	"\n",
	"import requests\n",
	"from bs4 import BeautifulSoup\n",
	"import time\n",
	"from collections import deque\n",
	"from selenium import webdriver\n",
	"from selenium.webdriver.chrome.service import Service\n",
	"from selenium.webdriver.chrome.options import Options\n",
	"import chromedriver_autoinstaller\n",
	"\n",
	"def get_rendered_html(url):\n",
	" options = Options()\n",
	" options.add_argument('--headless') # Run in headless mode\n",
	" options.add_argument('--no-sandbox')\n",
	" options.add_argument('--disable-dev-shm-usage')\n",
	" chromedriver_autoinstaller.install()\n",
	" driver = webdriver.Chrome(options=options)\n",
	" driver.get(url)\n",
	" html = driver.page_source\n",
	" driver.quit()\n",
	" return html\n",
	"\n",
	"def dfs_crawl(start_link, max_depth):\n",
	" visited = set()\n",
	" stack = [(start_link, 0)]\n",
	" start_time = time.time()\n",
	" while stack:\n",
	" url, depth = stack.pop()\n",
	" if depth > max_depth:\n",
	" continue\n",
	" if url in visited:\n",
	" continue\n",
	" visited.add(url)\n",
	" try:\n",
	" html = get_rendered_html(url)\n",
	" soup = BeautifulSoup(html, 'html.parser')\n",
	" print(f\"DFS - Crawling: {url} at depth {depth}\")\n",
	" for link in soup.find_all('a', href=True):\n",
	" absolute_url = link['href']\n",
	" if absolute_url.startswith('http'):\n",
	" stack.append((absolute_url, depth + 1))\n",
	" except Exception as e:\n",
	" print(f\"DFS - Error crawling {url}: {e}\")\n",
	" continue\n",
	" end_time = time.time()\n",
	" print(f\"DFS - Total execution time: {end_time - start_time:.2f} seconds\")\n",
	" print(f\"DFS - Total URLs crawled: {len(visited)}\")\n",
	"\n",
	"def bfs_crawl(start_link, max_depth):\n",
	" visited = set()\n",
	" queue = deque([(start_link, 0)])\n",
	" start_time = time.time()\n",
	" while queue:\n",
	" url, depth = queue.popleft()\n",
	" if depth > max_depth:\n",
	" continue\n",
	" if url in visited:\n",
	" continue\n",
	" visited.add(url)\n",
	" try:\n",
	" html = get_rendered_html(url)\n",
	" soup = BeautifulSoup(html, 'html.parser')\n",
	" print(f\"BFS - Crawling: {url} at depth {depth}\")\n",
	" for link in soup.find_all('a', href=True):\n",
	" absolute_url = link['href']\n",
	" if absolute_url.startswith('http'):\n",
	" queue.append((absolute_url, depth + 1))\n",
	" except Exception as e:\n",
	" print(f\"BFS - Error crawling {url}: {e}\")\n",
	" continue\n",
	" end_time = time.time()\n",
	" print(f\"BFS - Total execution time: {end_time - start_time:.2f} seconds\")\n",
	" print(f\"BFS - Total URLs crawled: {len(visited)}\")\n",
	"\n",
	"# Example usage\n",
	"start_url = \"__URL__\" # Replace with your desired starting URL\n",
	"max_depth_val = 2\n",
	"\n",
	"print(\"DFS Crawling:\")\n",
	"dfs_crawl(start_url, max_depth_val)\n",
	"\n",
	"print(\"\\nBFS Crawling:\")\n",
	"bfs_crawl(start_url, max_depth_val)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}