Skip to content

Instantly share code, notes, and snippets.

@steve-kasica
Last active April 22, 2024 19:33
Show Gist options
  • Save steve-kasica/971e57b26a32cdcd4b465eb9664fd0bd to your computer and use it in GitHub Desktop.
Save steve-kasica/971e57b26a32cdcd4b465eb9664fd0bd to your computer and use it in GitHub Desktop.
How to build an interactive scraper in a Jupyter notebook
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "4be886d2",
"metadata": {},
"source": [
"# Interactive webpage scraper in Jupyter notebooks\n",
"\n",
"How to build an interactive scraper in a Jupyter notebook. This proof-of-concept uses [Outside's search feature](https://www.outsideonline.com/?s=outside)."
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "1ca5f35f",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup \n",
"from IPython.display import display, Markdown\n",
"import ipywidgets as widgets\n",
"import re\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "68547a37",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "916d7a80aee14e908fd17bf37301c60e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(Text(value='Bigfoot', description='Search query', placeholder='Bigfoot'), Output()))"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def scrapeSearchResults(query):\n",
" articles = [] \n",
"\n",
" url = \"https://www.outsideonline.com/?s={}\".format(query.replace(\" \", \"-\")) \n",
" r = requests.get(url)\n",
" soup = BeautifulSoup(r.content)\n",
" blocks = soup.find_all(\"div\", {\"class\", \"c-block\"})\n",
" for block in blocks:\n",
" img = block.find(\"img\", {\"class\": \"o-image\"})\n",
" author = block.find(\"span\", {\"class\": \"o-meta__author\"})\n",
" heading = block.find(\"a\", {\"class\": \"o-heading__link\"})\n",
" dek = block.find(\"div\", {\"class\": \"o-dek\"})\n",
" dateOg = block.find(\"span\", {\"class\": \"o-meta__date--original\"})\n",
" dateUpdate = block.find(\"span\", {\"class\": \"o-meta__date--updated\"})\n",
"\n",
" articles.append({\n",
" \"title\": heading.text.replace(\" \", \"\").replace(\"\\n\", \"\") if heading else None,\n",
" \"dek\": dek.p.text if dek else None,\n",
" \"href\": block.find(\"a\", {\"class\": \"o-heading__link\"})[\"href\"],\n",
" \"img\": img[\"data-src\"] if img else None,\n",
" \"original-date\": dateOg.find(\"time\")[\"datetime\"] if dateOg else None,\n",
" \"updated-date\": dateUpdate.find(\"time\")[\"datetime\"],\n",
" \"author\": author.strong.text if author else None,\n",
" })\n",
"\n",
" return articles\n",
"\n",
"\n",
"def displayResults(query):\n",
" results = scrapeSearchResults(query)\n",
" serializedResults = json.dumps(results, indent=1)\n",
" print(serializedResults)\n",
" \n",
"searchWidget = widgets.Text(\n",
" value=\"Bigfoot\",\n",
" placeholder=\"Bigfoot\",\n",
" description=\"Search query\",\n",
" disabled=False\n",
")\n",
"\n",
"out = widgets.interactive_output(displayResults, {\"query\": searchWidget})\n",
"\n",
"widgets.VBox([searchWidget, out])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ebb5865b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment