Skip to content

Instantly share code, notes, and snippets.

@intellectronica
Created December 14, 2024 21:42
Show Gist options
  • Save intellectronica/e6fcbba87a051f61a1e8c632b9a86983 to your computer and use it in GitHub Desktop.
Save intellectronica/e6fcbba87a051f61a1e8c632b9a86983 to your computer and use it in GitHub Desktop.
overcoming-bias-anthology-md.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"private_outputs": true,
"provenance": [],
"authorship_tag": "ABX9TyNc7fKPdBKHKm9MXz/hd0J5",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/intellectronica/e6fcbba87a051f61a1e8c632b9a86983/overcoming-bias-anthology-md.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "m46DdbX1-gVJ"
},
"outputs": [],
"source": [
"%pip install requests beautifulsoup4 markdown\n",
"from IPython.display import clear_output ; clear_output()\n"
]
},
{
"cell_type": "code",
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import time"
],
"metadata": {
"id": "V-yvwBYaGoU0"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"anthology_html = requests.get('https://overcoming-bias-anthology.com/').text"
],
"metadata": {
"id": "zZI7cSgO-4Qq"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"anthology_index = []\n",
"\n",
"anthology_soup = BeautifulSoup(anthology_html, 'html.parser')\n",
"for section in anthology_soup.find_all('section'):\n",
" idx_subsections = []\n",
" for subsection in section.find_all('div'):\n",
" idx_items = []\n",
" for item in subsection.find_all('li'):\n",
" idx_item = {\n",
" 'title': item.find('a').text,\n",
" 'url': item.find('a')['href'],\n",
" 'html': requests.get(item.find('a')['href']).text,\n",
" }\n",
" idx_items.append(idx_item)\n",
" time.sleep(3.45)\n",
" idx_subsection = {\n",
" 'title': subsection.find('h2').text,\n",
" 'items': idx_items,\n",
" }\n",
" idx_subsections.append(idx_subsection)\n",
" idx_section = {\n",
" 'title': section.find('h1').text.split(': ')[1],\n",
" 'subsections': idx_subsections,\n",
" }\n",
" anthology_index.append(idx_section)"
],
"metadata": {
"id": "lI_6E9hY_RVn"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from markdown import markdown\n",
"from IPython.display import display, Markdown\n",
"\n",
"anthology_md = '# Overcoming Bias Anthology\\n\\n'\n",
"\n",
"for section in anthology_index:\n",
" anthology_md += f'## {section[\"title\"]}\\n\\n'\n",
" for subsection in section['subsections']:\n",
" anthology_md += f'### {subsection[\"title\"]}\\n\\n'\n",
" for item in subsection['items']:\n",
" anthology_md += f'#### {item[\"title\"]}\\n\\n'\n",
" item_soup = BeautifulSoup(item['html'], 'html.parser')\n",
" anthology_md += markdown(\n",
" item_soup.find('div', class_='available-content').decode_contents()\n",
" )\n",
" anthology_md += '\\n\\n'\n",
"\n",
"with open('overcoming-bias-anthology.md', 'w') as f:\n",
" f.write(anthology_md)\n",
"\n",
"with open('overcoming-bias-anthology.md', 'r') as f:\n",
" anthology_md = f.read()\n",
" display(Markdown(anthology_md))"
],
"metadata": {
"id": "GtNPNl1lACS_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "hvUVZ3edCfjr"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment