Skip to content

Instantly share code, notes, and snippets.

@yungwarlock
Created July 4, 2025 15:38
Show Gist options
  • Save yungwarlock/9465b560383f7dd6e4e6b0892c2b8e00 to your computer and use it in GitHub Desktop.
Save yungwarlock/9465b560383f7dd6e4e6b0892c2b8e00 to your computer and use it in GitHub Desktop.
Channels News Scraper.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/yungwarlock/9465b560383f7dd6e4e6b0892c2b8e00/channels-news-scraper.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wifot5drBDPc"
},
"source": [
"## Setup project"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "-K0AYQKIUu4N"
},
"outputs": [],
"source": [
"%pip install -qU cloudscraper trafilatura git+https://github.com/erikriver/opengraph@master tenacity beautifulsoup4 duckdb==0.9.2 jupysql duckdb-engine pandas-gbq"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "086de4YcBHsJ"
},
"source": [
"## Extract all links"
]
},
{
"cell_type": "code",
"source": [
"import cloudscraper\n",
"\n",
"BASE_URL = \"https://www.channelstv.com/wp-json/wp/v2\"\n",
"scraper = cloudscraper.create_scraper()"
],
"metadata": {
"id": "ERBnX1bUeMPm"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"CATEGORIES_CACHE = {}\n",
"\n",
"def get_category_info(category_id):\n",
" if category_id in CATEGORIES_CACHE:\n",
" return CATEGORIES_CACHE[category_id]\n",
" url = BASE_URL + f\"/categories/{category_id}\"\n",
" data = scraper.get(url).json()\n",
" try:\n",
" res = {\n",
" \"id\": data[\"id\"],\n",
" \"name\": data[\"name\"],\n",
" \"slug\": data[\"name\"],\n",
" \"parent\": get_category_info(data[\"parent\"]),\n",
" }\n",
" CATEGORIES_CACHE[category_id] = res\n",
" return res\n",
" except Exception as e:\n",
" return None"
],
"metadata": {
"id": "HAzsfB3Wer__"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def extract_data(data):\n",
" return {\n",
" \"id\": data[\"id\"],\n",
" \"title\": data[\"yoast_head_json\"][\"og_title\"],\n",
" \"description\": data[\"yoast_head_json\"][\"og_description\"],\n",
" \"date\": data[\"date\"],\n",
" \"url\": data[\"link\"],\n",
" \"categories\": [get_category_info(x) for x in data[\"categories\"]]\n",
" }"
],
"metadata": {
"id": "_3XTvi8jYODm"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import re\n",
"import os\n",
"import json\n",
"import threading\n",
"\n",
"from multiprocessing.pool import ThreadPool\n",
"\n",
"import time\n",
"import tenacity\n",
"import cloudscraper\n",
"from tqdm import tqdm\n",
"\n",
"\n",
"# @tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_fixed(2))\n",
"def extract_links(i):\n",
" \"\"\"\n",
" Fetches data from a specific page, extracts links, and saves them to a file.\n",
" \"\"\"\n",
"\n",
" output_file = f\"/content/drive/MyDrive/Colab Notebooks/channel_news/data{i}.jsonl\"\n",
" url = f\"https://www.channelstv.com/wp-json/wp/v2/posts?page={i}\"\n",
"\n",
" if os.path.exists(output_file):\n",
" return\n",
"\n",
" # if (i % 5) == 0:\n",
" # time.sleep(10)\n",
"\n",
" try:\n",
" response = scraper.get(url)\n",
" response.raise_for_status()\n",
" data = response.json()\n",
"\n",
" with open(output_file, \"w\") as f:\n",
" for item in data:\n",
" f.write(json.dumps(extract_data(item)) + \"\\n\")\n",
" except Exception as e:\n",
" print(f\"Error processing page {i}: {e}\")\n",
"\n",
"page_range_start = 10001\n",
"page_range_end = 20000\n",
"\n",
"# os.makedirs('results', exist_ok=True)\n",
"\n",
"print(f\"Starting data extraction for pages {page_range_start} to {page_range_end - 1}...\")\n",
"\n",
"with ThreadPool(20) as pool:\n",
" list(tqdm(pool.imap_unordered(extract_links, range(page_range_start, page_range_end)),\n",
" total=page_range_end - page_range_start,\n",
" desc=\"Processing pages\"))\n",
"\n",
"print(\"Data extraction complete!\")"
],
"metadata": {
"id": "R51yF9VSnOzH"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Extract with Trafilatura"
],
"metadata": {
"id": "UUqQlUaHfqkL"
}
},
{
"cell_type": "code",
"source": [
"import re\n",
"import os\n",
"import json\n",
"import time\n",
"import threading\n",
"from multiprocessing.pool import ThreadPool\n",
"\n",
"import tenacity\n",
"import cloudscraper\n",
"from trafilatura import extract\n",
"from tqdm import tqdm\n",
"\n",
"# page_range_start = 10001\n",
"# page_range_end = 20000\n",
"\n",
"input_folder = \"/content/drive/MyDrive/Colab Notebooks/channel_news\"\n",
"results_folder = \"/content/drive/MyDrive/Colab Notebooks/channel_news_results\"\n",
"\n",
"os.makedirs(results_folder, exist_ok=True)\n",
"\n",
"\n",
"def format_results_data(data):\n",
" return {\n",
" \"id\": data[\"id\"],\n",
" \"title\": data[\"title\"],\n",
" \"description\": data[\"description\"],\n",
" \"date\": data[\"date\"],\n",
" \"url\": data[\"url\"],\n",
" \"categories\": [x[\"name\"] for x in data[\"categories\"]]\n",
" }\n",
"\n",
"# @tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_fixed(2)) # Example tenacity usage\n",
"def extract_text(i):\n",
" input_jsonl = f\"{input_folder}/data{i}.jsonl\"\n",
" basename = os.path.basename(input_jsonl)\n",
"\n",
" output_file = results_folder + \"/\" + basename\n",
"\n",
" if os.path.exists(output_file):\n",
" return\n",
"\n",
" with open(output_file, \"w\") as output_fd:\n",
" try:\n",
" with open(input_jsonl, \"r\") as input_fd:\n",
" for line_no, line in enumerate(input_fd.readlines()):\n",
" try:\n",
" data = json.loads(line)\n",
" response = scraper.get(data[\"url\"])\n",
" response.raise_for_status()\n",
" new_data = format_results_data(data)\n",
" new_data[\"text\"] = extract(response.text)\n",
" output_fd.write(json.dumps(new_data) + \"\\n\")\n",
" except Exception as e:\n",
" print(f\"Error processing line {line_no} in {basename}: {e}\")\n",
" except Exception as e:\n",
" print(\"Failed to open file\", input_jsonl)\n",
"\n",
"page_range_start = 8990\n",
"page_range_end = 18001\n",
"\n",
"print(f\"Starting data extraction for pages {page_range_start} to {page_range_end - 1}...\")\n",
"\n",
"with ThreadPool(20) as pool:\n",
" list(tqdm(pool.imap_unordered(extract_text, range(page_range_start, page_range_end)),\n",
" total=page_range_end - page_range_start,\n",
" desc=\"Processing pages\"))\n",
"\n",
"print(\"Data extraction complete!\")"
],
"metadata": {
"id": "casaoIs4iwJB"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import glob\n",
"with open(glob.glob(\"/content/drive/MyDrive/Colab Notebooks/channel_news_results/*\")[999], \"r\") as fd:\n",
" print(fd.read())"
],
"metadata": {
"id": "WzzkRfpAntp-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!du -sh \"/content/drive/MyDrive/Colab Notebooks/channel_news_results\""
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qbAYOmEupMhV",
"outputId": "9da6eb09-361b-422c-977b-7cf1391d8ee7"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"433M\t/content/drive/MyDrive/Colab Notebooks/channel_news_results\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import glob\n",
"with open(glob.glob(\"/content/drive/MyDrive/Colab Notebooks/channel_news/*\")[0], \"r\") as fd:\n",
" print(fd.read())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "KjJFslJG-rLT",
"outputId": "bc47648b-11ce-45c0-c585-f23b3e744bdd"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{\"id\": 59089, \"title\": \"New York State Senate, Washington Mayor Honour Chinua Achebe\", \"description\": \"A resolution honouring Professor Chinua Achebe has been adopted by the New York senate in Albany as part of its tradition of paying tribute to &hellip; Continue reading New York State Senate, Washington Mayor Honour Chinua Achebe\", \"date\": \"2013-04-18T08:49:09\", \"url\": \"https://www.channelstv.com/2013/04/18/new-york-state-senate-washington-mayor-honour-chinua-achebe/\", \"categories\": [{\"id\": 508, \"name\": \"Arts &amp; Culture\", \"slug\": \"Arts &amp; Culture\", \"parent\": null}, {\"id\": 22978, \"name\": \"Channels Book Club\", \"slug\": \"Channels Book Club\", \"parent\": {\"id\": 13, \"name\": \"Programs\", \"slug\": \"Programs\", \"parent\": null}}]}\n",
"{\"id\": 59096, \"title\": \"Japanese Students Make World\\u2019s Biggest Roll Cake At 130 Metres Long\", \"description\": \"You wouldn&#8217;t want to be the one left doing the washing up. Confectionery college school students celebrated as they made the world&#8217;s longest roll cake &hellip; Continue reading Japanese Students Make World\\u2019s Biggest Roll Cake At 130 Metres Long\", \"date\": \"2013-04-18T08:48:19\", \"url\": \"https://www.channelstv.com/2013/04/18/japanese-students-make-worlds-biggest-roll-cake-at-130-metres-long/\", \"categories\": [{\"id\": 182, \"name\": \"Lifestyle\", \"slug\": \"Lifestyle\", \"parent\": null}]}\n",
"{\"id\": 59085, \"title\": \"Nigeria's Inflation Rate Drops To 8.6%\", \"description\": \"Nigeria\\u2019s inflation rate fell to the lowest in almost five years in March as the impact of higher fuel prices a year ago fell out &hellip; Continue reading Nigeria&#8217;s Inflation Rate Drops To 8.6%\", \"date\": \"2013-04-18T08:39:29\", \"url\": \"https://www.channelstv.com/2013/04/18/nigerias-inflation-rate-drops-to-8-6/\", \"categories\": [{\"id\": 5, \"name\": \"Business\", \"slug\": \"Business\", \"parent\": null}]}\n",
"{\"id\": 59070, \"title\": \"Two Women Arrested For Stealing 3-Day Old Babies\", \"description\": \"The Imo state police command has nabbed a 55 year old woman, Mrs Patience Aibangbe from Edo State alongside two of her accomplices for allegedly &hellip; Continue reading Two Women Arrested For Stealing 3-Day Old Babies\", \"date\": \"2013-04-18T08:37:23\", \"url\": \"https://www.channelstv.com/2013/04/18/police-nabs-55-year-old-kidnapper-of-babies/\", \"categories\": [{\"id\": 3, \"name\": \"Local\", \"slug\": \"Local\", \"parent\": null}]}\n",
"{\"id\": 59081, \"title\": \"Babies As Young As Six Months Victims Of Rape In War: UN Envoy\", \"description\": \"In her first seven months as U.N. envoy on sexual violence in conflict, Zainab Hawa Bangura has visited a Congolese district where rebels raped babies, &hellip; Continue reading Babies As Young As Six Months Victims Of Rape In War: UN Envoy\", \"date\": \"2013-04-18T07:55:01\", \"url\": \"https://www.channelstv.com/2013/04/18/babies-as-young-as-six-months-victims-of-rape-in-war-un-envoy/\", \"categories\": [{\"id\": 3, \"name\": \"Local\", \"slug\": \"Local\", \"parent\": null}]}\n",
"{\"id\": 59077, \"title\": \"Son Of Senegal's Ex-president Charged With Corruption\", \"description\": \"Senegalese prosecutors on Wednesday formally charged Karim Wade, the son of the West African nation&#8217;s former president, with corruption and ordered him to be detained &hellip; Continue reading Son Of Senegal&#8217;s Ex-president Charged With Corruption\", \"date\": \"2013-04-18T07:51:15\", \"url\": \"https://www.channelstv.com/2013/04/18/son-of-senegals-ex-president-charged-with-corruption/\", \"categories\": [{\"id\": 3, \"name\": \"Local\", \"slug\": \"Local\", \"parent\": null}]}\n",
"{\"id\": 59073, \"title\": \"Scores Feared Dead As Explosion, Fire Rip Through Texas Fertilizer Plant\", \"description\": \"A deadly explosion and fire tore through a fertilizer plant in a small Texas town late on Wednesday, injuring more than 100 people, leveling dozens &hellip; Continue reading Scores Feared Dead As Explosion, Fire Rip Through Texas Fertilizer Plant\", \"date\": \"2013-04-18T07:41:55\", \"url\": \"https://www.channelstv.com/2013/04/18/deadly-explosion-fire-rip-through-texas-fertilizer-plant/\", \"categories\": [{\"id\": 12, \"name\": \"World News\", \"slug\": \"World News\", \"parent\": null}]}\n",
"{\"id\": 59069, \"title\": \"Pakistani Court Orders Arrest Of Pervez Musharraf\", \"description\": \"A Pakistani court has ordered the arrest of Pakistan&#8217;s ex-military ruler Pervez Musharraf over moves to impose house arrest on judges in March 2007. Mr &hellip; Continue reading Pakistani Court Orders Arrest Of Pervez Musharraf\", \"date\": \"2013-04-18T07:40:13\", \"url\": \"https://www.channelstv.com/2013/04/18/pakistani-court-orders-arrest-of-pervez-musharraf/\", \"categories\": [{\"id\": 12, \"name\": \"World News\", \"slug\": \"World News\", \"parent\": null}]}\n",
"{\"id\": 59066, \"title\": \"North Korea demands end of sanctions if U.S. wants dialogue\", \"description\": \"North Korea offered the United States and South Korea a list of conditions for talks, including the lifting of U.N. sanctions, signaling a possible end &hellip; Continue reading North Korea demands end of sanctions if U.S. wants dialogue\", \"date\": \"2013-04-18T07:35:22\", \"url\": \"https://www.channelstv.com/2013/04/18/north-korea-demands-end-of-sanctions-if-u-s-wants-dialogue/\", \"categories\": [{\"id\": 12, \"name\": \"World News\", \"slug\": \"World News\", \"parent\": null}]}\n",
"{\"id\": 59063, \"title\": \"Oando To Purchase ConocoPhillips' Asset\", \"description\": \"Oando is close to securing funds to buy ConocoPhillips&#8217; Nigerian assets, the company&#8217;s chief executive said on Wednesday, as he looked to allay fears it &hellip; Continue reading Oando To Purchase ConocoPhillips&#8217; Asset\", \"date\": \"2013-04-18T07:31:56\", \"url\": \"https://www.channelstv.com/2013/04/18/oando-to-purchase-conocophillips-asset/\", \"categories\": [{\"id\": 5, \"name\": \"Business\", \"slug\": \"Business\", \"parent\": null}]}\n",
"\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## Push to Huggingface Hub"
],
"metadata": {
"id": "s7ItuSCFCviI"
}
},
{
"cell_type": "code",
"source": [
"import glob\n",
"import pandas as pd\n",
"from datasets import Dataset\n",
"\n",
"json_files = glob.glob(\"/content/drive/MyDrive/Colab Notebooks/channel_news_results/*\")\n",
"\n",
"dfs = []\n",
"for file in json_files:\n",
" df = pd.read_json(file, lines=True)\n",
" dfs.append(df)\n",
"\n",
"combined_df = pd.concat(dfs, ignore_index=True)\n",
"hf_dataset = Dataset.from_pandas(combined_df)"
],
"metadata": {
"id": "IubzRRN7tLM1"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"hf_dataset.push_to_hub(\"raven-consult/channels_news\")"
],
"metadata": {
"id": "iIsOZ4Y_2Us8"
},
"execution_count": null,
"outputs": []
}
],
"metadata": {
"colab": {
"provenance": [],
"collapsed_sections": [
"086de4YcBHsJ",
"UUqQlUaHfqkL",
"s7ItuSCFCviI",
"fNN7bq86fkAo"
],
"mount_file_id": "1qHC6b_J3CBL-G2FBVBPcUyzbrY_CVnhe",
"authorship_tag": "ABX9TyPcOsuMMG4vWd7oAnlH11p/",
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment