Created
July 4, 2025 15:38
-
-
Save yungwarlock/9465b560383f7dd6e4e6b0892c2b8e00 to your computer and use it in GitHub Desktop.
Channels News Scraper.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/yungwarlock/9465b560383f7dd6e4e6b0892c2b8e00/channels-news-scraper.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "wifot5drBDPc" | |
}, | |
"source": [ | |
"## Setup project" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "-K0AYQKIUu4N" | |
}, | |
"outputs": [], | |
"source": [ | |
"%pip install -qU cloudscraper trafilatura git+https://github.com/erikriver/opengraph@master tenacity beautifulsoup4 duckdb==0.9.2 jupysql duckdb-engine pandas-gbq" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "086de4YcBHsJ" | |
}, | |
"source": [ | |
"## Extract all links" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import cloudscraper\n", | |
"\n", | |
"BASE_URL = \"https://www.channelstv.com/wp-json/wp/v2\"\n", | |
"scraper = cloudscraper.create_scraper()" | |
], | |
"metadata": { | |
"id": "ERBnX1bUeMPm" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"CATEGORIES_CACHE = {}\n", | |
"\n", | |
"def get_category_info(category_id):\n", | |
" if category_id in CATEGORIES_CACHE:\n", | |
" return CATEGORIES_CACHE[category_id]\n", | |
" url = BASE_URL + f\"/categories/{category_id}\"\n", | |
" data = scraper.get(url).json()\n", | |
" try:\n", | |
" res = {\n", | |
" \"id\": data[\"id\"],\n", | |
" \"name\": data[\"name\"],\n", | |
" \"slug\": data[\"name\"],\n", | |
" \"parent\": get_category_info(data[\"parent\"]),\n", | |
" }\n", | |
" CATEGORIES_CACHE[category_id] = res\n", | |
" return res\n", | |
" except Exception as e:\n", | |
" return None" | |
], | |
"metadata": { | |
"id": "HAzsfB3Wer__" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"def extract_data(data):\n", | |
" return {\n", | |
" \"id\": data[\"id\"],\n", | |
" \"title\": data[\"yoast_head_json\"][\"og_title\"],\n", | |
" \"description\": data[\"yoast_head_json\"][\"og_description\"],\n", | |
" \"date\": data[\"date\"],\n", | |
" \"url\": data[\"link\"],\n", | |
" \"categories\": [get_category_info(x) for x in data[\"categories\"]]\n", | |
" }" | |
], | |
"metadata": { | |
"id": "_3XTvi8jYODm" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import re\n", | |
"import os\n", | |
"import json\n", | |
"import threading\n", | |
"\n", | |
"from multiprocessing.pool import ThreadPool\n", | |
"\n", | |
"import time\n", | |
"import tenacity\n", | |
"import cloudscraper\n", | |
"from tqdm import tqdm\n", | |
"\n", | |
"\n", | |
"# @tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_fixed(2))\n", | |
"def extract_links(i):\n", | |
" \"\"\"\n", | |
" Fetches data from a specific page, extracts links, and saves them to a file.\n", | |
" \"\"\"\n", | |
"\n", | |
" output_file = f\"/content/drive/MyDrive/Colab Notebooks/channel_news/data{i}.jsonl\"\n", | |
" url = f\"https://www.channelstv.com/wp-json/wp/v2/posts?page={i}\"\n", | |
"\n", | |
" if os.path.exists(output_file):\n", | |
" return\n", | |
"\n", | |
" # if (i % 5) == 0:\n", | |
" # time.sleep(10)\n", | |
"\n", | |
" try:\n", | |
" response = scraper.get(url)\n", | |
" response.raise_for_status()\n", | |
" data = response.json()\n", | |
"\n", | |
" with open(output_file, \"w\") as f:\n", | |
" for item in data:\n", | |
" f.write(json.dumps(extract_data(item)) + \"\\n\")\n", | |
" except Exception as e:\n", | |
" print(f\"Error processing page {i}: {e}\")\n", | |
"\n", | |
"page_range_start = 10001\n", | |
"page_range_end = 20000\n", | |
"\n", | |
"# os.makedirs('results', exist_ok=True)\n", | |
"\n", | |
"print(f\"Starting data extraction for pages {page_range_start} to {page_range_end - 1}...\")\n", | |
"\n", | |
"with ThreadPool(20) as pool:\n", | |
" list(tqdm(pool.imap_unordered(extract_links, range(page_range_start, page_range_end)),\n", | |
" total=page_range_end - page_range_start,\n", | |
" desc=\"Processing pages\"))\n", | |
"\n", | |
"print(\"Data extraction complete!\")" | |
], | |
"metadata": { | |
"id": "R51yF9VSnOzH" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Extract with Trafilatura" | |
], | |
"metadata": { | |
"id": "UUqQlUaHfqkL" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import re\n", | |
"import os\n", | |
"import json\n", | |
"import time\n", | |
"import threading\n", | |
"from multiprocessing.pool import ThreadPool\n", | |
"\n", | |
"import tenacity\n", | |
"import cloudscraper\n", | |
"from trafilatura import extract\n", | |
"from tqdm import tqdm\n", | |
"\n", | |
"# page_range_start = 10001\n", | |
"# page_range_end = 20000\n", | |
"\n", | |
"input_folder = \"/content/drive/MyDrive/Colab Notebooks/channel_news\"\n", | |
"results_folder = \"/content/drive/MyDrive/Colab Notebooks/channel_news_results\"\n", | |
"\n", | |
"os.makedirs(results_folder, exist_ok=True)\n", | |
"\n", | |
"\n", | |
"def format_results_data(data):\n", | |
" return {\n", | |
" \"id\": data[\"id\"],\n", | |
" \"title\": data[\"title\"],\n", | |
" \"description\": data[\"description\"],\n", | |
" \"date\": data[\"date\"],\n", | |
" \"url\": data[\"url\"],\n", | |
" \"categories\": [x[\"name\"] for x in data[\"categories\"]]\n", | |
" }\n", | |
"\n", | |
"# @tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_fixed(2)) # Example tenacity usage\n", | |
"def extract_text(i):\n", | |
" input_jsonl = f\"{input_folder}/data{i}.jsonl\"\n", | |
" basename = os.path.basename(input_jsonl)\n", | |
"\n", | |
" output_file = results_folder + \"/\" + basename\n", | |
"\n", | |
" if os.path.exists(output_file):\n", | |
" return\n", | |
"\n", | |
" with open(output_file, \"w\") as output_fd:\n", | |
" try:\n", | |
" with open(input_jsonl, \"r\") as input_fd:\n", | |
" for line_no, line in enumerate(input_fd.readlines()):\n", | |
" try:\n", | |
" data = json.loads(line)\n", | |
" response = scraper.get(data[\"url\"])\n", | |
" response.raise_for_status()\n", | |
" new_data = format_results_data(data)\n", | |
" new_data[\"text\"] = extract(response.text)\n", | |
" output_fd.write(json.dumps(new_data) + \"\\n\")\n", | |
" except Exception as e:\n", | |
" print(f\"Error processing line {line_no} in {basename}: {e}\")\n", | |
" except Exception as e:\n", | |
" print(\"Failed to open file\", input_jsonl)\n", | |
"\n", | |
"page_range_start = 8990\n", | |
"page_range_end = 18001\n", | |
"\n", | |
"print(f\"Starting data extraction for pages {page_range_start} to {page_range_end - 1}...\")\n", | |
"\n", | |
"with ThreadPool(20) as pool:\n", | |
" list(tqdm(pool.imap_unordered(extract_text, range(page_range_start, page_range_end)),\n", | |
" total=page_range_end - page_range_start,\n", | |
" desc=\"Processing pages\"))\n", | |
"\n", | |
"print(\"Data extraction complete!\")" | |
], | |
"metadata": { | |
"id": "casaoIs4iwJB" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import glob\n", | |
"with open(glob.glob(\"/content/drive/MyDrive/Colab Notebooks/channel_news_results/*\")[999], \"r\") as fd:\n", | |
" print(fd.read())" | |
], | |
"metadata": { | |
"id": "WzzkRfpAntp-" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!du -sh \"/content/drive/MyDrive/Colab Notebooks/channel_news_results\"" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "qbAYOmEupMhV", | |
"outputId": "9da6eb09-361b-422c-977b-7cf1391d8ee7" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"433M\t/content/drive/MyDrive/Colab Notebooks/channel_news_results\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import glob\n", | |
"with open(glob.glob(\"/content/drive/MyDrive/Colab Notebooks/channel_news/*\")[0], \"r\") as fd:\n", | |
" print(fd.read())" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "KjJFslJG-rLT", | |
"outputId": "bc47648b-11ce-45c0-c585-f23b3e744bdd" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"{\"id\": 59089, \"title\": \"New York State Senate, Washington Mayor Honour Chinua Achebe\", \"description\": \"A resolution honouring Professor Chinua Achebe has been adopted by the New York senate in Albany as part of its tradition of paying tribute to … Continue reading New York State Senate, Washington Mayor Honour Chinua Achebe\", \"date\": \"2013-04-18T08:49:09\", \"url\": \"https://www.channelstv.com/2013/04/18/new-york-state-senate-washington-mayor-honour-chinua-achebe/\", \"categories\": [{\"id\": 508, \"name\": \"Arts & Culture\", \"slug\": \"Arts & Culture\", \"parent\": null}, {\"id\": 22978, \"name\": \"Channels Book Club\", \"slug\": \"Channels Book Club\", \"parent\": {\"id\": 13, \"name\": \"Programs\", \"slug\": \"Programs\", \"parent\": null}}]}\n", | |
"{\"id\": 59096, \"title\": \"Japanese Students Make World\\u2019s Biggest Roll Cake At 130 Metres Long\", \"description\": \"You wouldn’t want to be the one left doing the washing up. Confectionery college school students celebrated as they made the world’s longest roll cake … Continue reading Japanese Students Make World\\u2019s Biggest Roll Cake At 130 Metres Long\", \"date\": \"2013-04-18T08:48:19\", \"url\": \"https://www.channelstv.com/2013/04/18/japanese-students-make-worlds-biggest-roll-cake-at-130-metres-long/\", \"categories\": [{\"id\": 182, \"name\": \"Lifestyle\", \"slug\": \"Lifestyle\", \"parent\": null}]}\n", | |
"{\"id\": 59085, \"title\": \"Nigeria's Inflation Rate Drops To 8.6%\", \"description\": \"Nigeria\\u2019s inflation rate fell to the lowest in almost five years in March as the impact of higher fuel prices a year ago fell out … Continue reading Nigeria’s Inflation Rate Drops To 8.6%\", \"date\": \"2013-04-18T08:39:29\", \"url\": \"https://www.channelstv.com/2013/04/18/nigerias-inflation-rate-drops-to-8-6/\", \"categories\": [{\"id\": 5, \"name\": \"Business\", \"slug\": \"Business\", \"parent\": null}]}\n", | |
"{\"id\": 59070, \"title\": \"Two Women Arrested For Stealing 3-Day Old Babies\", \"description\": \"The Imo state police command has nabbed a 55 year old woman, Mrs Patience Aibangbe from Edo State alongside two of her accomplices for allegedly … Continue reading Two Women Arrested For Stealing 3-Day Old Babies\", \"date\": \"2013-04-18T08:37:23\", \"url\": \"https://www.channelstv.com/2013/04/18/police-nabs-55-year-old-kidnapper-of-babies/\", \"categories\": [{\"id\": 3, \"name\": \"Local\", \"slug\": \"Local\", \"parent\": null}]}\n", | |
"{\"id\": 59081, \"title\": \"Babies As Young As Six Months Victims Of Rape In War: UN Envoy\", \"description\": \"In her first seven months as U.N. envoy on sexual violence in conflict, Zainab Hawa Bangura has visited a Congolese district where rebels raped babies, … Continue reading Babies As Young As Six Months Victims Of Rape In War: UN Envoy\", \"date\": \"2013-04-18T07:55:01\", \"url\": \"https://www.channelstv.com/2013/04/18/babies-as-young-as-six-months-victims-of-rape-in-war-un-envoy/\", \"categories\": [{\"id\": 3, \"name\": \"Local\", \"slug\": \"Local\", \"parent\": null}]}\n", | |
"{\"id\": 59077, \"title\": \"Son Of Senegal's Ex-president Charged With Corruption\", \"description\": \"Senegalese prosecutors on Wednesday formally charged Karim Wade, the son of the West African nation’s former president, with corruption and ordered him to be detained … Continue reading Son Of Senegal’s Ex-president Charged With Corruption\", \"date\": \"2013-04-18T07:51:15\", \"url\": \"https://www.channelstv.com/2013/04/18/son-of-senegals-ex-president-charged-with-corruption/\", \"categories\": [{\"id\": 3, \"name\": \"Local\", \"slug\": \"Local\", \"parent\": null}]}\n", | |
"{\"id\": 59073, \"title\": \"Scores Feared Dead As Explosion, Fire Rip Through Texas Fertilizer Plant\", \"description\": \"A deadly explosion and fire tore through a fertilizer plant in a small Texas town late on Wednesday, injuring more than 100 people, leveling dozens … Continue reading Scores Feared Dead As Explosion, Fire Rip Through Texas Fertilizer Plant\", \"date\": \"2013-04-18T07:41:55\", \"url\": \"https://www.channelstv.com/2013/04/18/deadly-explosion-fire-rip-through-texas-fertilizer-plant/\", \"categories\": [{\"id\": 12, \"name\": \"World News\", \"slug\": \"World News\", \"parent\": null}]}\n", | |
"{\"id\": 59069, \"title\": \"Pakistani Court Orders Arrest Of Pervez Musharraf\", \"description\": \"A Pakistani court has ordered the arrest of Pakistan’s ex-military ruler Pervez Musharraf over moves to impose house arrest on judges in March 2007. Mr … Continue reading Pakistani Court Orders Arrest Of Pervez Musharraf\", \"date\": \"2013-04-18T07:40:13\", \"url\": \"https://www.channelstv.com/2013/04/18/pakistani-court-orders-arrest-of-pervez-musharraf/\", \"categories\": [{\"id\": 12, \"name\": \"World News\", \"slug\": \"World News\", \"parent\": null}]}\n", | |
"{\"id\": 59066, \"title\": \"North Korea demands end of sanctions if U.S. wants dialogue\", \"description\": \"North Korea offered the United States and South Korea a list of conditions for talks, including the lifting of U.N. sanctions, signaling a possible end … Continue reading North Korea demands end of sanctions if U.S. wants dialogue\", \"date\": \"2013-04-18T07:35:22\", \"url\": \"https://www.channelstv.com/2013/04/18/north-korea-demands-end-of-sanctions-if-u-s-wants-dialogue/\", \"categories\": [{\"id\": 12, \"name\": \"World News\", \"slug\": \"World News\", \"parent\": null}]}\n", | |
"{\"id\": 59063, \"title\": \"Oando To Purchase ConocoPhillips' Asset\", \"description\": \"Oando is close to securing funds to buy ConocoPhillips’ Nigerian assets, the company’s chief executive said on Wednesday, as he looked to allay fears it … Continue reading Oando To Purchase ConocoPhillips’ Asset\", \"date\": \"2013-04-18T07:31:56\", \"url\": \"https://www.channelstv.com/2013/04/18/oando-to-purchase-conocophillips-asset/\", \"categories\": [{\"id\": 5, \"name\": \"Business\", \"slug\": \"Business\", \"parent\": null}]}\n", | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Push to Huggingface Hub" | |
], | |
"metadata": { | |
"id": "s7ItuSCFCviI" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import glob\n", | |
"import pandas as pd\n", | |
"from datasets import Dataset\n", | |
"\n", | |
"json_files = glob.glob(\"/content/drive/MyDrive/Colab Notebooks/channel_news_results/*\")\n", | |
"\n", | |
"dfs = []\n", | |
"for file in json_files:\n", | |
" df = pd.read_json(file, lines=True)\n", | |
" dfs.append(df)\n", | |
"\n", | |
"combined_df = pd.concat(dfs, ignore_index=True)\n", | |
"hf_dataset = Dataset.from_pandas(combined_df)" | |
], | |
"metadata": { | |
"id": "IubzRRN7tLM1" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"hf_dataset.push_to_hub(\"raven-consult/channels_news\")" | |
], | |
"metadata": { | |
"id": "iIsOZ4Y_2Us8" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"collapsed_sections": [ | |
"086de4YcBHsJ", | |
"UUqQlUaHfqkL", | |
"s7ItuSCFCviI", | |
"fNN7bq86fkAo" | |
], | |
"mount_file_id": "1qHC6b_J3CBL-G2FBVBPcUyzbrY_CVnhe", | |
"authorship_tag": "ABX9TyPcOsuMMG4vWd7oAnlH11p/", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"name": "python3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment