Skip to content

Instantly share code, notes, and snippets.

@AWeirdDev
Created March 2, 2024 14:34
Show Gist options
  • Save AWeirdDev/b294e7fda8fd1330d8fac5ea20447090 to your computer and use it in GitHub Desktop.
Save AWeirdDev/b294e7fda8fd1330d8fac5ea20447090 to your computer and use it in GitHub Desktop.
from typing import List, Tuple
import requests
from datasets import Dataset
from selectolax.lexbor import LexborHTMLParser
# News pages search total iterations
N_NEWS_ITERS = 100
user_agent = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 OPR/107.0.0.0"
)
base_url = "https://news.pts.org.tw/dailynews?page=%i"
def read_article(link: str) -> Tuple[str, List[str]]:
r = requests.get(link, headers={ "User-Agent": user_agent })
r.raise_for_status()
parser = LexborHTMLParser(r.text)
conclusion = parser.css_first(".articleimg").text()
contents = []
for paragraph in parser.css(".post-article p"):
contents.append(paragraph.text(separator=" ", strip=True))
return (conclusion, contents)
def dataset_generator():
for i in range(N_NEWS_ITERS):
r = requests.get(base_url % i, headers={ "User-Agent": user_agent })
r.raise_for_status()
parser = LexborHTMLParser(r.text)
articles = parser.css(".break-news-container ul.news-list-update li.d-flex")
for article in articles:
link = article.css_first("a").attributes['href']
image = article.css_first("img").attributes['src']
title = article.css_first("h2").text()
info = article.css_first(".news-info")
timestamp = info.css_first("time").text(strip=True)
try:
category = info.css_first("a").text(strip=True)
except AttributeError:
category = None
tags = []
tag_nodes = article.css('ul[x-data="articleTags"] li a')
for tag_node in tag_nodes:
tags.append(tag_node.text(strip=True))
conclusion, contents = read_article(link)
yield {
"image": image,
"title": title,
"conclusion": conclusion,
"content": "\n".join(contents),
"timestamp": timestamp,
"category": category,
"link": link
}
dataset = Dataset.from_generator(dataset_generator)
dataset.save_to_disk(f"zh-tw-pts-news-{dataset.num_rows}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment