Skip to content

Instantly share code, notes, and snippets.

@RazhanHameed
Last active June 27, 2024 20:49
Show Gist options
  • Save RazhanHameed/3912631b3c93a91e50f93a371f05c9be to your computer and use it in GitHub Desktop.
Save RazhanHameed/3912631b3c93a91e50f93a371f05c9be to your computer and use it in GitHub Desktop.
A minimal script to scrape rudaw.net
"""
This script can scrape rudaw.net in two stages
1. `python rudaw.py links` collect links for each category
2. `python rudaw.py content` collect content for each link and writes it to rudaw.csv
"""
import sys
import os
import csv
import asyncio
from playwright.async_api import async_playwright, Playwright
from trafilatura import extract
from tqdm import tqdm
URLs = [
"https://www.rudaw.net/sorani/news?CategoryID=412602",
"https://www.rudaw.net/sorani/news?CategoryID=412608",
"https://www.rudaw.net/sorani/news?CategoryID=412614",
"https://www.rudaw.net/sorani/news?CategoryID=412616",
"https://www.rudaw.net/sorani/news?CategoryID=412617",
"https://www.rudaw.net/sorani/news?CategoryID=412625",
"https://www.rudaw.net/sorani/news?CategoryID=412626",
"https://www.rudaw.net/sorani/news?CategoryID=412627",
"https://www.rudaw.net/sorani/news?CategoryID=412628",
"https://www.rudaw.net/sorani/news?CategoryID=414583",
"https://www.rudaw.net/sorani/news?CategoryID=414584",
"https://www.rudaw.net/sorani/news?CategoryID=412631",
"https://www.rudaw.net/sorani/news?CategoryID=412632",
"https://www.rudaw.net/sorani/onair/tv/episodes?PID=67",
"https://www.rudaw.net/sorani/onair/tv/episodes?PID=55"
]
category_names = [
"کوردستان",
"عێراق",
"تورکیا",
"ئێران",
"سووریا",
"جیهان",
"ئابووری",
"هەڤپەیڤین",
"بیروڕا",
"کولتوور و ستایڵ",
"ئەدەب",
"تەندروستی",
"وەرزش",
"دۆکیۆمێنتاری",
"فیچەر"
]
# use list slicing if you want to skip some categories
# URLs = URLs[14:]
# category_names = category_names[14:]
SAVE_DIR = "rudaw_links"
OUTPUT_FILE = "rudaw.csv"
# make sure SAVE_DIR exists
if not os.path.exists(SAVE_DIR):
os.makedirs(SAVE_DIR)
async def run(playwright: Playwright):
browser = await playwright.chromium.launch(headless=False)
for url, category_name in zip(URLs, category_names):
print("Scraping " + category_name)
page = await browser.new_page()
await page.goto(url) # Replace with your target website
links = set()
last_height = 0
new_links = await page.evaluate(
"""
() => Array.from(document.querySelectorAll('a.article__link')).map(a => a.href)
"""
)
# Add new links to set
links.update(new_links)
print(f"Total unique links found: {len(links)}")
while True:
# Scroll down
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
# Wait for new content to load
await page.wait_for_timeout(3000)
# Get all links
new_links = await page.evaluate(
"""
() => Array.from(document.querySelectorAll('a.article__link')).map(a => a.href)
"""
)
# Add new links to set
links.update(new_links)
# Save links to file
with open(SAVE_DIR + f"/{category_name}.txt", "w") as f:
for link in links:
f.write(f"{link}\n")
print(f"Total unique links found: {len(links)}")
# Check if we've reached the end of the page
new_height = await page.evaluate("document.body.scrollHeight")
# get a button where the title is "Load" and check if it is visible
archive_btn = await page.get_by_role("link", name="ئه‌رشیفی گه‌ڕان")
loader = await page.locator("#ajaxLoaderNewsListing")
if (
new_height == last_height
and not loader.is_visible()
and archive_btn.is_visible()
):
print("Reached the end of the page")
break
last_height = new_height
await browser.close()
async def get_content(playwright):
browser = await playwright.chromium.launch()
# read files SAVE_DIR
files = os.listdir(SAVE_DIR)
# read urls from rudaw.csv so we don't scrape them again
rudaw_urls = set()
with open(OUTPUT_FILE, "r") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
rudaw_urls.add(row["url"])
# read URLs and categories from files
urls_and_categories = []
for file in tqdm(files):
category = file.split(".")[0]
with open(SAVE_DIR + "/" + file, "r") as f:
urls = f.read().splitlines()
urls_and_categories.extend([(url, category) for url in urls])
# Filter out URLs that are already in rudaw.csv
urls_and_categories = [(url, category) for url, category in urls_and_categories if url not in rudaw_urls]
# write results to a csv file using the csv writer
if not os.path.exists(OUTPUT_FILE):
with open(OUTPUT_FILE, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["text", "category", "url"])
with open(OUTPUT_FILE, "a", newline="") as f:
writer = csv.writer(f)
context = await browser.new_context()
page = await context.new_page()
for url, category in tqdm(urls_and_categories, desc="Extracting text.."):
await page.goto(url, timeout=60000)
content = await page.content()
text = extract(content)
writer.writerow([text, category, url])
await context.close()
await browser.close()
# setup guard for arguments
if len(sys.argv) < 2:
print("Usage: python rudaw.py <links|content>")
sys.exit(1)
async def main(stage: str = "links"):
async with async_playwright() as playwright:
if stage == "links":
await run(playwright)
elif stage == "content":
await get_content(playwright)
asyncio.run(main(sys.argv[1]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment