Last active
June 27, 2024 20:49
-
-
Save RazhanHameed/3912631b3c93a91e50f93a371f05c9be to your computer and use it in GitHub Desktop.
A minimal script to scrape rudaw.net
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script can scrape rudaw.net in two stages | |
1. `python rudaw.py links` collect links for each category | |
2. `python rudaw.py content` collect content for each link and writes it to rudaw.csv | |
""" | |
import sys | |
import os | |
import csv | |
import asyncio | |
from playwright.async_api import async_playwright, Playwright | |
from trafilatura import extract | |
from tqdm import tqdm | |
URLs = [ | |
"https://www.rudaw.net/sorani/news?CategoryID=412602", | |
"https://www.rudaw.net/sorani/news?CategoryID=412608", | |
"https://www.rudaw.net/sorani/news?CategoryID=412614", | |
"https://www.rudaw.net/sorani/news?CategoryID=412616", | |
"https://www.rudaw.net/sorani/news?CategoryID=412617", | |
"https://www.rudaw.net/sorani/news?CategoryID=412625", | |
"https://www.rudaw.net/sorani/news?CategoryID=412626", | |
"https://www.rudaw.net/sorani/news?CategoryID=412627", | |
"https://www.rudaw.net/sorani/news?CategoryID=412628", | |
"https://www.rudaw.net/sorani/news?CategoryID=414583", | |
"https://www.rudaw.net/sorani/news?CategoryID=414584", | |
"https://www.rudaw.net/sorani/news?CategoryID=412631", | |
"https://www.rudaw.net/sorani/news?CategoryID=412632", | |
"https://www.rudaw.net/sorani/onair/tv/episodes?PID=67", | |
"https://www.rudaw.net/sorani/onair/tv/episodes?PID=55" | |
] | |
category_names = [ | |
"کوردستان", | |
"عێراق", | |
"تورکیا", | |
"ئێران", | |
"سووریا", | |
"جیهان", | |
"ئابووری", | |
"هەڤپەیڤین", | |
"بیروڕا", | |
"کولتوور و ستایڵ", | |
"ئەدەب", | |
"تەندروستی", | |
"وەرزش", | |
"دۆکیۆمێنتاری", | |
"فیچەر" | |
] | |
# use list slicing if you want to skip some categories | |
# URLs = URLs[14:] | |
# category_names = category_names[14:] | |
SAVE_DIR = "rudaw_links" | |
OUTPUT_FILE = "rudaw.csv" | |
# make sure SAVE_DIR exists | |
if not os.path.exists(SAVE_DIR): | |
os.makedirs(SAVE_DIR) | |
async def run(playwright: Playwright): | |
browser = await playwright.chromium.launch(headless=False) | |
for url, category_name in zip(URLs, category_names): | |
print("Scraping " + category_name) | |
page = await browser.new_page() | |
await page.goto(url) # Replace with your target website | |
links = set() | |
last_height = 0 | |
new_links = await page.evaluate( | |
""" | |
() => Array.from(document.querySelectorAll('a.article__link')).map(a => a.href) | |
""" | |
) | |
# Add new links to set | |
links.update(new_links) | |
print(f"Total unique links found: {len(links)}") | |
while True: | |
# Scroll down | |
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") | |
# Wait for new content to load | |
await page.wait_for_timeout(3000) | |
# Get all links | |
new_links = await page.evaluate( | |
""" | |
() => Array.from(document.querySelectorAll('a.article__link')).map(a => a.href) | |
""" | |
) | |
# Add new links to set | |
links.update(new_links) | |
# Save links to file | |
with open(SAVE_DIR + f"/{category_name}.txt", "w") as f: | |
for link in links: | |
f.write(f"{link}\n") | |
print(f"Total unique links found: {len(links)}") | |
# Check if we've reached the end of the page | |
new_height = await page.evaluate("document.body.scrollHeight") | |
# get a button where the title is "Load" and check if it is visible | |
archive_btn = await page.get_by_role("link", name="ئهرشیفی گهڕان") | |
loader = await page.locator("#ajaxLoaderNewsListing") | |
if ( | |
new_height == last_height | |
and not loader.is_visible() | |
and archive_btn.is_visible() | |
): | |
print("Reached the end of the page") | |
break | |
last_height = new_height | |
await browser.close() | |
async def get_content(playwright): | |
browser = await playwright.chromium.launch() | |
# read files SAVE_DIR | |
files = os.listdir(SAVE_DIR) | |
# read urls from rudaw.csv so we don't scrape them again | |
rudaw_urls = set() | |
with open(OUTPUT_FILE, "r") as csvfile: | |
reader = csv.DictReader(csvfile) | |
for row in reader: | |
rudaw_urls.add(row["url"]) | |
# read URLs and categories from files | |
urls_and_categories = [] | |
for file in tqdm(files): | |
category = file.split(".")[0] | |
with open(SAVE_DIR + "/" + file, "r") as f: | |
urls = f.read().splitlines() | |
urls_and_categories.extend([(url, category) for url in urls]) | |
# Filter out URLs that are already in rudaw.csv | |
urls_and_categories = [(url, category) for url, category in urls_and_categories if url not in rudaw_urls] | |
# write results to a csv file using the csv writer | |
if not os.path.exists(OUTPUT_FILE): | |
with open(OUTPUT_FILE, "w", newline="") as f: | |
writer = csv.writer(f) | |
writer.writerow(["text", "category", "url"]) | |
with open(OUTPUT_FILE, "a", newline="") as f: | |
writer = csv.writer(f) | |
context = await browser.new_context() | |
page = await context.new_page() | |
for url, category in tqdm(urls_and_categories, desc="Extracting text.."): | |
await page.goto(url, timeout=60000) | |
content = await page.content() | |
text = extract(content) | |
writer.writerow([text, category, url]) | |
await context.close() | |
await browser.close() | |
# setup guard for arguments | |
if len(sys.argv) < 2: | |
print("Usage: python rudaw.py <links|content>") | |
sys.exit(1) | |
async def main(stage: str = "links"): | |
async with async_playwright() as playwright: | |
if stage == "links": | |
await run(playwright) | |
elif stage == "content": | |
await get_content(playwright) | |
asyncio.run(main(sys.argv[1])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment