RazhanHameed · June 27, 2024 20:49
diff --git a/rudaw.py b/rudaw.py
 """
 This script can scrape rudaw.net in two stages

 1. `python rudaw.py links` collect links for each category
 2. `python rudaw.py content` collect content for each link and writes it to rudaw.csv
 """

 import sys
 import os
 import csv
 import asyncio
 from playwright.async_api import async_playwright, Playwright
 from trafilatura import extract
 from tqdm import tqdm


 URLs = [
    "https://www.rudaw.net/sorani/news?CategoryID=412602",
    "https://www.rudaw.net/sorani/news?CategoryID=412608",
    "https://www.rudaw.net/sorani/news?CategoryID=412614",
    "https://www.rudaw.net/sorani/news?CategoryID=412616",
    "https://www.rudaw.net/sorani/news?CategoryID=412617",
    "https://www.rudaw.net/sorani/news?CategoryID=412625",
    "https://www.rudaw.net/sorani/news?CategoryID=412626",
    "https://www.rudaw.net/sorani/news?CategoryID=412627",
    "https://www.rudaw.net/sorani/news?CategoryID=412628",
    "https://www.rudaw.net/sorani/news?CategoryID=414583",
    "https://www.rudaw.net/sorani/news?CategoryID=414584",
    "https://www.rudaw.net/sorani/news?CategoryID=412631",
    "https://www.rudaw.net/sorani/news?CategoryID=412632",
    "https://www.rudaw.net/sorani/onair/tv/episodes?PID=67",
    "https://www.rudaw.net/sorani/onair/tv/episodes?PID=55"
 ]
 category_names = [
    "کوردستان",
    "عێراق",
    "تورکیا",
    "ئێران",
    "سووریا",
    "جیهان",
    "ئابووری",
    "هەڤپەیڤین",
    "بیروڕا",
    "کولتوور و ستایڵ",
    "ئەدەب",
    "تەندروستی",
    "وەرزش",
    "دۆکیۆمێنتاری",
    "فیچەر"
 ]

 # use list slicing if you want to skip some categories
 # URLs = URLs[14:]
 # category_names = category_names[14:]

 SAVE_DIR = "rudaw_links"
 OUTPUT_FILE = "rudaw.csv"


 # make sure SAVE_DIR exists
 if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

 async def run(playwright: Playwright):
    browser = await playwright.chromium.launch(headless=False)
    for url, category_name in zip(URLs, category_names):
        print("Scraping " + category_name)
        page = await browser.new_page()
        await page.goto(url)  # Replace with your target website

        links = set()
        last_height = 0

        new_links = await page.evaluate(
            """
                () => Array.from(document.querySelectorAll('a.article__link')).map(a => a.href)
        """
        )

        # Add new links to set
        links.update(new_links)
        print(f"Total unique links found: {len(links)}")

        while True:
            # Scroll down
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")

            # Wait for new content to load
            await page.wait_for_timeout(3000)

            # Get all links
            new_links = await page.evaluate(
                """
                () => Array.from(document.querySelectorAll('a.article__link')).map(a => a.href)
            """
            )

            # Add new links to set
            links.update(new_links)

            # Save links to file
            with open(SAVE_DIR + f"/{category_name}.txt", "w") as f:
                for link in links:
                    f.write(f"{link}\n")

            print(f"Total unique links found: {len(links)}")

            # Check if we've reached the end of the page
            new_height = await page.evaluate("document.body.scrollHeight")

            # get a button where the title is "Load" and check if it is visible
            archive_btn = await page.get_by_role("link", name="ئه‌رشیفی گه‌ڕان")
            loader = await page.locator("#ajaxLoaderNewsListing")
            if (
                new_height == last_height
                and not loader.is_visible()
                and archive_btn.is_visible()
            ):
                print("Reached the end of the page")
                break
            last_height = new_height

    await browser.close()





 async def get_content(playwright):
    browser = await playwright.chromium.launch()

    # read files SAVE_DIR
    files = os.listdir(SAVE_DIR)

    # read urls from rudaw.csv so we don't scrape them again
    rudaw_urls = set()
    with open(OUTPUT_FILE, "r") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            rudaw_urls.add(row["url"])

    # read URLs and categories from files
    urls_and_categories = []
    for file in tqdm(files):
        category = file.split(".")[0]
        with open(SAVE_DIR + "/" + file, "r") as f:
            urls = f.read().splitlines()
            urls_and_categories.extend([(url, category) for url in urls])


    # Filter out URLs that are already in rudaw.csv
    urls_and_categories = [(url, category) for url, category in urls_and_categories if url not in rudaw_urls]

    # write results to a csv file using the csv writer
    if not os.path.exists(OUTPUT_FILE):
        with open(OUTPUT_FILE, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["text", "category", "url"])

    with open(OUTPUT_FILE, "a", newline="") as f:
        writer = csv.writer(f)
        context = await  browser.new_context()
        page = await context.new_page()
        for url, category in tqdm(urls_and_categories, desc="Extracting text.."):
            await page.goto(url, timeout=60000)
            content = await page.content()
            text = extract(content)
            writer.writerow([text, category, url])

        await context.close()

    await browser.close()


 # setup guard for arguments
 if len(sys.argv) < 2:
    print("Usage: python rudaw.py <links|content>")
    sys.exit(1)


 async def main(stage: str = "links"):
    async with async_playwright() as playwright:
        if stage == "links":
            await run(playwright)

        elif stage == "content":
            await get_content(playwright)

    
 asyncio.run(main(sys.argv[1]))
	"""
	This script can scrape rudaw.net in two stages

	1. `python rudaw.py links` collect links for each category
	2. `python rudaw.py content` collect content for each link and writes it to rudaw.csv
	"""

	import sys
	import os
	import csv
	import asyncio
	from playwright.async_api import async_playwright, Playwright
	from trafilatura import extract
	from tqdm import tqdm


	URLs = [
	"https://www.rudaw.net/sorani/news?CategoryID=412602",
	"https://www.rudaw.net/sorani/news?CategoryID=412608",
	"https://www.rudaw.net/sorani/news?CategoryID=412614",
	"https://www.rudaw.net/sorani/news?CategoryID=412616",
	"https://www.rudaw.net/sorani/news?CategoryID=412617",
	"https://www.rudaw.net/sorani/news?CategoryID=412625",
	"https://www.rudaw.net/sorani/news?CategoryID=412626",
	"https://www.rudaw.net/sorani/news?CategoryID=412627",
	"https://www.rudaw.net/sorani/news?CategoryID=412628",
	"https://www.rudaw.net/sorani/news?CategoryID=414583",
	"https://www.rudaw.net/sorani/news?CategoryID=414584",
	"https://www.rudaw.net/sorani/news?CategoryID=412631",
	"https://www.rudaw.net/sorani/news?CategoryID=412632",
	"https://www.rudaw.net/sorani/onair/tv/episodes?PID=67",
	"https://www.rudaw.net/sorani/onair/tv/episodes?PID=55"
	]
	category_names = [
	"کوردستان",
	"عێراق",
	"تورکیا",
	"ئێران",
	"سووریا",
	"جیهان",
	"ئابووری",
	"هەڤپەیڤین",
	"بیروڕا",
	"کولتوور و ستایڵ",
	"ئەدەب",
	"تەندروستی",
	"وەرزش",
	"دۆکیۆمێنتاری",
	"فیچەر"
	]

	# use list slicing if you want to skip some categories
	# URLs = URLs[14:]
	# category_names = category_names[14:]

	SAVE_DIR = "rudaw_links"
	OUTPUT_FILE = "rudaw.csv"


	# make sure SAVE_DIR exists
	if not os.path.exists(SAVE_DIR):
	os.makedirs(SAVE_DIR)

	async def run(playwright: Playwright):
	browser = await playwright.chromium.launch(headless=False)
	for url, category_name in zip(URLs, category_names):
	print("Scraping " + category_name)
	page = await browser.new_page()
	await page.goto(url) # Replace with your target website

	links = set()
	last_height = 0

	new_links = await page.evaluate(
	"""
	() => Array.from(document.querySelectorAll('a.article__link')).map(a => a.href)
	"""
	)

	# Add new links to set
	links.update(new_links)
	print(f"Total unique links found: {len(links)}")

	while True:
	# Scroll down
	await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")

	# Wait for new content to load
	await page.wait_for_timeout(3000)

	# Get all links
	new_links = await page.evaluate(
	"""
	() => Array.from(document.querySelectorAll('a.article__link')).map(a => a.href)
	"""
	)

	# Add new links to set
	links.update(new_links)

	# Save links to file
	with open(SAVE_DIR + f"/{category_name}.txt", "w") as f:
	for link in links:
	f.write(f"{link}\n")

	print(f"Total unique links found: {len(links)}")

	# Check if we've reached the end of the page
	new_height = await page.evaluate("document.body.scrollHeight")

	# get a button where the title is "Load" and check if it is visible
	archive_btn = await page.get_by_role("link", name="ئه‌رشیفی گه‌ڕان")
	loader = await page.locator("#ajaxLoaderNewsListing")
	if (
	new_height == last_height
	and not loader.is_visible()
	and archive_btn.is_visible()
	):
	print("Reached the end of the page")
	break
	last_height = new_height

	await browser.close()





	async def get_content(playwright):
	browser = await playwright.chromium.launch()

	# read files SAVE_DIR
	files = os.listdir(SAVE_DIR)

	# read urls from rudaw.csv so we don't scrape them again
	rudaw_urls = set()
	with open(OUTPUT_FILE, "r") as csvfile:
	reader = csv.DictReader(csvfile)
	for row in reader:
	rudaw_urls.add(row["url"])

	# read URLs and categories from files
	urls_and_categories = []
	for file in tqdm(files):
	category = file.split(".")[0]
	with open(SAVE_DIR + "/" + file, "r") as f:
	urls = f.read().splitlines()
	urls_and_categories.extend([(url, category) for url in urls])


	# Filter out URLs that are already in rudaw.csv
	urls_and_categories = [(url, category) for url, category in urls_and_categories if url not in rudaw_urls]

	# write results to a csv file using the csv writer
	if not os.path.exists(OUTPUT_FILE):
	with open(OUTPUT_FILE, "w", newline="") as f:
	writer = csv.writer(f)
	writer.writerow(["text", "category", "url"])

	with open(OUTPUT_FILE, "a", newline="") as f:
	writer = csv.writer(f)
	context = await browser.new_context()
	page = await context.new_page()
	for url, category in tqdm(urls_and_categories, desc="Extracting text.."):
	await page.goto(url, timeout=60000)
	content = await page.content()
	text = extract(content)
	writer.writerow([text, category, url])

	await context.close()

	await browser.close()


	# setup guard for arguments
	if len(sys.argv) < 2:
	print("Usage: python rudaw.py <links\|content>")
	sys.exit(1)


	async def main(stage: str = "links"):
	async with async_playwright() as playwright:
	if stage == "links":
	await run(playwright)

	elif stage == "content":
	await get_content(playwright)


	asyncio.run(main(sys.argv[1]))