Skip to content

Instantly share code, notes, and snippets.

@smzm
Created July 17, 2024 08:58
Show Gist options
  • Save smzm/f41ee308da75ecbcf6f96643892c171b to your computer and use it in GitHub Desktop.
Save smzm/f41ee308da75ecbcf6f96643892c171b to your computer and use it in GitHub Desktop.
Udemy Course Content Scraper
from playwright.sync_api import sync_playwright
UdemyCourse = "https://www.udemy.com/course/"
courseLable = input("Enter the course URL label: ")
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=True)
context = browser.new_context()
page = context.new_page()
page.goto(UdemyCourse + courseLable)
page.get_by_text("Expand all sections").click()
node = page.locator(
"xpath=//div[@data-purpose='course-curriculum']/div[last()]/div"
).all()
i = 1
with open(f"{courseLable}.csv", "w") as file:
file.write("sep=\t \n")
for season in node:
header = season.locator(
"xpath=/div[1]//span[@class='section--section-title--svpHP']"
).inner_text()
section_lists = season.locator("xpath=/div[2]//ul/li").all()
# print(f"{i}. {header}")
file.write(f"{i}. {header}")
for section in section_lists:
content = section.locator("xpath=/div/div/div/div//span").inner_text()
duration = section.locator("xpath=/div/div/span[2]").inner_text()
# print(f"\t - {content}: {duration}")
file.write(f" \t{content}\t {duration}\n")
i = i + 1
@smzm
Copy link
Author

smzm commented Jul 17, 2024

CBTNUGGETS :

import re

from playwright.sync_api import sync_playwright

URL = "https://www.cbtnuggets.com/it-training/python/python-pandas"

with sync_playwright() as playwright:
    browser = playwright.chromium.launch(headless=False)
    context = browser.new_context()
    page = context.new_page()
    page.goto(URL)
    list_of_li = page.locator(
        "xpath=//ul[@class='StyledSkillList-sc-pqcd25-0 hPGOBd']/li"
    ).all()

    with open("cbtnuggets-sylabus.csv", "w") as file:
        file.write("sep=\t \n")
        i = 1
        for li in list_of_li:
            full_header = li.locator("xpath=/div/div[2]").text_content()
            header = re.sub(r"^Skill:", f"{i}. ", full_header)
            file.write(f"{header}")
            li.click()
            contents = li.locator("xpath=/ul/li").all()
            for content in contents:
                name = content.locator("xpath=/div[2]/span").inner_text()
                duration = content.locator("xpath=/div[2]/div").inner_text()
                file.write(f"\t{name} \t {duration}\n")
            i = i + 1

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment