Created
July 17, 2024 08:58
-
-
Save smzm/f41ee308da75ecbcf6f96643892c171b to your computer and use it in GitHub Desktop.
Udemy Course Content Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from playwright.sync_api import sync_playwright | |
UdemyCourse = "https://www.udemy.com/course/" | |
courseLable = input("Enter the course URL label: ") | |
with sync_playwright() as playwright: | |
browser = playwright.chromium.launch(headless=True) | |
context = browser.new_context() | |
page = context.new_page() | |
page.goto(UdemyCourse + courseLable) | |
page.get_by_text("Expand all sections").click() | |
node = page.locator( | |
"xpath=//div[@data-purpose='course-curriculum']/div[last()]/div" | |
).all() | |
i = 1 | |
with open(f"{courseLable}.csv", "w") as file: | |
file.write("sep=\t \n") | |
for season in node: | |
header = season.locator( | |
"xpath=/div[1]//span[@class='section--section-title--svpHP']" | |
).inner_text() | |
section_lists = season.locator("xpath=/div[2]//ul/li").all() | |
# print(f"{i}. {header}") | |
file.write(f"{i}. {header}") | |
for section in section_lists: | |
content = section.locator("xpath=/div/div/div/div//span").inner_text() | |
duration = section.locator("xpath=/div/div/span[2]").inner_text() | |
# print(f"\t - {content}: {duration}") | |
file.write(f" \t{content}\t {duration}\n") | |
i = i + 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
CBTNUGGETS :