Created
March 18, 2020 13:25
-
-
Save simonseo/56a794138a08164f7e0a052ed7d4f854 to your computer and use it in GitHub Desktop.
패스트캠퍼스 강좌를 다운로드 받기 위한 스크레이퍼
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# @File Name: scrapper.py | |
# @Created: 2020-03-18 02:57:12 Simon Myunggun Seo ([email protected]) | |
# @Updated: 2020-03-18 17:08:14 Simon Seo ([email protected]) | |
import sys, time | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.common.by import By | |
from contextlib import contextmanager | |
import requests, json | |
USERNAME = "[email protected]" | |
PASSWORD = "mypassword" | |
COURSE_URL = "https://www.fastcampus.co.kr/courses/200328/clips/4591" | |
VIDEO_COUNT = 5 | |
@contextmanager | |
def headlessDriver(): | |
options = webdriver.ChromeOptions() | |
# options.add_argument('headless') | |
driver = webdriver.Chrome(chrome_options=options) | |
driver.implicitly_wait(10) | |
yield driver | |
driver.close() | |
@contextmanager | |
def point_to(driver, url): | |
driver.get(url) | |
driver.implicitly_wait(4) | |
yield driver | |
driver.back() | |
driver.implicitly_wait(4) | |
def authenticate(driver): | |
# Normal Auth | |
print("Authenticating Fastcampus") | |
username_box = driver.find_element_by_name("email") | |
username_box.clear(); username_box.send_keys(USERNAME) | |
password_box = driver.find_element_by_name("password") | |
password_box.clear(); password_box.send_keys(PASSWORD) | |
password_box.send_keys(Keys.RETURN) | |
time.sleep(5) | |
def get_video_info(driver: webdriver.Chrome): | |
print("Retrieving Video Info from {}".format(driver.current_url)) | |
# get title here | |
title_tag = driver.find_elements_by_class_name('fco-lecture-hall-header__title')[0].find_elements_by_css_selector("*")[-1] | |
title = title_tag.text | |
while title == "재생중인 강의가 없습니다.": | |
driver.implicitly_wait(1) | |
title_tag = driver.find_elements_by_class_name('fco-lecture-hall-header__title')[0].find_elements_by_css_selector("*")[-1] | |
title = title_tag.text | |
iframe = driver.find_elements_by_class_name("fco-kollus-video__viewer")[0] | |
iframe_src = iframe.get_attribute('src') | |
print(iframe_src) | |
with point_to(driver, iframe_src): | |
print(driver.find_element_by_id('kollus_player_html5_api')) | |
video_src = driver.find_element_by_id('kollus_player_html5_api').get_attribute('src') | |
return title, video_src | |
def load_next_video(driver): | |
next_button = driver.find_elements_by_class_name("fco-video-controller__play-control")[0].find_elements_by_css_selector("*")[-1] | |
webdriver.ActionChains(driver).move_to_element(next_button).click(next_button).perform() # click button even if it's hidden | |
driver.implicitly_wait(4) | |
def download(file_name, link): | |
with open(file_name, "wb") as f: | |
print("Downloading video \"{}\"".format(file_name)) | |
response = requests.get(link, stream=True) | |
total_length = response.headers.get('content-length') | |
if total_length is None: # no content length header | |
f.write(response.content) | |
else: | |
dl = 0 | |
total_length = int(total_length) | |
for data in response.iter_content(chunk_size=4096): | |
dl += len(data) | |
f.write(data) | |
done = int(50 * dl / total_length) | |
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) ) | |
sys.stdout.flush() | |
print("\nFinished Downloading video \"{}\"".format(file_name)) | |
def run(): | |
with headlessDriver() as driver: | |
try: | |
# log in | |
driver.get(COURSE_URL) | |
if "패스트캠퍼스" in driver.title: | |
authenticate(driver) | |
# download | |
for _ in range(VIDEO_COUNT): | |
load_next_video(driver) | |
title, video_src = get_video_info(driver) | |
download(title+".mp4", video_src) | |
except Exception as e: | |
driver.close() | |
raise e | |
if __name__ == '__main__': | |
run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment