Created
February 21, 2020 10:16
-
-
Save Oshibuki/5155a611c991e2b37589efe1e4eaa343 to your computer and use it in GitHub Desktop.
fix download for edx.org in edx-dl at 2020/2/21
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Parsing and extraction functions | |
""" | |
import re | |
import json | |
from datetime import timedelta, datetime | |
from six.moves import html_parser | |
from bs4 import BeautifulSoup as BeautifulSoup_ | |
from .common import Course, Section, SubSection, Unit, Video | |
# Force use of bs4 with html.parser | |
BeautifulSoup = lambda page: BeautifulSoup_(page, 'html.parser') | |
def edx_json2srt(o): | |
""" | |
Transform the dict 'o' into the srt subtitles format | |
""" | |
if o == {}: | |
return '' | |
base_time = datetime(1, 1, 1) | |
output = [] | |
for i, (s, e, t) in enumerate(zip(o['start'], o['end'], o['text'])): | |
if t == '': | |
continue | |
output.append(str(i) + '\n') | |
s = base_time + timedelta(seconds=s/1000.) | |
e = base_time + timedelta(seconds=e/1000.) | |
time_range = "%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\n" % \ | |
(s.hour, s.minute, s.second, s.microsecond/1000, | |
e.hour, e.minute, e.second, e.microsecond/1000) | |
output.append(time_range) | |
output.append(t + "\n\n") | |
return ''.join(output) | |
class PageExtractor(object): | |
""" | |
Base class for PageExtractor | |
Every subclass can represent a different layout for an OpenEdX site. | |
They should implement the given methods. | |
Usage: | |
>>> import parsing | |
>>> d = parsing.SubclassFromPageExtractor() | |
>>> units = d.extract_units_from_html(page, BASE_URL) | |
>>> ... | |
""" | |
def extract_units_from_html(self, page, BASE_URL, file_formats): | |
""" | |
Method to extract the resources (units) from the given page | |
""" | |
raise NotImplementedError("Subclasses should implement this") | |
def extract_sections_from_html(self, page, BASE_URL): | |
""" | |
Method to extract the sections (and subsections) from an html page | |
""" | |
raise NotImplementedError("Subclasses should implement this") | |
def extract_courses_from_html(self, page, BASE_URL): | |
""" | |
Method to extract the courses from an html page | |
""" | |
raise NotImplementedError("Subclasses should implement this") | |
class ClassicEdXPageExtractor(PageExtractor): | |
def extract_units_from_html(self, page, BASE_URL, file_formats): | |
""" | |
Extract Units from the html of a subsection webpage as a list of | |
resources | |
""" | |
# in this function we avoid using beautifulsoup for performance reasons | |
# parsing html with regular expressions is really nasty, don't do this if | |
# you don't need to ! | |
re_units = re.compile('(<div?[^>]id="seq_contents_\d+".*?>.*?<\/div>)', | |
re.DOTALL) | |
units = [] | |
for unit_html in re_units.findall(page): | |
unit = self.extract_unit(unit_html, BASE_URL, file_formats) | |
if len(unit.videos) > 0 or len(unit.resources_urls) > 0: | |
units.append(unit) | |
return units | |
def extract_unit(self, text, BASE_URL, file_formats): | |
""" | |
Parses the <div> of each unit and extracts the urls of its resources | |
""" | |
video_youtube_url = self.extract_video_youtube_url(text) | |
available_subs_url, sub_template_url = self.extract_subtitle_urls(text, BASE_URL) | |
mp4_urls = self.extract_mp4_urls(text) | |
videos = [Video(video_youtube_url=video_youtube_url, | |
available_subs_url=available_subs_url, | |
sub_template_url=sub_template_url, | |
mp4_urls=mp4_urls)] | |
resources_urls = self.extract_resources_urls(text, BASE_URL, | |
file_formats) | |
return Unit(videos=videos, resources_urls=resources_urls) | |
def extract_video_youtube_url(self, text): | |
re_video_youtube_url = re.compile(r'data-streams=".*?1.0\d+\:(?:.*?)(.{11})') | |
video_youtube_url = None | |
match_video_youtube_url = re_video_youtube_url.search(text) | |
if match_video_youtube_url is None: | |
re_video_youtube_url = re.compile(r'https://www.youtube.com/embed/(.{11})\?rel=') | |
match_video_youtube_url = re_video_youtube_url.search(text) | |
if match_video_youtube_url is not None: | |
video_id = match_video_youtube_url.group(1) | |
video_youtube_url = 'https://youtube.com/watch?v=' + video_id | |
return video_youtube_url | |
def extract_subtitle_urls(self, text, BASE_URL): | |
re_sub_template_url = re.compile(r'data-transcript-translation-url=(?:"|")([^"&]*)(?:"|")') | |
re_available_subs_url = re.compile(r'data-transcript-available-translations-url=(?:"|")([^"&]*)(?:"|")') | |
available_subs_url = None | |
sub_template_url = None | |
match_subs = re_sub_template_url.search(text) | |
if match_subs: | |
match_available_subs = re_available_subs_url.search(text) | |
if match_available_subs: | |
available_subs_url = BASE_URL + match_available_subs.group(1) | |
sub_template_url = BASE_URL + match_subs.group(1) + "/%s" | |
else: | |
re_available_subs_url=re.compile(r'href=(?:"|")([^"&]+)(?:"|")>Download transcript<') | |
match_available_subs = re_available_subs_url.search(text) | |
if match_available_subs: | |
sub_template_url = BASE_URL + match_available_subs.group(1) | |
available_subs_url = None | |
return available_subs_url, sub_template_url | |
def extract_mp4_urls(self, text): | |
""" | |
Looks for available links to the mp4 version of the videos | |
""" | |
# mp4 urls may be in two places, in the field data-sources, and as <a> | |
# refs This regex tries to match all the appearances, however we | |
# exclude the ';' # character in the urls, since it is used to separate | |
# multiple urls in one string, however ';' is a valid url name | |
# character, but it is not really common. | |
re_mp4_urls = re.compile(r'(?:(https?://[^;]*?\.mp4))') | |
mp4_urls = list(set(re_mp4_urls.findall(text))) | |
return mp4_urls | |
def extract_resources_urls(self, text, BASE_URL, file_formats): | |
""" | |
Extract resources looking for <a> references in the webpage and | |
matching the given file formats | |
""" | |
formats = '|'.join(file_formats) | |
re_resources_urls = re.compile(r'<a href=(?:"|")([^"&]*.(?:' + formats + '))(?:"|")') | |
resources_urls = [] | |
for url in re_resources_urls.findall(text): | |
if url.startswith('http') or url.startswith('https'): | |
resources_urls.append(url) | |
elif url.startswith('//'): | |
resources_urls.append('https:' + url) | |
else: | |
resources_urls.append(BASE_URL + url) | |
# we match links to youtube videos as <a href> and add them to the | |
# download list | |
re_youtube_links = re.compile(r'<a href=(?:"|")(https?\:\/\/(?:www\.)?(?:youtube\.com|youtu\.?be)\/.*?)(?:"|")') | |
youtube_links = re_youtube_links.findall(text) | |
resources_urls += youtube_links | |
return resources_urls | |
def extract_sections_from_html(self, page, BASE_URL): | |
""" | |
Extract sections (Section->SubSection) from the html page | |
""" | |
def _make_url(section_soup): # FIXME: Extract from here and test | |
try: | |
return BASE_URL + section_soup.ul.a['href'] | |
except AttributeError: | |
# Section might be empty and contain no links | |
return None | |
def _get_section_name(section_soup): # FIXME: Extract from here and test | |
try: | |
return section_soup.h3.a.string.strip() | |
except AttributeError: | |
return None | |
def _make_subsections(section_soup): | |
try: | |
subsections_soup = section_soup.ul.find_all("li") | |
except AttributeError: | |
return [] | |
# FIXME correct extraction of subsection.name (unicode) | |
subsections = [SubSection(position=i, | |
url=BASE_URL + s.a['href'], | |
name=s.p.get_text().replace('current section','')) | |
for i, s in enumerate(subsections_soup, 1)] | |
return subsections | |
soup = BeautifulSoup(page) | |
sections_soup = soup.find_all('div', attrs={'class': 'chapter'}) | |
sections = [Section(position=i, | |
name=_get_section_name(section_soup), | |
url=_make_url(section_soup), | |
subsections=_make_subsections(section_soup)) | |
for i, section_soup in enumerate(sections_soup, 1)] | |
# Filter out those sections for which name or url could not be parsed | |
sections = [section for section in sections | |
if section.name and section.url] | |
return sections | |
def extract_courses_from_html(self, page, BASE_URL): | |
""" | |
Extracts courses (Course) from the html page | |
""" | |
soup = BeautifulSoup(page) | |
# First, try with new course structure (as of December 2017). If | |
# that doesn't work, we fallback to an older course structure | |
# (released with version 0.1.6). If even that doesn't work, then we | |
# try with the oldest course structure (that was current before | |
# version 0.1.6). | |
# | |
# rbrito---This code is ugly. | |
courses_soup = soup.find_all('article', 'course') | |
if len(courses_soup) == 0: | |
courses_soup = soup.find_all('div', 'course') | |
if len(courses_soup) == 0: | |
courses_soup = soup.find_all('div', 'course audit') | |
courses = [] | |
for course_soup in courses_soup: | |
course_id = None | |
course_name = course_soup.h3.text.strip() | |
course_url = None | |
course_state = 'Not yet' | |
try: | |
# started courses include the course link in the href attribute | |
course_url = BASE_URL + course_soup.a['href'] | |
if course_url.endswith('info') or course_url.endswith('info/') or course_url.endswith('course') or course_url.endswith('course/'): | |
course_state = 'Started' | |
# The id of a course in edX is composed by the path | |
# {organization}/{course_number}/{course_run} | |
course_id = course_soup.a['href'][9:-5] | |
except KeyError: | |
pass | |
courses.append(Course(id=course_id, | |
name=course_name, | |
url=course_url, | |
state=course_state)) | |
return courses | |
class CurrentEdXPageExtractor(ClassicEdXPageExtractor): | |
""" | |
A new page extractor for the recent changes in layout of edx | |
""" | |
def extract_unit(self, text, BASE_URL, file_formats): | |
re_metadata = re.compile(r'data-metadata='(.*?)'') | |
videos = [] | |
match_metadatas = re_metadata.findall(text) | |
for match_metadata in match_metadatas: | |
metadata = html_parser.HTMLParser().unescape(match_metadata) | |
metadata = json.loads(html_parser.HTMLParser().unescape(metadata)) | |
video_youtube_url = None | |
re_video_speed = re.compile(r'1.0\d+\:(?:.*?)(.{11})') | |
match_video_youtube_url = re_video_speed.search(metadata['streams']) | |
if match_video_youtube_url is not None: | |
video_id = match_video_youtube_url.group(1) | |
video_youtube_url = 'https://youtube.com/watch?v=' + video_id | |
# notice that the concrete languages come now in | |
# so we can eventually build the full urls here | |
# subtitles_download_urls = {sub_lang: | |
# BASE_URL + metadata['transcriptTranslationUrl'].replace('__lang__', sub_lang) | |
# for sub_lang in metadata['transcriptLanguages'].keys()} | |
available_subs_url = BASE_URL + metadata['transcriptAvailableTranslationsUrl'] | |
sub_template_url = BASE_URL + metadata['transcriptTranslationUrl'].replace('__lang__', '%s') | |
mp4_urls = [url for url in metadata['sources'] if url.endswith('.mp4')] | |
videos.append(Video(video_youtube_url=video_youtube_url, | |
available_subs_url=available_subs_url, | |
sub_template_url=sub_template_url, | |
mp4_urls=mp4_urls)) | |
resources_urls = self.extract_resources_urls(text, BASE_URL, | |
file_formats) | |
return Unit(videos=videos, resources_urls=resources_urls) | |
def extract_sections_from_html(self, page, BASE_URL): | |
""" | |
Extract sections (Section->SubSection) from the html page | |
""" | |
def _make_url(section_soup): # FIXME: Extract from here and test | |
try: | |
return BASE_URL + section_soup.div.div.a['href'] | |
except AttributeError: | |
# Section might be empty and contain no links | |
return None | |
def _get_section_name(section_soup): # FIXME: Extract from here and test | |
try: | |
return section_soup['aria-label'][:-8] # -8 cuts the submenu word | |
except AttributeError: | |
return None | |
def _make_subsections(section_soup): | |
try: | |
subsections_soup = section_soup.find_all('div', attrs={'class': 'menu-item'}) | |
except AttributeError: | |
return [] | |
# FIXME correct extraction of subsection.name (unicode) | |
subsections = [SubSection(position=i, | |
url=BASE_URL + s.a['href'], | |
name=s.p.string) | |
for i, s in enumerate(subsections_soup, 1)] | |
return subsections | |
soup = BeautifulSoup(page) | |
sections_soup = soup.find_all('div', attrs={'class': 'chapter-content-container'}) | |
sections = [Section(position=i, | |
name=_get_section_name(section_soup), | |
url=_make_url(section_soup), | |
subsections=_make_subsections(section_soup)) | |
for i, section_soup in enumerate(sections_soup, 1)] | |
# Filter out those sections for which name or url could not be parsed | |
sections = [section for section in sections | |
if section.name and section.url] | |
return sections | |
class NewEdXPageExtractor(CurrentEdXPageExtractor): | |
""" | |
A new page extractor for the latest changes in layout of edx | |
""" | |
def extract_sections_from_html(self, page, BASE_URL): | |
""" | |
Extract sections (Section->SubSection) from the html page | |
""" | |
def _make_url(section_soup): # FIXME: Extract from here and test | |
try: | |
return section_soup.a['href'] | |
except AttributeError: | |
# Section might be empty and contain no links | |
return None | |
def _get_section_name(section_soup): # FIXME: Extract from here and test | |
try: | |
return section_soup.a.h4.string.strip() | |
except AttributeError: | |
return None | |
def _make_subsections(section_soup): | |
# try: | |
# subsections_soup = section_soup.find_all('li', class_=['subsection']) | |
# except AttributeError: | |
# return [] | |
subsections_soup = section_soup.find_all('li', class_=['subsection']) | |
# FIXME correct extraction of subsection.name (unicode) | |
subsections = [SubSection(position=i, | |
url=s.a['href'], | |
name=s.a.h4.string.strip()) | |
for i, s in enumerate(subsections_soup, 1)] | |
return subsections | |
soup = BeautifulSoup(page) | |
sections_soup = soup.find_all('li', class_=['outline-item section']) | |
sections = [Section(position=i, | |
name=_get_section_name(section_soup), | |
url=_make_url(section_soup), | |
subsections=_make_subsections(section_soup)) | |
for i, section_soup in enumerate(sections_soup, 1)] | |
# Filter out those sections for which name could not be parsed | |
sections = [section for section in sections | |
if section.name] | |
return sections | |
def get_page_extractor(url): | |
""" | |
factory method for page extractors | |
""" | |
if ( | |
url.startswith('https://courses.edx.org') or | |
url.startswith('https://mitxpro.mit.edu') | |
): | |
return NewEdXPageExtractor() | |
elif ( | |
url.startswith('https://edge.edx.org') or | |
url.startswith('https://lagunita.stanford.edu') or | |
url.startswith('https://www.fun-mooc.fr') | |
): | |
return NewEdXPageExtractor() | |
else: | |
return ClassicEdXPageExtractor() | |
def is_youtube_url(url): | |
re_youtube_url = re.compile(r'(https?\:\/\/(?:www\.)?(?:youtube\.com|youtu\.?be)\/.*?)') | |
return re_youtube_url.match(url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment