This has been released as a python package: https://pypi.org/project/vyoma-download/
pip3 install -U vyoma-download
This has been released as a python package: https://pypi.org/project/vyoma-download/
pip3 install -U vyoma-download
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Mon Sep 11 17:25:53 2019 | |
| Updated on Sun Aug 30 17:23:43 2020 | |
| @author: Hrishikesh Terdalkar | |
| Requirements: | |
| pip install requests | |
| pip install beautifulsoup4 | |
| pip install requests-downloader | |
| """ | |
| import re | |
| import os | |
| import sys | |
| import stat | |
| import getpass | |
| import argparse | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from requests_downloader.downloader import download as download_file | |
| ############################################################################### | |
| SERVER = 'www.sanskritfromhome.in' | |
| ############################################################################### | |
| def extract_course_id(course_url): | |
| try: | |
| not_url = '/' not in course_url | |
| sfh_pattern = '(http(s|)://|)(www.|)sanskritfromhome.in/course/([^/]*)' | |
| sfh_match = re.match(sfh_pattern, course_url) | |
| if not_url: | |
| course_id = course_url | |
| else: | |
| if sfh_match: | |
| course_id = sfh_match.group(len(sfh_match.groups())) | |
| else: | |
| print("Invalid course URL.") | |
| return None | |
| except Exception: | |
| print("Invalid course URL.") | |
| return None | |
| course_id = course_id.split('#')[0] | |
| return course_id | |
| ############################################################################### | |
| class Vyoma(): | |
| home_url = f'https://{SERVER}' | |
| login_url = f'https://{SERVER}/wp-admin/admin-ajax.php' | |
| def __init__(self, username, password, download_dir=None): | |
| ''' | |
| Vyoma downloader class | |
| Parameters | |
| ---------- | |
| username : str | |
| Username. | |
| password : str | |
| Password. | |
| download_dir : str, optional | |
| Location in which the course content will be downloaded. | |
| The default is None. | |
| ''' | |
| self.session = requests.Session() | |
| self.username = username | |
| self.password = password | |
| self.logged_in = False | |
| self.links = {} | |
| self.descriptions = {} | |
| # download directory | |
| self.download_dir = download_dir | |
| if not download_dir: | |
| home_dir = os.path.expanduser('~') | |
| vyoma_dir = os.path.join(home_dir, 'vyoma', username) | |
| self.download_dir = vyoma_dir | |
| if not os.path.isdir(self.download_dir): | |
| os.makedirs(self.download_dir) | |
| def get_course_url(self, course_id): | |
| '''Build course URL from course ID''' | |
| return f'{self.home_url}/course/{course_id}' | |
| def login(self): | |
| '''Login to sanskritfromhome.in''' | |
| r = self.get(self.home_url) | |
| # check if already logged in | |
| if 'Sign Out' in r.text and 'Sign In' not in r.text: | |
| self.logged_in = True | |
| return self.logged_in | |
| data = { | |
| 'user_login': self.username, | |
| 'user_password': self.password, | |
| 'user_action': 'login_user', | |
| 'action': 'themex_update_user' | |
| } | |
| soup = BeautifulSoup(r.text, 'html.parser') | |
| nonce = soup.find('input', attrs={'name': 'nonce'})['value'] | |
| data['nonce'] = nonce | |
| r = self.post(self.login_url, data=data) | |
| # check if it succeeded | |
| r = self.get(self.home_url) | |
| self.logged_in = ('Sign Out' in r.text and 'Sign In' not in r.text) | |
| return self.logged_in | |
| def fetch_course_page(self, course_id): | |
| '''Get contents of the course page | |
| Parameters | |
| ---------- | |
| course_id : str | |
| Course ID. | |
| Raises | |
| ------ | |
| RuntimeError | |
| If called without a valid login. | |
| Returns | |
| ------- | |
| html | |
| HTML content of the course page. | |
| ''' | |
| self.login() | |
| if not self.logged_in: | |
| raise RuntimeError("Requires a sign-in.") | |
| r = self.get(self.get_course_url(course_id)) | |
| return r.text | |
| def fetch_course_links(self, course_id, html=''): | |
| ''' | |
| Fetch all links of downloadable content for a course. | |
| Parameters | |
| ---------- | |
| course_id : str | |
| Course ID. | |
| html : str, optional | |
| Instead of a course ID, a pre-downloaded HTML can be specified. | |
| The default is ''. | |
| Returns | |
| ------- | |
| links : dict | |
| Links to audio, video and documents from the course. | |
| ''' | |
| if not html: | |
| html = self.fetch_course_page(course_id) | |
| soup = BeautifulSoup(html, 'html.parser') | |
| links = {} | |
| links['audio'] = soup.find_all('a', attrs={'class': 'audio'}) | |
| links['video'] = soup.find_all('a', attrs={'class': 'video'}) | |
| links['document'] = soup.find_all('a', attrs={'class': 'document'}) | |
| self.links[course_id] = links | |
| return self.links[course_id] | |
| def fetch_course_description(self, course_id, html=''): | |
| ''' | |
| Fetch course description | |
| Parameters | |
| ---------- | |
| course_id : str | |
| Course ID. | |
| html : str, optional | |
| Instead of a course ID, a pre-downloaded HTML can be specified. | |
| The default is ''. | |
| Returns | |
| ------- | |
| description : str | |
| HTML description of the course | |
| ''' | |
| if not html: | |
| html = self.fetch_course_page(course_id) | |
| soup = BeautifulSoup(html, 'html.parser') | |
| description_div = soup.find('div', class_='course-description') | |
| self.descriptions[course_id] = str(description_div) | |
| return self.descriptions[course_id] | |
| def download_course_content(self, course_id, | |
| document=True, audio=True, video=True): | |
| ''' | |
| Download course content (audios, documents and video-links) | |
| Parameters | |
| ---------- | |
| course_id : str | |
| Course ID. | |
| document : bool, optional | |
| If True, download document links. | |
| The default is True. | |
| audio : bool, optional | |
| If True, download audio links. | |
| The default is True. | |
| video : bool, optional | |
| If True, download video links. | |
| The default is True. | |
| Returns | |
| ------- | |
| status : bool | |
| True if the download funcion completed successfully. | |
| Does not mean that all the files were downloaded successfully. | |
| ''' | |
| html = self.fetch_course_page(course_id) | |
| course_dir = os.path.join(self.download_dir, course_id) | |
| if not os.path.isdir(course_dir): | |
| os.mkdir(course_dir) | |
| if course_id not in self.descriptions: | |
| self.fetch_course_description(course_id, html=html) | |
| with open(os.path.join(course_dir, 'description.html'), 'w') as f: | |
| f.write(self.descriptions[course_id]) | |
| print("Saved course description.") | |
| if course_id not in self.links: | |
| self.fetch_course_links(course_id, html=html) | |
| links = self.links[course_id] | |
| if video: | |
| video_file = os.path.join(course_dir, 'video_links.txt') | |
| video_links = [link['href'] for link in links['video']] | |
| with open(video_file, 'w') as f: | |
| f.write('\n'.join(video_links)) | |
| download = [] | |
| if document: | |
| download.append('document') | |
| if audio: | |
| download.append('audio') | |
| all_skipped_links = [] | |
| for dl in download: | |
| # save links | |
| dl_links_file = os.path.join(course_dir, f'{dl}_links.txt') | |
| with open(dl_links_file, 'w') as f: | |
| f.write('\n'.join([link['href'] for link in links[dl]])) | |
| dl_dir = os.path.join(course_dir, dl) | |
| if not os.path.isdir(dl_dir): | |
| os.mkdir(dl_dir) | |
| downloaded = 0 | |
| skipped = 0 | |
| skipped_links = [] | |
| print(f"Total {dl.title()}s:", len(links[dl])) | |
| for link in links[dl]: | |
| success = True | |
| try: | |
| download_file(link['href'], download_dir=dl_dir, | |
| session=self.session, verbose=True) | |
| except Exception as e: | |
| print(e) | |
| success = False | |
| if success: | |
| downloaded += 1 | |
| else: | |
| skipped += 1 | |
| skipped_links.append(link['href']) | |
| print(f"Skipping {link['href']}") | |
| print(f"Successfully downloaded {downloaded} {dl} files.") | |
| if skipped: | |
| print(f"Could not download {skipped} {dl} files.") | |
| print(f"Skipped {dl.title()} URLs: {skipped_links}") | |
| if all_skipped_links: | |
| skipped_file = os.path.join(course_dir, 'skipped_links.txt') | |
| with open(skipped_file, 'w') as f: | |
| f.write('\n'.join(all_skipped_links)) | |
| return True | |
| def download_course_audios(self, course_id): | |
| '''Wrapper to download only audio links''' | |
| return self.download_course_content(course_id, audio=True, | |
| document=False, video=False) | |
| def download_course_video_links(self, course_id): | |
| '''Wrapper to download only video links''' | |
| return self.download_course_content(course_id, video=True, | |
| document=False, audio=False) | |
| def download_course_documents(self, course_id): | |
| '''Wrapper to download only document links''' | |
| return self.download_course_content(course_id, document=True, | |
| audio=False, video=False) | |
| def set_user(self, username): | |
| self.username = username | |
| def set_pass(self, password): | |
| self.password = password | |
| def get(self, *args, **kwargs): | |
| return self.session.get(*args, **kwargs) | |
| def post(self, *args, **kwargs): | |
| return self.session.post(*args, **kwargs) | |
| def __repr__(self): | |
| return f"Vyoma(username={self.username}, logged_in={self.logged_in})" | |
| ############################################################################### | |
| def main(): | |
| desc = "Download course contents from 'sanskritfromhome.in'." | |
| p = argparse.ArgumentParser(description=desc) | |
| p.add_argument("course-url", help="URL of the relevant course") | |
| p.add_argument("-v", "--video", action='store_true', | |
| help="Download video links only") | |
| p.add_argument("-a", "--audio", action='store_true', | |
| help="Download audios only") | |
| p.add_argument("-d", "--document", action='store_true', | |
| help="Download documents only") | |
| p.add_argument("-o", "--output", default=None, | |
| help="Path of the directory to download content to.") | |
| p.add_argument("-u", "--username", default=None) | |
| p.add_argument("-p", "--password", default=None) | |
| args = vars(p.parse_args()) | |
| # download options | |
| course_url = args['course-url'] | |
| audio = args['audio'] | |
| video = args['video'] | |
| document = args['document'] | |
| output = args['output'] | |
| # credentials | |
| config = {} | |
| home_dir = os.path.expanduser('~') | |
| config_file = os.path.join(home_dir, '.vyoma.cnf') | |
| if os.path.isfile(config_file): | |
| with open(config_file) as f: | |
| lines = f.read().split('\n') | |
| for line in lines: | |
| if line.strip(): | |
| key, value = line.split('=') | |
| config[key.strip()] = value.strip() | |
| username = ( | |
| config.get('username') or | |
| os.environ.get('VYOMA_USER') or | |
| args['username'] | |
| ) | |
| password = ( | |
| config.get('password') or | |
| os.environ.get('VYOMA_PASS') or | |
| args['password'] | |
| ) | |
| manual = not (username and password) | |
| if not username: | |
| username = input('Username: ') | |
| if not password: | |
| password = getpass.getpass('Password: ') | |
| username = username.strip() | |
| password = password.strip() | |
| # action | |
| vyoma = Vyoma(username=username, password=password, download_dir=output) | |
| if not vyoma.login(): | |
| print("Error: could not sign in.") | |
| return 1 | |
| if manual: | |
| answer = input("Save credentials for future use? (Y/n)") | |
| if not answer or answer.lower()[0] == 'y': | |
| with open(config_file, 'w') as f: | |
| f.write(f"username = {username}\n" | |
| f"password = {password}") | |
| print("Credentials saved!") | |
| os.chmod(config_file, stat.S_IREAD + stat.S_IWRITE) | |
| if vyoma.logged_in: | |
| course_id = extract_course_id(course_url) | |
| vyoma.fetch_course_links(course_id) | |
| if not(any([audio, video, document])): | |
| vyoma.download_course_content(course_id) | |
| else: | |
| vyoma.download_course_content(course_id, | |
| document=document, | |
| audio=audio, | |
| video=video) | |
| return 0 | |
| ############################################################################### | |
| if __name__ == '__main__': | |
| sys.exit(main()) | |
| ############################################################################### |
Release as pip installable package.
Added support for resuming partial downloads and ignoring the files that have already been downloaded.
Downloads course-description too now.
Can now accept various forms of URLs (with or without http, www etc) or can simply provide course ID (Part after
/course/from the URL)