Last active
April 16, 2020 13:40
-
-
Save bitsnaps/ac634f5ac5f2e37b6db105edf221685f to your computer and use it in GitHub Desktop.
Quick and dirty link scarp from wistia video
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/Users/username/.conda/envs/py3/bin/python | |
| # remove the 1st line if you don't want to use conda's python env "py3" in this case | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import urllib | |
| import ntpath, sys | |
| import time, datetime | |
| # Copy video id from wistia player (right click then copy link) and paste it here | |
| tag_content = ''' | |
| <p><a href="https://website.com/full/link/to/lectures/11891982?wvideo=Aqn3c4exu7"><img src="https://embed-ssl.wistia.com/deliveries/6f4b905a9C6TC60fdac7752077be458b.jpg?image_crop_resized=800x450&image_play_button_size=2x&image_play_button=1&image_play_button_color=ff9a83e0" width="400" height="225" style="width: 400px; height: 225px;"></a></p><p><a href="https://website.com/link/to/lectures/11891982?wvideo=Aqn3c4exu7">Title of the Course | Category</a></p> | |
| ''' | |
| # path where to save the video | |
| path = '/Users/username/where/to/save/Videos/' | |
| def get_video_info(): | |
| soup = BeautifulSoup(tag_content, features="html.parser") | |
| urls = soup.find_all('a', href=True) | |
| url = urls[-1] | |
| href = url['href'] | |
| params = urllib.parse.urlparse(href) | |
| link = urllib.parse.parse_qs(params.query) | |
| return (link['wvideo'][0], url.get_text()) | |
| (video_id, link_text) = get_video_info() | |
| print('VideoID: %s' % video_id) | |
| print('Title: %s' % link_text) | |
| url = "http://fast.wistia.net/embed/iframe/"+video_id | |
| name = re.sub(r'[\\/*?:"<>|]','_', link_text.split('|')[0].strip()) # remove unwanted chars | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, features="html.parser") | |
| script = soup.find_all('script')[-1] | |
| # grab the last script | |
| page = str(script.contents[0]) | |
| def sizeof_file(num, suffix='B'): | |
| for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']: | |
| if abs(num) < 1024.0: | |
| return "%3.1f%s%s" % (num, unit, suffix) | |
| num /= 1024.0 | |
| return "%.1f%s%s" % (num, 'Yi', suffix) | |
| def size_on_disk(filename): | |
| f=open(filename,'rb') | |
| size = len(f.read()) | |
| f.close() | |
| return size | |
| # Report hook for downloading progress | |
| def reporthook(blocknum, blocksize, totalsize): | |
| global start_time | |
| global filesize | |
| if blocknum == 0: | |
| start_time = time.time() | |
| filesize = sizeof_file(totalsize) | |
| return | |
| duration = time.time() - start_time | |
| elapsed_time = str(datetime.timedelta(seconds=int(duration))) | |
| readsofar = int(blocknum * blocksize) | |
| speed = int(readsofar / (1024 * duration)) | |
| estimated_seconds = int(totalsize / (speed * 1024)) | |
| estimated_duration = str(datetime.timedelta(seconds=estimated_seconds)) | |
| if totalsize > 0: | |
| percent = readsofar * 1e2 / totalsize | |
| s = "\r%5.1f%% %s / %s, %d Kb/s, %s estimated, %s passed " % ( | |
| percent, sizeof_file(readsofar), filesize, speed, estimated_duration, elapsed_time) | |
| sys.stderr.write(s) | |
| if readsofar >= totalsize: # near the end | |
| sys.stderr.write("\n") | |
| else: # total size is unknown | |
| sys.stderr.write("read %d\n" % (readsofar,)) | |
| if 'embed.wistia.com/deliveries' in page: | |
| urls = re.findall('((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)', page) | |
| if (len(urls) > 0): | |
| print('Found: %d links.' % len(urls)) | |
| fileurl = str(urls[0][0]) | |
| print(fileurl.replace('.bin','.mp4')) | |
| network_obj = urllib.request.urlopen(fileurl) | |
| filesize = sizeof_file(int(network_obj.info()['Content-Length'])) | |
| print('File size: %s' % filesize) | |
| sys.stdout.write('Do you want to proceed for download (y/n) ?') | |
| if input().strip().lower() in ['y','yes']: | |
| # Download file | |
| # filepath = path+ntpath.basename(fileurl.replace('.bin','.mp4')) | |
| filepath = path+name+'.mp4' | |
| urllib.request.urlretrieve(fileurl, filepath, reporthook) | |
| print('Saved to: %s' % filepath) | |
| print('File size on disk: %s' % sizeof_file(size_on_disk(filepath))) | |
| else: | |
| print('Abort downloading.') | |
| else: | |
| print('Could not find any url.') | |
| else: | |
| print("Page doesn't contains any url:") | |
| print(page) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment