Skip to content

Instantly share code, notes, and snippets.

@Pragith
Last active September 28, 2019 00:16
Show Gist options
  • Save Pragith/d51300845f090bcfcb06d225b08cc6f0 to your computer and use it in GitHub Desktop.
Save Pragith/d51300845f090bcfcb06d225b08cc6f0 to your computer and use it in GitHub Desktop.
Generates a CSV of list of Lynda videos
from bs4 import BeautifulSoup as bs
import argparse, requests, pandas as pd
# Get report related arguments from the command line
parser = argparse.ArgumentParser()
parser.add_argument("-url","--url", help="Enter Lynda course URL", type=str)
parser.add_argument("-o","--output_file", help="Enter the output filename", type=str)
args = vars(parser.parse_args())
def duration_to_seconds(T):
sec = 0
if 'm' in T and 's' in T:
T = T.split(' ')
sec += int(T[0].replace('m','')) * 60
sec += int(T[1].replace('s',''))
if 'm' in T:
sec += int(T.replace('m','')) * 60
if 's' in T:
sec += int(T.replace('s',''))
return sec
page = requests.get(args['url'])
soup = bs(page.text, 'html.parser')
course_title = soup.title.text.strip()
toc = soup.find('ul', {'class':'course-toc'})
chapters = toc.find_all('li', {'role':'presentation'})
course_df = pd.DataFrame()
for chapter in chapters:
chapter_title = chapter.find('h4')
if chapter_title:
chapter_title = chapter_title.text.strip()
if chapter_title not in ['Introduction','Conclusion']:
print('\nChapter:',chapter_title)
videos = chapter.find_all('li', {'class':'toc-video-item'})
for video in videos:
video_title = video.find('a', {'class':'video-name'}).text.strip()
video_duration = video.find('span', {'class':'video-duration'}).text.strip()
seconds = duration_to_seconds(video_duration)
minutes = round(seconds/60,1)
print('> Video:',video_title)
course_df = course_df.append(pd.DataFrame({
'course':course_title,
'chapter':chapter_title,
'video':video_title,
'duration':video_duration,
'seconds':seconds,
'minutes':minutes
}, index={0}),sort=False)
total_duration = sum(course_df['seconds'])
course_df['duration_percentage'] = course_df['seconds'].apply(lambda s: round(s*100/total_duration, 2))
course_df['id'] = range(1,len(course_df)+1)
course_df = course_df[course_df.columns.tolist()[-1:] + course_df.columns.tolist()[:-1]]
course_df.to_csv(args['output_file'] + '.csv', index=False)
course_high_level_df = course_df.groupby(['course','chapter']).agg({'seconds':'sum','minutes':'sum'}).reset_index()
course_high_level_df['duration_percentage'] = course_high_level_df['seconds'].apply(lambda s: round(s*100/total_duration, 2))
course_high_level_df['id'] = range(1,len(course_high_level_df)+1)
course_high_level_df = course_high_level_df[course_high_level_df.columns.tolist()[-1:] + course_high_level_df.columns.tolist()[:-1]]
course_high_level_df.to_csv(args['output_file'] + '_highlevel.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment