Pragith · September 28, 2019 00:16
diff --git a/lynda_to_csv.py b/lynda_to_csv.py
 from bs4 import BeautifulSoup as bs
 import argparse, requests, pandas as pd

 # Get report related arguments from the command line
 parser = argparse.ArgumentParser()
 parser.add_argument("-url","--url", help="Enter Lynda course URL", type=str)
 parser.add_argument("-o","--output_file", help="Enter the output filename", type=str)
 args = vars(parser.parse_args())

 def duration_to_seconds(T):
    sec = 0
    if 'm' in T and 's' in T:
        T = T.split(' ')
        sec += int(T[0].replace('m','')) * 60
        sec += int(T[1].replace('s',''))
    if 'm' in T:
        sec += int(T.replace('m','')) * 60
    if 's' in T:
        sec += int(T.replace('s',''))

    return sec


 page = requests.get(args['url'])
 soup = bs(page.text, 'html.parser')
 course_title = soup.title.text.strip()
 toc = soup.find('ul', {'class':'course-toc'})
 chapters = toc.find_all('li', {'role':'presentation'})
 course_df = pd.DataFrame()

 for chapter in chapters:
    chapter_title = chapter.find('h4')
    if chapter_title:
        chapter_title = chapter_title.text.strip()
        if chapter_title not in ['Introduction','Conclusion']:
            print('\nChapter:',chapter_title)
            videos = chapter.find_all('li', {'class':'toc-video-item'})
            for video in videos:
                video_title = video.find('a', {'class':'video-name'}).text.strip()
                video_duration = video.find('span', {'class':'video-duration'}).text.strip()
                seconds = duration_to_seconds(video_duration)
                minutes = round(seconds/60,1)
                print('> Video:',video_title)
                course_df = course_df.append(pd.DataFrame({
                    'course':course_title,
                    'chapter':chapter_title,
                    'video':video_title,
                    'duration':video_duration,
                    'seconds':seconds,
                    'minutes':minutes
                }, index={0}),sort=False)

 total_duration = sum(course_df['seconds'])
 course_df['duration_percentage'] = course_df['seconds'].apply(lambda s: round(s*100/total_duration, 2))
 course_df['id'] = range(1,len(course_df)+1)
 course_df = course_df[course_df.columns.tolist()[-1:] + course_df.columns.tolist()[:-1]]
 course_df.to_csv(args['output_file'] + '.csv', index=False)


 course_high_level_df = course_df.groupby(['course','chapter']).agg({'seconds':'sum','minutes':'sum'}).reset_index()
 course_high_level_df['duration_percentage'] = course_high_level_df['seconds'].apply(lambda s: round(s*100/total_duration, 2))
 course_high_level_df['id'] = range(1,len(course_high_level_df)+1)
 course_high_level_df = course_high_level_df[course_high_level_df.columns.tolist()[-1:] + course_high_level_df.columns.tolist()[:-1]]
 course_high_level_df.to_csv(args['output_file'] + '_highlevel.csv', index=False)
	from bs4 import BeautifulSoup as bs
	import argparse, requests, pandas as pd

	# Get report related arguments from the command line
	parser = argparse.ArgumentParser()
	parser.add_argument("-url","--url", help="Enter Lynda course URL", type=str)
	parser.add_argument("-o","--output_file", help="Enter the output filename", type=str)
	args = vars(parser.parse_args())

	def duration_to_seconds(T):
	sec = 0
	if 'm' in T and 's' in T:
	T = T.split(' ')
	sec += int(T[0].replace('m','')) * 60
	sec += int(T[1].replace('s',''))
	if 'm' in T:
	sec += int(T.replace('m','')) * 60
	if 's' in T:
	sec += int(T.replace('s',''))

	return sec


	page = requests.get(args['url'])
	soup = bs(page.text, 'html.parser')
	course_title = soup.title.text.strip()
	toc = soup.find('ul', {'class':'course-toc'})
	chapters = toc.find_all('li', {'role':'presentation'})
	course_df = pd.DataFrame()

	for chapter in chapters:
	chapter_title = chapter.find('h4')
	if chapter_title:
	chapter_title = chapter_title.text.strip()
	if chapter_title not in ['Introduction','Conclusion']:
	print('\nChapter:',chapter_title)
	videos = chapter.find_all('li', {'class':'toc-video-item'})
	for video in videos:
	video_title = video.find('a', {'class':'video-name'}).text.strip()
	video_duration = video.find('span', {'class':'video-duration'}).text.strip()
	seconds = duration_to_seconds(video_duration)
	minutes = round(seconds/60,1)
	print('> Video:',video_title)
	course_df = course_df.append(pd.DataFrame({
	'course':course_title,
	'chapter':chapter_title,
	'video':video_title,
	'duration':video_duration,
	'seconds':seconds,
	'minutes':minutes
	}, index={0}),sort=False)

	total_duration = sum(course_df['seconds'])
	course_df['duration_percentage'] = course_df['seconds'].apply(lambda s: round(s*100/total_duration, 2))
	course_df['id'] = range(1,len(course_df)+1)
	course_df = course_df[course_df.columns.tolist()[-1:] + course_df.columns.tolist()[:-1]]
	course_df.to_csv(args['output_file'] + '.csv', index=False)


	course_high_level_df = course_df.groupby(['course','chapter']).agg({'seconds':'sum','minutes':'sum'}).reset_index()
	course_high_level_df['duration_percentage'] = course_high_level_df['seconds'].apply(lambda s: round(s*100/total_duration, 2))
	course_high_level_df['id'] = range(1,len(course_high_level_df)+1)
	course_high_level_df = course_high_level_df[course_high_level_df.columns.tolist()[-1:] + course_high_level_df.columns.tolist()[:-1]]
	course_high_level_df.to_csv(args['output_file'] + '_highlevel.csv', index=False)