Skip to content

Instantly share code, notes, and snippets.

@Phoenix-Effect
Last active February 11, 2022 22:00
Show Gist options
  • Save Phoenix-Effect/fbc8e54af71424d7ce4e86d54182f8b1 to your computer and use it in GitHub Desktop.
Save Phoenix-Effect/fbc8e54af71424d7ce4e86d54182f8b1 to your computer and use it in GitHub Desktop.
Scrape videos data from youtube and push to airtable. By default reads from videos.txt but can defined with an argument.
import pafy
import os
import glob
import re
from airtable import Airtable
from airtable.auth import AirtableAuth
import requests
from pathlib import Path
from webvtt import WebVTT
apikey = "airtable api key"
baseurl = "base url of airtable"
tableid = "table id"
# takes in a string and tells you if its a number ot not
def is_number(s):
try:
float(s)
return True
except ValueError:
pass
try:
import unicodedata
unicodedata.numeric(s)
return True
except (TypeError, ValueError):
pass
return False
# takes in a line and cleans html tags
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
# Takes in a youtube url and returns video ID
def url_to_video_id(url):
youtube_regex = (
r'(https?://)?(www\.)?'
'(youtube|youtu|youtube-nocookie)\.(com|be)/'
'(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})')
youtube_regex_match = re.match(youtube_regex, url)
if youtube_regex_match:
return youtube_regex_match.group(6)
return youtube_regex_match
# Optionally takes in a file name and returns all lines in an array
def videos_array(filename="videos.txt"):
lines = [line.rstrip('\n') for line in open(filename)]
return lines
# Takes video url, downloads subtitles and returns its filename
def download_subs(video_url, lang="en"):
cmd = [
"youtube-dl",
"--skip-download",
"--write-auto-sub",
"--sub-lang",
lang,
video_url
]
os.system(" ".join(cmd))
videoID = url_to_video_id(video_url)
for file in glob.glob('*' + videoID + '.*'):
return file
def filetotranscript(filename):
if os.path.isfile(filename):
trans = ""
webvtt = WebVTT().read(filename)
count = 0
for caption in webvtt:
printthis = caption.text
printthis = printthis.rstrip()
if count % 3 == 0:
trans += printthis + " "
count = count + 1
os.remove(filename)
else:
trans = "No captions"
return trans
def youtube_info_object(url):
info = pafy.new(url)
infoObj = vars(info)
filename = download_subs(url)
if filename is None:
filename = " "
infoObj['transcript'] = filetotranscript(filename)
infoObj['description'] = info.description
infoObj['duration'] = info.duration
return infoObj
def push_to_airtable(line):
airtable = Airtable(baseurl, tableid, apikey)
url = str(line['watchv_url'])
id = str(line['videoid'])
custom = str(line['_title'])
views = int(line['_viewcount'])
transcript = str(line['transcript'])
thumbnail = str('https://img.youtube.com/vi/' + id + '/hqdefault.jpg')
keywords = ", ".join(line['_keywords'])
channelName = str(line['_username'])
description = str(line['_description'])
published = str(line['_published'])
published = published.split('-')[0]
dur = line['_length']
if len(custom.split('|')) > 1:
title = custom.split('|')[0]
speaker = custom.split('|')[1]
else:
title = custom
speaker = "Stephen Stearns"
airtable.insert({"sys.yt_id": id, "URL": url, "Title": title, "sys.yt_transcript": transcript,
"KeyWords": keywords, "Length": dur, "Creator_FirstLast": speaker,
"Sponsor": channelName, "Abstract/Description": description, "Hosting site": hosting,
"Year": published, "Kind of resource": ["Video"], "Kind tex": "Video",
"sys.approved": False, "Image": [{"url": thumbnail}]})
# MAIN program
videos = videos_array()
for video in videos:
vidObj = youtube_info_object(video)
push_to_airtable(vidObj)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment