Last active
February 11, 2022 22:00
-
-
Save Phoenix-Effect/fbc8e54af71424d7ce4e86d54182f8b1 to your computer and use it in GitHub Desktop.
Scrape videos data from youtube and push to airtable. By default reads from videos.txt but can defined with an argument.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pafy | |
import os | |
import glob | |
import re | |
from airtable import Airtable | |
from airtable.auth import AirtableAuth | |
import requests | |
from pathlib import Path | |
from webvtt import WebVTT | |
apikey = "airtable api key" | |
baseurl = "base url of airtable" | |
tableid = "table id" | |
# takes in a string and tells you if its a number ot not | |
def is_number(s): | |
try: | |
float(s) | |
return True | |
except ValueError: | |
pass | |
try: | |
import unicodedata | |
unicodedata.numeric(s) | |
return True | |
except (TypeError, ValueError): | |
pass | |
return False | |
# takes in a line and cleans html tags | |
def cleanhtml(raw_html): | |
cleanr = re.compile('<.*?>') | |
cleantext = re.sub(cleanr, '', raw_html) | |
return cleantext | |
# Takes in a youtube url and returns video ID | |
def url_to_video_id(url): | |
youtube_regex = ( | |
r'(https?://)?(www\.)?' | |
'(youtube|youtu|youtube-nocookie)\.(com|be)/' | |
'(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})') | |
youtube_regex_match = re.match(youtube_regex, url) | |
if youtube_regex_match: | |
return youtube_regex_match.group(6) | |
return youtube_regex_match | |
# Optionally takes in a file name and returns all lines in an array | |
def videos_array(filename="videos.txt"): | |
lines = [line.rstrip('\n') for line in open(filename)] | |
return lines | |
# Takes video url, downloads subtitles and returns its filename | |
def download_subs(video_url, lang="en"): | |
cmd = [ | |
"youtube-dl", | |
"--skip-download", | |
"--write-auto-sub", | |
"--sub-lang", | |
lang, | |
video_url | |
] | |
os.system(" ".join(cmd)) | |
videoID = url_to_video_id(video_url) | |
for file in glob.glob('*' + videoID + '.*'): | |
return file | |
def filetotranscript(filename): | |
if os.path.isfile(filename): | |
trans = "" | |
webvtt = WebVTT().read(filename) | |
count = 0 | |
for caption in webvtt: | |
printthis = caption.text | |
printthis = printthis.rstrip() | |
if count % 3 == 0: | |
trans += printthis + " " | |
count = count + 1 | |
os.remove(filename) | |
else: | |
trans = "No captions" | |
return trans | |
def youtube_info_object(url): | |
info = pafy.new(url) | |
infoObj = vars(info) | |
filename = download_subs(url) | |
if filename is None: | |
filename = " " | |
infoObj['transcript'] = filetotranscript(filename) | |
infoObj['description'] = info.description | |
infoObj['duration'] = info.duration | |
return infoObj | |
def push_to_airtable(line): | |
airtable = Airtable(baseurl, tableid, apikey) | |
url = str(line['watchv_url']) | |
id = str(line['videoid']) | |
custom = str(line['_title']) | |
views = int(line['_viewcount']) | |
transcript = str(line['transcript']) | |
thumbnail = str('https://img.youtube.com/vi/' + id + '/hqdefault.jpg') | |
keywords = ", ".join(line['_keywords']) | |
channelName = str(line['_username']) | |
description = str(line['_description']) | |
published = str(line['_published']) | |
published = published.split('-')[0] | |
dur = line['_length'] | |
if len(custom.split('|')) > 1: | |
title = custom.split('|')[0] | |
speaker = custom.split('|')[1] | |
else: | |
title = custom | |
speaker = "Stephen Stearns" | |
airtable.insert({"sys.yt_id": id, "URL": url, "Title": title, "sys.yt_transcript": transcript, | |
"KeyWords": keywords, "Length": dur, "Creator_FirstLast": speaker, | |
"Sponsor": channelName, "Abstract/Description": description, "Hosting site": hosting, | |
"Year": published, "Kind of resource": ["Video"], "Kind tex": "Video", | |
"sys.approved": False, "Image": [{"url": thumbnail}]}) | |
# MAIN program | |
videos = videos_array() | |
for video in videos: | |
vidObj = youtube_info_object(video) | |
push_to_airtable(vidObj) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment