Last active
December 14, 2022 19:43
-
-
Save dbreunig/36dcf8f33035341a1b247a026ed0c0cc to your computer and use it in GitHub Desktop.
Given a podcast RSS feed, download the epsiodes, transcribe them, and load the transcriptions into a sqlite3 database.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import feedparser | |
import whisper | |
import sqlite3 | |
import requests | |
# Connect to the database and create the tables | |
con = sqlite3.connect("podscribe.db") | |
cur = con.cursor() | |
cur.execute(""" | |
CREATE TABLE episodes( | |
id TEXT PRIMARY KEY, | |
title TEXT, | |
pub_date TEXT, | |
link TEXT, | |
summary TEXT, | |
audio_link TEXT, | |
processed INTEGER DEFAULT 0 | |
) | |
""") | |
cur.execute(""" | |
CREATE TABLE segments( | |
episode_id INTEGER, | |
seek REAL, | |
start REAL, | |
end REAL, | |
text TEXT, | |
FOREIGN KEY (episode_id) | |
REFERENCES episodes (id) | |
ON DELETE CASCADE | |
ON UPDATE NO ACTION | |
) | |
""") | |
# Load the model | |
model = whisper.load_model("tiny") | |
# Download an RSS feed | |
feed_url = "https://feed.xml" # DUMMY FEED, REPLACE WITH YOUR OWN | |
feed = feedparser.parse(feed_url) | |
# Load the episodes into the database | |
episodes = [] | |
for e in feed.entries: | |
# Find the audio link | |
filename = "" | |
for l in e['links']: | |
if l['rel'] == 'enclosure': | |
audio_link = l['href'] | |
# Load the metadata we need | |
episodes.append(( | |
e['id'], | |
e['title'], | |
e['published'], | |
e['link'], | |
e['summary'], | |
audio_link, | |
0 | |
)) | |
cur.executemany("INSERT INTO episodes VALUES(?, ?, ?, ?, ?, ?, ?)", episodes) | |
con.commit() | |
# Kick off the transcription of each | |
transcription = "" | |
for e in episodes: | |
print(f"Starting {e[1]}") | |
# Download | |
filename = f"{e[5].split('/')[-1]}.mp3" | |
response = requests.get(e[5]) | |
open(filename, "wb").write(response.content) | |
# Transcribe | |
transcription = model.transcribe(filename) | |
# Load | |
segments = [] | |
for s in transcription['segments']: | |
segments.append((e[0], s['seek'], s['start'], s['end'], s['text'])) | |
cur.executemany("INSERT INTO segments VALUES(?, ?, ?, ?, ?)", segments) | |
con.commit() | |
# Puts result | |
print(f"Loaded {e[1]}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment