Skip to content

Instantly share code, notes, and snippets.

@dbreunig
Last active December 14, 2022 19:43
Show Gist options
  • Save dbreunig/36dcf8f33035341a1b247a026ed0c0cc to your computer and use it in GitHub Desktop.
Save dbreunig/36dcf8f33035341a1b247a026ed0c0cc to your computer and use it in GitHub Desktop.
Given a podcast RSS feed, download the epsiodes, transcribe them, and load the transcriptions into a sqlite3 database.
import feedparser
import whisper
import sqlite3
import requests
# Connect to the database and create the tables
con = sqlite3.connect("podscribe.db")
cur = con.cursor()
cur.execute("""
CREATE TABLE episodes(
id TEXT PRIMARY KEY,
title TEXT,
pub_date TEXT,
link TEXT,
summary TEXT,
audio_link TEXT,
processed INTEGER DEFAULT 0
)
""")
cur.execute("""
CREATE TABLE segments(
episode_id INTEGER,
seek REAL,
start REAL,
end REAL,
text TEXT,
FOREIGN KEY (episode_id)
REFERENCES episodes (id)
ON DELETE CASCADE
ON UPDATE NO ACTION
)
""")
# Load the model
model = whisper.load_model("tiny")
# Download an RSS feed
feed_url = "https://feed.xml" # DUMMY FEED, REPLACE WITH YOUR OWN
feed = feedparser.parse(feed_url)
# Load the episodes into the database
episodes = []
for e in feed.entries:
# Find the audio link
filename = ""
for l in e['links']:
if l['rel'] == 'enclosure':
audio_link = l['href']
# Load the metadata we need
episodes.append((
e['id'],
e['title'],
e['published'],
e['link'],
e['summary'],
audio_link,
0
))
cur.executemany("INSERT INTO episodes VALUES(?, ?, ?, ?, ?, ?, ?)", episodes)
con.commit()
# Kick off the transcription of each
transcription = ""
for e in episodes:
print(f"Starting {e[1]}")
# Download
filename = f"{e[5].split('/')[-1]}.mp3"
response = requests.get(e[5])
open(filename, "wb").write(response.content)
# Transcribe
transcription = model.transcribe(filename)
# Load
segments = []
for s in transcription['segments']:
segments.append((e[0], s['seek'], s['start'], s['end'], s['text']))
cur.executemany("INSERT INTO segments VALUES(?, ?, ?, ?, ?)", segments)
con.commit()
# Puts result
print(f"Loaded {e[1]}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment