dbreunig · February 15, 2023 19:05
diff --git a/podcast-to-transcript-to-sqlite.py b/podcast-to-transcript-to-sqlite.py
 import feedparser
 import whisper
 import sqlite3
 import requests

 podcast_feed_url = "https://feeds.libsyn.com/92106/rss"
 db_name = "podcast.db"

 # Create the database and its tables.
 con = sqlite3.connect(db_name)
 cur = con.cursor()
 cur.execute("""
    CREATE TABLE episodes(
        id TEXT PRIMARY KEY, 
        title TEXT,
        pub_date TEXT,
        link TEXT,
        summary TEXT,
        audio_link TEXT,
        processed INTEGER DEFAULT 0
    )
    """)
 cur.execute("""
    CREATE TABLE segments(
        episode_id INTEGER, 
        seek REAL,
        start REAL,
        end REAL,
        text TEXT,
        FOREIGN KEY (episode_id) 
            REFERENCES episodes (id)
                ON DELETE CASCADE
                ON UPDATE NO ACTION
    )
    """)

 # Load whisper model [tiny, base, small, medium, or larger]
 model = whisper.load_model("small")

 # Load the podcast feed url
 feed = feedparser.parse(podcast_feed_url)

 # Prep the podcast feed data into an array prepared for sql insertion
 episodes = []
 for e in feed.entries:
    # Find the audio link
    filename = ""
    for l in e['links']:
        if l['rel'] == 'enclosure':
            audio_link = l['href']
    # Load the metadata we need
    episodes.append((
        e['id'],
        e['title'],
        e['published'],
        e['link'],
        e['summary'],
        audio_link,
        0
    ))

 # Insert the episodes into the db
 cur.executemany("INSERT INTO episodes VALUES(?, ?, ?, ?, ?, ?, ?)", episodes)
 con.commit()

 # Transcribe the first 10 episodes (adjust the subarray values to transcribe fewer or more)
 transcription = ""
 for e in episodes[0:10]:
    print(f"Starting {e[1]}")
    # Download
    filename = f"{e[5].split('/')[-1]}.mp3"
    response = requests.get(e[5])
    open(filename, "wb").write(response.content)
    # Transcribe
    transcription = model.transcribe(filename)
    # Load
    segments = []
    for s in transcription['segments']:
        segments.append((e[0], s['seek'], s['start'], s['end'], s['text']))
    cur.executemany("INSERT INTO segments VALUES(?, ?, ?, ?, ?)", segments)
    con.commit()
    # Puts result
    print(f"Loaded {e[1]}")
	import feedparser
	import whisper
	import sqlite3
	import requests

	podcast_feed_url = "https://feeds.libsyn.com/92106/rss"
	db_name = "podcast.db"

	# Create the database and its tables.
	con = sqlite3.connect(db_name)
	cur = con.cursor()
	cur.execute("""
	CREATE TABLE episodes(
	id TEXT PRIMARY KEY,
	title TEXT,
	pub_date TEXT,
	link TEXT,
	summary TEXT,
	audio_link TEXT,
	processed INTEGER DEFAULT 0
	)
	""")
	cur.execute("""
	CREATE TABLE segments(
	episode_id INTEGER,
	seek REAL,
	start REAL,
	end REAL,
	text TEXT,
	FOREIGN KEY (episode_id)
	REFERENCES episodes (id)
	ON DELETE CASCADE
	ON UPDATE NO ACTION
	)
	""")

	# Load whisper model [tiny, base, small, medium, or larger]
	model = whisper.load_model("small")

	# Load the podcast feed url
	feed = feedparser.parse(podcast_feed_url)

	# Prep the podcast feed data into an array prepared for sql insertion
	episodes = []
	for e in feed.entries:
	# Find the audio link
	filename = ""
	for l in e['links']:
	if l['rel'] == 'enclosure':
	audio_link = l['href']
	# Load the metadata we need
	episodes.append((
	e['id'],
	e['title'],
	e['published'],
	e['link'],
	e['summary'],
	audio_link,
	0
	))

	# Insert the episodes into the db
	cur.executemany("INSERT INTO episodes VALUES(?, ?, ?, ?, ?, ?, ?)", episodes)
	con.commit()

	# Transcribe the first 10 episodes (adjust the subarray values to transcribe fewer or more)
	transcription = ""
	for e in episodes[0:10]:
	print(f"Starting {e[1]}")
	# Download
	filename = f"{e[5].split('/')[-1]}.mp3"
	response = requests.get(e[5])
	open(filename, "wb").write(response.content)
	# Transcribe
	transcription = model.transcribe(filename)
	# Load
	segments = []
	for s in transcription['segments']:
	segments.append((e[0], s['seek'], s['start'], s['end'], s['text']))
	cur.executemany("INSERT INTO segments VALUES(?, ?, ?, ?, ?)", segments)
	con.commit()
	# Puts result
	print(f"Loaded {e[1]}")