Skip to content

Instantly share code, notes, and snippets.

@jatrost
Last active November 2, 2024 02:42
Show Gist options
  • Save jatrost/be0cfba5764ca403975a93259ac554a0 to your computer and use it in GitHub Desktop.
Save jatrost/be0cfba5764ca403975a93259ac554a0 to your computer and use it in GitHub Desktop.
Various scripts / tools for extracting text from podcasts and youtube videos
#!/usr/bin/env python3
import getpodcast
import re
import requests
def get_podcast_url_apple(podcast_url):
re_pattern = re.compile(r'[^w]+/id(\d+)')
matches = re_pattern.search(podcast_url)
if not matches:
return None
id = matches.group(1)
url = f"https://itunes.apple.com/lookup?id={id}&entity=podcast"
response = requests.get(url)
response.raise_for_status()
data = response.json()
results = data.get('results', [])
if results:
return results[0].get('feedUrl')
else:
return None
opt = getpodcast.options(
date_from='2024-01-05',
root_dir='./podcast')
podcasts = {
# "Big Technology Podcast": "https://podcasts.apple.com/us/podcast/big-technology-podcast/id1522960417",
# "Modern Wisdom": "https://feeds.megaphone.fm/SIXMSB5088139739",
# "Hard Fork": "https://feeds.simplecast.com/l2i9YnTd",
# "The AI Breakdown": "https://podcasts.apple.com/us/podcast/the-ai-breakdown-daily-artificial-intelligence-news/id1680633614",
# "The Cognitive Revolution": "https://podcasts.apple.com/us/podcast/the-cognitive-revolution-ai-builders-researchers-and/id1669813431",
"MLOps.community": "https://podcasts.apple.com/us/podcast/mlops-community/id1505372978"
}
for name, url in podcasts.items():
if url.startswith('https://podcasts.apple.com'):
new_url = get_podcast_url_apple(url)
if new_url:
print(f'INFO: Found XML URL for apple podcast: {name}: {new_url}')
podcasts[name] = new_url
else:
print(f'WARNING: could not determine URL for apple podcast: {name}, {url}')
getpodcast.getpodcast(podcasts, opt)
from youtube_transcript_api import YouTubeTranscriptApi
import sys
import json
if len(sys.argv) == 2:
print(json.dumps(YouTubeTranscriptApi.get_transcript(sys.argv[1])))
elif len(sys.argv) == 3 and '-t' in sys.argv:
sys.argv.remove('-t')
transcript = YouTubeTranscriptApi.get_transcript(sys.argv[1])
for rec in transcript:
print(rec['text'])

Setup

pip install -r requirements.txt

Configuration

Add your podcast URLs to download_podcasts.py

podcasts = {
   "MLOps.community": "https://podcasts.apple.com/us/podcast/mlops-community/id1505372978"
}

Running

Download Podcasts

python download_podcast.py --run

Audio to Text

this will output a file named file.json

time whisper file.mp3 --model base --language en --output_format json 

Youtube Transcript

Input is a Youtube ID number.

python download_youtube_transcript.py slIVToAG98M -t > transcript.txt
youtube-transcript-api
openai-whisper
getpodcast
requests
@nusliew
Copy link

nusliew commented Nov 2, 2024

May I know is it possible to support downloading specific episode of Podcast? Thanks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment