Skip to content

Instantly share code, notes, and snippets.

@EncodeTheCode
Created January 27, 2026 03:20
Show Gist options
  • Select an option

  • Save EncodeTheCode/3579740150e840d9a465728039fb463d to your computer and use it in GitHub Desktop.

Select an option

Save EncodeTheCode/3579740150e840d9a465728039fb463d to your computer and use it in GitHub Desktop.
import re
import time
import json
import html
import requests
import feedparser
from bs4 import BeautifulSoup
from urllib.parse import urljoin, parse_qs, urlparse
CHANNEL_ID = "" # aCYGczpunIkljwZ83O1z1o0k
CHANNEL_HANDLE_URL = "https://www.youtube.com/@ChannelName"
RSS_FEED = f"https://www.youtube.com/feeds/videos.xml?channel_id={CHANNEL_ID}"
HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; pytube-bot/1.0; +https://example.com/bot)"
}
VIDEO_ID_RE = re.compile(r"(?:v=|/embed/|/watch\?v=|/v/|youtu\.be/)([A-Za-z0-9_-]{11})")
# ---------- 1) RSS fetch (public uploads only) ----------
def get_rss_videos(channel_id):
feed_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
feed = feedparser.parse(feed_url)
videos = []
for e in feed.entries:
videos.append({
"video_id": getattr(e, "yt_videoid", None) or VIDEO_ID_RE.search(e.link).group(1),
"title": e.title,
"link": e.link,
"published": e.published
})
return videos
# ---------- 2) Scrape channel page for ytInitialData / embedded IDs ----------
def get_videoids_from_channel_page(handle_url):
r = requests.get(handle_url, headers=HEADERS, timeout=15)
text = r.text
# Decode basic HTML escapes
text_unesc = html.unescape(text)
# 1) Try to find "ytInitialData" JSON blob
m = re.search(r"ytInitialData\"\s*:\s*(\{.*?\})\s*,\s*\"ytInitialPlayerResponse", text_unesc, flags=re.DOTALL)
if m:
try:
js = m.group(1)
# quick safe-ish fixups
js = js.replace(";var ytcfg=", "");
data = json.loads(js)
# search recursively for videoId values
vids = set(re.findall(r'"videoId"\s*:\s*"([A-Za-z0-9_-]{11})"', js))
if vids:
return list(vids)
except Exception:
pass
# 2) fallback: regex search for any UC.. or video ids in markup
vids = set(re.findall(VIDEO_ID_RE, text_unesc))
return list(vids)
# ---------- 3) Find public playlists from the /playlists page and extract videos ----------
def get_public_playlists_for_channel(handle_url):
playlists = set()
playlists_page = handle_url.rstrip("/") + "/playlists"
r = requests.get(playlists_page, headers=HEADERS, timeout=15)
soup = BeautifulSoup(r.text, "html.parser")
# find playlist links in anchor tags
for a in soup.find_all("a", href=True):
href = a["href"]
if "list=" in href:
parsed = urlparse(href)
q = parse_qs(parsed.query)
if "list" in q:
playlists.add(q["list"][0])
return list(playlists)
def get_videos_from_playlist_page(playlist_id):
url = f"https://www.youtube.com/playlist?list={playlist_id}"
r = requests.get(url, headers=HEADERS, timeout=15)
# many playlist pages include the video ids in the HTML as "data-video-id" or in JSON
vids = set(re.findall(r'data-video-id="([A-Za-z0-9_-]{11})"', r.text))
# fallback to other patterns
vids.update(re.findall(VIDEO_ID_RE, r.text))
return list(vids)
# ---------- 4) Optionally enrich via YouTube Data API (if you supply API key) ----------
def enrich_with_youtube_api(video_ids, api_key):
# This function uses the official API client if available.
# You must pip install google-api-python-client and pass your API key.
from googleapiclient.discovery import build
youtube = build("youtube", "v3", developerKey=api_key)
out = []
# chunk requests to 50 ids
for i in range(0, len(video_ids), 50):
chunk = video_ids[i:i+50]
resp = youtube.videos().list(part="snippet,contentDetails,status", id=",".join(chunk)).execute()
for item in resp.get("items", []):
out.append({
"id": item["id"],
"title": item["snippet"]["title"],
"status": item.get("status", {}),
"publishedAt": item["snippet"].get("publishedAt"),
})
return out
# ---------- 5) Crawl an external list of pages to find embedded ids ----------
def find_videoids_in_pages(urls):
found = set()
for u in urls:
try:
r = requests.get(u, headers=HEADERS, timeout=12)
found.update(re.findall(VIDEO_ID_RE, r.text))
except Exception:
continue
return list(found)
# ---------- 6) Check video watch page for availability (quick heuristic) ----------
def check_video_status(video_id):
watch_url = f"https://www.youtube.com/watch?v={video_id}"
r = requests.get(watch_url, headers=HEADERS, timeout=12)
text = r.text
# heuristics:
if "This video is private" in text or "is private" in text:
return "private"
if "has been removed" in text or "Video unavailable" in text:
return "unavailable"
# If page loads and contains player, treat as accessible (public or unlisted)
if "player" in text or "ytplayer" in text:
return "accessible"
return "unknown"
# ---------- Orchestrator: try multiple methods and merge results ----------
def find_possible_videos(channel_id, handle_url, extra_pages=None, api_key=None):
extra_pages = extra_pages or []
results = {}
# RSS
try:
rss = get_rss_videos(channel_id)
for v in rss:
results[v["video_id"]] = {"source": "rss", **v}
except Exception:
pass
# channel page scrape
try:
vids = get_videoids_from_channel_page(handle_url)
for vid in vids:
results.setdefault(vid, {})["source_channel_page"] = True
except Exception:
pass
# playlists
try:
pls = get_public_playlists_for_channel(handle_url)
for pl in pls:
vids = get_videos_from_playlist_page(pl)
for vid in vids:
results.setdefault(vid, {})["source_playlist"] = pl
except Exception:
pass
# external pages
if extra_pages:
found = find_videoids_in_pages(extra_pages)
for vid in found:
results.setdefault(vid, {})["source_external"] = True
# optional API enrichment
if api_key and results:
vids = list(results.keys())
try:
meta = enrich_with_youtube_api(vids, api_key)
for m in meta:
vid = m["id"]
results.setdefault(vid, {})["api_meta"] = m
except Exception as e:
results["_api_error"] = str(e)
# quick status check
for vid in list(results.keys()):
if vid.startswith("UC"): # skip accidental UC matches
continue
try:
results[vid]["status_check"] = check_video_status(vid)
except Exception:
results[vid]["status_check"] = "error"
return results
# ---------- Example usage ----------
if __name__ == "__main__":
extra = [
# add URLs of blogs, Twitter/X posts, GitHub READMEs, or other pages to scan
# "https://example.com/possible-embed",
]
res = find_possible_videos(CHANNEL_ID, CHANNEL_HANDLE_URL, extra_pages=extra, api_key=None)
print("Discovered video candidates:", list(res.keys()))
print(json.dumps(res, indent=2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment