EncodeTheCode · January 27, 2026 03:20
diff --git a/youtube_rss_feed.py b/youtube_rss_feed.py
 import re
 import time
 import json
 import html
 import requests
 import feedparser
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, parse_qs, urlparse

 CHANNEL_ID = "" # aCYGczpunIkljwZ83O1z1o0k
 CHANNEL_HANDLE_URL = "https://www.youtube.com/@ChannelName"
 RSS_FEED = f"https://www.youtube.com/feeds/videos.xml?channel_id={CHANNEL_ID}"

 HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; pytube-bot/1.0; +https://example.com/bot)"
 }

 VIDEO_ID_RE = re.compile(r"(?:v=|/embed/|/watch\?v=|/v/|youtu\.be/)([A-Za-z0-9_-]{11})")

 # ---------- 1) RSS fetch (public uploads only) ----------
 def get_rss_videos(channel_id):
    feed_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
    feed = feedparser.parse(feed_url)
    videos = []
    for e in feed.entries:
        videos.append({
            "video_id": getattr(e, "yt_videoid", None) or VIDEO_ID_RE.search(e.link).group(1),
            "title": e.title,
            "link": e.link,
            "published": e.published
        })
    return videos

 # ---------- 2) Scrape channel page for ytInitialData / embedded IDs ----------
 def get_videoids_from_channel_page(handle_url):
    r = requests.get(handle_url, headers=HEADERS, timeout=15)
    text = r.text
    # Decode basic HTML escapes
    text_unesc = html.unescape(text)

    # 1) Try to find "ytInitialData" JSON blob
    m = re.search(r"ytInitialData\"\s*:\s*(\{.*?\})\s*,\s*\"ytInitialPlayerResponse", text_unesc, flags=re.DOTALL)
    if m:
        try:
            js = m.group(1)
            # quick safe-ish fixups
            js = js.replace(";var ytcfg=", ""); 
            data = json.loads(js)
            # search recursively for videoId values
            vids = set(re.findall(r'"videoId"\s*:\s*"([A-Za-z0-9_-]{11})"', js))
            if vids:
                return list(vids)
        except Exception:
            pass

    # 2) fallback: regex search for any UC.. or video ids in markup
    vids = set(re.findall(VIDEO_ID_RE, text_unesc))
    return list(vids)

 # ---------- 3) Find public playlists from the /playlists page and extract videos ----------
 def get_public_playlists_for_channel(handle_url):
    playlists = set()
    playlists_page = handle_url.rstrip("/") + "/playlists"
    r = requests.get(playlists_page, headers=HEADERS, timeout=15)
    soup = BeautifulSoup(r.text, "html.parser")
    # find playlist links in anchor tags
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "list=" in href:
            parsed = urlparse(href)
            q = parse_qs(parsed.query)
            if "list" in q:
                playlists.add(q["list"][0])
    return list(playlists)

 def get_videos_from_playlist_page(playlist_id):
    url = f"https://www.youtube.com/playlist?list={playlist_id}"
    r = requests.get(url, headers=HEADERS, timeout=15)
    # many playlist pages include the video ids in the HTML as "data-video-id" or in JSON
    vids = set(re.findall(r'data-video-id="([A-Za-z0-9_-]{11})"', r.text))
    # fallback to other patterns
    vids.update(re.findall(VIDEO_ID_RE, r.text))
    return list(vids)

 # ---------- 4) Optionally enrich via YouTube Data API (if you supply API key) ----------
 def enrich_with_youtube_api(video_ids, api_key):
    # This function uses the official API client if available.
    # You must pip install google-api-python-client and pass your API key.
    from googleapiclient.discovery import build
    youtube = build("youtube", "v3", developerKey=api_key)
    out = []
    # chunk requests to 50 ids
    for i in range(0, len(video_ids), 50):
        chunk = video_ids[i:i+50]
        resp = youtube.videos().list(part="snippet,contentDetails,status", id=",".join(chunk)).execute()
        for item in resp.get("items", []):
            out.append({
                "id": item["id"],
                "title": item["snippet"]["title"],
                "status": item.get("status", {}),
                "publishedAt": item["snippet"].get("publishedAt"),
            })
    return out

 # ---------- 5) Crawl an external list of pages to find embedded ids ----------
 def find_videoids_in_pages(urls):
    found = set()
    for u in urls:
        try:
            r = requests.get(u, headers=HEADERS, timeout=12)
            found.update(re.findall(VIDEO_ID_RE, r.text))
        except Exception:
            continue
    return list(found)

 # ---------- 6) Check video watch page for availability (quick heuristic) ----------
 def check_video_status(video_id):
    watch_url = f"https://www.youtube.com/watch?v={video_id}"
    r = requests.get(watch_url, headers=HEADERS, timeout=12)
    text = r.text
    # heuristics:
    if "This video is private" in text or "is private" in text:
        return "private"
    if "has been removed" in text or "Video unavailable" in text:
        return "unavailable"
    # If page loads and contains player, treat as accessible (public or unlisted)
    if "player" in text or "ytplayer" in text:
        return "accessible"
    return "unknown"

 # ---------- Orchestrator: try multiple methods and merge results ----------
 def find_possible_videos(channel_id, handle_url, extra_pages=None, api_key=None):
    extra_pages = extra_pages or []
    results = {}

    # RSS
    try:
        rss = get_rss_videos(channel_id)
        for v in rss:
            results[v["video_id"]] = {"source": "rss", **v}
    except Exception:
        pass

    # channel page scrape
    try:
        vids = get_videoids_from_channel_page(handle_url)
        for vid in vids:
            results.setdefault(vid, {})["source_channel_page"] = True
    except Exception:
        pass

    # playlists
    try:
        pls = get_public_playlists_for_channel(handle_url)
        for pl in pls:
            vids = get_videos_from_playlist_page(pl)
            for vid in vids:
                results.setdefault(vid, {})["source_playlist"] = pl
    except Exception:
        pass

    # external pages
    if extra_pages:
        found = find_videoids_in_pages(extra_pages)
        for vid in found:
            results.setdefault(vid, {})["source_external"] = True

    # optional API enrichment
    if api_key and results:
        vids = list(results.keys())
        try:
            meta = enrich_with_youtube_api(vids, api_key)
            for m in meta:
                vid = m["id"]
                results.setdefault(vid, {})["api_meta"] = m
        except Exception as e:
            results["_api_error"] = str(e)

    # quick status check
    for vid in list(results.keys()):
        if vid.startswith("UC"):  # skip accidental UC matches
            continue
        try:
            results[vid]["status_check"] = check_video_status(vid)
        except Exception:
            results[vid]["status_check"] = "error"

    return results

 # ---------- Example usage ----------
 if __name__ == "__main__":
    extra = [
        # add URLs of blogs, Twitter/X posts, GitHub READMEs, or other pages to scan
        # "https://example.com/possible-embed",
    ]
    res = find_possible_videos(CHANNEL_ID, CHANNEL_HANDLE_URL, extra_pages=extra, api_key=None)
    print("Discovered video candidates:", list(res.keys()))
    print(json.dumps(res, indent=2))
	import re
	import time
	import json
	import html
	import requests
	import feedparser
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, parse_qs, urlparse

	CHANNEL_ID = "" # aCYGczpunIkljwZ83O1z1o0k
	CHANNEL_HANDLE_URL = "https://www.youtube.com/@ChannelName"
	RSS_FEED = f"https://www.youtube.com/feeds/videos.xml?channel_id={CHANNEL_ID}"

	HEADERS = {
	"User-Agent": "Mozilla/5.0 (compatible; pytube-bot/1.0; +https://example.com/bot)"
	}

	VIDEO_ID_RE = re.compile(r"(?:v=\|/embed/\|/watch\?v=\|/v/\|youtu\.be/)([A-Za-z0-9_-]{11})")

	# ---------- 1) RSS fetch (public uploads only) ----------
	def get_rss_videos(channel_id):
	feed_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
	feed = feedparser.parse(feed_url)
	videos = []
	for e in feed.entries:
	videos.append({
	"video_id": getattr(e, "yt_videoid", None) or VIDEO_ID_RE.search(e.link).group(1),
	"title": e.title,
	"link": e.link,
	"published": e.published
	})
	return videos

	# ---------- 2) Scrape channel page for ytInitialData / embedded IDs ----------
	def get_videoids_from_channel_page(handle_url):
	r = requests.get(handle_url, headers=HEADERS, timeout=15)
	text = r.text
	# Decode basic HTML escapes
	text_unesc = html.unescape(text)

	# 1) Try to find "ytInitialData" JSON blob
	m = re.search(r"ytInitialData\"\s:\s(\{.?\})\s,\s*\"ytInitialPlayerResponse", text_unesc, flags=re.DOTALL)
	if m:
	try:
	js = m.group(1)
	# quick safe-ish fixups
	js = js.replace(";var ytcfg=", "");
	data = json.loads(js)
	# search recursively for videoId values
	vids = set(re.findall(r'"videoId"\s:\s"([A-Za-z0-9_-]{11})"', js))
	if vids:
	return list(vids)
	except Exception:
	pass

	# 2) fallback: regex search for any UC.. or video ids in markup
	vids = set(re.findall(VIDEO_ID_RE, text_unesc))
	return list(vids)

	# ---------- 3) Find public playlists from the /playlists page and extract videos ----------
	def get_public_playlists_for_channel(handle_url):
	playlists = set()
	playlists_page = handle_url.rstrip("/") + "/playlists"
	r = requests.get(playlists_page, headers=HEADERS, timeout=15)
	soup = BeautifulSoup(r.text, "html.parser")
	# find playlist links in anchor tags
	for a in soup.find_all("a", href=True):
	href = a["href"]
	if "list=" in href:
	parsed = urlparse(href)
	q = parse_qs(parsed.query)
	if "list" in q:
	playlists.add(q["list"][0])
	return list(playlists)

	def get_videos_from_playlist_page(playlist_id):
	url = f"https://www.youtube.com/playlist?list={playlist_id}"
	r = requests.get(url, headers=HEADERS, timeout=15)
	# many playlist pages include the video ids in the HTML as "data-video-id" or in JSON
	vids = set(re.findall(r'data-video-id="([A-Za-z0-9_-]{11})"', r.text))
	# fallback to other patterns
	vids.update(re.findall(VIDEO_ID_RE, r.text))
	return list(vids)

	# ---------- 4) Optionally enrich via YouTube Data API (if you supply API key) ----------
	def enrich_with_youtube_api(video_ids, api_key):
	# This function uses the official API client if available.
	# You must pip install google-api-python-client and pass your API key.
	from googleapiclient.discovery import build
	youtube = build("youtube", "v3", developerKey=api_key)
	out = []
	# chunk requests to 50 ids
	for i in range(0, len(video_ids), 50):
	chunk = video_ids[i:i+50]
	resp = youtube.videos().list(part="snippet,contentDetails,status", id=",".join(chunk)).execute()
	for item in resp.get("items", []):
	out.append({
	"id": item["id"],
	"title": item["snippet"]["title"],
	"status": item.get("status", {}),
	"publishedAt": item["snippet"].get("publishedAt"),
	})
	return out

	# ---------- 5) Crawl an external list of pages to find embedded ids ----------
	def find_videoids_in_pages(urls):
	found = set()
	for u in urls:
	try:
	r = requests.get(u, headers=HEADERS, timeout=12)
	found.update(re.findall(VIDEO_ID_RE, r.text))
	except Exception:
	continue
	return list(found)

	# ---------- 6) Check video watch page for availability (quick heuristic) ----------
	def check_video_status(video_id):
	watch_url = f"https://www.youtube.com/watch?v={video_id}"
	r = requests.get(watch_url, headers=HEADERS, timeout=12)
	text = r.text
	# heuristics:
	if "This video is private" in text or "is private" in text:
	return "private"
	if "has been removed" in text or "Video unavailable" in text:
	return "unavailable"
	# If page loads and contains player, treat as accessible (public or unlisted)
	if "player" in text or "ytplayer" in text:
	return "accessible"
	return "unknown"

	# ---------- Orchestrator: try multiple methods and merge results ----------
	def find_possible_videos(channel_id, handle_url, extra_pages=None, api_key=None):
	extra_pages = extra_pages or []
	results = {}

	# RSS
	try:
	rss = get_rss_videos(channel_id)
	for v in rss:
	results[v["video_id"]] = {"source": "rss", **v}
	except Exception:
	pass

	# channel page scrape
	try:
	vids = get_videoids_from_channel_page(handle_url)
	for vid in vids:
	results.setdefault(vid, {})["source_channel_page"] = True
	except Exception:
	pass

	# playlists
	try:
	pls = get_public_playlists_for_channel(handle_url)
	for pl in pls:
	vids = get_videos_from_playlist_page(pl)
	for vid in vids:
	results.setdefault(vid, {})["source_playlist"] = pl
	except Exception:
	pass

	# external pages
	if extra_pages:
	found = find_videoids_in_pages(extra_pages)
	for vid in found:
	results.setdefault(vid, {})["source_external"] = True

	# optional API enrichment
	if api_key and results:
	vids = list(results.keys())
	try:
	meta = enrich_with_youtube_api(vids, api_key)
	for m in meta:
	vid = m["id"]
	results.setdefault(vid, {})["api_meta"] = m
	except Exception as e:
	results["_api_error"] = str(e)

	# quick status check
	for vid in list(results.keys()):
	if vid.startswith("UC"): # skip accidental UC matches
	continue
	try:
	results[vid]["status_check"] = check_video_status(vid)
	except Exception:
	results[vid]["status_check"] = "error"

	return results

	# ---------- Example usage ----------
	if __name__ == "__main__":
	extra = [
	# add URLs of blogs, Twitter/X posts, GitHub READMEs, or other pages to scan
	# "https://example.com/possible-embed",
	]
	res = find_possible_videos(CHANNEL_ID, CHANNEL_HANDLE_URL, extra_pages=extra, api_key=None)
	print("Discovered video candidates:", list(res.keys()))
	print(json.dumps(res, indent=2))
No results found