Created
January 27, 2026 03:20
-
-
Save EncodeTheCode/3579740150e840d9a465728039fb463d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import time | |
| import json | |
| import html | |
| import requests | |
| import feedparser | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, parse_qs, urlparse | |
| CHANNEL_ID = "" # aCYGczpunIkljwZ83O1z1o0k | |
| CHANNEL_HANDLE_URL = "https://www.youtube.com/@ChannelName" | |
| RSS_FEED = f"https://www.youtube.com/feeds/videos.xml?channel_id={CHANNEL_ID}" | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (compatible; pytube-bot/1.0; +https://example.com/bot)" | |
| } | |
| VIDEO_ID_RE = re.compile(r"(?:v=|/embed/|/watch\?v=|/v/|youtu\.be/)([A-Za-z0-9_-]{11})") | |
| # ---------- 1) RSS fetch (public uploads only) ---------- | |
| def get_rss_videos(channel_id): | |
| feed_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}" | |
| feed = feedparser.parse(feed_url) | |
| videos = [] | |
| for e in feed.entries: | |
| videos.append({ | |
| "video_id": getattr(e, "yt_videoid", None) or VIDEO_ID_RE.search(e.link).group(1), | |
| "title": e.title, | |
| "link": e.link, | |
| "published": e.published | |
| }) | |
| return videos | |
| # ---------- 2) Scrape channel page for ytInitialData / embedded IDs ---------- | |
| def get_videoids_from_channel_page(handle_url): | |
| r = requests.get(handle_url, headers=HEADERS, timeout=15) | |
| text = r.text | |
| # Decode basic HTML escapes | |
| text_unesc = html.unescape(text) | |
| # 1) Try to find "ytInitialData" JSON blob | |
| m = re.search(r"ytInitialData\"\s*:\s*(\{.*?\})\s*,\s*\"ytInitialPlayerResponse", text_unesc, flags=re.DOTALL) | |
| if m: | |
| try: | |
| js = m.group(1) | |
| # quick safe-ish fixups | |
| js = js.replace(";var ytcfg=", ""); | |
| data = json.loads(js) | |
| # search recursively for videoId values | |
| vids = set(re.findall(r'"videoId"\s*:\s*"([A-Za-z0-9_-]{11})"', js)) | |
| if vids: | |
| return list(vids) | |
| except Exception: | |
| pass | |
| # 2) fallback: regex search for any UC.. or video ids in markup | |
| vids = set(re.findall(VIDEO_ID_RE, text_unesc)) | |
| return list(vids) | |
| # ---------- 3) Find public playlists from the /playlists page and extract videos ---------- | |
| def get_public_playlists_for_channel(handle_url): | |
| playlists = set() | |
| playlists_page = handle_url.rstrip("/") + "/playlists" | |
| r = requests.get(playlists_page, headers=HEADERS, timeout=15) | |
| soup = BeautifulSoup(r.text, "html.parser") | |
| # find playlist links in anchor tags | |
| for a in soup.find_all("a", href=True): | |
| href = a["href"] | |
| if "list=" in href: | |
| parsed = urlparse(href) | |
| q = parse_qs(parsed.query) | |
| if "list" in q: | |
| playlists.add(q["list"][0]) | |
| return list(playlists) | |
| def get_videos_from_playlist_page(playlist_id): | |
| url = f"https://www.youtube.com/playlist?list={playlist_id}" | |
| r = requests.get(url, headers=HEADERS, timeout=15) | |
| # many playlist pages include the video ids in the HTML as "data-video-id" or in JSON | |
| vids = set(re.findall(r'data-video-id="([A-Za-z0-9_-]{11})"', r.text)) | |
| # fallback to other patterns | |
| vids.update(re.findall(VIDEO_ID_RE, r.text)) | |
| return list(vids) | |
| # ---------- 4) Optionally enrich via YouTube Data API (if you supply API key) ---------- | |
| def enrich_with_youtube_api(video_ids, api_key): | |
| # This function uses the official API client if available. | |
| # You must pip install google-api-python-client and pass your API key. | |
| from googleapiclient.discovery import build | |
| youtube = build("youtube", "v3", developerKey=api_key) | |
| out = [] | |
| # chunk requests to 50 ids | |
| for i in range(0, len(video_ids), 50): | |
| chunk = video_ids[i:i+50] | |
| resp = youtube.videos().list(part="snippet,contentDetails,status", id=",".join(chunk)).execute() | |
| for item in resp.get("items", []): | |
| out.append({ | |
| "id": item["id"], | |
| "title": item["snippet"]["title"], | |
| "status": item.get("status", {}), | |
| "publishedAt": item["snippet"].get("publishedAt"), | |
| }) | |
| return out | |
| # ---------- 5) Crawl an external list of pages to find embedded ids ---------- | |
| def find_videoids_in_pages(urls): | |
| found = set() | |
| for u in urls: | |
| try: | |
| r = requests.get(u, headers=HEADERS, timeout=12) | |
| found.update(re.findall(VIDEO_ID_RE, r.text)) | |
| except Exception: | |
| continue | |
| return list(found) | |
| # ---------- 6) Check video watch page for availability (quick heuristic) ---------- | |
| def check_video_status(video_id): | |
| watch_url = f"https://www.youtube.com/watch?v={video_id}" | |
| r = requests.get(watch_url, headers=HEADERS, timeout=12) | |
| text = r.text | |
| # heuristics: | |
| if "This video is private" in text or "is private" in text: | |
| return "private" | |
| if "has been removed" in text or "Video unavailable" in text: | |
| return "unavailable" | |
| # If page loads and contains player, treat as accessible (public or unlisted) | |
| if "player" in text or "ytplayer" in text: | |
| return "accessible" | |
| return "unknown" | |
| # ---------- Orchestrator: try multiple methods and merge results ---------- | |
| def find_possible_videos(channel_id, handle_url, extra_pages=None, api_key=None): | |
| extra_pages = extra_pages or [] | |
| results = {} | |
| # RSS | |
| try: | |
| rss = get_rss_videos(channel_id) | |
| for v in rss: | |
| results[v["video_id"]] = {"source": "rss", **v} | |
| except Exception: | |
| pass | |
| # channel page scrape | |
| try: | |
| vids = get_videoids_from_channel_page(handle_url) | |
| for vid in vids: | |
| results.setdefault(vid, {})["source_channel_page"] = True | |
| except Exception: | |
| pass | |
| # playlists | |
| try: | |
| pls = get_public_playlists_for_channel(handle_url) | |
| for pl in pls: | |
| vids = get_videos_from_playlist_page(pl) | |
| for vid in vids: | |
| results.setdefault(vid, {})["source_playlist"] = pl | |
| except Exception: | |
| pass | |
| # external pages | |
| if extra_pages: | |
| found = find_videoids_in_pages(extra_pages) | |
| for vid in found: | |
| results.setdefault(vid, {})["source_external"] = True | |
| # optional API enrichment | |
| if api_key and results: | |
| vids = list(results.keys()) | |
| try: | |
| meta = enrich_with_youtube_api(vids, api_key) | |
| for m in meta: | |
| vid = m["id"] | |
| results.setdefault(vid, {})["api_meta"] = m | |
| except Exception as e: | |
| results["_api_error"] = str(e) | |
| # quick status check | |
| for vid in list(results.keys()): | |
| if vid.startswith("UC"): # skip accidental UC matches | |
| continue | |
| try: | |
| results[vid]["status_check"] = check_video_status(vid) | |
| except Exception: | |
| results[vid]["status_check"] = "error" | |
| return results | |
| # ---------- Example usage ---------- | |
| if __name__ == "__main__": | |
| extra = [ | |
| # add URLs of blogs, Twitter/X posts, GitHub READMEs, or other pages to scan | |
| # "https://example.com/possible-embed", | |
| ] | |
| res = find_possible_videos(CHANNEL_ID, CHANNEL_HANDLE_URL, extra_pages=extra, api_key=None) | |
| print("Discovered video candidates:", list(res.keys())) | |
| print(json.dumps(res, indent=2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment