zircote · March 20, 2026 16:05
diff --git a/README.md b/README.md
diff --git a/msteams-transcripts.py b/msteams-transcripts.py
 #!/usr/bin/env python3
 """
 teams-transcripts — download Microsoft Teams meeting transcripts from OneDrive

 Setup (one-time):
  1. Go to https://portal.azure.com > App registrations > New registration
  2. Name it anything, set account type to "Accounts in this org directory only"
     (or "any org" if multi-tenant), redirect URI = http://localhost (Public client)
  3. Under "Authentication" > enable "Allow public client flows"
  4. Under "API permissions" > Add > Microsoft Graph > Delegated:
       Files.Read   OnlineMeetings.Read
  5. Copy the Application (client) ID
  6. export TEAMS_CLIENT_ID=<your-client-id>
     export TEAMS_TENANT_ID=<your-tenant-id>   # find in Azure AD > Overview

 Usage:
  python teams-transcripts.py            # list + download all transcripts
  python teams-transcripts.py --list     # list only, no download
  python teams-transcripts.py --days 30  # only transcripts from last N days
  python teams-transcripts.py --out ~/Downloads/transcripts
  python teams-transcripts.py --format docx  # download .docx instead of .vtt
 """

 import argparse
 import json
 import os
 import sys
 import time
 from datetime import datetime, timedelta, timezone
 from pathlib import Path

 try:
    import msal
    import requests
 except ImportError:
    print("Missing dependencies. Run: pip install msal requests")
    sys.exit(1)

 # ── Config ────────────────────────────────────────────────────────────────────

 CLIENT_ID  = os.environ.get("TEAMS_CLIENT_ID")
 TENANT_ID  = os.environ.get("TEAMS_TENANT_ID", "common")
 SCOPES     = ["https://graph.microsoft.com/Files.Read",
              "https://graph.microsoft.com/OnlineMeetings.Read"]
 CACHE_FILE = Path.home() / ".teams-transcripts-token-cache.json"
 GRAPH_BASE = "https://graph.microsoft.com/v1.0"

 # ── Auth ──────────────────────────────────────────────────────────────────────

 def build_app():
    if not CLIENT_ID:
        print("ERROR: TEAMS_CLIENT_ID is not set. See setup instructions at top of script.")
        sys.exit(1)
    cache = msal.SerializableTokenCache()
    if CACHE_FILE.exists():
        cache.deserialize(CACHE_FILE.read_text())
    app = msal.PublicClientApplication(
        CLIENT_ID,
        authority=f"https://login.microsoftonline.com/{TENANT_ID}",
        token_cache=cache,
    )
    return app, cache


 def get_token():
    app, cache = build_app()
    accounts = app.get_accounts()
    result = None

    if accounts:
        result = app.acquire_token_silent(SCOPES, account=accounts[0])

    if not result:
        flow = app.initiate_device_flow(scopes=SCOPES)
        if "user_code" not in flow:
            print(f"Device flow failed: {flow}")
            sys.exit(1)
        print(f"\n{flow['message']}\n")
        result = app.acquire_token_by_device_flow(flow)

    if "access_token" not in result:
        print(f"Auth error: {result.get('error_description', result)}")
        sys.exit(1)

    CACHE_FILE.write_text(cache.serialize())
    CACHE_FILE.chmod(0o600)
    return result["access_token"]

 # ── Graph helpers ─────────────────────────────────────────────────────────────

 def graph_get(token, url, params=None):
    headers = {"Authorization": f"Bearer {token}"}
    resp = requests.get(url, headers=headers, params=params)
    if resp.status_code == 429:
        wait = int(resp.headers.get("Retry-After", 5))
        print(f"  rate limited — waiting {wait}s...")
        time.sleep(wait)
        return graph_get(token, url, params)
    resp.raise_for_status()
    return resp.json()


 def graph_get_all(token, url, params=None):
    """Follow @odata.nextLink pagination."""
    results = []
    while url:
        data = graph_get(token, url, params)
        results.extend(data.get("value", []))
        url = data.get("@odata.nextLink")
        params = None  # only on first call
    return results

 # ── Transcript discovery ──────────────────────────────────────────────────────

 def find_transcripts_onedrive(token, days=None, fmt="vtt"):
    """
    Search OneDrive for Teams transcript files.
    Teams stores them under: /Recording/ or /Microsoft Teams Data/
    as <meeting-title>.vtt and <meeting-title>.docx
    """
    ext = f".{fmt}"
    url = f"{GRAPH_BASE}/me/drive/search(q='{ext}')"
    params = {"$top": 200, "$orderby": "lastModifiedDateTime desc",
              "$select": "id,name,lastModifiedDateTime,parentReference,@microsoft.graph.downloadUrl,size"}
    items = graph_get_all(token, url, params)

    cutoff = None
    if days:
        cutoff = datetime.now(timezone.utc) - timedelta(days=days)

    results = []
    for item in items:
        if not item["name"].endswith(ext):
            continue
        # Filter to Teams transcript paths
        path = item.get("parentReference", {}).get("path", "")
        if not any(p in path for p in ["Recording", "Transcripts", "Microsoft Teams"]):
            continue
        modified = datetime.fromisoformat(item["lastModifiedDateTime"].replace("Z", "+00:00"))
        if cutoff and modified < cutoff:
            continue
        results.append(item)

    return results

 # ── Download ──────────────────────────────────────────────────────────────────

 def sanitize(name):
    return "".join(c if c.isalnum() or c in " ._-" else "_" for c in name)


 def download_transcript(token, item, output_dir, dry_run=False):
    modified = item["lastModifiedDateTime"][:10]
    name = sanitize(item["name"])
    filename = f"{modified}_{name}"
    dest = output_dir / filename
    size_kb = item.get("size", 0) // 1024

    if dest.exists():
        print(f"  skip  {filename}  (already exists)")
        return False

    if dry_run:
        print(f"  would download  {filename}  ({size_kb} KB)")
        return False

    url = item.get("@microsoft.graph.downloadUrl")
    if not url:
        # Fall back to content endpoint
        file_id = item["id"]
        drive_id = item["parentReference"].get("driveId", "")
        url = f"{GRAPH_BASE}/drives/{drive_id}/items/{file_id}/content"
        resp = requests.get(url, headers={"Authorization": f"Bearer {token}"}, allow_redirects=True)
    else:
        resp = requests.get(url, allow_redirects=True)

    resp.raise_for_status()
    dest.write_bytes(resp.content)
    print(f"  ✓  {filename}  ({size_kb} KB)")
    return True

 # ── Main ──────────────────────────────────────────────────────────────────────

 def main():
    parser = argparse.ArgumentParser(description="Download Teams meeting transcripts")
    parser.add_argument("--list",   action="store_true", help="List only, no download")
    parser.add_argument("--days",   type=int, default=None, metavar="N",
                        help="Only transcripts from last N days")
    parser.add_argument("--out",    type=Path, default=Path.home() / "teams-transcripts",
                        help="Output directory (default: ~/teams-transcripts)")
    parser.add_argument("--format", choices=["vtt", "docx"], default="vtt",
                        help="File format to download (default: vtt)")
    parser.add_argument("--logout", action="store_true", help="Clear cached credentials")
    args = parser.parse_args()

    if args.logout:
        if CACHE_FILE.exists():
            CACHE_FILE.unlink()
            print("Credentials cleared.")
        else:
            print("No cached credentials found.")
        return

    print("Authenticating with Microsoft...")
    token = get_token()
    print("Authenticated.\n")

    label = f"last {args.days} days" if args.days else "all time"
    print(f"Searching OneDrive for .{args.format} transcripts ({label})...")
    items = find_transcripts_onedrive(token, days=args.days, fmt=args.format)

    if not items:
        print("No transcripts found.")
        return

    print(f"Found {len(items)} transcript(s):\n")
    for item in items:
        size_kb = item.get("size", 0) // 1024
        modified = item["lastModifiedDateTime"][:10]
        path = item.get("parentReference", {}).get("path", "")
        print(f"  {modified}  {item['name']}  ({size_kb} KB)")
        print(f"           {path}")

    if args.list:
        return

    args.out.mkdir(parents=True, exist_ok=True)
    print(f"\nDownloading to {args.out}/\n")
    downloaded = 0
    for item in items:
        if download_transcript(token, item, args.out):
            downloaded += 1

    print(f"\nDone. {downloaded} new file(s) downloaded.")


 if __name__ == "__main__":
    main()
Option	Default	Description
`--days N`	None (all time)	Limit to transcripts modified in the last N days
`--out PATH`	`~/teams-transcripts`	Directory to save downloaded files
`--format vtt\|docx`	`vtt`	File format to download
`--list`	off	Print transcript list, skip download
`--logout`	off	Clear the cached token and exit
Variable	Required	Description
`TEAMS_CLIENT_ID`	Yes	Application (client) ID from Azure app registration
`TEAMS_TENANT_ID`	No (default: `common`)	Directory (tenant) ID from Azure AD Overview
Permission	Type	Purpose
`Files.Read`	Delegated	Search and download files from the signed-in user's OneDrive
`OnlineMeetings.Read`	Delegated	Read meeting metadata
Package	Version	Purpose
`msal`	>= 1.20	Microsoft Authentication Library for token acquisition and caching
`requests`	>= 2.28	HTTP client for Graph API calls and file downloads
Variable	Example	Use
`{date}`	`2024-03-15`	Anchor the report to a specific day
`{team}`	`Platform Engineering`	Focus extraction on team-relevant items
`{output_format}`	`Markdown`, `plain text`	Control rendering
`{max_length}`	`500 words`	Cap verbose output
Model	Best for
`claude-haiku-4-5`	Fast, cheap batch processing of many short meetings
`claude-sonnet-4-5`	General use, good balance of quality and cost
`claude-opus-4-6`	Long or complex transcripts where nuance matters
	#!/usr/bin/env python3
	"""
	teams-transcripts — download Microsoft Teams meeting transcripts from OneDrive

	Setup (one-time):
	1. Go to https://portal.azure.com > App registrations > New registration
	2. Name it anything, set account type to "Accounts in this org directory only"
	(or "any org" if multi-tenant), redirect URI = http://localhost (Public client)
	3. Under "Authentication" > enable "Allow public client flows"
	4. Under "API permissions" > Add > Microsoft Graph > Delegated:
	Files.Read OnlineMeetings.Read
	5. Copy the Application (client) ID
	6. export TEAMS_CLIENT_ID=<your-client-id>
	export TEAMS_TENANT_ID=<your-tenant-id> # find in Azure AD > Overview

	Usage:
	python teams-transcripts.py # list + download all transcripts
	python teams-transcripts.py --list # list only, no download
	python teams-transcripts.py --days 30 # only transcripts from last N days
	python teams-transcripts.py --out ~/Downloads/transcripts
	python teams-transcripts.py --format docx # download .docx instead of .vtt
	"""

	import argparse
	import json
	import os
	import sys
	import time
	from datetime import datetime, timedelta, timezone
	from pathlib import Path

	try:
	import msal
	import requests
	except ImportError:
	print("Missing dependencies. Run: pip install msal requests")
	sys.exit(1)

	# ── Config ────────────────────────────────────────────────────────────────────

	CLIENT_ID = os.environ.get("TEAMS_CLIENT_ID")
	TENANT_ID = os.environ.get("TEAMS_TENANT_ID", "common")
	SCOPES = ["https://graph.microsoft.com/Files.Read",
	"https://graph.microsoft.com/OnlineMeetings.Read"]
	CACHE_FILE = Path.home() / ".teams-transcripts-token-cache.json"
	GRAPH_BASE = "https://graph.microsoft.com/v1.0"

	# ── Auth ──────────────────────────────────────────────────────────────────────

	def build_app():
	if not CLIENT_ID:
	print("ERROR: TEAMS_CLIENT_ID is not set. See setup instructions at top of script.")
	sys.exit(1)
	cache = msal.SerializableTokenCache()
	if CACHE_FILE.exists():
	cache.deserialize(CACHE_FILE.read_text())
	app = msal.PublicClientApplication(
	CLIENT_ID,
	authority=f"https://login.microsoftonline.com/{TENANT_ID}",
	token_cache=cache,
	)
	return app, cache


	def get_token():
	app, cache = build_app()
	accounts = app.get_accounts()
	result = None

	if accounts:
	result = app.acquire_token_silent(SCOPES, account=accounts[0])

	if not result:
	flow = app.initiate_device_flow(scopes=SCOPES)
	if "user_code" not in flow:
	print(f"Device flow failed: {flow}")
	sys.exit(1)
	print(f"\n{flow['message']}\n")
	result = app.acquire_token_by_device_flow(flow)

	if "access_token" not in result:
	print(f"Auth error: {result.get('error_description', result)}")
	sys.exit(1)

	CACHE_FILE.write_text(cache.serialize())
	CACHE_FILE.chmod(0o600)
	return result["access_token"]

	# ── Graph helpers ─────────────────────────────────────────────────────────────

	def graph_get(token, url, params=None):
	headers = {"Authorization": f"Bearer {token}"}
	resp = requests.get(url, headers=headers, params=params)
	if resp.status_code == 429:
	wait = int(resp.headers.get("Retry-After", 5))
	print(f" rate limited — waiting {wait}s...")
	time.sleep(wait)
	return graph_get(token, url, params)
	resp.raise_for_status()
	return resp.json()


	def graph_get_all(token, url, params=None):
	"""Follow @odata.nextLink pagination."""
	results = []
	while url:
	data = graph_get(token, url, params)
	results.extend(data.get("value", []))
	url = data.get("@odata.nextLink")
	params = None # only on first call
	return results

	# ── Transcript discovery ──────────────────────────────────────────────────────

	def find_transcripts_onedrive(token, days=None, fmt="vtt"):
	"""
	Search OneDrive for Teams transcript files.
	Teams stores them under: /Recording/ or /Microsoft Teams Data/
	as <meeting-title>.vtt and <meeting-title>.docx
	"""
	ext = f".{fmt}"
	url = f"{GRAPH_BASE}/me/drive/search(q='{ext}')"
	params = {"$top": 200, "$orderby": "lastModifiedDateTime desc",
	"$select": "id,name,lastModifiedDateTime,parentReference,@microsoft.graph.downloadUrl,size"}
	items = graph_get_all(token, url, params)

	cutoff = None
	if days:
	cutoff = datetime.now(timezone.utc) - timedelta(days=days)

	results = []
	for item in items:
	if not item["name"].endswith(ext):
	continue
	# Filter to Teams transcript paths
	path = item.get("parentReference", {}).get("path", "")
	if not any(p in path for p in ["Recording", "Transcripts", "Microsoft Teams"]):
	continue
	modified = datetime.fromisoformat(item["lastModifiedDateTime"].replace("Z", "+00:00"))
	if cutoff and modified < cutoff:
	continue
	results.append(item)

	return results

	# ── Download ──────────────────────────────────────────────────────────────────

	def sanitize(name):
	return "".join(c if c.isalnum() or c in " ._-" else "_" for c in name)


	def download_transcript(token, item, output_dir, dry_run=False):
	modified = item["lastModifiedDateTime"][:10]
	name = sanitize(item["name"])
	filename = f"{modified}_{name}"
	dest = output_dir / filename
	size_kb = item.get("size", 0) // 1024

	if dest.exists():
	print(f" skip {filename} (already exists)")
	return False

	if dry_run:
	print(f" would download {filename} ({size_kb} KB)")
	return False

	url = item.get("@microsoft.graph.downloadUrl")
	if not url:
	# Fall back to content endpoint
	file_id = item["id"]
	drive_id = item["parentReference"].get("driveId", "")
	url = f"{GRAPH_BASE}/drives/{drive_id}/items/{file_id}/content"
	resp = requests.get(url, headers={"Authorization": f"Bearer {token}"}, allow_redirects=True)
	else:
	resp = requests.get(url, allow_redirects=True)

	resp.raise_for_status()
	dest.write_bytes(resp.content)
	print(f" ✓ {filename} ({size_kb} KB)")
	return True

	# ── Main ──────────────────────────────────────────────────────────────────────

	def main():
	parser = argparse.ArgumentParser(description="Download Teams meeting transcripts")
	parser.add_argument("--list", action="store_true", help="List only, no download")
	parser.add_argument("--days", type=int, default=None, metavar="N",
	help="Only transcripts from last N days")
	parser.add_argument("--out", type=Path, default=Path.home() / "teams-transcripts",
	help="Output directory (default: ~/teams-transcripts)")
	parser.add_argument("--format", choices=["vtt", "docx"], default="vtt",
	help="File format to download (default: vtt)")
	parser.add_argument("--logout", action="store_true", help="Clear cached credentials")
	args = parser.parse_args()

	if args.logout:
	if CACHE_FILE.exists():
	CACHE_FILE.unlink()
	print("Credentials cleared.")
	else:
	print("No cached credentials found.")
	return

	print("Authenticating with Microsoft...")
	token = get_token()
	print("Authenticated.\n")

	label = f"last {args.days} days" if args.days else "all time"
	print(f"Searching OneDrive for .{args.format} transcripts ({label})...")
	items = find_transcripts_onedrive(token, days=args.days, fmt=args.format)

	if not items:
	print("No transcripts found.")
	return

	print(f"Found {len(items)} transcript(s):\n")
	for item in items:
	size_kb = item.get("size", 0) // 1024
	modified = item["lastModifiedDateTime"][:10]
	path = item.get("parentReference", {}).get("path", "")
	print(f" {modified} {item['name']} ({size_kb} KB)")
	print(f" {path}")

	if args.list:
	return

	args.out.mkdir(parents=True, exist_ok=True)
	print(f"\nDownloading to {args.out}/\n")
	downloaded = 0
	for item in items:
	if download_transcript(token, item, args.out):
	downloaded += 1

	print(f"\nDone. {downloaded} new file(s) downloaded.")


	if __name__ == "__main__":
	main()