Skip to content

Instantly share code, notes, and snippets.

@softyoda
Created February 5, 2024 20:09
Show Gist options
  • Save softyoda/939a3ba825894d157726a248de536f2b to your computer and use it in GitHub Desktop.
Save softyoda/939a3ba825894d157726a248de536f2b to your computer and use it in GitHub Desktop.
Scrap STR, Description, Title of all video of a youtube channel.
import requests
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import SRTFormatter
import json
from datetime import timedelta
def download_channel_videos(api_key, channel_id):
base_url = "https://www.googleapis.com/youtube/v3"
videos_url = f"{base_url}/search?key={api_key}&channelId={channel_id}&part=snippet,id&order=date&maxResults=50"
video_details_list = []
try:
# Chargez le JSON existant s'il existe
with open('channel_videos.json', 'r', encoding='utf-8') as existing_file:
video_details_list = json.load(existing_file)
except FileNotFoundError:
pass
total_videos = len(video_details_list) # Obtenez le nombre de vidéos déjà enregistrées
while True:
response = requests.get(videos_url)
videos = response.json()
for video in videos.get('items', []):
if video['id']['kind'] == "youtube#video":
video_id = video['id']['videoId']
# Vérifiez si la vidéo est déjà enregistrée dans le JSON existant
existing_video = next((v for v in video_details_list if v['video_id'] == video_id), None)
if existing_video:
# Si la vidéo existe déjà dans le JSON, passez à la suivante
print(f"Video ID: {video_id} already exists in JSON.")
print(f"({total_videos}/{videos['pageInfo']['totalResults']})")
continue
video_data = {
'video_id': video_id,
'video_full_url': f"https://www.youtube.com/watch?v={video_id}"
}
# Récupérer les métadonnées de la vidéo
video_details_url = f"{base_url}/videos?id={video_id}&key={api_key}&part=snippet,statistics,contentDetails"
video_details_response = requests.get(video_details_url)
video_details = video_details_response.json()
if video_details['items']:
item = video_details['items'][0]
snippet = item['snippet']
statistics = item['statistics']
content_details = item['contentDetails']
video_data['video_title'] = snippet['title']
video_data['video_description'] = snippet['description']
video_data['video_view_count'] = statistics['viewCount']
# Durée de la vidéo au format ISO 8601
video_length_iso = content_details['duration']
video_data['video_length'] = video_length_iso # Conservez la durée au format ISO 8601
# Date de publication de la vidéo
video_data['release_date'] = snippet['publishedAt']
# Récupérer les sous-titres
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
transcript = transcript_list.find_transcript(['fr'])
# Texte complet sans les temps
video_data['video_subtitle_text'] = ' '.join([i['text'] for i in transcript.fetch()])
# Sous-titres format SRT
formatter = SRTFormatter()
video_data['video_subtitle_srt'] = formatter.format_transcript(transcript.fetch())
except Exception as e:
print(f"Error while fetching subtitles for video ID {video_id}: {e}")
video_details_list.append(video_data)
total_videos += 1
print(f"Downloaded details for video ID: {video_id} ({total_videos}/{videos['pageInfo']['totalResults']})")
if 'nextPageToken' in videos:
nextPageToken = videos['nextPageToken']
videos_url = f"{videos_url}&pageToken={nextPageToken}"
else:
break
with open('channel_videos.json', 'w', encoding='utf-8') as f:
json.dump(video_details_list, f, ensure_ascii=False, indent=4)
print("All video details downloaded and saved to 'channel_videos.json'.")
api_key = 'API_TOKEN' # Remplacez par votre clé API
channel_id = 'CHANNEL_ID' # Remplacez par l'ID de la chaîne YouTube (dans le code source de la page youtube si y'a un alias)
download_channel_videos(api_key, channel_id)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment