softyoda · February 5, 2024 20:09
diff --git a/scrap_srt_and_description.py b/scrap_srt_and_description.py
 import requests
 from youtube_transcript_api import YouTubeTranscriptApi
 from youtube_transcript_api.formatters import SRTFormatter
 import json
 from datetime import timedelta

 def download_channel_videos(api_key, channel_id):
    base_url = "https://www.googleapis.com/youtube/v3"
    videos_url = f"{base_url}/search?key={api_key}&channelId={channel_id}&part=snippet,id&order=date&maxResults=50"
    video_details_list = []

    try:
        # Chargez le JSON existant s'il existe
        with open('channel_videos.json', 'r', encoding='utf-8') as existing_file:
            video_details_list = json.load(existing_file)
    except FileNotFoundError:
        pass

    total_videos = len(video_details_list)  # Obtenez le nombre de vidéos déjà enregistrées

    while True:
        response = requests.get(videos_url)
        videos = response.json()

        for video in videos.get('items', []):
            if video['id']['kind'] == "youtube#video":
                video_id = video['id']['videoId']

                # Vérifiez si la vidéo est déjà enregistrée dans le JSON existant
                existing_video = next((v for v in video_details_list if v['video_id'] == video_id), None)

                if existing_video:
                    # Si la vidéo existe déjà dans le JSON, passez à la suivante
                    print(f"Video ID: {video_id} already exists in JSON.")
                    print(f"({total_videos}/{videos['pageInfo']['totalResults']})")
                    continue

                video_data = {
                    'video_id': video_id,
                    'video_full_url': f"https://www.youtube.com/watch?v={video_id}"
                }

                # Récupérer les métadonnées de la vidéo
                video_details_url = f"{base_url}/videos?id={video_id}&key={api_key}&part=snippet,statistics,contentDetails"
                video_details_response = requests.get(video_details_url)
                video_details = video_details_response.json()

                if video_details['items']:
                    item = video_details['items'][0]
                    snippet = item['snippet']
                    statistics = item['statistics']
                    content_details = item['contentDetails']

                    video_data['video_title'] = snippet['title']
                    video_data['video_description'] = snippet['description']
                    video_data['video_view_count'] = statistics['viewCount']

                    # Durée de la vidéo au format ISO 8601
                    video_length_iso = content_details['duration']

                    video_data['video_length'] = video_length_iso  # Conservez la durée au format ISO 8601

                    # Date de publication de la vidéo
                    video_data['release_date'] = snippet['publishedAt']

                # Récupérer les sous-titres
                try:
                    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
                    transcript = transcript_list.find_transcript(['fr'])

                    # Texte complet sans les temps
                    video_data['video_subtitle_text'] = ' '.join([i['text'] for i in transcript.fetch()])

                    # Sous-titres format SRT
                    formatter = SRTFormatter()
                    video_data['video_subtitle_srt'] = formatter.format_transcript(transcript.fetch())
                except Exception as e:
                    print(f"Error while fetching subtitles for video ID {video_id}: {e}")

                video_details_list.append(video_data)
                total_videos += 1
                print(f"Downloaded details for video ID: {video_id} ({total_videos}/{videos['pageInfo']['totalResults']})")

        if 'nextPageToken' in videos:
            nextPageToken = videos['nextPageToken']
            videos_url = f"{videos_url}&pageToken={nextPageToken}"
        else:
            break

    with open('channel_videos.json', 'w', encoding='utf-8') as f:
        json.dump(video_details_list, f, ensure_ascii=False, indent=4)

    print("All video details downloaded and saved to 'channel_videos.json'.")

 api_key = 'API_TOKEN'  # Remplacez par votre clé API
 channel_id = 'CHANNEL_ID'  # Remplacez par l'ID de la chaîne YouTube (dans le code source de la page youtube si y'a un alias)
 download_channel_videos(api_key, channel_id)
	import requests
	from youtube_transcript_api import YouTubeTranscriptApi
	from youtube_transcript_api.formatters import SRTFormatter
	import json
	from datetime import timedelta

	def download_channel_videos(api_key, channel_id):
	base_url = "https://www.googleapis.com/youtube/v3"
	videos_url = f"{base_url}/search?key={api_key}&channelId={channel_id}&part=snippet,id&order=date&maxResults=50"
	video_details_list = []

	try:
	# Chargez le JSON existant s'il existe
	with open('channel_videos.json', 'r', encoding='utf-8') as existing_file:
	video_details_list = json.load(existing_file)
	except FileNotFoundError:
	pass

	total_videos = len(video_details_list) # Obtenez le nombre de vidéos déjà enregistrées

	while True:
	response = requests.get(videos_url)
	videos = response.json()

	for video in videos.get('items', []):
	if video['id']['kind'] == "youtube#video":
	video_id = video['id']['videoId']

	# Vérifiez si la vidéo est déjà enregistrée dans le JSON existant
	existing_video = next((v for v in video_details_list if v['video_id'] == video_id), None)

	if existing_video:
	# Si la vidéo existe déjà dans le JSON, passez à la suivante
	print(f"Video ID: {video_id} already exists in JSON.")
	print(f"({total_videos}/{videos['pageInfo']['totalResults']})")
	continue

	video_data = {
	'video_id': video_id,
	'video_full_url': f"https://www.youtube.com/watch?v={video_id}"
	}

	# Récupérer les métadonnées de la vidéo
	video_details_url = f"{base_url}/videos?id={video_id}&key={api_key}&part=snippet,statistics,contentDetails"
	video_details_response = requests.get(video_details_url)
	video_details = video_details_response.json()

	if video_details['items']:
	item = video_details['items'][0]
	snippet = item['snippet']
	statistics = item['statistics']
	content_details = item['contentDetails']

	video_data['video_title'] = snippet['title']
	video_data['video_description'] = snippet['description']
	video_data['video_view_count'] = statistics['viewCount']

	# Durée de la vidéo au format ISO 8601
	video_length_iso = content_details['duration']

	video_data['video_length'] = video_length_iso # Conservez la durée au format ISO 8601

	# Date de publication de la vidéo
	video_data['release_date'] = snippet['publishedAt']

	# Récupérer les sous-titres
	try:
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
	transcript = transcript_list.find_transcript(['fr'])

	# Texte complet sans les temps
	video_data['video_subtitle_text'] = ' '.join([i['text'] for i in transcript.fetch()])

	# Sous-titres format SRT
	formatter = SRTFormatter()
	video_data['video_subtitle_srt'] = formatter.format_transcript(transcript.fetch())
	except Exception as e:
	print(f"Error while fetching subtitles for video ID {video_id}: {e}")

	video_details_list.append(video_data)
	total_videos += 1
	print(f"Downloaded details for video ID: {video_id} ({total_videos}/{videos['pageInfo']['totalResults']})")

	if 'nextPageToken' in videos:
	nextPageToken = videos['nextPageToken']
	videos_url = f"{videos_url}&pageToken={nextPageToken}"
	else:
	break

	with open('channel_videos.json', 'w', encoding='utf-8') as f:
	json.dump(video_details_list, f, ensure_ascii=False, indent=4)

	print("All video details downloaded and saved to 'channel_videos.json'.")

	api_key = 'API_TOKEN' # Remplacez par votre clé API
	channel_id = 'CHANNEL_ID' # Remplacez par l'ID de la chaîne YouTube (dans le code source de la page youtube si y'a un alias)
	download_channel_videos(api_key, channel_id)