Skip to content

Instantly share code, notes, and snippets.

@dirumahrafif
Last active May 24, 2025 22:38
Show Gist options
  • Save dirumahrafif/e1eb94b296b19f614ecbdbd13b2f035b to your computer and use it in GitHub Desktop.
Save dirumahrafif/e1eb94b296b19f614ecbdbd13b2f035b to your computer and use it in GitHub Desktop.
Scrapping Data Komentar Youtube

1. Instal Plugin

pip install google-api-python-client

2. Proses Ambil Satu Data

Key didapatkan dari Google Cloud

def_api_key = 'xxx'

Tambahkan library

from googleapiclient.discovery import build
import pandas as pd
import time

Jalankan Script

video_id = 'yQItJNFbkpc'  # contoh video
youtube = build('youtube', 'v3', developerKey=def_api_key)

def get_comments(video_id):
    comments = []

    request = youtube.commentThreads().list(
        part='snippet,replies',
        videoId=video_id,
        maxResults=100,
        textFormat='plainText'
    )

    while request:
        response = request.execute()
        for item in response['items']:
            top_comment = item['snippet']['topLevelComment']['snippet']
            comments.append({
                'author': top_comment['authorDisplayName'],
                'comment': top_comment['textDisplay'],
                'likes': top_comment['likeCount'],
                'is_reply': False
            })

            # nested comments (replies)
            if 'replies' in item:
                for reply in item['replies']['comments']:
                    reply_snippet = reply['snippet']
                    comments.append({
                        'author': reply_snippet['authorDisplayName'],
                        'comment': reply_snippet['textDisplay'],
                        'likes': reply_snippet['likeCount'],
                        'is_reply': True
                    })

        request = youtube.commentThreads().list_next(request, response)

    return comments

# Ambil semua komentar
all_comments = get_comments(video_id)
# Simpan ke DataFrame
df = pd.DataFrame(all_comments)
# Tampilkan beberapa baris
print(df.head())
# Simpan ke CSV (opsional)
df.to_csv("youtube_comments.csv", index=False)

3. Ambil Playlist Tersembunyi

# Ganti dengan Channel ID (bukan username)
channel_id = 'xxx'  # contoh: Google Developers

# Inisialisasi client
youtube = build('youtube', 'v3', developerKey=def_api_key)

def get_uploads_playlist_id(channel_id):
    response = youtube.channels().list(
        part='contentDetails',
        id=channel_id
    ).execute()

    if 'items' not in response or not response['items']:
        print("Channel tidak ditemukan.")
        return None

    uploads_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    return uploads_id

# Jalankan
uploads_playlist_id = get_uploads_playlist_id(channel_id)
print(f"Playlist uploads ID: {uploads_playlist_id}")

Dapatkan playlist id

# Playlist tersembunyi
def_playlist_id = uploads_playlist_id

4. Masukkan Data Video Ke DataFrame

Jalankan script

# Inisialisasi client
youtube = build('youtube', 'v3', developerKey=def_api_key)

def get_videos_from_playlist(playlist_id):
    video_ids = []
    next_page_token = None

    while True:
        res = youtube.playlistItems().list(
            part='contentDetails',
            playlistId=playlist_id,
            maxResults=50,
            pageToken=next_page_token
        ).execute()

        for item in res['items']:
            video_ids.append(item['contentDetails']['videoId'])

        next_page_token = res.get('nextPageToken')
        if not next_page_token:
            break
        time.sleep(0.1)

    return video_ids

def get_video_details(video_ids):
    video_data = []

    for i in range(0, len(video_ids), 50):
        batch_ids = video_ids[i:i+50]
        res = youtube.videos().list(
            part='snippet,statistics',
            id=','.join(batch_ids)
        ).execute()

        for item in res['items']:
            data = {
                'videoId': item['id'],
                'title': item['snippet']['title'],
                'publishedAt': item['snippet']['publishedAt'],
                'viewCount': int(item['statistics'].get('viewCount', 0)),
                'commentCount': int(item['statistics'].get('commentCount', 0))
            }
            video_data.append(data)
    
    return video_data

# Ambil semua video ID dari playlist
video_ids = get_videos_from_playlist(def_playlist_id)

# Ambil detail dari video-video tersebut
video_details = get_video_details(video_ids)

# Simpan ke DataFrame
df = pd.DataFrame(video_details)

# Tampilkan 5 baris pertama
print(df.head())

# Simpan ke file CSV (opsional)
df.to_csv("video_playlist.csv", index=False)

5. Ambil Komentar dari Semua Video

youtube = build('youtube', 'v3', developerKey=def_api_key)

# id pemillik
def_id_pemilik = "@dirumahrafif"
# tentukan jumlah video teratas
def_jumlah_video = 80

# Baca CSV video_playlist.csv
df_videos = pd.read_csv('video_playlist.csv')

# Urutkan berdasarkan commentCount descending dan ambil video teratas
df_videos_sorted = df_videos.sort_values(by='commentCount', ascending=False).head(def_jumlah_video)

# Buat dict videoId ke title dari data yang sudah disortir
video_title_map = dict(zip(df_videos_sorted['videoId'], df_videos_sorted['title']))

# Ambil videoId list yang sudah disortir dan dibatasi
video_ids = df_videos_sorted['videoId'].tolist()

def get_comments(video_id):
    comments = []
    request = youtube.commentThreads().list(
        part='snippet,replies',
        videoId=video_id,
        maxResults=100,
        textFormat='plainText'
    )

    while request:
        response = request.execute()
        for item in response['items']:
            top_comment = item['snippet']['topLevelComment']['snippet']
            if (top_comment['authorDisplayName'] == def_id_pemilik and def_id_pemilik != ''):
                continue
            comments.append({
                'author': top_comment['authorDisplayName'],
                'comment': top_comment['textDisplay'],
                'likeCount': top_comment['likeCount'],
                'isReply': False
            })

            if 'replies' in item:
                for reply in item['replies']['comments']:
                    reply_snippet = reply['snippet']
                    if (reply_snippet['authorDisplayName'] == def_id_pemilik and def_id_pemilik != ''):
                        continue
                    comments.append({
                        'author': reply_snippet['authorDisplayName'],
                        'comment': reply_snippet['textDisplay'],
                        'likeCount': reply_snippet['likeCount'],
                        'isReply': True
                    })

        request = youtube.commentThreads().list_next(request, response)
        time.sleep(0.1)

    return comments

def get_video_title(video_id):
    return video_title_map.get(video_id, 'Unknown Title')

all_comment_data = []

batch_size = 10
for i in range(0, len(video_ids), batch_size):
    batch = video_ids[i:i+batch_size]
    print(f"Processing batch videos {i+1} to {i+len(batch)}")

    for video_id in batch:
        print(f"  Mengambil komentar video {video_id}")
        title = get_video_title(video_id)
        comments = get_comments(video_id)

        for c in comments:
            all_comment_data.append({
                'videoId': video_id,
                'title': title,
                'author': c['author'],
                'comment': c['comment'],
                'likeCount': c['likeCount'],
                'isReply': c['isReply']
            })

    time.sleep(10)

df_comments = pd.DataFrame(all_comment_data)
print(df_comments.head())
df_comments.to_csv('video_comments.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment