Skip to content

Instantly share code, notes, and snippets.

@awforsythe
Created August 20, 2013 03:01
Show Gist options
  • Save awforsythe/6276657 to your computer and use it in GitHub Desktop.
Save awforsythe/6276657 to your computer and use it in GitHub Desktop.
Retrieves commenter usernames (up to the first 1000 comments) from a YouTube video using the v2 API.
'''
retrieve_commenters.py
by Alex Forsythe
19 August 2013
Uses the YouTube API v2 to generate a list of the users who have commented on a
specific video. Thanks to the limitations of this API, only examines the first
1000 comments.
1. Place this script in the desired directory.
2. Set the value of kVideoId by replacing VIDEO_ID_HERE with your video id.
3. Open a terminal and browse to the directory containing this script.
4. Run 'python retrieve_commenters.py' and wait.
5. Names will be written to commenters.csv in the same directory.
'''
import math
import urllib2
import xml.etree.ElementTree as ET
kVideoId = r'VIDEO_ID_HERE'
kStartIndex = 1
kMaxResults = 25
kApiUrl = 'https://gdata.youtube.com/feeds/api/'
def generate_video_url(videoId):
'''
Returns a URL that will request an XML node containing the metadata for the
given video.
'''
return 'https://gdata.youtube.com/feeds/api/videos/%s' % videoId
def generate_comments_url(videoId, i, n):
'''
Returns a URL that will request an XML feed representing the comments for
given video, beginning with the comment at i and containing n comments.
'''
return kApiUrl + 'videos/%s/comments?start-index=%d&max-results=%d' % (
videoId, i, n)
def retrieve_xml_response(url):
'''
Initiates an HTTP request and returns the resulting XML data, parsed into
an ElementTree hierarchy.
'''
res = urllib2.urlopen(url)
return ET.fromstring(res.read())
def retrieve_num_comments(videoId):
'''
Returns the number of comments on the specified video.
'''
xml = retrieve_xml_response(generate_comments_url(videoId, 1, 1))
totalResults = xml.find('{http://a9.com/-/spec/opensearchrss/1.0/}totalResults')
if totalResults is None:
raise Exception('Could not find totalResults element for video %s.' % videoId)
n = int(totalResults.text)
return n if n < 1000 else 1000
def calculate_num_pages(numComments):
'''
Returns the total number of pages that will need to be fetched in order to
read all the comments for a video with the specified number of comments.
'''
return int(math.ceil(float(numComments) / kMaxResults))
def calculate_start_index(pageIndex):
'''
Returns the index of the first comment for the page with the given index.
'''
return pageIndex * kMaxResults + 1
def retrieve_commenters_from_page(videoId, i):
'''
Retrieves the i'th page of comments from the given video and returns a set
containing the (username, realname) tuples of all the users whose comments
are contained on that page.
'''
xml = retrieve_xml_response(generate_comments_url(videoId, i, kMaxResults))
commenters = set()
for entry in xml.findall('{http://www.w3.org/2005/Atom}entry'):
author = entry.find('{http://www.w3.org/2005/Atom}author')
username = author.find('{http://www.w3.org/2005/Atom}uri').text.split('/')[-1]
realname = author.find('{http://www.w3.org/2005/Atom}name').text
commenters.add((username, realname))
return commenters
def retrieve_commenters(videoId, verbose):
'''
Returns a set containing the (username, realname) pairs for all the users
who wrote comments on the specified video. If verbose is True, prints
progress messages to the console.
'''
def pr(s):
'''
Prints the given string to the console if retrieve_commenters has been
called with verbose mode enabled. Otherwise, does nothing.
'''
if verbose:
print s
pr('Retrieving commenters for video with ID < %s >...' % videoId)
num_comments = retrieve_num_comments(videoId)
num_pages = calculate_num_pages(num_comments)
pr('Video has %d comments spanning %d pages (%d per page).' % (
num_comments, num_pages, kMaxResults))
commenters = set()
for page_index in range(num_pages):
comment_index = calculate_start_index(page_index)
pr('Getting comments from page %d (starting at comment %d)...' % (
page_index + 1, comment_index))
page_commenters = retrieve_commenters_from_page(videoId, comment_index)
commenters |= page_commenters
pr('...got %d commenters from page %d. Total: %d' % (
len(page_commenters), page_index + 1, len(commenters)))
return commenters
def write_commenters(commenters, filename, verbose):
'''
Writes the given set of commenter name pairs to an CSV file at the given
filename (including extension).
'''
with open(filename, 'w') as fp:
fp.write('USERNAME,REALNAME\n')
for username, realname in commenters:
fp.write('%s,%s\n' % (
username.encode('utf-8'),
realname.encode('utf-8')))
if verbose:
print 'Wrote %d names to %s.' % (len(commenters), filename)
if __name__ == '__main__':
if kVideoId.startswith('VIDEO_ID_HER') and kVideoId.endswith('IDEO_ID_HERE'):
print 'Set the value of kVideoId on line 21!'
else:
write_commenters(retrieve_commenters(kVideoId, True), 'commenters.csv', True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment