awforsythe · August 20, 2013 03:01
diff --git a/retrieve_commenters.py b/retrieve_commenters.py
 '''
 retrieve_commenters.py
 by Alex Forsythe
 19 August 2013

 Uses the YouTube API v2 to generate a list of the users who have commented on a
 specific video. Thanks to the limitations of this API, only examines the first
 1000 comments.

 1. Place this script in the desired directory.
 2. Set the value of kVideoId by replacing VIDEO_ID_HERE with your video id.
 3. Open a terminal and browse to the directory containing this script.
 4. Run 'python retrieve_commenters.py' and wait.
 5. Names will be written to commenters.csv in the same directory.
 '''

 import math
 import urllib2
 import xml.etree.ElementTree as ET

 kVideoId = r'VIDEO_ID_HERE'

 kStartIndex = 1
 kMaxResults = 25

 kApiUrl = 'https://gdata.youtube.com/feeds/api/'

 def generate_video_url(videoId):
    '''
    Returns a URL that will request an XML node containing the metadata for the
    given video.
    '''
    return 'https://gdata.youtube.com/feeds/api/videos/%s' % videoId

 def generate_comments_url(videoId, i, n):
    '''
    Returns a URL that will request an XML feed representing the comments for
    given video, beginning with the comment at i and containing n comments.
    '''
    return kApiUrl + 'videos/%s/comments?start-index=%d&max-results=%d' % (
        videoId, i, n)

 def retrieve_xml_response(url):
    '''
    Initiates an HTTP request and returns the resulting XML data, parsed into
    an ElementTree hierarchy.
    '''
    res = urllib2.urlopen(url)
    return ET.fromstring(res.read())

 def retrieve_num_comments(videoId):
    '''
    Returns the number of comments on the specified video.
    '''
    xml = retrieve_xml_response(generate_comments_url(videoId, 1, 1))
    totalResults = xml.find('{http://a9.com/-/spec/opensearchrss/1.0/}totalResults')
    if totalResults is None:
        raise Exception('Could not find totalResults element for video %s.' % videoId)
    n = int(totalResults.text)
    return n if n < 1000 else 1000

 def calculate_num_pages(numComments):
    '''
    Returns the total number of pages that will need to be fetched in order to
    read all the comments for a video with the specified number of comments.
    '''
    return int(math.ceil(float(numComments) / kMaxResults))

 def calculate_start_index(pageIndex):
    '''
    Returns the index of the first comment for the page with the given index.
    '''
    return pageIndex * kMaxResults + 1

 def retrieve_commenters_from_page(videoId, i):
    '''
    Retrieves the i'th page of comments from the given video and returns a set
    containing the (username, realname) tuples of all the users whose comments
    are contained on that page.
    '''
    xml = retrieve_xml_response(generate_comments_url(videoId, i, kMaxResults))
    commenters = set()
    for entry in xml.findall('{http://www.w3.org/2005/Atom}entry'):
        author = entry.find('{http://www.w3.org/2005/Atom}author')
        username = author.find('{http://www.w3.org/2005/Atom}uri').text.split('/')[-1]
        realname = author.find('{http://www.w3.org/2005/Atom}name').text
        commenters.add((username, realname))
    return commenters

 def retrieve_commenters(videoId, verbose):
    '''
    Returns a set containing the (username, realname) pairs for all the users
    who wrote comments on the specified video. If verbose is True, prints
    progress messages to the console.
    '''
    
    def pr(s):
        '''
        Prints the given string to the console if retrieve_commenters has been
        called with verbose mode enabled. Otherwise, does nothing.
        '''
        if verbose:
            print s

    pr('Retrieving commenters for video with ID < %s >...' % videoId)

    num_comments = retrieve_num_comments(videoId)
    num_pages = calculate_num_pages(num_comments)

    pr('Video has %d comments spanning %d pages (%d per page).' % (
        num_comments, num_pages, kMaxResults))

    commenters = set()
    for page_index in range(num_pages):

        comment_index = calculate_start_index(page_index)
        pr('Getting comments from page %d (starting at comment %d)...' % (
            page_index + 1, comment_index))

        page_commenters = retrieve_commenters_from_page(videoId, comment_index)
        commenters |= page_commenters
        pr('...got %d commenters from page %d. Total: %d' % (
            len(page_commenters), page_index + 1, len(commenters)))
    
    return commenters

 def write_commenters(commenters, filename, verbose):
    '''
    Writes the given set of commenter name pairs to an CSV file at the given
    filename (including extension).
    '''
    with open(filename, 'w') as fp:
        fp.write('USERNAME,REALNAME\n')
        for username, realname in commenters:
            fp.write('%s,%s\n' % (
                username.encode('utf-8'),
                realname.encode('utf-8')))

    if verbose:
        print 'Wrote %d names to %s.' % (len(commenters), filename)

 if __name__ == '__main__':

    if kVideoId.startswith('VIDEO_ID_HER') and kVideoId.endswith('IDEO_ID_HERE'):
        print 'Set the value of kVideoId on line 21!'
    else:
        write_commenters(retrieve_commenters(kVideoId, True), 'commenters.csv', True)
	'''
	retrieve_commenters.py
	by Alex Forsythe
	19 August 2013

	Uses the YouTube API v2 to generate a list of the users who have commented on a
	specific video. Thanks to the limitations of this API, only examines the first
	1000 comments.

	1. Place this script in the desired directory.
	2. Set the value of kVideoId by replacing VIDEO_ID_HERE with your video id.
	3. Open a terminal and browse to the directory containing this script.
	4. Run 'python retrieve_commenters.py' and wait.
	5. Names will be written to commenters.csv in the same directory.
	'''

	import math
	import urllib2
	import xml.etree.ElementTree as ET

	kVideoId = r'VIDEO_ID_HERE'

	kStartIndex = 1
	kMaxResults = 25

	kApiUrl = 'https://gdata.youtube.com/feeds/api/'

	def generate_video_url(videoId):
	'''
	Returns a URL that will request an XML node containing the metadata for the
	given video.
	'''
	return 'https://gdata.youtube.com/feeds/api/videos/%s' % videoId

	def generate_comments_url(videoId, i, n):
	'''
	Returns a URL that will request an XML feed representing the comments for
	given video, beginning with the comment at i and containing n comments.
	'''
	return kApiUrl + 'videos/%s/comments?start-index=%d&max-results=%d' % (
	videoId, i, n)

	def retrieve_xml_response(url):
	'''
	Initiates an HTTP request and returns the resulting XML data, parsed into
	an ElementTree hierarchy.
	'''
	res = urllib2.urlopen(url)
	return ET.fromstring(res.read())

	def retrieve_num_comments(videoId):
	'''
	Returns the number of comments on the specified video.
	'''
	xml = retrieve_xml_response(generate_comments_url(videoId, 1, 1))
	totalResults = xml.find('{http://a9.com/-/spec/opensearchrss/1.0/}totalResults')
	if totalResults is None:
	raise Exception('Could not find totalResults element for video %s.' % videoId)
	n = int(totalResults.text)
	return n if n < 1000 else 1000

	def calculate_num_pages(numComments):
	'''
	Returns the total number of pages that will need to be fetched in order to
	read all the comments for a video with the specified number of comments.
	'''
	return int(math.ceil(float(numComments) / kMaxResults))

	def calculate_start_index(pageIndex):
	'''
	Returns the index of the first comment for the page with the given index.
	'''
	return pageIndex * kMaxResults + 1

	def retrieve_commenters_from_page(videoId, i):
	'''
	Retrieves the i'th page of comments from the given video and returns a set
	containing the (username, realname) tuples of all the users whose comments
	are contained on that page.
	'''
	xml = retrieve_xml_response(generate_comments_url(videoId, i, kMaxResults))
	commenters = set()
	for entry in xml.findall('{http://www.w3.org/2005/Atom}entry'):
	author = entry.find('{http://www.w3.org/2005/Atom}author')
	username = author.find('{http://www.w3.org/2005/Atom}uri').text.split('/')[-1]
	realname = author.find('{http://www.w3.org/2005/Atom}name').text
	commenters.add((username, realname))
	return commenters

	def retrieve_commenters(videoId, verbose):
	'''
	Returns a set containing the (username, realname) pairs for all the users
	who wrote comments on the specified video. If verbose is True, prints
	progress messages to the console.
	'''

	def pr(s):
	'''
	Prints the given string to the console if retrieve_commenters has been
	called with verbose mode enabled. Otherwise, does nothing.
	'''
	if verbose:
	print s

	pr('Retrieving commenters for video with ID < %s >...' % videoId)

	num_comments = retrieve_num_comments(videoId)
	num_pages = calculate_num_pages(num_comments)

	pr('Video has %d comments spanning %d pages (%d per page).' % (
	num_comments, num_pages, kMaxResults))

	commenters = set()
	for page_index in range(num_pages):

	comment_index = calculate_start_index(page_index)
	pr('Getting comments from page %d (starting at comment %d)...' % (
	page_index + 1, comment_index))

	page_commenters = retrieve_commenters_from_page(videoId, comment_index)
	commenters \|= page_commenters
	pr('...got %d commenters from page %d. Total: %d' % (
	len(page_commenters), page_index + 1, len(commenters)))

	return commenters

	def write_commenters(commenters, filename, verbose):
	'''
	Writes the given set of commenter name pairs to an CSV file at the given
	filename (including extension).
	'''
	with open(filename, 'w') as fp:
	fp.write('USERNAME,REALNAME\n')
	for username, realname in commenters:
	fp.write('%s,%s\n' % (
	username.encode('utf-8'),
	realname.encode('utf-8')))

	if verbose:
	print 'Wrote %d names to %s.' % (len(commenters), filename)

	if __name__ == '__main__':

	if kVideoId.startswith('VIDEO_ID_HER') and kVideoId.endswith('IDEO_ID_HERE'):
	print 'Set the value of kVideoId on line 21!'
	else:
	write_commenters(retrieve_commenters(kVideoId, True), 'commenters.csv', True)