Created
August 20, 2013 03:01
-
-
Save awforsythe/6276657 to your computer and use it in GitHub Desktop.
Retrieves commenter usernames (up to the first 1000 comments) from a YouTube video using the v2 API.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
retrieve_commenters.py | |
by Alex Forsythe | |
19 August 2013 | |
Uses the YouTube API v2 to generate a list of the users who have commented on a | |
specific video. Thanks to the limitations of this API, only examines the first | |
1000 comments. | |
1. Place this script in the desired directory. | |
2. Set the value of kVideoId by replacing VIDEO_ID_HERE with your video id. | |
3. Open a terminal and browse to the directory containing this script. | |
4. Run 'python retrieve_commenters.py' and wait. | |
5. Names will be written to commenters.csv in the same directory. | |
''' | |
import math | |
import urllib2 | |
import xml.etree.ElementTree as ET | |
kVideoId = r'VIDEO_ID_HERE' | |
kStartIndex = 1 | |
kMaxResults = 25 | |
kApiUrl = 'https://gdata.youtube.com/feeds/api/' | |
def generate_video_url(videoId): | |
''' | |
Returns a URL that will request an XML node containing the metadata for the | |
given video. | |
''' | |
return 'https://gdata.youtube.com/feeds/api/videos/%s' % videoId | |
def generate_comments_url(videoId, i, n): | |
''' | |
Returns a URL that will request an XML feed representing the comments for | |
given video, beginning with the comment at i and containing n comments. | |
''' | |
return kApiUrl + 'videos/%s/comments?start-index=%d&max-results=%d' % ( | |
videoId, i, n) | |
def retrieve_xml_response(url): | |
''' | |
Initiates an HTTP request and returns the resulting XML data, parsed into | |
an ElementTree hierarchy. | |
''' | |
res = urllib2.urlopen(url) | |
return ET.fromstring(res.read()) | |
def retrieve_num_comments(videoId): | |
''' | |
Returns the number of comments on the specified video. | |
''' | |
xml = retrieve_xml_response(generate_comments_url(videoId, 1, 1)) | |
totalResults = xml.find('{http://a9.com/-/spec/opensearchrss/1.0/}totalResults') | |
if totalResults is None: | |
raise Exception('Could not find totalResults element for video %s.' % videoId) | |
n = int(totalResults.text) | |
return n if n < 1000 else 1000 | |
def calculate_num_pages(numComments): | |
''' | |
Returns the total number of pages that will need to be fetched in order to | |
read all the comments for a video with the specified number of comments. | |
''' | |
return int(math.ceil(float(numComments) / kMaxResults)) | |
def calculate_start_index(pageIndex): | |
''' | |
Returns the index of the first comment for the page with the given index. | |
''' | |
return pageIndex * kMaxResults + 1 | |
def retrieve_commenters_from_page(videoId, i): | |
''' | |
Retrieves the i'th page of comments from the given video and returns a set | |
containing the (username, realname) tuples of all the users whose comments | |
are contained on that page. | |
''' | |
xml = retrieve_xml_response(generate_comments_url(videoId, i, kMaxResults)) | |
commenters = set() | |
for entry in xml.findall('{http://www.w3.org/2005/Atom}entry'): | |
author = entry.find('{http://www.w3.org/2005/Atom}author') | |
username = author.find('{http://www.w3.org/2005/Atom}uri').text.split('/')[-1] | |
realname = author.find('{http://www.w3.org/2005/Atom}name').text | |
commenters.add((username, realname)) | |
return commenters | |
def retrieve_commenters(videoId, verbose): | |
''' | |
Returns a set containing the (username, realname) pairs for all the users | |
who wrote comments on the specified video. If verbose is True, prints | |
progress messages to the console. | |
''' | |
def pr(s): | |
''' | |
Prints the given string to the console if retrieve_commenters has been | |
called with verbose mode enabled. Otherwise, does nothing. | |
''' | |
if verbose: | |
print s | |
pr('Retrieving commenters for video with ID < %s >...' % videoId) | |
num_comments = retrieve_num_comments(videoId) | |
num_pages = calculate_num_pages(num_comments) | |
pr('Video has %d comments spanning %d pages (%d per page).' % ( | |
num_comments, num_pages, kMaxResults)) | |
commenters = set() | |
for page_index in range(num_pages): | |
comment_index = calculate_start_index(page_index) | |
pr('Getting comments from page %d (starting at comment %d)...' % ( | |
page_index + 1, comment_index)) | |
page_commenters = retrieve_commenters_from_page(videoId, comment_index) | |
commenters |= page_commenters | |
pr('...got %d commenters from page %d. Total: %d' % ( | |
len(page_commenters), page_index + 1, len(commenters))) | |
return commenters | |
def write_commenters(commenters, filename, verbose): | |
''' | |
Writes the given set of commenter name pairs to an CSV file at the given | |
filename (including extension). | |
''' | |
with open(filename, 'w') as fp: | |
fp.write('USERNAME,REALNAME\n') | |
for username, realname in commenters: | |
fp.write('%s,%s\n' % ( | |
username.encode('utf-8'), | |
realname.encode('utf-8'))) | |
if verbose: | |
print 'Wrote %d names to %s.' % (len(commenters), filename) | |
if __name__ == '__main__': | |
if kVideoId.startswith('VIDEO_ID_HER') and kVideoId.endswith('IDEO_ID_HERE'): | |
print 'Set the value of kVideoId on line 21!' | |
else: | |
write_commenters(retrieve_commenters(kVideoId, True), 'commenters.csv', True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment