Last active
August 29, 2015 14:00
-
-
Save jrodbx/11202614 to your computer and use it in GitHub Desktop.
Google I/O 2014 YouTube video annotation easter egg crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.google.api.client.googleapis.json.GoogleJsonResponseException; | |
import com.google.api.client.http.HttpRequest; | |
import com.google.api.client.http.HttpRequestInitializer; | |
import com.google.api.client.http.HttpTransport; | |
import com.google.api.client.http.javanet.NetHttpTransport; | |
import com.google.api.client.json.JsonFactory; | |
import com.google.api.client.json.jackson2.JacksonFactory; | |
import com.google.api.client.util.DateTime; | |
import com.google.api.services.youtube.YouTube; | |
import com.google.api.services.youtube.model.PageInfo; | |
import com.google.api.services.youtube.model.ResourceId; | |
import com.google.api.services.youtube.model.SearchListResponse; | |
import com.google.api.services.youtube.model.SearchResult; | |
import org.w3c.dom.Document; | |
import org.w3c.dom.Node; | |
import org.w3c.dom.NodeList; | |
import javax.xml.parsers.DocumentBuilder; | |
import javax.xml.parsers.DocumentBuilderFactory; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.net.URL; | |
import java.util.Iterator; | |
import java.util.List; | |
import java.util.Properties; | |
public class YouTubeCrawler { | |
public static final HttpTransport HTTP_TRANSPORT = new NetHttpTransport(); | |
public static final JsonFactory JSON_FACTORY = new JacksonFactory(); | |
private static final String PROPERTIES_FILENAME = "youtube.properties"; | |
private static final long NUMBER_OF_VIDEOS_RETURNED = 50; | |
private static YouTube youtube; | |
private static String ANNOTATION_URI = "https://www.youtube.com/annotations_invideo?features=1&legacy=0&video_id="; | |
private static int counter = 0; | |
public static void main(String[] args) { | |
// YouTube api requests limit to 50 results per page, 500 results total | |
// therefore, if a query results in more than 500 results, we'll need to | |
// run multiple queries updating the publishedBefore search parameter each time | |
DateTime lastPublished = new DateTime("2014-04-18T20:53:24.000Z"); | |
//DateTime lastPublished = new DateTime("2013-09-19T17:00:15.000Z"); | |
//DateTime lastPublished = new DateTime("2013-05-18T19:00:27.000Z"); | |
//DateTime lastPublished = new DateTime("2013-01-29T22:18:50.000Z"); | |
//DateTime lastPublished = new DateTime("2012-06-21T21:32:19.000Z"); | |
//DateTime lastPublished = new DateTime("2009-08-11T22:46:53.000Z"); | |
//DateTime lastPublished = new DateTime("2007-12-21T13:18:39.000Z"); | |
//DateTime lastPublished = new DateTime("2007-08-23T17:44:51.000Z"); | |
Properties properties = new Properties(); | |
try { | |
InputStream in = YouTube.Search.class.getResourceAsStream("/" + PROPERTIES_FILENAME); | |
properties.load(in); | |
} catch (IOException e) { | |
System.err.println("There was an error reading " + PROPERTIES_FILENAME + ": " + e.getCause() | |
+ " : " + e.getMessage()); | |
System.exit(1); | |
} | |
String apiKey = properties.getProperty("youtube.apikey"); | |
String appName = properties.getProperty("app.name"); | |
try { | |
youtube = new YouTube.Builder(HTTP_TRANSPORT, JSON_FACTORY, new HttpRequestInitializer() { | |
public void initialize(HttpRequest request) throws IOException { | |
return; | |
} | |
}).setApplicationName(appName).build(); | |
String prevPageToken = null; | |
String nextPageToken = null; | |
while (true) { | |
YouTube.Search.List search = youtube.search().list("id,snippet"); | |
// Set your developer key from the Google Developers Console for non-authenticated requests. | |
// See: https://cloud.google.com/console | |
search.setKey(apiKey); | |
search.setOrder("date"); | |
search.setChannelId("UC_x5XG1OV2P6uZZ5FSM9Ttw"); | |
search.setPageToken(nextPageToken); | |
search.setPublishedBefore(lastPublished); | |
search.setType("video"); | |
//search.setFields("items(id/kind,id/videoId,snippet/title),nextPageToken,pageInfo(totalResults)"); | |
search.setFields("items(id/kind,id/videoId,snippet/publishedAt),nextPageToken,pageInfo(totalResults)"); | |
search.setMaxResults(NUMBER_OF_VIDEOS_RETURNED); | |
SearchListResponse searchResponse = search.execute(); | |
PageInfo pageInfo = searchResponse.getPageInfo(); | |
if (pageInfo != null && nextPageToken == null) { | |
int totalResults = pageInfo.getTotalResults(); | |
System.out.println("\n============================================================="); | |
System.out.println(" " + totalResults + " videos for search on \"Google Developers\"."); | |
System.out.println("=============================================================\n"); | |
} | |
prevPageToken = nextPageToken; | |
nextPageToken = searchResponse.getNextPageToken(); | |
//System.out.println("next page: " + nextPageToken); | |
List<SearchResult> searchResultList = searchResponse.getItems(); | |
if (searchResultList != null) { | |
prettyPrint(searchResultList.iterator()); | |
} | |
if (nextPageToken == null) { | |
DateTime finalPub = null; | |
if(searchResultList != null) { | |
finalPub = searchResultList.get(searchResultList.size()-1).getSnippet().getPublishedAt(); | |
} | |
System.out.println("no more next page, pubAt = " + finalPub + ", counter = " + counter + ", prev = " + prevPageToken + ", next = " + nextPageToken); | |
break; | |
} | |
} | |
} catch (GoogleJsonResponseException e) { | |
System.err.println("There was a service error: " + e.getDetails().getCode() + " : " + e.getDetails().getMessage()); | |
} catch (IOException e) { | |
System.err.println("There was an IO error: " + e.getCause() + " : " + e.getMessage()); | |
} catch (Throwable t) { | |
t.printStackTrace(); | |
} | |
} | |
private static void prettyPrint(Iterator<SearchResult> iteratorSearchResults) { | |
while (iteratorSearchResults.hasNext()) { | |
SearchResult singleVideo = iteratorSearchResults.next(); | |
ResourceId rId = singleVideo.getId(); | |
counter++; | |
if (rId.getKind().equals("youtube#video")) { | |
//System.out.println(" Title: " + singleVideo.getSnippet().getTitle()); | |
printAnnotation(rId.getVideoId(), singleVideo.getSnippet().getPublishedAt()); | |
//System.out.println(" Video Id: " + rId.getVideoId()); | |
//System.out.println(); | |
} | |
} | |
} | |
private static void printAnnotation(String videoId, DateTime publishedAt) { | |
try { | |
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); | |
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); | |
Document doc = dBuilder.parse(new URL(ANNOTATION_URI + videoId).openStream()); | |
doc.getDocumentElement().normalize(); | |
NodeList annotationsList = doc.getElementsByTagName("TEXT"); | |
for (int i = 0; i < annotationsList.getLength(); i++) { | |
Node annotationNode = annotationsList.item(i); | |
String annotation = annotationNode.getTextContent(); | |
if (annotation.contains("goo.gl")) { | |
System.out.println(annotation + " , " + publishedAt + " , " + counter); | |
} | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment