Skip to content

Instantly share code, notes, and snippets.

Created December 18, 2014 08:31
Show Gist options
  • Save anonymous/84449fe7860e292e904f to your computer and use it in GitHub Desktop.
Save anonymous/84449fe7860e292e904f to your computer and use it in GitHub Desktop.
NY Times Top Books - hacked together, sorry
import java.util.List;
import org.apache.commons.lang3.text.WordUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxDriver;
public class NytimesScraper {
public static void getBooks(){
WebDriver driver = new FirefoxDriver();
driver.get("http://www.nytimes.com/2014/12/07/books/review/100-notable-books-of-2014.html");
List<WebElement> booklist = driver.findElements(By.cssSelector(".story-body-text"));
booklist.remove(0); // Just details.
int count = 0;
for(WebElement element : booklist){
try {
if(element.findElement(By.tagName("strong")) != null){
String title = element.findElement(By.tagName("strong")).getText();
title = title.substring(0, title.length()-1);
String author = element.findElement(By.tagName("em")).getText().replaceAll("\\(.*?\\)", "");
author = author.substring(3,author.length()-2);
// String url = element.findElement(By.tagName("a")).getAttribute("href");
System.out.println(count+". '"+WordUtils.capitalizeFully(title)+"' by "+author+"\n"); //+". ["+url+"]");
}
} catch(NoSuchElementException e){
// This is such a terrible hack, but it works for this case.
System.out.println("Category: "+element.getText()+"\n");
count = 0;
}
count++;
}
driver.quit();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment