Created
February 22, 2014 08:49
-
-
Save fiskurgit/9150628 to your computer and use it in GitHub Desktop.
Parse hippocamp.net catalogue from html using JSoup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package hctojson; | |
import java.io.IOException; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
public class hctojson { | |
public static void main(String[] args) throws IOException { | |
Document doc = Jsoup.connect("http://hippocamp.net/index.php").get(); | |
Elements releases = doc.getElementsByClass("box"); | |
l("Number of releases: " + releases.size()); | |
for(Element release : releases){ | |
//Get the catalogue number: | |
String catNum = release.getElementsByClass("cellheadlabel").first().text(); | |
//Get the release title and artist name: | |
String titleHTML = release.getElementsByTag("h4").first().html(); | |
String title = titleHTML.substring(0, titleHTML.indexOf("<br />")); | |
String artist = titleHTML.substring(titleHTML.indexOf("<br />") + 6, titleHTML.length()); | |
l("cat: " + catNum + " title: " + title + " artist: " + artist); | |
//Get the tracks: | |
Elements tracks = release.getElementsByTag("li"); | |
for(Element track : tracks){ | |
String trackTitle = track.text(); | |
String trackUrl = track.getElementsByTag("a").first().attr("href").toString(); | |
l("track: " + trackTitle + " url: " + trackUrl); | |
} | |
} | |
} | |
private static void l(String message){ | |
System.out.println(message); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment