Last active
October 20, 2015 01:33
-
-
Save AnEmortalKid/db109459c0f05959b2dd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.anemortalid.essex.whatson.scrape; | |
import java.util.ArrayList; | |
import java.util.List; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
/** | |
* Scrapes the domain for some info | |
* | |
*/ | |
public class Scraper { | |
private static final String ESSEX_DOMAIN = "http://www.essexstudent.com"; | |
public static void main(String[] args) { | |
List<EssexEvent> events = new Scraper().scrubWhatson(); | |
System.out.println("There are " + events.size() + " events."); | |
for (EssexEvent essexEvent : events) { | |
System.out.println(essexEvent); | |
} | |
} | |
public List<EssexEvent> scrubWhatson() { | |
List<EssexEvent> events = new ArrayList<Scraper.EssexEvent>(); | |
try { | |
Document document = Jsoup.connect(ESSEX_DOMAIN + "/whatson/").get(); | |
// Get events only, they most likely all have this class | |
Elements eventItems = document.getElementsByClass("event_item"); | |
// extract the info, we don't really need to visit their links | |
for (Element eventItem : eventItems) { | |
EssexEvent event = new EssexEvent(); | |
Elements eventNames = eventItem.select("a.msl_event_name"); | |
Element aTag = eventNames.get(0); | |
String link = aTag.attr("href"); | |
String title = aTag.text(); | |
event.setTitle(title); | |
event.setEventLink(link); | |
// These other tags have some more info, they're in a | |
// div->dl->dd section | |
Elements eventTimes = eventItem.getElementsByClass("msl_event_time"); | |
String eventTime = eventTimes.text(); | |
event.setTime(eventTime); | |
Elements eventLocations = eventItem.getElementsByClass("msl_event_location"); | |
String eventLocation = eventLocations.text(); | |
event.setLocation(eventLocation); | |
Elements eventDescriptions = eventItem.getElementsByClass("msl_event_description"); | |
String eventDescription = eventDescriptions.text(); | |
event.setDescription(eventDescription); | |
// find the image at the end, not all event items have a | |
// span>msl_event_image | |
Elements eventImageElems = eventItem.select("span.msl_event_image"); | |
if (eventImageElems.size() > 0) { | |
Element spanTag = eventImageElems.first(); | |
Elements spanChildren = spanTag.children(); | |
Element imgTag = spanChildren.first(); | |
String imgSource = imgTag.attr("src"); | |
event.setImgSrc(imgSource); | |
} | |
events.add(event); | |
} | |
} catch (Exception e) { | |
// yolo | |
e.printStackTrace(); | |
} | |
return events; | |
} | |
/** | |
* Represents an Essex event | |
* | |
*/ | |
private class EssexEvent { | |
private String title; | |
private String eventLink; | |
private String imgSrc; | |
private String time; | |
private String location; | |
private String description; | |
public String getTitle() { | |
return title; | |
} | |
public void setTitle(String title) { | |
this.title = title; | |
} | |
public String getEventLink() { | |
return eventLink; | |
} | |
public void setEventLink(String eventLink) { | |
this.eventLink = eventLink; | |
} | |
public String getImgSrc() { | |
return imgSrc; | |
} | |
public void setImgSrc(String imgSrc) { | |
this.imgSrc = imgSrc; | |
} | |
public String getTime() { | |
return time; | |
} | |
public void setTime(String time) { | |
this.time = time; | |
} | |
public String getLocation() { | |
return location; | |
} | |
public void setLocation(String location) { | |
this.location = location; | |
} | |
public String getDescription() { | |
return description; | |
} | |
public void setDescription(String description) { | |
this.description = description; | |
} | |
public String getImageLocationLink() { | |
if (imgSrc == null) { | |
return "NO_IMAGE"; | |
} | |
int questionMark = imgSrc.indexOf("?"); | |
return ESSEX_DOMAIN + imgSrc.substring(0, questionMark); | |
} | |
@Override | |
public String toString() { | |
StringBuilder builder = new StringBuilder(); | |
builder.append("EssexEvent [title="); | |
builder.append(title); | |
builder.append(", eventLink="); | |
builder.append(eventLink); | |
builder.append(", imgSrc="); | |
builder.append(imgSrc); | |
builder.append(", imgOnlyLink="); | |
builder.append(getImageLocationLink()); | |
builder.append(", time="); | |
builder.append(time); | |
builder.append(", location="); | |
builder.append(location); | |
builder.append(", description="); | |
builder.append(description); | |
builder.append("]"); | |
return builder.toString(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment