Created
February 11, 2012 16:04
-
-
Save kritzikratzi/1801406 to your computer and use it in GitHub Desktop.
Parsing wikipedia is funky ...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package jobs; | |
import static utils.MWUtils.fetch; | |
import static utils.MWUtils.findReferences; | |
import static utils.MWUtils.getTagParameter; | |
import static utils.MWUtils.removeLinks; | |
import static utils.MWUtils.clean; | |
import static utils.MWUtils.findHeadlines; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Locale; | |
import java.util.TreeMap; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import org.joda.time.DateTime; | |
import org.joda.time.format.DateTimeFormat; | |
import org.joda.time.format.DateTimeFormatter; | |
import controllers.Admin; | |
import models.Episode; | |
import models.Series; | |
import play.Logger; | |
import play.db.jpa.JPA; | |
import play.jobs.Every; | |
import play.jobs.Job; | |
import utils.MWUtils; | |
@Every("1min") | |
public class RefreshJob extends Job<RefreshJob.Result>{ | |
// We need those to parse the dates ... | |
private final static Pattern shortPattern = Pattern.compile( "Start date\\|([0-9]{4}\\|[0-9]{1,2}\\|[0-9]{1,2})", Pattern.CASE_INSENSITIVE ); | |
private final static Pattern longPattern = Pattern.compile( "([a-z]+\\s+[0-9]+,\\s+[0-9]+)", Pattern.CASE_INSENSITIVE ); | |
private final static DateTimeFormatter shortParser = DateTimeFormat.forPattern( "yyyy|MM|dd" ); | |
private final static DateTimeFormatter longParser = DateTimeFormat.forPattern( "MMMM dd, yyyy" ).withLocale( Locale.ENGLISH ); | |
private final static Pattern numberPattern = Pattern.compile( "([0-9]+)" ); | |
private final static Pattern episodeNumberPattern = Pattern.compile( "([0-9]+)\\.([0-9]+)" ); | |
private final static Pattern prodCodePattern = Pattern.compile( "#([0-9]+)\\.([0-9]+)" ); | |
private final static Pattern articleSeasonPattern = Pattern.compile( ".*\\((Season|Series) ([0-9]+)\\)$", Pattern.CASE_INSENSITIVE ); | |
private final static Pattern seasonHeadlinePattern = Pattern.compile( "(Season|Series) ([0-9]+).*", Pattern.CASE_INSENSITIVE ); | |
private final static Pattern weirdSeasonHeadlinePattern = Pattern.compile( "(Season|Series) ([A-Z]+).*", Pattern.CASE_INSENSITIVE ); | |
private final Long seriesId; | |
public RefreshJob(){ | |
seriesId = null; | |
} | |
public RefreshJob( Long seriesId ){ | |
this.seriesId = seriesId; | |
} | |
@Override | |
public Result doJobWithResult() throws Exception { | |
Series series; | |
if( seriesId == null ) series = Series.find( "order by lastRefresh asc" ).first(); | |
else series = Series.findById( seriesId ); | |
if( series == null ){ | |
Logger.info( "Background Job: no such series: ", seriesId ); | |
return new Result( null, null, null ); | |
} | |
series.lastRefresh = new DateTime(); | |
series.save(); | |
JPA.em().getTransaction().commit(); | |
JPA.em().getTransaction().begin(); | |
Logger.info( "Background Job: refreshing %s", series.name ); | |
ArrayList<Episode> episodes = fetchEpisodes( series ); | |
ArrayList<Episode> added = new ArrayList<Episode>(); | |
for( Episode episode : episodes ){ | |
try{ | |
// Does this already exist? | |
Episode old = Episode.find( "series = ? and episodeNumber = ?", series, episode.episodeNumber ).first(); | |
if( old == null ){ | |
episode.save(); | |
added.add( episode ); | |
} | |
else if( !old.equals( episode ) ){ | |
// update the interesting fields .. | |
old.title = episode.title; | |
old.airDate = episode.airDate; | |
old.episodeNumber = episode.episodeNumber; | |
old.episodeNumberString = episode.episodeNumberString; | |
old.episodeNumberInSeason = episode.episodeNumberInSeason; | |
old.episodeNumberInSeasonString = episode.episodeNumberInSeasonString; | |
old.season = episode.season; | |
old.save(); | |
} | |
} | |
catch( Exception e ){ | |
Logger.info( "obj= %s", episode.toString() ); | |
e.printStackTrace(); | |
} | |
} | |
series.lastSuccess = new DateTime(); | |
series.save(); | |
return new Result( series, episodes, added ); | |
} | |
public static class Result{ | |
public Series series; | |
public List<Episode> episodes; | |
public List<Episode> added; | |
public Result( Series series, List<Episode> episodes, List<Episode> added ){ | |
this.series = series; | |
this.episodes = episodes; | |
this.added = added; | |
} | |
} | |
public static ArrayList<Episode> fetchEpisodes( Series series ){ | |
// grab the main page ... | |
TreeMap<String, String> articleContents = new TreeMap<String, String>(); | |
String content = MWUtils.fetch( series.wikiEpisodeUrl ); | |
articleContents.put( series.wikiEpisodeUrl, content ); | |
ArrayList<String> refs = findReferences( ":", content ); | |
for( String ref : refs ){ | |
String articleTitle = ref.substring( 3, ref.length() - 2 ); | |
Logger.info( "Fetching %s", articleTitle ); | |
articleContents.put( articleTitle, fetch( articleTitle ) ); | |
} | |
// Now grab all {{Episode ...}} tags | |
ArrayList<Episode> episodes = new ArrayList<Episode>( refs.size() ); | |
for( String articleTitle : articleContents.keySet() ){ | |
Matcher matcher; | |
String articleContent = articleContents.get( articleTitle ); | |
Logger.info( "Scanning %s", articleTitle ); | |
refs = MWUtils.findReferences( "Episode", articleContent ); | |
for( String ref : refs ){ | |
Episode episode = new Episode(); | |
String num = clean( getTagParameter( "EpisodeNumber", ref, "-1" ) ); | |
episode.series = series; | |
episode.title = clean( getTagParameter( "Title", ref, "?" ) ); | |
String dateStr = getTagParameter( "OriginalAirDate", ref, "" ); | |
// Fucked up ting number 1: episodeNumber might be of the format "SS.EE" | |
if( numberPattern.matcher( num ).matches() ){ | |
episode.episodeNumber = firstInt( clean( getTagParameter( "EpisodeNumber", ref, "-1" ) ) ); | |
episode.episodeNumberInSeason = firstInt( clean( getTagParameter( "EpisodeNumber2", ref, getTagParameter( "EpisodeNumber", ref, "-1" ) ) ) ); | |
episode.episodeNumberString = clean( getTagParameter( "EpisodeNumber", ref, "-1" ) ); | |
episode.episodeNumberInSeasonString = clean( getTagParameter( "EpisodeNumber2", ref, getTagParameter( "EpisodeNumber", ref, "-1" ) ) ); | |
} | |
else if( ( matcher = episodeNumberPattern.matcher( num ) ).matches() ){ | |
int S = firstInt( matcher.group( 1 ) ); | |
int E = firstInt( matcher.group( 2 ) ); | |
episode.episodeNumber = S*100 + E; | |
episode.episodeNumberString = episode.episodeNumber + ""; | |
episode.episodeNumberInSeason = E; | |
episode.episodeNumberInSeasonString = E + ""; | |
episode.season = S; | |
} | |
else{ | |
Logger.error( "The world is not going to stand tomorrow. EpisodeNumber was formatted as %s", num ); | |
episode.episodeNumber = firstInt( clean( getTagParameter( "EpisodeNumber", ref, "-1" ) ) ); | |
episode.episodeNumberInSeason = firstInt( clean( getTagParameter( "EpisodeNumber2", ref, getTagParameter( "EpisodeNumber", ref, "-1" ) ) ) ); | |
episode.episodeNumberString = clean( getTagParameter( "EpisodeNumber", ref, "-1" ) ); | |
episode.episodeNumberInSeasonString = clean( getTagParameter( "EpisodeNumber2", ref, getTagParameter( "EpisodeNumber", ref, "-1" ) ) ); | |
} | |
// More fuck ups: | |
// Sometimes epNum2 and epNum are reversed ... | |
if( episode.episodeNumber > 0 && episode.episodeNumberInSeason > 0 && episode.episodeNumber < episode.episodeNumberInSeason ){ | |
// swap | |
int tmpNum = episode.episodeNumber; | |
String tmpStr = episode.episodeNumberString; | |
episode.episodeNumber = episode.episodeNumberInSeason; | |
episode.episodeNumberString = episode.episodeNumberInSeasonString; | |
episode.episodeNumberInSeason = tmpNum; | |
episode.episodeNumberInSeasonString = tmpStr; | |
} | |
String prodCode = getTagParameter( "ProdCode", ref, null ); | |
// Let's see how we can _properly_ figure out infos about season/episode | |
// we only give a shit if season was not yet found ... | |
if( prodCode != null && episode.season <= 0 ){ | |
if( prodCode.toLowerCase().equals( "pilot" ) ){ | |
episode.season = 0; | |
episode.episodeNumber = 0; | |
episode.episodeNumberInSeason = 0; | |
episode.episodeNumberString = "Pilot"; | |
episode.episodeNumberInSeasonString = "Pilot"; | |
} | |
else if( ( matcher = prodCodePattern.matcher( prodCode ) ).matches() ){ | |
episode.season = firstInt( matcher.group( 1 ) ); | |
episode.episodeNumberInSeasonString = matcher.group( 2 ); | |
episode.episodeNumberInSeason = firstInt( matcher.group( 2 ) ); | |
} | |
else{ | |
//Logger.info( "> Unknown prod code format: %s; maybe an episode. maybe not. dunno. ", prodCode ); | |
/*episode.season = -1; | |
episode.episodeNumberInSeason = -1; | |
episode.episodeNumberInSeasonString = prodCode;*/ | |
} | |
} | |
// Still no season number? Maybe we can guess it correctly from the title | |
if( episode.season <= 0 && ( matcher = articleSeasonPattern.matcher( articleTitle ) ).matches() ){ | |
episode.season = firstInt( matcher.group( 1 ) ); | |
} | |
// OMG, still no season? | |
// maybe we get a clue in a headline ... | |
if( episode.season <= 0 ){ | |
// this is somewhat unefficient, but i just don't give a shit ... | |
String substr = articleContent.substring( 0, articleContent.indexOf( ref ) ); | |
ArrayList<String> headlines = findHeadlines( substr ); | |
int N = headlines.size(); | |
if( N > 0 && ( N = getSeasonByHeadline( headlines.get( N-1 ) ) ) > 0 ){ | |
// holy shit, this worked? | |
episode.season = N; | |
} | |
} | |
if( ( matcher = shortPattern.matcher( dateStr ) ).find() ) | |
episode.airDate = shortParser.parseLocalDate( matcher.group( 1 ) ); | |
else if( ( matcher = longPattern.matcher( dateStr ) ).find() ) | |
episode.airDate = longParser.parseLocalDate( matcher.group( 1 ) ); | |
else | |
episode.airDate = null; | |
episodes.add( episode ); | |
} | |
} | |
// All done, do we have major problems identifying seasons? | |
boolean majorProblems = true; | |
for( Episode e : episodes ) majorProblems &= e.season <= 0; | |
if( majorProblems ){ | |
// let's do it brutally, should be fine though! | |
Logger.info( "> Had major trouble identifying season. Just assigning shit randomly now" ); | |
int season = 1; | |
int oldEpisode = -1; | |
for( Episode e : episodes ){ | |
if( e.episodeNumberInSeason < oldEpisode ) season ++; | |
e.season = season; | |
oldEpisode = e.episodeNumberInSeason; | |
} | |
} | |
// Continue .. do we have major problems identifying episodeNumberInSeason? | |
// omg, the hacks get worse and worse | |
majorProblems = false; | |
int prevSeason = 1, prevEpInS = 1, newNum = 1; | |
for( Episode e : episodes ){ | |
if( majorProblems ){ | |
newNum ++; | |
} | |
if( e.season > prevSeason && e.episodeNumberInSeason > prevEpInS ){ | |
// this CAN NOT BE RIGHT! | |
newNum = 1; | |
majorProblems = true; | |
} | |
if( e.season > 0 ){ | |
prevSeason = e.season; | |
prevEpInS = e.episodeNumberInSeason; | |
} | |
if( majorProblems ){ | |
e.episodeNumberInSeason = newNum; | |
e.episodeNumberInSeasonString = newNum + ""; | |
} | |
} | |
for( Episode e : episodes ){ | |
Logger.info( "> Found %s", e.toString() ); | |
} | |
return episodes; | |
} | |
private static int firstInt( String str ){ | |
Matcher matcher = numberPattern.matcher( str ); | |
if( matcher.find() ){ | |
return Integer.parseInt( matcher.group( 1 ) ); | |
} | |
return -1; | |
} | |
/** | |
* FUCK YOU, WIKIPEDIA! :) | |
* | |
* tries to parse the season numbers from headlines as | |
* "season 1 (2009)" | |
* "season 1" | |
* "season one" | |
* @param string | |
* | |
* @return the parsed season number, or -1 | |
*/ | |
private static int getSeasonByHeadline( String text ){ | |
Matcher matcher; | |
if( ( matcher = seasonHeadlinePattern.matcher( text ) ).matches() ){ | |
return firstInt( matcher.group( 2 ) ); | |
} | |
else if( ( matcher = weirdSeasonHeadlinePattern.matcher( text ) ).matches() ){ | |
String num = matcher.group( 2 ).toLowerCase(); | |
if( num.equals( "one" ) ) return 1; | |
if( num.equals( "two" ) ) return 2; | |
if( num.equals( "three" ) ) return 3; | |
if( num.equals( "four" ) ) return 4; | |
if( num.equals( "five" ) ) return 5; | |
if( num.equals( "six" ) ) return 6; | |
if( num.equals( "seven" ) ) return 7; | |
if( num.equals( "eight" ) ) return 8; | |
if( num.equals( "nine" ) ) return 9; | |
if( num.equals( "ten" ) ) return 10; | |
if( num.equals( "eleven" ) ) return 11; | |
if( num.equals( "twelve" ) ) return 12; | |
} | |
return -1; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment