Last active
April 14, 2020 18:30
-
-
Save courville/d02cdc241f826a8b2fbd6dacf9fef401 to your computer and use it in GitHub Desktop.
Test scraper code for Nova Video Player
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileReader; | |
import java.io.InputStreamReader; | |
import java.util.Locale; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
// TODO: add MoviePathMatcher | |
// TODO: add TvShowPathMatcher | |
// TODO: add other tvShow | |
public class TestScraper { | |
public static void main(String[] args) throws Exception { | |
// for stdin | |
BufferedReader inputFileReader = new BufferedReader(new InputStreamReader(System.in)); | |
// for file input | |
// FileReader inputFileReader = new FileReader("input.txt"); | |
BufferedReader reader = new BufferedReader(inputFileReader); | |
String test; | |
while ((test = reader.readLine()) != null) { | |
getMatch(getFileNameWithoutExtension(test)); | |
println(); | |
} | |
} | |
/** | |
* Matches everything. Tries to strip away all junk, not very reliable. | |
* <p> | |
* Process is as follows: | |
* <ul> | |
* <li> Start with filename without extension: "100. [DVD]Starship_Troopers_1995.-HDrip--IT" | |
* <li> Remove potential starting numbering of collections "[DVD]Starship_Troopers_1995.-HDrip--IT" | |
* <li> Extract last year if any: "[DVD]Starship_Troopers_.-HDrip--IT" | |
* <li> Remove anything in brackets: "Starship_Troopers_.-HDrip--IT" | |
* <li> Assume from here on that the title is first followed by junk | |
* <li> Trim CasE sensitive junk: "Starship_Troopers_.-HDrip" ("it" could be part of the movie name, "IT" probably not) | |
* <li> Remove separators: "Starship Troopers HDrip" | |
* <li> Trim junk case insensitive: "Starship Troopers" | |
* </ul> | |
*/ | |
private static void getMatch(String input) { | |
// TODO test 3rd party denoise pattern | |
// denoise filter Default = @"(([\(\{\[]|\b)((576|720|1080)[pi]|dir(ectors )?cut|dvd([r59]|rip|scr(eener)?)|(avc)?hd|wmv|ntsc|pal|mpeg|dsr|r[1-5]|bd[59]|dts|ac3|blu(-)?ray|[hp]dtv|stv|hddvd|xvid|divx|x264|dxva|(?-i)FEST[Ii]VAL|L[iI]M[iI]TED|[WF]S|PROPER|REPACK|RER[Ii]P|REAL|RETA[Ii]L|EXTENDED|REMASTERED|UNRATED|CHRONO|THEATR[Ii]CAL|DC|SE|UNCUT|[Ii]NTERNAL|[DS]UBBED)([\]\)\}]|\b)(-[^\s]+$)?)")] | |
String name = input; | |
println("input : " + name); | |
// extract the last year from the string | |
String year = null; | |
// matches "[space or punctuation/brackets etc]year", year is group 1 | |
final Pattern YEAR_PATTERN = Pattern.compile("[\\s\\p{Punct}]((?:19|20)\\d{2})(?!\\d)"); | |
Matcher matcher = YEAR_PATTERN.matcher(name); | |
int start = 0; | |
int stop = 0; | |
boolean found = false; | |
while (matcher.find()) { | |
found = true; | |
start = matcher.start(1); | |
stop = matcher.end(1); | |
} | |
// get the last match and extract it from the string | |
if (found) { | |
year = name.substring(start, stop); | |
name = name.substring(0, start) + name.substring(stop); | |
} | |
println("release year : %s year:%s", name, year); | |
// Strip out starting numbering for collections | |
// Matches "1. ", "1) ", "1 - ", "1.-.", "1._"... but not "1.Foo" or "1-Foo" .. | |
final Pattern LEADING_NUMBERING = Pattern.compile("^(\\d+([.)][\\s\\p{Punct}]+|\\s+\\p{Punct}[\\p{Punct}\\s]*))*"); | |
final Pattern LEADING_NUMBERING2 = Pattern.compile("^([0-9]+[ _]*[\\-\\.]+[ _]*)*"); | |
name = replaceAll(name, "", LEADING_NUMBERING); | |
println("remove numbering : " + name); | |
// Strip out everything else in brackets <[{( .. )})>, most of the time teams names, etc | |
final Pattern BRACKETS = Pattern.compile("[<({\\[].+?[>)}\\]]"); | |
final Pattern BRACKETS2 = Pattern.compile("[<\\(\\[\\{].+?[>\\)\\]\\}]"); | |
name = replaceAll(name, "", BRACKETS); | |
println("brackets : " + name); | |
// strip away known case sensitive garbage | |
name = cutOffBeforeFirstMatch(name, GARBAGE_CASESENSITIVE_PATTERNS); | |
println("CaSe junk : " + name); | |
// removes all punctuation characters besides ' Also does apostrophe and Acronym replacement | |
// replace all remaining whitespace & punctuation with a single space | |
// break what does removeInnerAndOutterSeparatorJunk(name); | |
// -> this is unifyApostrophes(name) | |
// replaces alternative apostrophes with a simple ' | |
// besides the plain ' there is the typographic ’ and ‘ which is actually not an apostrophe | |
final char[] ALTERNATE_APOSTROPHES = new char[]{'’', '‘'}; | |
name = replaceAllChars(name, ALTERNATE_APOSTROPHES, '\''); | |
println("apostrophes : " + name); | |
// -> this is replaceAcronyms(name) | |
// Matches dots in between Uppercase letters e.g. in "E.T.", "S.H.I.E.L.D." but not a "a.b.c." | |
// replaces "S.H.I.E.L.D." with "SHIELD", only uppercase letters | |
// Last dot is kept "a.F.O.O.is.foo" => "a.FOO.is.foo" | |
final Pattern ACRONYM_DOTS = Pattern.compile("(?<=(\\b|[._])\\p{Lu})[.](?=\\p{Lu}([.]|$))"); | |
name = replaceAll(name, "", ACRONYM_DOTS); | |
println("acronyms : " + name); | |
// -> this is the end of removeInnerAndOutterSeparatorJunk(name) | |
// ( whitespace | punctuation)+, matches dots, spaces, brackets etc | |
final Pattern MULTI_NON_CHARACTER_PATTERN = Pattern.compile("[\\s\\p{Punct}&&[^']]+"); | |
name = replaceAll(name, " ", MULTI_NON_CHARACTER_PATTERN).trim(); | |
println("separators : " + name); | |
// append a " " to aid next step | |
// > "Foo bar 1080p AC3 " to find e.g. " AC3 " | |
name = name + " "; | |
// try to remove more garbage, this time " garbage " syntax | |
// method will compare with lowercase name automatically | |
name = cutOffBeforeFirstMatch(name, GARBAGE_LOWERCASE); | |
println("lowercase junk : " + name); | |
name = name.trim(); | |
println("RESULT : %s year:%s", name, year); | |
} | |
// Most of the common garbage in movies name we want to strip out | |
// (they can be part of the name or correspond to extensions as well). | |
private static final String[] GARBAGE_LOWERCASE = { | |
" dvdrip ", " dvd rip ", "dvdscreener ", " dvdscr ", " dvd scr ", | |
" brrip ", " br rip ", " bdrip", " bd rip ", " blu ray ", " bluray ", | |
" hddvd ", " hd dvd ", " hdrip ", " hd rip ", " hdlight ", " minibdrip ", | |
" webrip ", " web rip ", | |
" 720p ", " 1080p ", " 1080i ", " 720 ", " 1080 ", " 480i ", " 2160p ", " 4k ", " 480p ", " 576p ", " 576i ", " 240p ", " 360p ", " 4320p ", " 8k ", | |
" hdtv ", " sdtv ", " m hd ", " ultrahd ", " mhd ", | |
" h264 ", " x264 ", " aac ", " ac3 ", " ogm ", " dts ", " hevc ", " x265 ", " av1 ", | |
" avi ", " mkv ", " xvid ", " divx ", " wmv ", " mpg ", " mpeg ", " flv ", " f4v ", | |
" asf ", " vob ", " mp4 ", " mov ", | |
" directors cut ", " dircut ", " readnfo ", " read nfo ", " repack ", " rerip ", " multi ", " remastered ", | |
" truefrench ", " srt ", " extended cut ", | |
" sbs ", " hsbs ", " side by side ", " sidebyside ", /* Side-By-Side 3d stuff */ | |
" 3d ", " h sbs ", " h tb ", " tb ", " htb ", " top bot ", " topbot ", " top bottom ", " topbottom ", " tab ", " htab ", /* Top-Bottom 3d stuff */ | |
" anaglyph ", " anaglyphe ", /* Anaglyph 3d stuff */ | |
" truehd ", " atmos ", " uhd ", " hdr10+ ", " hdr10 ", " hdr ", " dolby ", " dts-x ", " dts-hd.ma ", | |
" hfr ", | |
}; | |
// stuff that could be present in real names is matched with tight case sensitive syntax | |
// strings here will only match if separated by any of " .-_" | |
private static final String[] GARBAGE_CASESENSITIVE = { | |
"FRENCH", "TRUEFRENCH", "DUAL", "MULTISUBS", "MULTI", "MULTi", "SUBFORCED", "SUBFORCES", "UNRATED", "UNRATED[ ._-]DC", "EXTENDED", "IMAX", | |
"COMPLETE", "PROPER", "iNTERNAL", "INTERNAL", | |
"SUBBED", "ANiME", "LIMITED", "REMUX", "DCPRip", | |
"TS", "TC", "REAL", "HD", "DDR", "WEB", | |
"EN", "ENG", "FR", "ES", "IT", "NL", "VFQ", "VF", "VO", "VOSTFR", "Eng", | |
"VOST", "VFF", "VF2", "VFI", "VFSTFR", | |
}; | |
private static final Pattern[] GARBAGE_CASESENSITIVE_PATTERNS = new Pattern[GARBAGE_CASESENSITIVE.length]; | |
static { | |
for (int i = 0; i < GARBAGE_CASESENSITIVE.length; i++) { | |
// case sensitive string wrapped in "space or . or _ or -", in the end either separator or end of line | |
// end of line is important since .foo.bar. could be stripped to .foo and that would no longer match .foo. | |
GARBAGE_CASESENSITIVE_PATTERNS[i] = Pattern.compile("[ ._-]" + GARBAGE_CASESENSITIVE[i] + "(?:[ ._-]|$)"); | |
} | |
} | |
// ( whitespace | punctuation), matches dots, spaces, brackets etc | |
private static final String NON_CHARACTER = "[\\s\\p{Punct}]"; | |
// matches "19XX and 20XX" - capture group | |
private static final String YEAR_GROUP = "((?:19|20)\\d{2})"; | |
/** | |
* assumes title is always first | |
* @return substring from start to first finding of any garbage pattern | |
*/ | |
private static String cutOffBeforeFirstMatch(String input, Pattern[] patterns) { | |
String remaining = input; | |
for (Pattern pattern : patterns) { | |
if (remaining.isEmpty()) return ""; | |
Matcher matcher = pattern.matcher(remaining); | |
if (matcher.find()) { | |
remaining = remaining.substring(0, matcher.start()); | |
} | |
} | |
return remaining; | |
} | |
/** | |
* assumes title is always first | |
* @param garbageStrings lower case strings | |
* @return substring from start to first finding of any garbage string | |
*/ | |
public static final String cutOffBeforeFirstMatch(String input, String[] garbageStrings) { | |
// lower case input to test against lowercase strings | |
String inputLowerCased = input.toLowerCase(Locale.US); | |
int firstGarbage = input.length(); | |
for (String garbage : garbageStrings) { | |
int garbageIndex = inputLowerCased.indexOf(garbage); | |
// if found, shrink to 0..index | |
if (garbageIndex > -1 && garbageIndex < firstGarbage) | |
firstGarbage = garbageIndex; | |
} | |
// return substring from input -> keep case | |
return input.substring(0, firstGarbage); | |
} | |
public static String replaceAllChars(String input, char[] badChars, char newChar) { | |
if (badChars == null || badChars.length == 0) | |
return input; | |
int inputLength = input.length(); | |
int replacementLenght = badChars.length; | |
boolean modified = false; | |
char[] buffer = new char[inputLength]; | |
input.getChars(0, inputLength, buffer, 0); | |
for (int inputIdx = 0; inputIdx < inputLength; inputIdx++) { | |
char current = buffer[inputIdx]; | |
for (int replacementIdx = 0; replacementIdx < replacementLenght; replacementIdx++) { | |
if (current == badChars[replacementIdx]) { | |
buffer[inputIdx] = newChar; | |
modified = true; | |
break; | |
} | |
} | |
} | |
return modified ? new String(buffer) : input; | |
} | |
public static String replaceAll(String input, String replacement, Pattern pattern) { | |
return pattern.matcher(input).replaceAll(replacement); | |
} | |
private static void println() { | |
System.out.println(); | |
} | |
private static void println(String in) { | |
System.out.println(in); | |
} | |
private static void println(String in, Object... args) { | |
System.out.println(String.format(in, args)); | |
} | |
private static String getFileNameWithoutExtension(String input) { | |
File file = new File(input); | |
String name = file.getName(); | |
if (name != null) { | |
int dotPos = name.lastIndexOf('.'); | |
if (dotPos > 0) { | |
name = name.substring(0, dotPos); | |
} | |
} | |
return name; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This standalone code can be used to check and enhance Nova Video Player scraper code.
You can use an online IDE like ideone e.g. https://ideone.com/fHZk6C (thanks seppel for the initial contribution).
Documented pull requests are welcome.