Last active
August 29, 2015 14:02
-
-
Save kfeoktistoff/efdf9daa04301dcd0b44 to your computer and use it in GitHub Desktop.
Using java.util.Scanner with strings bigger than 1024
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
java.util.Scanner is a great tool for parsing but it has some disadvantages. One of them is | |
unchangable buffer with length 1024. It means that working with strings bigger than 1024 will not | |
be correct - in fact only the first 1024 symbols will be scanned. Also java.util.Scanner class is final, | |
so overriding methods is not awailable. | |
Here is a draft implementation of solution when whole text is splitted to N parts on lexeme the nearest to | |
n*1024th symbol and each part is scanned seperately. In this example found lexeme is being replaced with upper case | |
and enclosed with "<>" | |
*/ | |
public class LargeStringScanner { | |
public static int SCANNER_INNER_BUFFER = 1024; //inner buffer of java.util.scanner | |
public static String DELIMITER = "\\b"; //lexeme delimiter regex | |
public String enhance(String body) { | |
if (body == null || body.isEmpty) return; | |
String enhancedBody = ""; | |
int bodyPartStartPosition = 0; | |
while (bodyPartStartPosition < body.length()) { | |
int bodyPartEndPosition = bodyPartStartPosition + SCANNER_INNER_BUFFER; | |
bodyPartEndPosition = body.length() > bodyPartEndPosition ? bodyPartEndPosition - 1 : body.length(); | |
bodyPartEndPosition = findLastDelimiterInSubString(bodyPartStartPosition, bodyPartEndPosition, body); | |
String subBody = body.substring(bodyPartStartPosition, bodyPartEndPosition); | |
Scanner scanner = new Scanner(subBody).useDelimiter(Pattern.compile(DELIMITER)); | |
String enhancedBodyPart = ""; | |
int charsRead = 0; | |
while (scanner.hasNext()) { | |
String word = scanner.next(); | |
enhancedBodyPart += subBody.substring(charsRead, scanner.match().start()); | |
word = doSmthWithLexeme(word); | |
enhancedBodyPart += word; | |
charsRead = scanner.match().end(); | |
} | |
enhancedBodyPart += subBody.substring(charsRead); | |
enhancedBody += enhancedBodyPart; | |
bodyPartStartPosition = bodyPartEndPosition; | |
} | |
return enhancedBody; | |
} | |
private int findLastDelimiterInSubString(int startPosition, int endPosition, String largeString) { | |
for (int i = endPosition - 1; i > startPosition; i--) { | |
if (Pattern.matches(DELIMITER, Character.toString(largeString.charAt(i)))) { | |
endPosition = i; | |
break; | |
} | |
} | |
return endPosition; | |
} | |
protected String doSmthWithLexeme(String lexeme) { | |
return "<" + lexeme.toUpperCase() + ">"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment