kfeoktistoff · August 29, 2015 14:02
diff --git a/LargeStringScanner.java b/LargeStringScanner.java
 /**
 	java.util.Scanner is a great tool for parsing but it has some disadvantages. One of them is
 	unchangable buffer with length 1024. It means that working with strings bigger than 1024 will not
 	be correct - in fact only the first 1024 symbols will be scanned. Also java.util.Scanner class is final,
 	so overriding methods is not awailable.
 	Here is a draft implementation of solution when whole text is splitted to N parts on lexeme the nearest to 
 	n*1024th symbol and each part is scanned seperately. In this example found lexeme is being replaced with upper case
 	and enclosed with "<>"
 */

 public class LargeStringScanner {
  public static int SCANNER_INNER_BUFFER = 1024; //inner buffer of java.util.scanner
  public static String DELIMITER = "\\b"; //lexeme delimiter regex
  
    public String enhance(String body) {
        if (body == null || body.isEmpty) return;
        
        String enhancedBody = "";

        int bodyPartStartPosition = 0;

        while (bodyPartStartPosition < body.length()) {
            int bodyPartEndPosition = bodyPartStartPosition + SCANNER_INNER_BUFFER;

            bodyPartEndPosition = body.length() > bodyPartEndPosition ? bodyPartEndPosition - 1 : body.length();
            bodyPartEndPosition = findLastDelimiterInSubString(bodyPartStartPosition, bodyPartEndPosition, body);

            String subBody = body.substring(bodyPartStartPosition, bodyPartEndPosition);

            Scanner scanner = new Scanner(subBody).useDelimiter(Pattern.compile(DELIMITER));
            String enhancedBodyPart = "";
            int charsRead = 0;

            while (scanner.hasNext()) {
                String word = scanner.next();
                enhancedBodyPart += subBody.substring(charsRead, scanner.match().start());

                word = doSmthWithLexeme(word);

                enhancedBodyPart += word;
                charsRead = scanner.match().end();
            }

            enhancedBodyPart += subBody.substring(charsRead);
            enhancedBody += enhancedBodyPart;
            bodyPartStartPosition = bodyPartEndPosition;
        }

        return enhancedBody;
    }

  
  private int findLastDelimiterInSubString(int startPosition, int endPosition, String largeString) {
    for (int i = endPosition - 1; i > startPosition; i--) {
      if (Pattern.matches(DELIMITER, Character.toString(largeString.charAt(i)))) {
          endPosition = i;
          break;
      }
    }

    return endPosition;
  }

  protected String doSmthWithLexeme(String lexeme) {
    return "<" + lexeme.toUpperCase() + ">";
  }
 }
	/**
	java.util.Scanner is a great tool for parsing but it has some disadvantages. One of them is
	unchangable buffer with length 1024. It means that working with strings bigger than 1024 will not
	be correct - in fact only the first 1024 symbols will be scanned. Also java.util.Scanner class is final,
	so overriding methods is not awailable.
	Here is a draft implementation of solution when whole text is splitted to N parts on lexeme the nearest to
	n*1024th symbol and each part is scanned seperately. In this example found lexeme is being replaced with upper case
	and enclosed with "<>"
	*/

	public class LargeStringScanner {
	public static int SCANNER_INNER_BUFFER = 1024; //inner buffer of java.util.scanner
	public static String DELIMITER = "\\b"; //lexeme delimiter regex

	public String enhance(String body) {
	if (body == null \|\| body.isEmpty) return;

	String enhancedBody = "";

	int bodyPartStartPosition = 0;

	while (bodyPartStartPosition < body.length()) {
	int bodyPartEndPosition = bodyPartStartPosition + SCANNER_INNER_BUFFER;

	bodyPartEndPosition = body.length() > bodyPartEndPosition ? bodyPartEndPosition - 1 : body.length();
	bodyPartEndPosition = findLastDelimiterInSubString(bodyPartStartPosition, bodyPartEndPosition, body);

	String subBody = body.substring(bodyPartStartPosition, bodyPartEndPosition);

	Scanner scanner = new Scanner(subBody).useDelimiter(Pattern.compile(DELIMITER));
	String enhancedBodyPart = "";
	int charsRead = 0;

	while (scanner.hasNext()) {
	String word = scanner.next();
	enhancedBodyPart += subBody.substring(charsRead, scanner.match().start());

	word = doSmthWithLexeme(word);

	enhancedBodyPart += word;
	charsRead = scanner.match().end();
	}

	enhancedBodyPart += subBody.substring(charsRead);
	enhancedBody += enhancedBodyPart;
	bodyPartStartPosition = bodyPartEndPosition;
	}

	return enhancedBody;
	}


	private int findLastDelimiterInSubString(int startPosition, int endPosition, String largeString) {
	for (int i = endPosition - 1; i > startPosition; i--) {
	if (Pattern.matches(DELIMITER, Character.toString(largeString.charAt(i)))) {
	endPosition = i;
	break;
	}
	}

	return endPosition;
	}

	protected String doSmthWithLexeme(String lexeme) {
	return "<" + lexeme.toUpperCase() + ">";
	}
	}
No results found