Skip to content

Instantly share code, notes, and snippets.

@logicmd
Created July 31, 2012 08:59
Show Gist options
  • Save logicmd/3215212 to your computer and use it in GitHub Desktop.
Save logicmd/3215212 to your computer and use it in GitHub Desktop.
Text Normalizer
/*
* Text Normalizer
* To remove all punctuation, extra spacing
* And transforming into lower case.
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
public class TextNormalizer {
public void Process(String path) {
BufferedReader br;
BufferedWriter bw;
File f = new File(
path.substring(0, path.length() - 4)
+ "-normalized" + path.substring(path.length() - 4)
);
/*
* Handled by BufferedWriter and FileWriter
*
* if (f.exists()) {
* f.delete();
* }
*
*/
try {
br = new BufferedReader(new FileReader(path));
bw = new BufferedWriter(new FileWriter(f));
String line = null;
while( (line = br.readLine()) != null) {
bw.write(Normalizer(line) + "\n");
}
bw.flush();
bw.close();
br.close();
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (IOException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
}
private String Normalizer(String str) {
/*
* "[^(\\w|\\s)]" means to delete non word or non space-like char.
* I don't know why it does not equal to \\W\\S
*
* "\\s{2,}" means to extra space
*
* p.s. [^(\\w|\\s)]|(\\()|(\\))
* means to delete non word or non space-like char or ( or )
*
* p.s. [^a-zA-Z] =.=
*/
return str.replaceAll("[^a-zA-Z]", " ").replaceAll("\\s{2,}", " ").toLowerCase();
}
public static void main(String[] args) {
TextNormalizer tn = new TextNormalizer();
tn.Process("data/ADanceWithDragons.txt");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment