Created
July 31, 2012 08:59
-
-
Save logicmd/3215212 to your computer and use it in GitHub Desktop.
Text Normalizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Text Normalizer | |
* To remove all punctuation, extra spacing | |
* And transforming into lower case. | |
*/ | |
import java.io.BufferedReader; | |
import java.io.BufferedWriter; | |
import java.io.File; | |
import java.io.FileNotFoundException; | |
import java.io.FileReader; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
public class TextNormalizer { | |
public void Process(String path) { | |
BufferedReader br; | |
BufferedWriter bw; | |
File f = new File( | |
path.substring(0, path.length() - 4) | |
+ "-normalized" + path.substring(path.length() - 4) | |
); | |
/* | |
* Handled by BufferedWriter and FileWriter | |
* | |
* if (f.exists()) { | |
* f.delete(); | |
* } | |
* | |
*/ | |
try { | |
br = new BufferedReader(new FileReader(path)); | |
bw = new BufferedWriter(new FileWriter(f)); | |
String line = null; | |
while( (line = br.readLine()) != null) { | |
bw.write(Normalizer(line) + "\n"); | |
} | |
bw.flush(); | |
bw.close(); | |
br.close(); | |
} catch (FileNotFoundException e1) { | |
// TODO Auto-generated catch block | |
e1.printStackTrace(); | |
} catch (IOException e2) { | |
// TODO Auto-generated catch block | |
e2.printStackTrace(); | |
} | |
} | |
private String Normalizer(String str) { | |
/* | |
* "[^(\\w|\\s)]" means to delete non word or non space-like char. | |
* I don't know why it does not equal to \\W\\S | |
* | |
* "\\s{2,}" means to extra space | |
* | |
* p.s. [^(\\w|\\s)]|(\\()|(\\)) | |
* means to delete non word or non space-like char or ( or ) | |
* | |
* p.s. [^a-zA-Z] =.= | |
*/ | |
return str.replaceAll("[^a-zA-Z]", " ").replaceAll("\\s{2,}", " ").toLowerCase(); | |
} | |
public static void main(String[] args) { | |
TextNormalizer tn = new TextNormalizer(); | |
tn.Process("data/ADanceWithDragons.txt"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment