Created
January 13, 2011 06:34
-
-
Save osima/777493 to your computer and use it in GitHub Desktop.
Fix (R) and (TM) mojibake
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // | |
| // Fix (R) and (TM) mojibake | |
| // | |
| String replace(String str){ | |
| def log = { text,i-> | |
| def out = ''<<'' | |
| for(int j=Math.max(0,i-10); j<i; j++){ | |
| out << text.charAt(j) | |
| } | |
| out.toString() | |
| } | |
| def sb = ''<<'' | |
| for( int i=0; i<str.length(); i++){ | |
| boolean found = false | |
| if( i>0 ){ | |
| def m0 = ( str.charAt(i-1) =~ /[a-zA-Z]/ ) | |
| def m1 = ( str.charAt(i-1) =~ />/ ) | |
| if( m0.find() || m1.find() ){ | |
| String hexstr = Integer.toHexString( (int)str.charAt(i) ) | |
| if( hexstr == '30a3' ){ | |
| println "found (R) -> (${log(str,i)})" | |
| sb << '®' | |
| found = true | |
| } | |
| else if( hexstr == '30a7' ){ | |
| println "found TM -> (${log(str,i)})" | |
| sb << '™' | |
| } | |
| } | |
| } | |
| if( found==false ){ | |
| sb << str.charAt(i) | |
| } | |
| } | |
| sb.toString() | |
| } | |
| if( args.length<1 ) | |
| System.exit(0) | |
| def ENC = 'MS932' | |
| def inf = new File(args[0]) | |
| def outf = inf | |
| println "--- ${inf.name} ---" | |
| def text = inf.getText(ENC) | |
| def w = outf.newWriter(ENC) | |
| w.print( replace(text) ) | |
| w.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment