Last active
January 13, 2022 13:09
-
-
Save mscalora/71ef68b804dd523c867087fe2300dd76 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
# credit to Lenny Domnitser for the logic of this tool, see: https://domnit.org/blog/2007/07/fix-encoding.html | |
# | |
# This tool will fix encoding errors in text files. In my case a sql backup file contained latin1 char | |
# sequences that appeared like: The “magic†of ’THings’ | |
# | |
# Usage: fix_encodings.js <in-file> [<out-file>] | |
# | |
var win2byte = { | |
'\u20AC': '\x80', '\u201A': '\x82', '\u0192': '\x83', '\u201E': '\x84', | |
'\u2026': '\x85', '\u2020': '\x86', '\u2021': '\x87', '\u02C6': '\x88', | |
'\u2030': '\x89', '\u0160': '\x8A', '\u2039': '\x8B', '\u0152': '\x8C', | |
'\u017D': '\x8E', '\u2018': '\x91', '\u2019': '\x92', '\u201C': '\x93', | |
'\u201D': '\x94', '\u2022': '\x95', '\u2013': '\x96', '\u2014': '\x97', | |
'\u02DC': '\x98', '\u2122': '\x99', '\u0161': '\x9A', '\u203A': '\x9B', | |
'\u0153': '\x9C', '\u017E': '\x9E', '\u0178': '\x9F' | |
}; | |
function getbyte(s) { | |
var b = win2byte[s]; | |
return b || s; | |
} | |
function comp_in (list, pred) { | |
let a = []; | |
for (let e in list) { | |
a.push(pred(e)) | |
} | |
return a; | |
} | |
//var codes = '(?:[\\x80-\\xBF]|' + [code for (code in win2byte)].join('|') + ')'; | |
var codes = '(?:[\\x80-\\xBF]|' + comp_in(win2byte, (code) => code).join('|') + ')'; | |
var pat = new RegExp('[\\xC2-\\xDF]' + codes + | |
'|[\\xE0-\\xEF]' + codes + '{2}' + | |
'|[\\xF0-\\xF4]' + codes + '{3}', 'g'); | |
function sub(s) { | |
//s = s[0] + [getbyte(s[1 + parseInt(code)]) for (code in s.substring(1))].join(''); | |
s = s[0] + comp_in(s.substring(1), (code) => getbyte(s[1 + parseInt(code)])).join(''); | |
return decodeURIComponent(escape(s)); | |
} | |
function fix(s) { | |
s = s.replace(pat, sub); | |
return s; | |
} | |
const fs = require('fs'), | |
Reset = "\x1b[0m", | |
Bright = "\x1b[1m", | |
Dim = "\x1b[2m", | |
Underscore = "\x1b[4m", | |
Reverse = "\x1b[7m", | |
FgBlack = "\x1b[30m", | |
FgRed = "\x1b[31m", | |
FgGreen = "\x1b[32m", | |
FgYellow = "\x1b[33m", | |
FgBlue = "\x1b[34m", | |
FgMagenta = "\x1b[35m", | |
FgCyan = "\x1b[36m", | |
FgWhite = "\x1b[37m"; | |
let infile = process.argv[2], | |
outfile = process.argv[3], | |
content = ''; | |
if (infile && fs.existsSync(infile)) { | |
content = fs.readFileSync(infile, 'utf8'); | |
if (content && content.length) { | |
process.stderr.write(`${FgCyan}Read ${content.length} characters\n${Reset}`); | |
} else { | |
process.stderr.write(`${FgRed}Error: unable to read input file ${infile}\n${Reset}`); | |
process.exit(1); | |
} | |
} else if (infile) { | |
process.stderr.write(`${FgRed}Error: input file "${infile}" does not exist\n${Reset}`); | |
process.exit(1); | |
} else { | |
process.stderr.write(`${FgRed}Error: expecting input file name\n${Reset}`); | |
process.exit(1); | |
} | |
let fixed = fix(content); | |
if (outfile) { | |
fs.writeFileSync(outfile, fixed, 'utf8'); | |
} else { | |
process.stdout.write(fixed); | |
} | |
process.stderr.write(`${FgGreen}Success: ${fixed.length} characters written`); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment