Skip to content

Instantly share code, notes, and snippets.

@fabiolimace
Last active May 10, 2025 12:45
Show Gist options
  • Save fabiolimace/495bd143e9add13a4656a52228d5bfe3 to your computer and use it in GitHub Desktop.
Save fabiolimace/495bd143e9add13a4656a52228d5bfe3 to your computer and use it in GitHub Desktop.
Corretor de caracteres especiais - ISO-8859-1
<!DOCTYPE html>
<html>
<!-- Yes, I know the jokes about long if-else chains. I just don't feel like using a hash table here. -->
<head>
<title>Corretor de Caracteres Especiais - v20220721</title>
<meta charset="UTF-8">
<meta name="author" content="Fabio Lima">
<meta name="keywords" content="Codificação, Caracteres, ISO-8859-1, UTF-8, WINDOWS-1252">
<meta name="description" content="Sustitui caracteres invalidos por caracteres equivalentes">
<script>
var entities = [["&amp;", "\u0026"], ["&lt;", "\u003C"], ["&gt;", "\u003E"], ["&nbsp;", "\u00A0"], ["&iexcl;", "\u00A1"], ["&cent;", "\u00A2"], ["&pound;", "\u00A3"], ["&curren;", "\u00A4"], ["&yen;", "\u00A5"], ["&brvbar;", "\u00A6"], ["&sect;", "\u00A7"], ["&uml;", "\u00A8"], ["&copy;", "\u00A9"], ["&ordf;", "\u00AA"], ["&laquo;", "\u00AB"], ["&not;", "\u00AC"], ["&shy;", "\u00AD"], ["&reg;", "\u00AE"], ["&macr;", "\u00AF"], ["&deg;", "\u00B0"], ["&plusmn;", "\u00B1"], ["&sup2;", "\u00B2"], ["&sup3;", "\u00B3"], ["&acute;", "\u00B4"], ["&micro;", "\u00B5"], ["&para;", "\u00B6"], ["&middot;", "\u00B7"], ["&cedil;", "\u00B8"], ["&sup1;", "\u00B9"], ["&ordm;", "\u00BA"], ["&raquo;", "\u00BB"], ["&frac14;", "\u00BC"], ["&frac12;", "\u00BD"], ["&frac34;", "\u00BE"], ["&iquest;", "\u00BF"], ["&Agrave;", "\u00C0"], ["&Aacute;", "\u00C1"], ["&Acirc;", "\u00C2"], ["&Atilde;", "\u00C3"], ["&Auml;", "\u00C4"], ["&Aring;", "\u00C5"], ["&AElig;", "\u00C6"], ["&Ccedil;", "\u00C7"], ["&Egrave;", "\u00C8"], ["&Eacute;", "\u00C9"], ["&Ecirc;", "\u00CA"], ["&Euml;", "\u00CB"], ["&Igrave;", "\u00CC"], ["&Iacute;", "\u00CD"], ["&Icirc;", "\u00CE"], ["&Iuml;", "\u00CF"], ["&ETH;", "\u00D0"], ["&Ntilde;", "\u00D1"], ["&Ograve;", "\u00D2"], ["&Oacute;", "\u00D3"], ["&Ocirc;", "\u00D4"], ["&Otilde;", "\u00D5"], ["&Ouml;", "\u00D6"], ["&times;", "\u00D7"], ["&Oslash;", "\u00D8"], ["&Ugrave;", "\u00D9"], ["&Uacute;", "\u00DA"], ["&Ucirc;", "\u00DB"], ["&Uuml;", "\u00DC"], ["&Yacute;", "\u00DD"], ["&THORN;", "\u00DE"], ["&szlig;", "\u00DF"], ["&agrave;", "\u00E0"], ["&aacute;", "\u00E1"], ["&acirc;", "\u00E2"], ["&atilde;", "\u00E3"], ["&auml;", "\u00E4"], ["&aring;", "\u00E5"], ["&aelig;", "\u00E6"], ["&ccedil;", "\u00E7"], ["&egrave;", "\u00E8"], ["&eacute;", "\u00E9"], ["&ecirc;", "\u00EA"], ["&euml;", "\u00EB"], ["&igrave;", "\u00EC"], ["&iacute;", "\u00ED"], ["&icirc;", "\u00EE"], ["&iuml;", "\u00EF"], ["&eth;", "\u00F0"], ["&ntilde;", "\u00F1"], ["&ograve;", "\u00F2"], ["&oacute;", "\u00F3"], ["&ocirc;", "\u00F4"], ["&otilde;", "\u00F5"], ["&ouml;", "\u00F6"], ["&divide;", "\u00F7"], ["&oslash;", "\u00F8"], ["&ugrave;", "\u00F9"], ["&uacute;", "\u00FA"], ["&ucirc;", "\u00FB"], ["&uuml;", "\u00FC"], ["&yacute;", "\u00FD"], ["&thorn;", "\u00FE"], ["&yuml;", "\u00FF"], ["&fnof;", "\u0192"], ["&Alpha;", "\u0391"], ["&Beta;", "\u0392"], ["&Gamma;", "\u0393"], ["&Delta;", "\u0394"], ["&Epsilon;", "\u0395"], ["&Zeta;", "\u0396"], ["&Eta;", "\u0397"], ["&Theta;", "\u0398"], ["&Iota;", "\u0399"], ["&Kappa;", "\u039A"], ["&Lambda;", "\u039B"], ["&Mu;", "\u039C"], ["&Nu;", "\u039D"], ["&Xi;", "\u039E"], ["&Omicron;", "\u039F"], ["&Pi;", "\u03A0"], ["&Rho;", "\u03A1"], ["&Sigma;", "\u03A3"], ["&Tau;", "\u03A4"], ["&Upsilon;", "\u03A5"], ["&Phi;", "\u03A6"], ["&Chi;", "\u03A7"], ["&Psi;", "\u03A8"], ["&Omega;", "\u03A9"], ["&alpha;", "\u03B1"], ["&beta;", "\u03B2"], ["&gamma;", "\u03B3"], ["&delta;", "\u03B4"], ["&epsilon;", "\u03B5"], ["&zeta;", "\u03B6"], ["&eta;", "\u03B7"], ["&theta;", "\u03B8"], ["&iota;", "\u03B9"], ["&kappa;", "\u03BA"], ["&lambda;", "\u03BB"], ["&mu;", "\u03BC"], ["&nu;", "\u03BD"], ["&xi;", "\u03BE"], ["&omicron;", "\u03BF"], ["&pi;", "\u03C0"], ["&rho;", "\u03C1"], ["&sigmaf;", "\u03C2"], ["&sigma;", "\u03C3"], ["&tau;", "\u03C4"], ["&upsilon;", "\u03C5"], ["&phi;", "\u03C6"], ["&chi;", "\u03C7"], ["&psi;", "\u03C8"], ["&omega;", "\u03C9"], ["&thetasym;", "\u03D1"], ["&upsih;", "\u03D2"], ["&piv;", "\u03D6"], ["&bull;", "\u2022"], ["&hellip;", "\u2026"], ["&prime;", "\u2032"], ["&Prime;", "\u2033"], ["&oline;", "\u203E"], ["&frasl;", "\u2044"], ["&weierp;", "\u2118"], ["&image;", "\u2111"], ["&real;", "\u211C"], ["&trade;", "\u2122"], ["&alefsym;", "\u2135"], ["&larr;", "\u2190"], ["&uarr;", "\u2191"], ["&rarr;", "\u2192"], ["&darr;", "\u2193"], ["&harr;", "\u2194"], ["&crarr;", "\u21B5"], ["&lArr;", "\u21D0"], ["&uArr;", "\u21D1"], ["&rArr;", "\u21D2"], ["&dArr;", "\u21D3"], ["&hArr;", "\u21D4"], ["&forall;", "\u2200"], ["&part;", "\u2202"], ["&exist;", "\u2203"], ["&empty;", "\u2205"], ["&nabla;", "\u2207"], ["&isin;", "\u2208"], ["&notin;", "\u2209"], ["&ni;", "\u220B"], ["&prod;", "\u220F"], ["&sum;", "\u2211"], ["&minus;", "\u2212"], ["&lowast;", "\u2217"], ["&radic;", "\u221A"], ["&prop;", "\u221D"], ["&infin;", "\u221E"], ["&ang;", "\u2220"], ["&and;", "\u2227"], ["&or;", "\u2228"], ["&cap;", "\u2229"], ["&cup;", "\u222A"], ["&int;", "\u222B"], ["&there4;", "\u2234"], ["&sim;", "\u223C"], ["&cong;", "\u2245"], ["&asymp;", "\u2248"], ["&ne;", "\u2260"], ["&equiv;", "\u2261"], ["&le;", "\u2264"], ["&ge;", "\u2265"], ["&sub;", "\u2282"], ["&sup;", "\u2283"], ["&nsub;", "\u2284"], ["&sube;", "\u2286"], ["&supe;", "\u2287"], ["&oplus;", "\u2295"], ["&otimes;", "\u2297"], ["&perp;", "\u22A5"], ["&sdot;", "\u22C5"], ["&lceil;", "\u2308"], ["&rceil;", "\u2309"], ["&lfloor;", "\u230A"], ["&rfloor;", "\u230B"], ["&lang;", "\u2329"], ["&rang;", "\u232A"], ["&loz;", "\u25CA"], ["&spades;", "\u2660"], ["&clubs;", "\u2663"], ["&hearts;", "\u2665"], ["&diams;", "\u2666"]];
function getAnchor(chr, index) {
return '<span id="index' + index + '">' + chr + '</span>';
}
function mensagem(chr, subst, index) {
var texto = "Substituido o caractere [" + chr + "] por [" + subst + "]";
var elemento = document.getElementById("mensagens");
var anchor = "index" + index;
elemento.innerHTML += '<br/><a href="javascript:jump(' + "'" + anchor + "'" + '); selectText(' + "'" + anchor + "'" + '); " >' + texto + '</span>';
}
function limparMensagens() {
var elemento = document.getElementById("mensagens");
elemento.innerHTML = '';
}
function selectText(containerid) {
if (document.selection) {
var range = document.body.createTextRange();
range.moveToElementText(document.getElementById(containerid));
range.select();
} else if (window.getSelection) {
var range = document.createRange();
range.selectNode(document.getElementById(containerid));
window.getSelection().addRange(range);
}
}
function jump(anchor) {
window.location.href = "#" + anchor;
}
function escape(text) {
return text.replaceAll(/</g, '&lt;').replaceAll(/>/g, '&gt;');
}
function removeComments(html) {
var matches = html.match(/<!--.{0,15}/g);
if (matches) {
matches.forEach(match => {
var elemento = document.getElementById("mensagens");
var texto = "Removido comentário HTML: " + escape(match);
elemento.innerHTML += '<br/><span>' + texto + '...</span>';
});
// Remove all HTML comments,
// such as those of Outlook:
// <!--[if gte mso 9]><![endif]-->
regexp = /<!--([^>]|[^-]>)*-->/ig;
return html.replaceAll(regexp, '');
}
return html;
}
function replaceEntitiesMessage(entity, subst) {
var elemento = document.getElementById("mensagens");
var texto = "Substituido o HTML entity [" + entity.substring(1, entity.length - 1) + "] por [" + subst + "]";
elemento.innerHTML += '<br/><span>' + texto + '</span>';
}
function replaceEntities(html) {
var matches = html.match(/&#?[\w]+;/g);
if (matches) {
let uniq = matches.filter((element, index) => {
return matches.indexOf(element) == index;
});
uniq.forEach(m => {
if (m.charAt(1) == '#') {
if (m.charAt(2) == 'x') {
// Formato hexadecimal
var num = parseInt("0x" + m.substring(3, m.length - 1));
var chr = String.fromCharCode(num);
replaceEntitiesMessage(m, chr);
html = html.replaceAll(m, chr);
} else {
// Formato decimal
var num = parseInt(m.substring(2, m.length - 1));
var char = String.fromCharCode(num);
replaceEntitiesMessage(m, chr);
html = html.replaceAll(m, chr);
}
} else {
// Formato nomeado
entities.forEach(e => {
if (m == e[0]) {
var chr = e[1];
replaceEntitiesMessage(m, chr);
html = html.replaceAll(m, chr);
}
});
}
});
}
return html;
}
function corrigir() {
limparMensagens();
var novoHtml = "";
var elemento = document.getElementById("texto");
var html = elemento.innerHTML;
// Remove comments if any...
html = removeComments(html);
html = replaceEntities(html);
for (var i = 0; i < html.length; i++) {
t = ' ';
c = html.charAt(i);
// Basic Latin e Latin1 Supplement
if (c > '\u00FF') {
// Combining Diacritical Marks
if ((c >= '\u0300') && (c <= '\u036F')) {
var x = html.charAt(i - 1);
novoHtml = novoHtml.substring(0, novoHtml.length - 1);
t = translateDiacritic(x, c);
t = getAnchor(t, i);
mensagem(x + ' ' + c, t, i);
}
// General Punctuation
else if ((c >= '\u2000') && (c <= '\u206F')) {
t = translateGeneralPunctuation(c);
t = getAnchor(t, i);
mensagem(c, t, i);
}
// Other characters
else {
t = '\u00BF'; // INVERTED QUESTION MARK
t = getAnchor(t, i);
mensagem(c, t, i);
}
} else {
// C1 Control
if ((c >= '\u0080') && (c <= '\u009F')) {
t = translateC1(c);
t = getAnchor(t, i);
mensagem(c, t, i);
}
// NO-BREAK SPACE (NBSP)
else if (c == '\u00A0') {
t = ' '; // invisible
t = getAnchor(t, i);
mensagem("nbsp", t, i);
}
// SOFT HYPHEN (SHY)
else if (c == '\u00AD') {
t = '-' // invisible
t = getAnchor(t, i);
mensagem("shy", t, i);
}
else {
t = c;
}
}
novoHtml += t;
}
elemento.innerHTML = novoHtml;
alert("Finalizado. Confira as alterações.");
}
// Code points for diacritic marks which are always combined with letters
function translateDiacritic(letter, diacritic) {
var l = letter;
var d = diacritic;
var c = '\u00BF'; // INVERTED QUESTION MARK
switch (d) {
case '\u0300': // COMBINING GRAVE ACCENT
if (l == 'a') c = '\u00E0';
else if (l == 'e') c = '\u00E8';
else if (l == 'i') c = '\u00EC';
else if (l == 'o') c = '\u00F2';
else if (l == 'u') c = '\u00F9';
else if (l == 'A') c = '\u00C0';
else if (l == 'E') c = '\u00C8';
else if (l == 'I') c = '\u00CC';
else if (l == 'O') c = '\u00D2';
else if (l == 'U') c = '\u00D9';
break;
case '\u0301': // COMBINING ACUTE ACCENT
if (l == 'a') c = '\u00E1';
else if (l == 'e') c = '\u00E9';
else if (l == 'i') c = '\u00ED';
else if (l == 'o') c = '\u00F3';
else if (l == 'u') c = '\u00FA';
else if (l == 'A') c = '\u00C1';
else if (l == 'E') c = '\u00C9';
else if (l == 'I') c = '\u00CD';
else if (l == 'O') c = '\u00D3';
else if (l == 'U') c = '\u00DA';
// LATIN LETTER Y WITH ACUTE
else if (l == 'y') c = '\u00FD';
else if (l == 'Y') c = '\u00DD';
break;
case '\u0302': // COMBINING CIRCUMFLEX ACCENT
if (l == 'a') c = '\u00E2';
else if (l == 'e') c = '\u00EA';
else if (l == 'i') c = '\u00EE';
else if (l == 'o') c = '\u00F4';
else if (l == 'u') c = '\u00FB';
else if (l == 'A') c = '\u00C2';
else if (l == 'E') c = '\u00CA';
else if (l == 'I') c = '\u00CE';
else if (l == 'O') c = '\u00D4';
else if (l == 'U') c = '\u00DB';
break;
case '\u0303': // COMBINING TILDE
if (l == 'a') c = '\u00E3';
else if (l == 'o') c = '\u00F5';
else if (l == 'n') c = '\u00F1';
else if (l == 'A') c = '\u00C3';
else if (l == 'O') c = '\u00D5';
else if (l == 'N') c = '\u00D1';
break;
case '\u0308': //COMBINING DIAERESIS
if (l == 'a') c = '\u00E4';
else if (l == 'e') c = '\u00EB';
else if (l == 'i') c = '\u00EF';
else if (l == 'o') c = '\u00F6';
else if (l == 'u') c = '\u00FC';
else if (l == 'A') c = '\u00C4';
else if (l == 'E') c = '\u00CB';
else if (l == 'I') c = '\u00CF';
else if (l == 'O') c = '\u00D6';
else if (l == 'U') c = '\u00DC';
// LATIN LETTER Y WITH DIAERESIS
else if (l == 'y') c = '\u00FF';
break;
case '\u0327': // COMBINING CEDILLA
if (l == 'c') c = '\u00E7';
else if (l == 'C') c = '\u00C7';
break;
case '\u030A': // CCOMBINING RING ABOVE
if (l == 'a') c = '\u00E5';
else if (l == 'A') c = '\u00C5';
break;
default:
break;
}
return c;
}
// Code points from Unicode's general punctuation
function translateGeneralPunctuation(punctuation) {
var p = punctuation;
var c = '\u00BF'; // INVERTED QUESTION MARK
switch (p) {
case '\u2010': // HYPHEN
c = '-';
break;
case '\u2011': // NON-BREAKING HYPHEN
c = '-';
break;
case '\u2012': // FIGURE DASH (used to separate digits in telephone numbers)
c = '-';
break;
case '\u2013': // EN DASH (used, for instance, to indicate a range)
c = '-';
break;
case '\u2014': // EM DASH (used to demarcate a parenthetical thought or to indicate a break, or for emphasis)
c = '--';
break;
case '\u2015': // HORIZONTAL BAR (introduces quoted text)
c = '--';
break;
case '\u2018': // LEFT SINGLE QUOTATION MARK
c = '\'';
break;
case '\u2019': // RIGHT SINGLE QUOTATION MARK
c = '\'';
break;
case '\u201A': // SINGLE LOW-9 QUOTATION MARK
c = '\'';
break;
case '\u201B': // SINGLE HIGH-REVERSED-9 QUOTATION MARK
c = '\'';
break;
case '\u201C': // LEFT DOUBLE QUOTATION MARK
c = '\"';
break;
case '\u201D': // RIGHT DOUBLE QUOTATION MARK
c = '\"';
break;
case '\u201E': // DOUBLE LOW-9 QUOTATION MARK
c = '\"';
break;
case '\u201F': // DOUBLE HIGH-REVERSED-9 QUOTATION MARK
c = '\"';
break;
case '\u2022': // BULLET
c = '*';
break;
case '\u2023': // TRIANGULAR BULLET
c = '-';
break;
case '\u2024': // ONE DOT LEADER
c = '.';
break;
case '\u2025': // TWO DOT LEADER
c = '..';
break;
case '\u2026': // HORIZONTAL ELLIPSIS
c = '...';
break;
case '\u2027': // HYPHENATION POINT
c = '\u00B7'; // MIDDLE DOT
break;
}
return c;
}
// C1 Control (used by WINDOWS-1252)
function translateC1(punctuation) {
var p = punctuation;
var c = '\u00BF'; // INVERTED QUESTION MARK
switch (p) {
case '\u0081': // <control>
c = 'EUR';
break;
case '\u0081': // <control>
break;
case '\u0082': // BREAK PERMITTED HERE
c = '\'';
break;
case '\u0083': // NO BREAK HERE
c = 'f';
break;
case '\u0084': // <control>
c = '\"';
break;
case '\u0085': // NEXT LINE (NEL)
c = '...';
break;
case '\u0086': // END OF SELECTED AREA
break;
case '\u0087': // END OF SELECTED AREA
break;
case '\u0088': // CHARACTER TABULATION SET
c = '^';
break;
case '\u0089': // CHARACTER TABULATION WITH JUSTIFICATION
break;
case '\u008A': // LINE TABULATION SET
c = 'S';
break;
case '\u008B': // PARTIAL LINE FORWARD
break;
case '\u008C': // PARTIAL LINE BACKWARD
c = 'OE';
break;
case '\u008D': // REVERSE LINE FEED
break;
case '\u008E': // SINGLE SHIFT TWO
c = 'Z';
break;
case '\u008F': // SINGLE SHIFT THREE
break;
case '\u0090': // DEVICE CONTROL STRING
break;
case '\u0091': // PRIVATE USE ONE
c = '\'';
break;
case '\u0092': // PRIVATE USE TWO
c = '\'';
break;
case '\u0093': // SET TRANSMIT STATE
c = '\"';
break;
case '\u0094': // CANCEL CHARACTER
c = '\"';
break;
case '\u0095': // MESSAGE WAITING
c = '*';
break;
case '\u0096': // START OF GUARDED AREA
c = '-';
break;
case '\u0097': // END OF GUARDED AREA
c = '--';
break;
case '\u0098': // START OF STRING
c = '~';
break;
case '\u0099': // <control>
c = 'TM';
break;
case '\u009A': // SINGLE CHARACTER INTRODUCER
c = 's';
break;
case '\u009B': // CONTROL SEQUENCE INTRODUCER
break;
case '\u009C': // STRING TERMINATOR
c = 'oe';
break;
case '\u009D': // OPERATING SYSTEM COMMAND
break;
case '\u009E': // PRIVACY MESSAGE
c = 'z';
break;
case '\u009F': // APPLICATION PROGRAM COMMAND
c = 'Y';
break;
}
return c;
}
</script>
</head>
<body>
<h1>Corretor de caracteres especiais</h1>
<p>Esta ferramenta procura caracteres inválidos para a codificação ISO-8859-1 e os substitui por caracteres
compatíveis.</p>
<p>Siga os passos abaixo para corrigir o texto com caracteres especiais:</p>
<ol>
<li>Cole o texto na caixa de texto abaixo;</li>
<li>Clique no botão "Corrigir";</li>
<li>Clique em cada item da caixa "Alterações realizadas" para conferir as correções;</li>
<li>Para finalizar, copie o texto corrigido para o destino desejado.</li>
</ol>
<div style="min-height:200px; border: solid 1px;" id="texto" contenteditable="true"></div>
<button type="button" onclick="corrigir();">Corrigir</button>
<br>
<br>
<span style="color: red; font-weight: bold; font-size: 150%;">Alterações realizadas</span><br>
<span>(Clique em uma mensagem abaixo para ir para a linha correspondente)</span> <br>
<div id="mensagens" style="min-height:100px; border: dashed 1px;" contenteditable="false"></div>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment