Skip to content

Instantly share code, notes, and snippets.

@lidemin
Created May 16, 2014 03:17
Show Gist options
  • Save lidemin/2b0b48285a3002077c28 to your computer and use it in GitHub Desktop.
Save lidemin/2b0b48285a3002077c28 to your computer and use it in GitHub Desktop.
HtmlManipulator
package com.damon.utils;
/**
* Created by damon on 16/5/14.
*/
import java.util.HashMap;
import java.util.Map;
/**
* Perform HTML-related operations.
*/
public final class HtmlManipulator {
/**
* Table of HTML entities obtained from http://www.w3.org/TR/html401/sgml/entities.html
* - Formatted as a series of space-delimited triplets (entity_name,entity_value,Unicode_value),
* e.g. (nbsp,#160,00A0).
* - entity_name and entity_value have been stripped of the surrounding "&" and ";",
* e.g. "nbsp" instead of " ", "#160" instead of " ".
*/
private static final String RAW_HTML_ENTITY_TABLE =
"nbsp #160 00A0 iexcl #161 00A1 cent #162 00A2 pound #163 00A3 curren #164 00A4 yen #165 00A5 " +
"brvbar #166 00A6 sect #167 00A7 uml #168 00A8 copy #169 00A9 ordf #170 00AA laquo #171 00AB " +
"not #172 00AC shy #173 00AD reg #174 00AE macr #175 00AF deg #176 00B0 plusmn #177 00B1 " +
"sup2 #178 00B2 sup3 #179 00B3 acute #180 00B4 micro #181 00B5 para #182 00B6 middot #183 00B7 " +
"cedil #184 00B8 sup1 #185 00B9 ordm #186 00BA raquo #187 00BB frac14 #188 00BC frac12 #189 00BD " +
"frac34 #190 00BE iquest #191 00BF Agrave #192 00C0 Aacute #193 00C1 Acirc #194 00C2 Atilde #195 00C3 " +
"Auml #196 00C4 Aring #197 00C5 AElig #198 00C6 Ccedil #199 00C7 Egrave #200 00C8 Eacute #201 00C9 " +
"Ecirc #202 00CA Euml #203 00CB Igrave #204 00CC Iacute #205 00CD Icirc #206 00CE Iuml #207 00CF " +
"ETH #208 00D0 Ntilde #209 00D1 Ograve #210 00D2 Oacute #211 00D3 Ocirc #212 00D4 Otilde #213 00D5 " +
"Ouml #214 00D6 times #215 00D7 Oslash #216 00D8 Ugrave #217 00D9 Uacute #218 00DA Ucirc #219 00DB " +
"Uuml #220 00DC Yacute #221 00DD THORN #222 00DE szlig #223 00DF agrave #224 00E0 aacute #225 00E1 " +
"acirc #226 00E2 atilde #227 00E3 auml #228 00E4 aring #229 00E5 aelig #230 00E6 ccedil #231 00E7 " +
"egrave #232 00E8 eacute #233 00E9 ecirc #234 00EA euml #235 00EB igrave #236 00EC iacute #237 00ED " +
"icirc #238 00EE iuml #239 00EF eth #240 00F0 ntilde #241 00F1 ograve #242 00F2 oacute #243 00F3 " +
"ocirc #244 00F4 otilde #245 00F5 ouml #246 00F6 divide #247 00F7 oslash #248 00F8 ugrave #249 00F9 " +
"uacute #250 00FA ucirc #251 00FB uuml #252 00FC yacute #253 00FD thorn #254 00FE yuml #255 00FF " +
"fnof #402 0192 Alpha #913 0391 Beta #914 0392 Gamma #915 0393 Delta #916 0394 Epsilon #917 0395 " +
"Zeta #918 0396 Eta #919 0397 Theta #920 0398 Iota #921 0399 Kappa #922 039A Lambda #923 039B " +
"Mu #924 039C Nu #925 039D Xi #926 039E Omicron #927 039F Pi #928 03A0 Rho #929 03A1 " +
"Sigma #931 03A3 Tau #932 03A4 Upsilon #933 03A5 Phi #934 03A6 Chi #935 03A7 Psi #936 03A8 " +
"Omega #937 03A9 alpha #945 03B1 beta #946 03B2 gamma #947 03B3 delta #948 03B4 epsilon #949 03B5 " +
"zeta #950 03B6 eta #951 03B7 theta #952 03B8 iota #953 03B9 kappa #954 03BA lambda #955 03BB " +
"mu #956 03BC nu #957 03BD xi #958 03BE omicron #959 03BF pi #960 03C0 rho #961 03C1 " +
"sigmaf #962 03C2 sigma #963 03C3 tau #964 03C4 upsilon #965 03C5 phi #966 03C6 chi #967 03C7 " +
"psi #968 03C8 omega #969 03C9 thetasym #977 03D1 upsih #978 03D2 piv #982 03D6 bull #8226 2022 " +
"hellip #8230 2026 prime #8242 2032 Prime #8243 2033 oline #8254 203E frasl #8260 2044 weierp #8472 2118 " +
"image #8465 2111 real #8476 211C trade #8482 2122 alefsym #8501 2135 larr #8592 2190 uarr #8593 2191 " +
"rarr #8594 2192 darr #8595 2193 harr #8596 2194 crarr #8629 21B5 lArr #8656 21D0 uArr #8657 21D1 " +
"rArr #8658 21D2 dArr #8659 21D3 hArr #8660 21D4 forall #8704 2200 part #8706 2202 exist #8707 2203 " +
"empty #8709 2205 nabla #8711 2207 isin #8712 2208 notin #8713 2209 ni #8715 220B prod #8719 220F " +
"sum #8721 2211 minus #8722 2212 lowast #8727 2217 radic #8730 221A prop #8733 221D infin #8734 221E " +
"ang #8736 2220 and #8743 2227 or #8744 2228 cap #8745 2229 cup #8746 222A int #8747 222B " +
"there4 #8756 2234 sim #8764 223C cong #8773 2245 asymp #8776 2248 ne #8800 2260 equiv #8801 2261 " +
"le #8804 2264 ge #8805 2265 sub #8834 2282 sup #8835 2283 nsub #8836 2284 sube #8838 2286 " +
"supe #8839 2287 oplus #8853 2295 otimes #8855 2297 perp #8869 22A5 sdot #8901 22C5 lceil #8968 2308 " +
"rceil #8969 2309 lfloor #8970 230A rfloor #8971 230B lang #9001 2329 rang #9002 232A loz #9674 25CA " +
"spades #9824 2660 clubs #9827 2663 hearts #9829 2665 diams #9830 2666 " +
"quot #34 0022 amp #38 0026 lt #60 003C gt #62 003E OElig #338 0152 oelig #339 0153 " +
"Scaron #352 0160 Yuml #376 0178 circ #710 02C6 tilde #732 02DC ensp #8194 2002 emsp #8195 2003 " +
"thinsp #8201 2009 zwnj #8204 200C zwj #8205 200D lrm #8206 200E rlm #8207 200F ndash #8211 2013 " +
"mdash #8212 2014 lsquo #8216 2018 rsquo #8217 2019 sbquo #8218 201A ldquo #8220 201C rdquo #8221 201D " +
"bdquo #8222 201E dagger #8224 2020 Dagger #8225 2021 permil #8240 2030 lsaquo #8249 2039 rsaquo #8250 203A " +
"euro #8364 20AC xxxx #39 0027";
/**
* value given by RAW_HTML_ENTITY_TABLE.hashCode(), used to guard against accidental modification
*/
private static final int RAW_HTML_ENTITY_TABLE_HASHCODE = -136767775;
/**
* mapping: HTML entity ---> Unicode character
*/
private static final Map<String, Character> HTML_ENTITY_TO_UNICODE_MAP = new HashMap<String, Character>();
/**
* mapping: Unicode character ---> HTML entity
*/
private static final Map<Character, String> UNICODE_TO_HTML_ENTITY_MAP = new HashMap<Character, String>();
/**
* Static initialization block.
* Populates HTML_ENTITY_TO_UNICODE_MAP and UNICODE_TO_HTML_ENTITY_MAP.
*/
static {
/* check hash code of RAW_HTML_ENTITY_TABLE */
if (RAW_HTML_ENTITY_TABLE.hashCode() != RAW_HTML_ENTITY_TABLE_HASHCODE) {
throw new RuntimeException("(INTERNAL) Malformed HtmlManipulator.RAW_HTML_ENTITY_TABLE.");
}
/* populate HTML entity <---> Unicode character maps */
final String[] elements = RAW_HTML_ENTITY_TABLE.split("[\\s]++");
for (int i = 0; i < elements.length; i += 3) {
final char unicode = (char) Integer.parseInt(elements[i + 2], 16);
if (elements[i] != "xxxx") {
//filter out the bad html name
//some html code does not have html name
HTML_ENTITY_TO_UNICODE_MAP.put(elements[i], unicode);
}
HTML_ENTITY_TO_UNICODE_MAP.put(elements[i + 1], unicode);
UNICODE_TO_HTML_ENTITY_MAP.put(unicode, elements[i]);
}
}
/**
* Private constructor that should never be called.
*/
private HtmlManipulator() {
}
/**
* Replace HTML entities in a given string with their Unicode character representations.
*
* @param s input string
* @return string with HTML entities replaced
*/
public static String replaceHtmlEntities(
final String s) {
final StringBuilder t = new StringBuilder();
for (int i = 0, n = s.length(); i < n; i++) {
final char c = s.charAt(i);
if (c == '&') {
/* candidate HTML entity */
final int j = s.indexOf(';', i);
if (j >= 0) {
final Character unicode = HTML_ENTITY_TO_UNICODE_MAP.get(s.substring(i + 1, j));
if (unicode != null) {
/* insert Unicode representation */
t.append((char) unicode);
i = j; /* advance index */
continue;
}
}
}
/* treat as a literal character */
t.append(c);
}
return t.toString();
}
/**
* Quote a specified string as HTML, by replacing all special characters with their
* equivalent HTML entities.
*
* @param s input string
* @return string with special characters replaced
*/
public static String quoteHtml(
final String s) {
final StringBuilder t = new StringBuilder();
for (char c : s.toCharArray()) {
final String entity = UNICODE_TO_HTML_ENTITY_MAP.get(c);
if (entity == null) {
t.append(c);
} else {
t.append('&');
t.append(entity);
t.append(';');
}
}
return t.toString();
}
}
@lidemin
Copy link
Author

lidemin commented May 16, 2014

This class is used for convert all the HTML code to string

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment