Skip to content

Instantly share code, notes, and snippets.

@JLarky
Created September 18, 2013 19:18
Show Gist options
  • Save JLarky/6614151 to your computer and use it in GitHub Desktop.
Save JLarky/6614151 to your computer and use it in GitHub Desktop.
html_entity_decode (php name) function for erlang. For binaries: unicode:characters_to_binary(html_utils:html_entity_decode(binary_to_list(Text))).
-module(html_utils).
-export([html_entity_decode/1]).
%% -*- coding: utf-8 -*-
%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 2003-2012. All Rights Reserved.
%%
%% The contents of this file are subject to the Erlang Public License,
%% Version 1.1, (the "License"); you may not use this file except in
%% compliance with the License. You should have received a copy of the
%% Erlang Public License along with this software. If not, it can be
%% retrieved online at http://www.erlang.org/.
%%
%% Software distributed under the License is distributed on an "AS IS"
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%% the License for the specific language governing rights and limitations
%% under the License.
%%
%% %CopyrightEnd%
%%
%% based on xmerl_scan module
html_entity_decode(Text) ->
scan_content(Text, []).
scan_content([], Acc) ->
lists:reverse(Acc);
scan_content("&" ++ T, Acc) ->
{Char, T2} = scan_reference(T),
scan_content(T2, Char++Acc);
scan_content([H | T], Acc) ->
scan_content(T, [H|Acc]).
scan_reference("#x" ++ T) ->
if hd(T) /= $; ->
scan_char_ref_hex(T, 0)
end;
scan_reference("#" ++ T) ->
if hd(T) /= $; ->
scan_char_ref_dec(T, [])
end;
scan_reference(T) ->
scan_entity_ref(T).
scan_char_ref_dec([H|T], Acc) when H >= $0, H =< $9 ->
scan_char_ref_dec(T, [H|Acc]);
scan_char_ref_dec(";" ++ T, Acc) ->
Ref = list_to_integer(lists:reverse(Acc)),
Ch = wfc_legal_char(Ref),
{[Ch], T}.
scan_char_ref_hex([H|T], Acc) when H >= $0, H =< $9 ->
Dec = H - $0,
scan_char_ref_hex(T, (Dec bor (Acc bsl 4)));
scan_char_ref_hex([H|T], Acc) when H >= $a, H =< $f ->
Dec = (H - $a) + 10,
scan_char_ref_hex(T, (Dec bor (Acc bsl 4)));
scan_char_ref_hex([H|T], Acc) when H >= $A, H =< $F ->
Dec = (H - $A) + 10,
scan_char_ref_hex(T, (Dec bor (Acc bsl 4)));
scan_char_ref_hex(";" ++ T, Acc) ->
Ch = wfc_legal_char(Acc),
{[Ch], T}. %% changed return value from [[Acc]]
wfc_legal_char(Ch) ->
true = xmerl_lib:is_char(Ch), Ch.
scan_entity_ref("amp;" ++ T) ->
{"&", T};
scan_entity_ref("lt;" ++ T) ->
{"<", T};
scan_entity_ref("gt;" ++ T) ->
{">", T};
scan_entity_ref("apos;" ++ T) ->
{"'", T};
scan_entity_ref("quot;" ++ T) ->
{"\"", T};
scan_entity_ref(T) -> %% will work as echo
{"&", T}.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment