Skip to content

Instantly share code, notes, and snippets.

@yangsu
Created March 19, 2013 17:38
Show Gist options
  • Save yangsu/5198282 to your computer and use it in GitHub Desktop.
Save yangsu/5198282 to your computer and use it in GitHub Desktop.
Erlang module exports a couple of routines for fetching URL either directly or through a proxy. Taken from http://www.erlang.org/article/8
-module(urlget).
% Hacked by Roland and Erik Aug 1997
%% Joe Armstrong
%% get_http(Fun, URL, OPts, Proxy, Timeout) ->
%% ok{URL', Header, Body} | error{What}
%% URL' is the actual URL that was gotten
-export([test/1, direct/1, proxy/1, get_http/5]).
-import(lists, [reverse/1]).
test(1) -> direct("http://www.ericsson.se/cslab/~joe");
test(2) -> direct("http://www.viasat.se/index9.html");
test(3) -> proxy("http://www.ericsson.se/cslab/~joe");
test(4) -> proxy("http://www.viasat.se/index9.html").
direct(URL) ->
Proxy = noproxy,
Timeout = 60000,
case get_http(fun progress/2, URL, [], Proxy, Timeout) of
{ok,{Url,Reply,Head,Body}} ->
{ok,{Url,Reply,Head,binary_to_list(Body)}};
Other ->
Other
end.
proxy(URL) ->
Proxy = {"proxy", 82},
Timeout = 60000,
case get_http(fun progress/2, URL, [], Proxy, Timeout) of
{ok,{Url,Reply,Head,Body}} ->
{ok,{Url,Reply,Head,binary_to_list(Body)}};
Other ->
Other
end.
progress(not_known, Y) ->
io:format("progress:#bytes = ~p~n", [Y]);
progress({length,N}, Y) ->
K = trunc(Y*100/N),
io:format("progress:#bytes (~w\%) = ~p~n", [K, Y]).
get_http(Fun, URL, Opts, Proxy, Timeout) ->
%% io:format("url_server: URL ~p~n", [URL]),
case get_ip_port(URL,Proxy) of
{ok, {IP, Port, Url0}} ->
Cmd = ["GET ", Url0, " HTTP/1.0\r\n", Opts, "\r\n"],
get_http2(Fun, URL, IP, Port, Cmd, Opts, Proxy, Timeout);
Other ->
Other
end.
get_http2(Fun, URL, IP, Port, Cmd, Opts, Proxy, Timeout) ->
io:format("Here connect:~p ~p~n",[IP, Port]),
case gen_tcp:connect(IP, Port, [binary, {packet,0}]) of
{error, Why} ->
{error, {socket_error, Why}};
{ok, Socket} ->
ok = gen_tcp:send(Socket, Cmd),
Return = case receive_header(Fun,URL,list_to_binary([]),
Socket,Timeout) of
{redo, URL1} ->
get_http(Fun, URL1, Opts, Proxy, Timeout);
Other ->
Other
end,
ok = gen_tcp:close(Socket),
Return
end.
get_ip_port(URL, Proxy) ->
case Proxy of
noproxy ->
case parse(URL) of
{error, Why} ->
{error, {badURL,{Why,URL}}};
{http, IP, Port, Url0} ->
{ok, {IP, Port,Url0}};
Other ->
{error,{unknown,Other}}
end;
{IP,Port} ->
{ok,{IP,Port,URL}}
end.
receive_header(Fun, URL, Bin, Socket, Timeout) ->
receive
{tcp, Socket, B} ->
B1 = concat_binary([Bin,B]),
case get_header(B1) of
{ok, Reply, Header, BT} ->
Size = content_length(Header),
case get_field(Header,"Location") of
{true,URL1} ->
%% If it's redo we still have to get the body
%% to flush the socket
case receive_body(Fun,Size,BT,Socket,Timeout) of
{ok, Body} ->
{redo, URL1};
Error ->
Error
end;
_ ->
case receive_body(Fun,Size,BT,Socket,Timeout) of
{ok, Body} ->
{ok, {URL, Reply, Header, Body}};
Error ->
Error
end
end;
more ->
receive_header(Fun, URL, B1, Socket, Timeout)
end;
{tcp_closed, Socket} ->
{error, socket_closed_in_header};
{tcp_error, Socket, Reason} ->
{error, Reason};
Other ->
{error, {socket, Other}}
after
Timeout ->
{error, timeout}
end.
receive_body(Fun, Size, Bin, Socket, Timeout) ->
receive
{tcp, Socket, B} ->
B1 = concat_binary([Bin,B]),
Fun(Size, size(B1)),
receive_body(Fun, Size, concat_binary([Bin,B]), Socket, Timeout);
{tcp_closed, Socket} ->
{ok, Bin};
{tcp_error, Socket, What} ->
{error, {socket, What}};
Other ->
{error, {socket, Other}}
after
Timeout ->
{error, timeout}
end.
get_header(B) ->
L = binary_to_list(B),
case split_header(L, []) of
{ReplyHeader, Rest} ->
{Reply,Header} = parse_reply(ReplyHeader),
{ok, Reply, parse_header(Header), list_to_binary(Rest)};
fail ->
more
end.
split_header([$\r,$\n,$\r,$\n|T], L) -> {reverse(L), T};
split_header([$\n,$\n|T], L) -> {reverse(L), T};
split_header([H|T], L) -> split_header(T, [H|L]);
split_header([], L) -> fail.
get_field([{K,V}|T],K) -> {true,V};
get_field([_|T],K) -> get_field(T,K);
get_field([],_) -> {false,false}.
parse_reply(R0) ->
{HTTP,R1} = get_until(R0,$ ,[]),
{CODE,R2} = get_until(R1,$ ,[]),
{COMM,R3} = get_until(R2,$\n,[]),
{{trim(HTTP),list_to_integer(trim(CODE)),trim(COMM)},R3}.
get_until([R|Rs],R,L) ->
{reverse(L),Rs};
get_until([R|Rs],P,L) ->
get_until(Rs,P,[R|L]).
parse_header(T) ->
{_, P} = parse_header(T, []),
P.
% It looks like parse_header/2 includes split header ???? - roland
parse_header([$\r,$\n | T], Info) -> header_end(T, Info);
parse_header([$\n | T], Info) -> header_end(T, Info);
parse_header(Cs, Info) -> header_line(Cs, [], Info).
header_line([$\r,$\n | T], Acc, Info) ->
parse_header(T, [split_info(reverse(Acc)) | Info]);
header_line([$\n | T], Acc, Info) ->
parse_header(T, [split_info(reverse(Acc)) | Info]);
header_line([C | Cs], Acc, Info) ->
header_line(Cs, [C | Acc], Info);
header_line([], Acc, Info) ->
header_end([], [split_info(reverse(Acc)) | Info]).
header_end([$\r,$\n | T], Info) -> header_end(T, Info);
header_end([$\n | T], Info) -> header_end(T, Info);
header_end(T, Info) -> {T, Info}.
split_info(String) ->
case string:chr(String, $:) of
0 -> {"Parse-Error",trim(String)};
Ix ->
{trim(string:substr(String, 1, Ix-1)),
trim(string:substr(String, Ix+1, length(String)))}
end.
trim(String) ->
reverse(strip(reverse(strip(String)))).
strip([$ | Cs]) -> strip(Cs);
strip([$\t | Cs]) -> strip(Cs);
strip([$\r | Cs]) -> strip(Cs);
strip([$\n | Cs]) -> strip(Cs);
strip(Cs) -> Cs.
content_length(Header) ->
case get_field(Header, "Content-Length") of
{true, Str} ->
{length, list_to_integer(Str)};
{false, _} ->
not_known
end.
%%----------------------------------------------------------------------
%% parse(URL) -> {http, Site, Port, File} |
%% {file, File} | {error,Why}
%% (primitive)
parse([$h,$t,$t,$p,$:,$/,$/|T]) -> parse_http(T);
parse([$f,$t,$p,$:,$/,$/|T]) -> {error, no_ftp};
parse([$f,$i,$l,$e,$:,$/,$/|F]) -> {file, F};
parse(X) -> {error, unknown_url_type}.
parse_http(X) ->
case string:chr(X, $/) of
0 ->
%% not terminated by "/" (sigh)
%% try again
parse_http(X ++ "/");
N ->
%% The Host is up to the first "/"
%% The file is everything else
Host = string:substr(X, 1, N-1),
File = string:substr(X, N, length(X)),
%% Now check to see if the host name contains a colon
%% i.e. there is an explicit port address in the hostname
case string:chr(Host, $:) of
0 ->
%% no colon
Port = 80,
{http, Host, 80, File};
M ->
Site = string:substr(Host,1,M-1),
case (catch list_to_integer(
string:substr(Host, M+1, length(Host)))) of
{'EXIT', _} ->
{http, Site, 80, File};
Port ->
{http, Site, Port, File}
end
end
end.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment