Created
March 19, 2013 17:38
-
-
Save yangsu/5198282 to your computer and use it in GitHub Desktop.
Erlang module exports a couple of routines for fetching URL either directly or through a proxy. Taken from http://www.erlang.org/article/8
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(urlget). | |
% Hacked by Roland and Erik Aug 1997 | |
%% Joe Armstrong | |
%% get_http(Fun, URL, OPts, Proxy, Timeout) -> | |
%% ok{URL', Header, Body} | error{What} | |
%% URL' is the actual URL that was gotten | |
-export([test/1, direct/1, proxy/1, get_http/5]). | |
-import(lists, [reverse/1]). | |
test(1) -> direct("http://www.ericsson.se/cslab/~joe"); | |
test(2) -> direct("http://www.viasat.se/index9.html"); | |
test(3) -> proxy("http://www.ericsson.se/cslab/~joe"); | |
test(4) -> proxy("http://www.viasat.se/index9.html"). | |
direct(URL) -> | |
Proxy = noproxy, | |
Timeout = 60000, | |
case get_http(fun progress/2, URL, [], Proxy, Timeout) of | |
{ok,{Url,Reply,Head,Body}} -> | |
{ok,{Url,Reply,Head,binary_to_list(Body)}}; | |
Other -> | |
Other | |
end. | |
proxy(URL) -> | |
Proxy = {"proxy", 82}, | |
Timeout = 60000, | |
case get_http(fun progress/2, URL, [], Proxy, Timeout) of | |
{ok,{Url,Reply,Head,Body}} -> | |
{ok,{Url,Reply,Head,binary_to_list(Body)}}; | |
Other -> | |
Other | |
end. | |
progress(not_known, Y) -> | |
io:format("progress:#bytes = ~p~n", [Y]); | |
progress({length,N}, Y) -> | |
K = trunc(Y*100/N), | |
io:format("progress:#bytes (~w\%) = ~p~n", [K, Y]). | |
get_http(Fun, URL, Opts, Proxy, Timeout) -> | |
%% io:format("url_server: URL ~p~n", [URL]), | |
case get_ip_port(URL,Proxy) of | |
{ok, {IP, Port, Url0}} -> | |
Cmd = ["GET ", Url0, " HTTP/1.0\r\n", Opts, "\r\n"], | |
get_http2(Fun, URL, IP, Port, Cmd, Opts, Proxy, Timeout); | |
Other -> | |
Other | |
end. | |
get_http2(Fun, URL, IP, Port, Cmd, Opts, Proxy, Timeout) -> | |
io:format("Here connect:~p ~p~n",[IP, Port]), | |
case gen_tcp:connect(IP, Port, [binary, {packet,0}]) of | |
{error, Why} -> | |
{error, {socket_error, Why}}; | |
{ok, Socket} -> | |
ok = gen_tcp:send(Socket, Cmd), | |
Return = case receive_header(Fun,URL,list_to_binary([]), | |
Socket,Timeout) of | |
{redo, URL1} -> | |
get_http(Fun, URL1, Opts, Proxy, Timeout); | |
Other -> | |
Other | |
end, | |
ok = gen_tcp:close(Socket), | |
Return | |
end. | |
get_ip_port(URL, Proxy) -> | |
case Proxy of | |
noproxy -> | |
case parse(URL) of | |
{error, Why} -> | |
{error, {badURL,{Why,URL}}}; | |
{http, IP, Port, Url0} -> | |
{ok, {IP, Port,Url0}}; | |
Other -> | |
{error,{unknown,Other}} | |
end; | |
{IP,Port} -> | |
{ok,{IP,Port,URL}} | |
end. | |
receive_header(Fun, URL, Bin, Socket, Timeout) -> | |
receive | |
{tcp, Socket, B} -> | |
B1 = concat_binary([Bin,B]), | |
case get_header(B1) of | |
{ok, Reply, Header, BT} -> | |
Size = content_length(Header), | |
case get_field(Header,"Location") of | |
{true,URL1} -> | |
%% If it's redo we still have to get the body | |
%% to flush the socket | |
case receive_body(Fun,Size,BT,Socket,Timeout) of | |
{ok, Body} -> | |
{redo, URL1}; | |
Error -> | |
Error | |
end; | |
_ -> | |
case receive_body(Fun,Size,BT,Socket,Timeout) of | |
{ok, Body} -> | |
{ok, {URL, Reply, Header, Body}}; | |
Error -> | |
Error | |
end | |
end; | |
more -> | |
receive_header(Fun, URL, B1, Socket, Timeout) | |
end; | |
{tcp_closed, Socket} -> | |
{error, socket_closed_in_header}; | |
{tcp_error, Socket, Reason} -> | |
{error, Reason}; | |
Other -> | |
{error, {socket, Other}} | |
after | |
Timeout -> | |
{error, timeout} | |
end. | |
receive_body(Fun, Size, Bin, Socket, Timeout) -> | |
receive | |
{tcp, Socket, B} -> | |
B1 = concat_binary([Bin,B]), | |
Fun(Size, size(B1)), | |
receive_body(Fun, Size, concat_binary([Bin,B]), Socket, Timeout); | |
{tcp_closed, Socket} -> | |
{ok, Bin}; | |
{tcp_error, Socket, What} -> | |
{error, {socket, What}}; | |
Other -> | |
{error, {socket, Other}} | |
after | |
Timeout -> | |
{error, timeout} | |
end. | |
get_header(B) -> | |
L = binary_to_list(B), | |
case split_header(L, []) of | |
{ReplyHeader, Rest} -> | |
{Reply,Header} = parse_reply(ReplyHeader), | |
{ok, Reply, parse_header(Header), list_to_binary(Rest)}; | |
fail -> | |
more | |
end. | |
split_header([$\r,$\n,$\r,$\n|T], L) -> {reverse(L), T}; | |
split_header([$\n,$\n|T], L) -> {reverse(L), T}; | |
split_header([H|T], L) -> split_header(T, [H|L]); | |
split_header([], L) -> fail. | |
get_field([{K,V}|T],K) -> {true,V}; | |
get_field([_|T],K) -> get_field(T,K); | |
get_field([],_) -> {false,false}. | |
parse_reply(R0) -> | |
{HTTP,R1} = get_until(R0,$ ,[]), | |
{CODE,R2} = get_until(R1,$ ,[]), | |
{COMM,R3} = get_until(R2,$\n,[]), | |
{{trim(HTTP),list_to_integer(trim(CODE)),trim(COMM)},R3}. | |
get_until([R|Rs],R,L) -> | |
{reverse(L),Rs}; | |
get_until([R|Rs],P,L) -> | |
get_until(Rs,P,[R|L]). | |
parse_header(T) -> | |
{_, P} = parse_header(T, []), | |
P. | |
% It looks like parse_header/2 includes split header ???? - roland | |
parse_header([$\r,$\n | T], Info) -> header_end(T, Info); | |
parse_header([$\n | T], Info) -> header_end(T, Info); | |
parse_header(Cs, Info) -> header_line(Cs, [], Info). | |
header_line([$\r,$\n | T], Acc, Info) -> | |
parse_header(T, [split_info(reverse(Acc)) | Info]); | |
header_line([$\n | T], Acc, Info) -> | |
parse_header(T, [split_info(reverse(Acc)) | Info]); | |
header_line([C | Cs], Acc, Info) -> | |
header_line(Cs, [C | Acc], Info); | |
header_line([], Acc, Info) -> | |
header_end([], [split_info(reverse(Acc)) | Info]). | |
header_end([$\r,$\n | T], Info) -> header_end(T, Info); | |
header_end([$\n | T], Info) -> header_end(T, Info); | |
header_end(T, Info) -> {T, Info}. | |
split_info(String) -> | |
case string:chr(String, $:) of | |
0 -> {"Parse-Error",trim(String)}; | |
Ix -> | |
{trim(string:substr(String, 1, Ix-1)), | |
trim(string:substr(String, Ix+1, length(String)))} | |
end. | |
trim(String) -> | |
reverse(strip(reverse(strip(String)))). | |
strip([$ | Cs]) -> strip(Cs); | |
strip([$\t | Cs]) -> strip(Cs); | |
strip([$\r | Cs]) -> strip(Cs); | |
strip([$\n | Cs]) -> strip(Cs); | |
strip(Cs) -> Cs. | |
content_length(Header) -> | |
case get_field(Header, "Content-Length") of | |
{true, Str} -> | |
{length, list_to_integer(Str)}; | |
{false, _} -> | |
not_known | |
end. | |
%%---------------------------------------------------------------------- | |
%% parse(URL) -> {http, Site, Port, File} | | |
%% {file, File} | {error,Why} | |
%% (primitive) | |
parse([$h,$t,$t,$p,$:,$/,$/|T]) -> parse_http(T); | |
parse([$f,$t,$p,$:,$/,$/|T]) -> {error, no_ftp}; | |
parse([$f,$i,$l,$e,$:,$/,$/|F]) -> {file, F}; | |
parse(X) -> {error, unknown_url_type}. | |
parse_http(X) -> | |
case string:chr(X, $/) of | |
0 -> | |
%% not terminated by "/" (sigh) | |
%% try again | |
parse_http(X ++ "/"); | |
N -> | |
%% The Host is up to the first "/" | |
%% The file is everything else | |
Host = string:substr(X, 1, N-1), | |
File = string:substr(X, N, length(X)), | |
%% Now check to see if the host name contains a colon | |
%% i.e. there is an explicit port address in the hostname | |
case string:chr(Host, $:) of | |
0 -> | |
%% no colon | |
Port = 80, | |
{http, Host, 80, File}; | |
M -> | |
Site = string:substr(Host,1,M-1), | |
case (catch list_to_integer( | |
string:substr(Host, M+1, length(Host)))) of | |
{'EXIT', _} -> | |
{http, Site, 80, File}; | |
Port -> | |
{http, Site, Port, File} | |
end | |
end | |
end. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment