Skip to content

Instantly share code, notes, and snippets.

@benjamintanweihao
Created September 13, 2013 02:10
Show Gist options
  • Select an option

  • Save benjamintanweihao/6546118 to your computer and use it in GitHub Desktop.

Select an option

Save benjamintanweihao/6546118 to your computer and use it in GitHub Desktop.
Some crawler stolen from some place.
-module(download).
-compile(export_all).
%%
%% Include files
%%
%%
%% Exported Functions
%%
-export([]).
%%
%% API Functions
%%
%%
%% Local Functions
%%
get_start_base() -> "http://airwar.ru".
get_start_url() -> get_start_base() ++ "image/".
out_file_name() -> "pictures.txt".
start() ->
inets:start(),
Writer = start_write(),
process_page(Writer, get_start_url()),
stop_write(Writer).
start_write() -> spawn(fun write_proc/0).
stop_write(W) ->
W ! stop.
write(W, String) ->
W ! {write, String}.
write_proc() -> write_loop([], 0).
write_loop(Data, DataLen) ->
receive
stop ->
io:format("Save ~s entries ~n", [DataLen]),
{ok, F} = file:open(out_file_name(), write),
[io:format(F, "~s~n", [S]) || S <- Data],
file:close(F),
io:format("Done~n");
{write, String} ->
%io:format("Adding ~s~n", [String]),
case DataLen rem 1000 of
0 -> io:format("Downlaoded ~p~n", [DataLen]);
_ -> ok
end,
write_loop([String|Data], 1 + DataLen)
after 10000 ->
io:format("Stop on timeout~n"),
stop_write(self()),
write_loop(Data, DataLen)
end.
process_page(W, Url) ->
MyPid = self(),
case get_url_contents(Url) of
{ok, Data} ->
Strings = string:tokens(Data, "\n"),
Pids = [spawn(fun() ->
process_strings(W, MyPid, Url, Str)
end) || Str <- Strings],
collect(length(Pids));
_ -> ok
end.
collect(0) -> ok;
collect(N) ->
%io:format("To collect ~p~n", [N]),
receive
done -> collect(N - 1)
end.
get_url_contents(Url) -> get_url_contents(Url, 5).
get_url_contents(Url, 0) -> failed;
get_url_contents(Url, MaxFailures) ->
case http:request(Url) of
{ok, {{_, RetCode, _}, _, Result}} -> if
RetCode == 200;RetCode == 201 ->
{ok, Result};
RetCode >= 500 ->
% server error, retry
timer:sleep(1000),
get_url_contents(Url, MaxFailures - 1);
true ->
% all other errors
failed
end;
{error, _Why} ->
timer:sleep(1000),
get_url_contents(Url, MaxFailures - 1)
end.
process_string(W, Parent, Dir, Str) ->
case extract_link(Str) of
{ok, Url} -> process_link(W, Dir, Url);
failed -> ok
end,
done(Parent).
done(Parent) ->
Parent ! done.
extract_link(S) ->
case re:run(S, "href *= *([^>]*)>", [{capture, all_but_first, list}]) of
{match, [Link]} -> {ok, string:strip(Link, both, $")};
_ -> failed
end.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment