Created
September 13, 2013 02:10
-
-
Save benjamintanweihao/6546118 to your computer and use it in GitHub Desktop.
Some crawler stolen from some place.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -module(download). | |
| -compile(export_all). | |
| %% | |
| %% Include files | |
| %% | |
| %% | |
| %% Exported Functions | |
| %% | |
| -export([]). | |
| %% | |
| %% API Functions | |
| %% | |
| %% | |
| %% Local Functions | |
| %% | |
| get_start_base() -> "http://airwar.ru". | |
| get_start_url() -> get_start_base() ++ "image/". | |
| out_file_name() -> "pictures.txt". | |
| start() -> | |
| inets:start(), | |
| Writer = start_write(), | |
| process_page(Writer, get_start_url()), | |
| stop_write(Writer). | |
| start_write() -> spawn(fun write_proc/0). | |
| stop_write(W) -> | |
| W ! stop. | |
| write(W, String) -> | |
| W ! {write, String}. | |
| write_proc() -> write_loop([], 0). | |
| write_loop(Data, DataLen) -> | |
| receive | |
| stop -> | |
| io:format("Save ~s entries ~n", [DataLen]), | |
| {ok, F} = file:open(out_file_name(), write), | |
| [io:format(F, "~s~n", [S]) || S <- Data], | |
| file:close(F), | |
| io:format("Done~n"); | |
| {write, String} -> | |
| %io:format("Adding ~s~n", [String]), | |
| case DataLen rem 1000 of | |
| 0 -> io:format("Downlaoded ~p~n", [DataLen]); | |
| _ -> ok | |
| end, | |
| write_loop([String|Data], 1 + DataLen) | |
| after 10000 -> | |
| io:format("Stop on timeout~n"), | |
| stop_write(self()), | |
| write_loop(Data, DataLen) | |
| end. | |
| process_page(W, Url) -> | |
| MyPid = self(), | |
| case get_url_contents(Url) of | |
| {ok, Data} -> | |
| Strings = string:tokens(Data, "\n"), | |
| Pids = [spawn(fun() -> | |
| process_strings(W, MyPid, Url, Str) | |
| end) || Str <- Strings], | |
| collect(length(Pids)); | |
| _ -> ok | |
| end. | |
| collect(0) -> ok; | |
| collect(N) -> | |
| %io:format("To collect ~p~n", [N]), | |
| receive | |
| done -> collect(N - 1) | |
| end. | |
| get_url_contents(Url) -> get_url_contents(Url, 5). | |
| get_url_contents(Url, 0) -> failed; | |
| get_url_contents(Url, MaxFailures) -> | |
| case http:request(Url) of | |
| {ok, {{_, RetCode, _}, _, Result}} -> if | |
| RetCode == 200;RetCode == 201 -> | |
| {ok, Result}; | |
| RetCode >= 500 -> | |
| % server error, retry | |
| timer:sleep(1000), | |
| get_url_contents(Url, MaxFailures - 1); | |
| true -> | |
| % all other errors | |
| failed | |
| end; | |
| {error, _Why} -> | |
| timer:sleep(1000), | |
| get_url_contents(Url, MaxFailures - 1) | |
| end. | |
| process_string(W, Parent, Dir, Str) -> | |
| case extract_link(Str) of | |
| {ok, Url} -> process_link(W, Dir, Url); | |
| failed -> ok | |
| end, | |
| done(Parent). | |
| done(Parent) -> | |
| Parent ! done. | |
| extract_link(S) -> | |
| case re:run(S, "href *= *([^>]*)>", [{capture, all_but_first, list}]) of | |
| {match, [Link]} -> {ok, string:strip(Link, both, $")}; | |
| _ -> failed | |
| end. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment