Last active
July 25, 2022 08:58
-
-
Save manzanit0/ccc8419319cca1fa7e7486bfda5e3885 to your computer and use it in GitHub Desktop.
Web crawler which uses Floki and HTTPoison – does 5 request at a time
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Dependencies: | |
# {:httpoison, "~> 1.5"}, | |
# {:floki, "~> 0.21.0"} | |
# {:benchee, "~> 1.0"} (Only for benchmarking – not in the script) | |
defmodule CrawlQueue do | |
use Agent | |
def start_link(urls) do | |
queue = :queue.from_list(urls) | |
Agent.start_link(fn -> queue end, name: __MODULE__) | |
end | |
def pop do | |
queue = Agent.get(__MODULE__, & &1) | |
{value, queue} = pop_value(queue) | |
Agent.update(__MODULE__, fn _ -> queue end) | |
value | |
end | |
def push(url) do | |
Agent.update(__MODULE__, &(:queue.in(url, &1))) | |
end | |
defp pop_value(queue) do | |
case :queue.out(queue) do | |
{{:value, value}, queue} -> {value, queue} | |
{:empty, queue} -> {:empty, queue} | |
end | |
end | |
end | |
defmodule Crawler do | |
alias CrawlQueue, as: Queue | |
defstruct [:pool_size, :results, :current, :pending_sieve, :sieved] | |
def init(seed, pool_size) do | |
Queue.start_link([]) | |
crawl(%__MODULE__{pool_size: pool_size, current: [seed | []], results: []}) | |
end | |
defp crawl(%{current: [], results: results}), do: results | |
defp crawl(struct) do | |
if length(struct.results) >= 500 do | |
# Enum.each(struct.results, &IO.puts/1) | |
struct.results | |
else | |
struct | |
|> scan_async() # scans current urls and adds results to pending_sieve | |
|> sieve() # cleans pending_sieve | |
|> push_to_queue() # pushes pending_sieve to queue and adds them as results | |
|> take() # take 5 from queue and add them to current | |
|> crawl() # reestart crawl process | |
end | |
end | |
defp sieve(%{pending_sieve: pending, results: results} = struct) do | |
sieved = filter_already_scanned_urls(pending, results) | |
%{struct | sieved: sieved, pending_sieve: []} | |
end | |
defp filter_already_scanned_urls(urls, scanned) do | |
Enum.filter(urls, fn x -> !Enum.member?(scanned, x) end) | |
end | |
defp push_to_queue(%{sieved: pending, results: results} = struct) do | |
Enum.each(pending, &Queue.push/1) | |
%{struct | sieved: [], results: results ++ pending} | |
end | |
defp take(%{pool_size: pool_size, current: urls} = struct) do | |
if length(urls) < pool_size do | |
case Queue.pop() do | |
url when is_binary(url) -> take(%{struct | current: [url | urls]}) | |
_empty -> struct | |
end | |
else | |
struct | |
end | |
end | |
defp scan_async([]), do: [] | |
defp scan_async(%{current: urls} = struct) when is_list(urls) do | |
urls | |
|> Enum.map(fn url -> Task.async(fn -> scan(url) end) end) | |
|> Enum.map(fn t -> Task.await(t, 15_000) end) | |
|> List.flatten() | |
|> mark_as_pending_sieve(struct) | |
|> clear_current() | |
end | |
defp scan(url) do | |
try do | |
url | |
|> HTTPoison.get!([], timeout: 15_000, recv_timeout: 15_000) | |
|> Map.get(:body) | |
|> Floki.find("* a") | |
|> Floki.attribute("href") | |
rescue | |
# Any error when getting/parsing will mean no results are retrieved | |
# from that website -> There are a lot of awkward websites out there (?) | |
CaseClauseError -> [] | |
HTTPoison.Error -> [] | |
ArgumentError -> [] | |
end | |
end | |
defp mark_as_pending_sieve(urls, struct), do: %__MODULE__{struct | pending_sieve: urls} | |
defp clear_current(struct), do: %__MODULE__{struct | current: []} | |
end |
Benchmark results 1:
Name ips average deviation median 99th %
20 workers 0.36 2.78 s ±11.03% 2.80 s 3.14 s
5 workers 0.31 3.24 s ±35.04% 3.13 s 4.73 s
10 workers 0.23 4.27 s ±94.25% 2.98 s 9.88 s
50 workers 0.22 4.48 s ±74.24% 5.26 s 7.34 s
100 workers 0.0658 15.21 s ±0.00% 15.21 s 15.21 s
Comparison:
20 workers 0.36
5 workers 0.31 - 1.17x slower +0.46 s
10 workers 0.23 - 1.53x slower +1.49 s
50 workers 0.22 - 1.61x slower +1.70 s
100 workers 0.0658 - 5.47x slower +12.42 s
Memory usage statistics:
Name average deviation median 99th %
20 workers 43.83 MB ±37.92% 43.83 MB 55.58 MB
5 workers 104.92 MB ±0.00% 104.92 MB 104.92 MB
10 workers 18.20 MB ±0.00% 18.20 MB 18.20 MB
50 workers 205.63 MB ±8.21% 205.63 MB 217.57 MB
100 workers 112.55 MB ±0.00% 112.55 MB 112.55 MB
Comparison:
20 workers 43.83 MB
5 workers 104.92 MB - 2.39x memory usage +61.09 MB
10 workers 18.20 MB - 0.42x memory usage -25.62134 MB
50 workers 205.63 MB - 4.69x memory usage +161.80 MB
100 workers 112.55 MB - 2.57x memory usage +68.72 MB
Benchmark results 2:
Name ips average deviation median 99th %
20 workers 0.36 2.78 s ±11.03% 2.80 s 3.14 s
5 workers 0.31 3.24 s ±35.04% 3.13 s 4.73 s
10 workers 0.23 4.27 s ±94.25% 2.98 s 9.88 s
50 workers 0.22 4.48 s ±74.24% 5.26 s 7.34 s
100 workers 0.0658 15.21 s ±0.00% 15.21 s 15.21 s
Comparison:
20 workers 0.36
5 workers 0.31 - 1.17x slower +0.46 s
10 workers 0.23 - 1.53x slower +1.49 s
50 workers 0.22 - 1.61x slower +1.70 s
100 workers 0.0658 - 5.47x slower +12.42 s
Memory usage statistics:
Name average deviation median 99th %
20 workers 43.83 MB ±37.92% 43.83 MB 55.58 MB
5 workers 104.92 MB ±0.00% 104.92 MB 104.92 MB
10 workers 18.20 MB ±0.00% 18.20 MB 18.20 MB
50 workers 205.63 MB ±8.21% 205.63 MB 217.57 MB
100 workers 112.55 MB ±0.00% 112.55 MB 112.55 MB
Comparison:
20 workers 43.83 MB
5 workers 104.92 MB - 2.39x memory usage +61.09 MB
10 workers 18.20 MB - 0.42x memory usage -25.62134 MB
50 workers 205.63 MB - 4.69x memory usage +161.80 MB
100 workers 112.55 MB - 2.57x memory usage +68.72 MB
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Benchmarks: