|
defmodule PoolCrawler.CrawlerPool.Worker do |
|
use Conqueuer.Worker |
|
|
|
@default_user_agent "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36" |
|
@default_width 1720 |
|
@default_height 1340 |
|
|
|
def perform({url, width, height, user_agent}, _state) do |
|
IO.puts "CrawlerWorker.peform" |
|
IO.puts "url: #{url}" |
|
IO.puts "width: #{width}" |
|
IO.puts "height: #{height}" |
|
IO.puts "user_agent: #{user_agent}" |
|
|
|
crawl({url, width, height, user_agent}) |> handle_html(url) |
|
end |
|
|
|
def perform(url, _state) do |
|
IO.puts "CrawlerWorker.perform" |
|
IO.puts "url: #{url}" |
|
|
|
user_agent = @default_user_agent |
|
width = @default_width |
|
height = @default_height |
|
|
|
crawl({url, width, height, user_agent}) |> handle_html(url) |
|
end |
|
|
|
defp crawl({url, width, height, user_agent}) do |
|
# probably want to keep track of counts or some other metrics here |
|
|
|
# call your module that actually does the crawl. I've used PhantomJS via Elixir/Erlang 'Ports' with success |
|
# i found some good code to get that working by looking at the Elixir WebDriver implentation. I didn't use that directly, but was able to put |
|
# together a decent PhantomJS GenServer/Port that does what I need |
|
# |
|
# example call: |
|
# `{:ok, pid} = SuperDuperPhantomServer.start_link(width, height, user_agent)` |
|
# `PhantomServer.crawl_page(pid, url)` |
|
end |
|
|
|
# if we get no html back, probably want to keep track of that |
|
defp handle_html(html, url) when is_nil(html), do: something... # <-- this will not compile, obviously |
|
defp handle_html(html, _) do |
|
# send HTML results to parsing queue |
|
Conqueuer.work(:parsers, html) |
|
end |
|
|
|
defp already_crawled?(url) do |
|
# if you want to avoid hitting the same url - store previously crawled |
|
# links someplace. Maybe a GenServer key/value store? |
|
end |
|
|
|
end |