Skip to content

Instantly share code, notes, and snippets.

@tomodutch
Last active May 22, 2016 18:52
Show Gist options
  • Save tomodutch/203ca1abe98dae5630bc640c92e8565e to your computer and use it in GitHub Desktop.
Save tomodutch/203ca1abe98dae5630bc640c92e8565e to your computer and use it in GitHub Desktop.
Fetch content from most recent posts on reddit
defmodule Scraper do
def scrape(subreddit) do
%{body: body} = HTTPoison.get!("https://www.reddit.com/r/#{subreddit}/.rss")
{:ok, feed, ""} = FeederEx.parse(body)
fetch_content(get_links(feed))
end
defp get_links(%FeederEx.Feed{entries: entries}) do
Enum.map(entries, fn (%FeederEx.Entry{link: link}) ->
link
end)
end
defp fetch_content(links) when is_list(links) do
links
|> Enum.reduce("", fn (link, acc) ->
id = get_id_from_link(link)
case acc do
"" -> id
_ -> "#{acc},#{id}"
end
end)
|> fetch_content
end
defp fetch_content(ids) do
HTTPoison.get!(
"http://www.reddit.com/by_id/#{ids}.json",
[{"User-Agent", "haikubot 0.1"}])
|> Map.get(:body)
|> parse_content
end
defp parse_content(content) do
%{"data" => %{"children" => children}} = Poison.decode!(content)
Enum.map(children, fn %{"data" => %{"selftext_html" => text}} ->
Floki.parse(text) |> Floki.text
end)
end
defp get_id_from_link(link) do
"t3_" <> Enum.at(String.split(link, "/", trim: true), 5)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment