Skip to content

Instantly share code, notes, and snippets.

@monotykamary
Last active August 3, 2024 15:51
Show Gist options
  • Save monotykamary/29664cf8d7829c4827156e7478594d77 to your computer and use it in GitHub Desktop.
Save monotykamary/29664cf8d7829c4827156e7478594d77 to your computer and use it in GitHub Desktop.
Crawl Website to JSON with Elixir Langchain
#!/usr/bin/env elixir
Mix.install([
{:langchain, "~> 0.3.0-rc.0"},
{:httpoison, "~> 2.2"},
{:floki, "~> 0.36"},
{:jason, "~> 1.4"}
])
defmodule WebScraper do
def scrape(url) do
case HTTPoison.get(url, [], recv_timeout: 30_000) do
{:ok, %HTTPoison.Response{body: body, status_code: 200}} ->
{:ok, body}
{:error, %HTTPoison.Error{reason: reason}} ->
{:error, "Failed to fetch the website: #{inspect(reason)}"}
_ ->
{:error, "Unknown error while fetching the website"}
end
end
def extract_main_content(html) do
{:ok, document} = Floki.parse_document(html)
# Try to find the main content using common HTML5 tags
main_content = Floki.find(document, "main, article, .content, #content")
if Enum.empty?(main_content) do
# If no main content found, fall back to the body
Floki.find(document, "body")
else
main_content
end
|> Floki.text()
|> String.trim()
end
def truncate_content(content, max_tokens \\ 100_000) do
words = String.split(content)
if length(words) > max_tokens do
Enum.take(words, max_tokens)
|> Enum.join(" ")
|> Kernel.<>("...")
else
content
end
end
end
defmodule SmartCrawl do
alias LangChain.ChatModels.ChatOpenAI
alias LangChain.Chains.LLMChain
alias LangChain.Message
alias LangChain.MessageDelta
def run(url, data_request) do
IO.puts("Fetching content from #{url}...")
case WebScraper.scrape(url) do
{:ok, html_content} ->
IO.puts("Content fetched successfully. Content length: #{String.length(html_content)} characters")
main_content = WebScraper.extract_main_content(html_content)
IO.puts("Main content extracted. Length: #{String.length(main_content)} characters")
truncated_content = WebScraper.truncate_content(main_content)
IO.puts("Content truncated. Final length: #{String.length(truncated_content)} characters")
IO.puts("Extracting data...")
extract_data(truncated_content, data_request)
{:error, reason} ->
IO.puts("Error fetching content: #{reason}")
end
end
defp extract_data(content, data_request) do
prompt = """
Content:
#{content}
Based on the content above, please extract the following information:
#{data_request}
Return the result as a JSON object.
"""
IO.puts("Sending request to OpenAI API...")
api_key = System.get_env("OPENAI_API_KEY")
handler = %{
on_llm_new_delta: fn _model, %MessageDelta{} = data ->
IO.write(data.content)
end,
on_llm_token_usage: fn _model, usage ->
IO.puts("\nToken usage: #{inspect(usage)}")
end
}
{:ok, llm_chain} =
%{
llm: ChatOpenAI.new!(%{
model: "gpt-4o-mini-2024-07-18",
temperature: 0.7,
stream: true,
json_response: true,
callbacks: [handler],
api_key: api_key
}),
callbacks: [handler]
}
|> LLMChain.new()
{:ok, _updated_chain, _response} =
llm_chain
|> LLMChain.add_messages([
Message.new_system!("You are a helpful assistant designed to output JSON."),
Message.new_user!(prompt)
])
|> LLMChain.run()
end
end
# Get URL and data_request from command line arguments
[url | rest] = System.argv()
data_request = case rest do
[custom_request | _] -> custom_request
[] -> "Extract main content and key information from this webpage"
end
if url do
SmartCrawl.run(url, data_request)
else
IO.puts("Please provide a URL as an argument.")
end
Fetching content from https://www.headfonia.com/meze-audio-109-pro-review/2/...
Content fetched successfully. Content length: 124707 characters
Main content extracted. Length: 17055 characters
Content truncated. Final length: 17055 characters
Extracting data...
Sending request to OpenAI API...
{
"title": "Meze Audio 109 Pro Review",
"author": "Berkhan",
"date": "22/09/2022",
"price_range": "$500-$1000",
"type": "Full Size Headphones",
"rating": 4.3,
"votes": 129,
"sound_quality": {
"overall": "Excellent performer across many music genres and setups",
"bass": {
"description": "Speedy and reasonably impactful bass response",
"characteristics": [
"Natural and breathy presentation",
"Well-balanced definition and control",
"Enjoyable for Pop and RnB tracks",
"Quality of bass is impressive with great decay and recovery"
],
"performance": "Mid-bass and sub-bass balance are excellent with smooth transitions to mid-range"
},
"mids": {
"description": "Lively, vibrant, and enjoyable",
"characteristics": [
"Good definition, positioning, detail, and transparency",
"Natural and flowing with great dynamism",
"Exceptional timbre quality"
],
"performance": "Very musical experience with excellent resolution and micro details"
},
"treble": {
"description": "Well extended with great detail and articulation",
"characteristics": [
"Crisp and somewhat bright with good sparkle",
"Good air and space",
"Lower treble may be perceived as aggressive depending on the setup"
],
"performance": "Overall realistic and well-defined"
}
},
"technical_performance": {
"sound_stage": "Quite good, especially in width",
"imaging": "Excellent layering and separation",
"details": "Good overall resolution and transparency"
},
"author_bio": {
"description": "A keen audiophile and hobby photographer",
"interests": [
"Full-frame cameras",
"Custom in-ear monitors",
"Photography and audio enthusiasm"
],
"personal_note": "Enjoys fine single malt along with favorite Jazz recordings"
},
"comments": [
{
"commenter": "Shane D",
"comment": "Nice write-up. Seems like they were much more careful with the bass this time."
},
{
"commenter": "Evan",
"comment": "I wonder how these compare to the Focal Elex since they’re right at the same price and both dynamic drivers."
},
{
"commenter": "Frank",
"comment": "How does 109 Pro compare to beyerdynamic T1 3rd?"
},
{
"commenter": "Aldo Cavacece",
"comment": "Why don’t you write that this headphone can compete with high-priced headphones?"
},
{
"commenter": "Rita Mohanan",
"comment": "Amazing review. I have been enjoying the Meze 99 pro since a long time."
}
]
}%
Fetching content from https://shopify-demo.gatsbyjs.com/products/automotive/...
Content fetched successfully. Content length: 152260 characters
Main content extracted. Length: 1093 characters
Content truncated. Final length: 1093 characters
Extracting data...
Sending request to OpenAI API...
{
"products": [
{
"company": "AutomotiveBartell Inc",
"product": "Fantastic Iron Knife",
"price": 30.63
},
{
"company": "Lynch Inc",
"product": "Mediocre Leather Pants",
"price": 62.15
},
{
"company": "Schmitt, Doyle and Stehr",
"product": "Gorgeous Rubber Watch",
"price": 2.40
},
{
"company": "Kessler-Cruickshank",
"product": "Intelligent Rubber Gloves",
"price": 19.77
},
{
"company": "Haag-Lebsack",
"product": "Incredible Wooden Lamp",
"price": 41.64
},
{
"company": "Douglas Group",
"product": "Practical Cotton Bench",
"price": 68.48
},
{
"company": "Tromp, Sipes and Buckridge",
"product": "Practical Steel Bag",
"price": 6.02
},
{
"company": "Glover-Hirthe",
"product": "Synergistic Linen Keyboard",
"price": 90.29
},
{
"company": "Conn-Daniel",
"product": "Enormous Wool Keyboard",
"price": 79.11
},
{
"company": "Barrows, Stanton and Goyette",
"product": "Mediocre Marble Car",
"price": 49.57
},
{
"company": "Stoltenberg Inc",
"product": "Heavy Duty Wooden Keyboard",
"price": 40.97
},
{
"company": "Bechtelar, Pagac and Kessler",
"product": "Mediocre Wool Table",
"price": 48.96
},
{
"company": "McClure-Thompson",
"product": "Practical Concrete Shirt",
"price": 85.43
},
{
"company": "Jast Group",
"product": "Aerodynamic Concrete Shoes",
"price": 5.06
},
{
"company": "Shanahan-Lakin",
"product": "Aerodynamic Bronze Knife",
"price": 4.37
},
{
"company": "Nikolaus-Pouros",
"product": "Heavy Duty Steel Lamp",
"price": 13.09
},
{
"company": "Schiller LLC",
"product": "Awesome Iron Shoes",
"price": 60.49
},
{
"company": "Pacocha, Homenick and O'Keefe",
"product": "Heavy Duty Paper Watch",
"price": 38.89
},
{
"company": "Emmerich-Kuphal",
"product": "Gorgeous Aluminum Plate",
"price": 19.17
},
{
"company": "Bartell, Carroll and DuBuque",
"product": "Enormous Silk Car",
"price": 72.16
},
{
"company": "Price Inc",
"product": "Lightweight Bronze Bench",
"price": 65.86
},
{
"company": "Stanton Group",
"product": "Intelligent Granite Table",
"price": 92.20
},
{
"company": "Leffler Group",
"product": "Ergonomic Aluminum Keyboard",
"price": 81.73
},
{
"company": "Hudson-Hoeger",
"product": "Gorgeous Paper Table",
"price": 65.24
}
]
}%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment