Last active
August 3, 2024 15:51
-
-
Save monotykamary/29664cf8d7829c4827156e7478594d77 to your computer and use it in GitHub Desktop.
Crawl Website to JSON with Elixir Langchain
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env elixir | |
Mix.install([ | |
{:langchain, "~> 0.3.0-rc.0"}, | |
{:httpoison, "~> 2.2"}, | |
{:floki, "~> 0.36"}, | |
{:jason, "~> 1.4"} | |
]) | |
defmodule WebScraper do | |
def scrape(url) do | |
case HTTPoison.get(url, [], recv_timeout: 30_000) do | |
{:ok, %HTTPoison.Response{body: body, status_code: 200}} -> | |
{:ok, body} | |
{:error, %HTTPoison.Error{reason: reason}} -> | |
{:error, "Failed to fetch the website: #{inspect(reason)}"} | |
_ -> | |
{:error, "Unknown error while fetching the website"} | |
end | |
end | |
def extract_main_content(html) do | |
{:ok, document} = Floki.parse_document(html) | |
# Try to find the main content using common HTML5 tags | |
main_content = Floki.find(document, "main, article, .content, #content") | |
if Enum.empty?(main_content) do | |
# If no main content found, fall back to the body | |
Floki.find(document, "body") | |
else | |
main_content | |
end | |
|> Floki.text() | |
|> String.trim() | |
end | |
def truncate_content(content, max_tokens \\ 100_000) do | |
words = String.split(content) | |
if length(words) > max_tokens do | |
Enum.take(words, max_tokens) | |
|> Enum.join(" ") | |
|> Kernel.<>("...") | |
else | |
content | |
end | |
end | |
end | |
defmodule SmartCrawl do | |
alias LangChain.ChatModels.ChatOpenAI | |
alias LangChain.Chains.LLMChain | |
alias LangChain.Message | |
alias LangChain.MessageDelta | |
def run(url, data_request) do | |
IO.puts("Fetching content from #{url}...") | |
case WebScraper.scrape(url) do | |
{:ok, html_content} -> | |
IO.puts("Content fetched successfully. Content length: #{String.length(html_content)} characters") | |
main_content = WebScraper.extract_main_content(html_content) | |
IO.puts("Main content extracted. Length: #{String.length(main_content)} characters") | |
truncated_content = WebScraper.truncate_content(main_content) | |
IO.puts("Content truncated. Final length: #{String.length(truncated_content)} characters") | |
IO.puts("Extracting data...") | |
extract_data(truncated_content, data_request) | |
{:error, reason} -> | |
IO.puts("Error fetching content: #{reason}") | |
end | |
end | |
defp extract_data(content, data_request) do | |
prompt = """ | |
Content: | |
#{content} | |
Based on the content above, please extract the following information: | |
#{data_request} | |
Return the result as a JSON object. | |
""" | |
IO.puts("Sending request to OpenAI API...") | |
api_key = System.get_env("OPENAI_API_KEY") | |
handler = %{ | |
on_llm_new_delta: fn _model, %MessageDelta{} = data -> | |
IO.write(data.content) | |
end, | |
on_llm_token_usage: fn _model, usage -> | |
IO.puts("\nToken usage: #{inspect(usage)}") | |
end | |
} | |
{:ok, llm_chain} = | |
%{ | |
llm: ChatOpenAI.new!(%{ | |
model: "gpt-4o-mini-2024-07-18", | |
temperature: 0.7, | |
stream: true, | |
json_response: true, | |
callbacks: [handler], | |
api_key: api_key | |
}), | |
callbacks: [handler] | |
} | |
|> LLMChain.new() | |
{:ok, _updated_chain, _response} = | |
llm_chain | |
|> LLMChain.add_messages([ | |
Message.new_system!("You are a helpful assistant designed to output JSON."), | |
Message.new_user!(prompt) | |
]) | |
|> LLMChain.run() | |
end | |
end | |
# Get URL and data_request from command line arguments | |
[url | rest] = System.argv() | |
data_request = case rest do | |
[custom_request | _] -> custom_request | |
[] -> "Extract main content and key information from this webpage" | |
end | |
if url do | |
SmartCrawl.run(url, data_request) | |
else | |
IO.puts("Please provide a URL as an argument.") | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Fetching content from https://www.headfonia.com/meze-audio-109-pro-review/2/... | |
Content fetched successfully. Content length: 124707 characters | |
Main content extracted. Length: 17055 characters | |
Content truncated. Final length: 17055 characters | |
Extracting data... | |
Sending request to OpenAI API... | |
{ | |
"title": "Meze Audio 109 Pro Review", | |
"author": "Berkhan", | |
"date": "22/09/2022", | |
"price_range": "$500-$1000", | |
"type": "Full Size Headphones", | |
"rating": 4.3, | |
"votes": 129, | |
"sound_quality": { | |
"overall": "Excellent performer across many music genres and setups", | |
"bass": { | |
"description": "Speedy and reasonably impactful bass response", | |
"characteristics": [ | |
"Natural and breathy presentation", | |
"Well-balanced definition and control", | |
"Enjoyable for Pop and RnB tracks", | |
"Quality of bass is impressive with great decay and recovery" | |
], | |
"performance": "Mid-bass and sub-bass balance are excellent with smooth transitions to mid-range" | |
}, | |
"mids": { | |
"description": "Lively, vibrant, and enjoyable", | |
"characteristics": [ | |
"Good definition, positioning, detail, and transparency", | |
"Natural and flowing with great dynamism", | |
"Exceptional timbre quality" | |
], | |
"performance": "Very musical experience with excellent resolution and micro details" | |
}, | |
"treble": { | |
"description": "Well extended with great detail and articulation", | |
"characteristics": [ | |
"Crisp and somewhat bright with good sparkle", | |
"Good air and space", | |
"Lower treble may be perceived as aggressive depending on the setup" | |
], | |
"performance": "Overall realistic and well-defined" | |
} | |
}, | |
"technical_performance": { | |
"sound_stage": "Quite good, especially in width", | |
"imaging": "Excellent layering and separation", | |
"details": "Good overall resolution and transparency" | |
}, | |
"author_bio": { | |
"description": "A keen audiophile and hobby photographer", | |
"interests": [ | |
"Full-frame cameras", | |
"Custom in-ear monitors", | |
"Photography and audio enthusiasm" | |
], | |
"personal_note": "Enjoys fine single malt along with favorite Jazz recordings" | |
}, | |
"comments": [ | |
{ | |
"commenter": "Shane D", | |
"comment": "Nice write-up. Seems like they were much more careful with the bass this time." | |
}, | |
{ | |
"commenter": "Evan", | |
"comment": "I wonder how these compare to the Focal Elex since they’re right at the same price and both dynamic drivers." | |
}, | |
{ | |
"commenter": "Frank", | |
"comment": "How does 109 Pro compare to beyerdynamic T1 3rd?" | |
}, | |
{ | |
"commenter": "Aldo Cavacece", | |
"comment": "Why don’t you write that this headphone can compete with high-priced headphones?" | |
}, | |
{ | |
"commenter": "Rita Mohanan", | |
"comment": "Amazing review. I have been enjoying the Meze 99 pro since a long time." | |
} | |
] | |
}% |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Fetching content from https://shopify-demo.gatsbyjs.com/products/automotive/... | |
Content fetched successfully. Content length: 152260 characters | |
Main content extracted. Length: 1093 characters | |
Content truncated. Final length: 1093 characters | |
Extracting data... | |
Sending request to OpenAI API... | |
{ | |
"products": [ | |
{ | |
"company": "AutomotiveBartell Inc", | |
"product": "Fantastic Iron Knife", | |
"price": 30.63 | |
}, | |
{ | |
"company": "Lynch Inc", | |
"product": "Mediocre Leather Pants", | |
"price": 62.15 | |
}, | |
{ | |
"company": "Schmitt, Doyle and Stehr", | |
"product": "Gorgeous Rubber Watch", | |
"price": 2.40 | |
}, | |
{ | |
"company": "Kessler-Cruickshank", | |
"product": "Intelligent Rubber Gloves", | |
"price": 19.77 | |
}, | |
{ | |
"company": "Haag-Lebsack", | |
"product": "Incredible Wooden Lamp", | |
"price": 41.64 | |
}, | |
{ | |
"company": "Douglas Group", | |
"product": "Practical Cotton Bench", | |
"price": 68.48 | |
}, | |
{ | |
"company": "Tromp, Sipes and Buckridge", | |
"product": "Practical Steel Bag", | |
"price": 6.02 | |
}, | |
{ | |
"company": "Glover-Hirthe", | |
"product": "Synergistic Linen Keyboard", | |
"price": 90.29 | |
}, | |
{ | |
"company": "Conn-Daniel", | |
"product": "Enormous Wool Keyboard", | |
"price": 79.11 | |
}, | |
{ | |
"company": "Barrows, Stanton and Goyette", | |
"product": "Mediocre Marble Car", | |
"price": 49.57 | |
}, | |
{ | |
"company": "Stoltenberg Inc", | |
"product": "Heavy Duty Wooden Keyboard", | |
"price": 40.97 | |
}, | |
{ | |
"company": "Bechtelar, Pagac and Kessler", | |
"product": "Mediocre Wool Table", | |
"price": 48.96 | |
}, | |
{ | |
"company": "McClure-Thompson", | |
"product": "Practical Concrete Shirt", | |
"price": 85.43 | |
}, | |
{ | |
"company": "Jast Group", | |
"product": "Aerodynamic Concrete Shoes", | |
"price": 5.06 | |
}, | |
{ | |
"company": "Shanahan-Lakin", | |
"product": "Aerodynamic Bronze Knife", | |
"price": 4.37 | |
}, | |
{ | |
"company": "Nikolaus-Pouros", | |
"product": "Heavy Duty Steel Lamp", | |
"price": 13.09 | |
}, | |
{ | |
"company": "Schiller LLC", | |
"product": "Awesome Iron Shoes", | |
"price": 60.49 | |
}, | |
{ | |
"company": "Pacocha, Homenick and O'Keefe", | |
"product": "Heavy Duty Paper Watch", | |
"price": 38.89 | |
}, | |
{ | |
"company": "Emmerich-Kuphal", | |
"product": "Gorgeous Aluminum Plate", | |
"price": 19.17 | |
}, | |
{ | |
"company": "Bartell, Carroll and DuBuque", | |
"product": "Enormous Silk Car", | |
"price": 72.16 | |
}, | |
{ | |
"company": "Price Inc", | |
"product": "Lightweight Bronze Bench", | |
"price": 65.86 | |
}, | |
{ | |
"company": "Stanton Group", | |
"product": "Intelligent Granite Table", | |
"price": 92.20 | |
}, | |
{ | |
"company": "Leffler Group", | |
"product": "Ergonomic Aluminum Keyboard", | |
"price": 81.73 | |
}, | |
{ | |
"company": "Hudson-Hoeger", | |
"product": "Gorgeous Paper Table", | |
"price": 65.24 | |
} | |
] | |
}% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment