-
-
Save renews/0f1958a764f14d4e1a4f6fb504898a94 to your computer and use it in GitHub Desktop.
AI Web Search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# You will need to install https://github.com/cpursley/html2markdown | |
defmodule Webpage do | |
@moduledoc false | |
defstruct [:url, :title, :description, :summary, :page_age] | |
end | |
defmodule WebSearch do | |
@moduledoc """ | |
Web search summarization chain | |
""" | |
alias LangChain.Chains.LLMChain | |
alias LangChain.Message | |
defstruct [:query, :summary, :sources] | |
@doc """ | |
Summarizes the search results for a given query and returns a structured summary of the web pages and their sources. | |
TODO: | |
- Shorten-up prompt instructions | |
- More robust relevance and ranking | |
- Make max_token_length configurable | |
- Don't trim long context, recursively split it into sections | |
- Consider allowing PDF web result parsing | |
- Extract out to Langchain Chain with config that allows different llm(s) to be passed in | |
""" | |
def summarize(search_query, limit \\ 8, timeout \\ 10_000) do | |
with %{"web" => %{"results" => results}} <- Brave.search(search_query, limit), | |
summarized_search_results when is_list(summarized_search_results) <- | |
summarize_search_results(search_query, results, timeout), | |
summary when is_binary(summary) <- webpages_summarizer(search_query, summarized_search_results), | |
sources when is_list(sources) <- map_sources(summarized_search_results) do | |
{ | |
:ok, | |
%WebSearch{ | |
query: search_query, | |
summary: summary, | |
sources: sources | |
} | |
} | |
else | |
{:error, error} -> | |
{:error, error} | |
_ -> | |
{:error, "Failed to summarize search results"} | |
end | |
end | |
def summarize_webpage(search_query, %{"url" => url} = result) when is_binary(url) do | |
with content when is_binary(content) <- request_content(url), | |
parsed_content when is_binary(parsed_content) <- preprocess_webpage_content(content), | |
summary when is_binary(summary) <- webpage_summarizer(search_query, parsed_content) do | |
%Webpage{ | |
url: url, | |
title: result["title"], | |
description: Html2Markdown.convert(result["description"]), | |
summary: summary, | |
page_age: cast_page_age(result["page_age"]) | |
} | |
else | |
{:error, error} -> | |
{:error, error} | |
_ -> | |
{:error, "Failed to summarize webpage"} | |
end | |
end | |
defp summarize_search_results(search_query, results, timeout) do | |
results | |
|> Enum.map(fn result -> Map.take(result, ["title", "description", "url", "page_age"]) end) | |
|> Task.async_stream(&summarize_webpage(search_query, &1), timeout: timeout, on_timeout: :kill_task) | |
|> Enum.filter(fn | |
{:ok, %Webpage{} = result} -> result | |
_ -> false | |
end) | |
|> Enum.map(fn {:ok, result} -> result end) | |
end | |
defp webpages_summarizer(search_query, results) do | |
llm = Models.llama_v3_8b_instruct() | |
system_message = """ | |
You are a helpful web search results summarizer. Your task is to deliver a concise and accurate response to a user's query, drawing from the provided search result summaries. | |
Please combine the following web page summaries into a single, comprehensive summary. The individual summaries have been generated by an LLM and cover different aspects or sections of a larger topic. | |
Before combining the summaries, consider the following: | |
- Assess the relevance of each individual summary to the original user query. | |
- Give higher preference to summaries with a newer page age when the topic of the user's query is time-sensitive. | |
- Filter out summaries that are not strongly relevant to the user's query or do not contribute significantly to answering the question. | |
- Ignore any summaries that are empty, have no content, or contain only whitespace characters. | |
When creating the combined summary, consider the following: | |
- Identify common themes, topics, or ideas across the relevant individual summaries. | |
- Organize the information in a logical and coherent manner, ensuring a smooth flow between the different sections. | |
- Synthesize the key points and main takeaways from each relevant summary, while avoiding repetition or redundancy. | |
- Maintain the accuracy and integrity of the information presented in the original summaries. | |
- Use clear and concise language to convey the combined summary effectively using an unbiased and journalistic tone. | |
- If there are any contradictory or conflicting points across the summaries, try to reconcile them or present them objectively. | |
- Don't use phrases like "here is", "this article", "this webpage", "the page", "the content" or other hedging language. | |
- Focus on providing information that directly addresses the user's query and helps answer their question comprehensively. | |
Combining data: | |
- If you encounter similar or overlapping data lists or tables across multiple summaries, merge them into a single, comprehensive list or table. | |
- Identify the common fields or properties present in the overlapping data lists. | |
- Merge the data from the overlapping lists and table, ensuring that each unique entry is represented only once in the combined list or table. | |
- If there are conflicting values for the same entry across different lists, use your best judgment to determine the most accurate or relevant value to include in the combined list. | |
Formatting: | |
- Use appropriate headings, subheadings, or bullet points to organize the information. | |
- If the data lends itself well to a tabular format (e.g., comparisons, lists with multiple properties), consider presenting it in a markdown table. | |
- If a table is not suitable, use other appropriate markdown formatting such as lists, code blocks, or blockquotes to present the information effectively. | |
- Do not trim or remove any relevant data from the tables or lists and don't use placeholders | |
- Do not list your sources and never write URLs or links! | |
Please provide your response in well structured markdown format. But don't mention "markdown" in your response. | |
""" | |
user_message = """ | |
User query: #{search_query} | |
## Individual web page summaries: | |
#{map_summaries(results)} | |
""" | |
messages = [ | |
Message.new_system!(system_message), | |
Message.new_user!(user_message) | |
] | |
run_chain(llm, messages) | |
end | |
defp webpage_summarizer(search_query, content) do | |
llm = Models.phi_3_mini_128k_instruct() | |
system_message = """ | |
You are a helpful web page data extractor and summarizer. | |
Please analyze the following web page content and extract the key meanings into a summary without losing any important information and extract the structured data without modifying its format. | |
Before summarizing the content, consider the following: | |
- Assess the relevance of the content to the original user query. | |
- Filter out content that are not strongly relevant to the user's query or do not contribute significantly to answering the question. | |
- Ignore any content that is empty, or contain only whitespace characters. | |
Summary: | |
- Identify the main themes, topics, or ideas discussed in the content. | |
- Recognize important facts, figures, or examples that support the main points. | |
- Capture any essential context or background information necessary for understanding the content. | |
- Avoid repetition and eliminate any redundant or less critical information. | |
- Organize the summary by grouping related meanings together under relevant headings or sections. | |
- Don't return any promotional or irrelevant information. | |
- Use clear and concise language to convey the content effectively using an unbiased and journalistic tone. | |
- Don't use phrases like "this article", "this webpage", "the page", "the content" or other hedging language. | |
- Focus on providing information that directly addresses the user's query and helps answer their question comprehensively. | |
Data: | |
- Identify and extract tables, lists, code snippets, or any other formatted data present in the content. | |
- Maintain the original structure and formatting of the extracted data. | |
- Ensure that no information is lost or altered during the extraction process. | |
- If there are multiple instances of structured data, extract each instance separately. | |
Please provide your response in well structured markdown format. But don't mention "markdown" in your response. | |
""" | |
user_message = """ | |
User query: #{search_query} | |
## Web page content to summarize: | |
```html | |
#{content} | |
``` | |
""" | |
messages = [ | |
Message.new_system!(system_message), | |
Message.new_user!(user_message) | |
] | |
run_chain(llm, messages) | |
end | |
defp run_chain(llm, messages) do | |
%{llm: llm, verbose: false} | |
|> LLMChain.new!() | |
|> LLMChain.add_messages(messages) | |
|> LLMChain.run(mode: :while_needs_response) | |
|> case do | |
{:ok, _chain, %{content: content}} -> | |
content | |
error -> | |
error | |
end | |
end | |
defp map_sources(summarized_webpages) do | |
Enum.map(summarized_webpages, fn summarized_webpage -> | |
%{ | |
url: summarized_webpage.url, | |
title: summarized_webpage.title | |
} | |
end) | |
end | |
defp map_summaries(results) do | |
# Llama 3 estimated token length (with some wiggle-room): (string length) / 4 | |
max_token_length = 7_200 * 4 | |
results | |
|> Enum.with_index() | |
|> Enum.map_join("\n", fn {result, index} -> | |
""" | |
### Web Page #{index + 1}: | |
Title: #{result.title} | |
Description: #{result.description} | |
Summary: #{result.summary} | |
Page Age: #{calculate_page_age(result.page_age)} | |
""" | |
end) | |
|> maybe_trim_to_context_limit(max_token_length) | |
end | |
defp request_content(url) do | |
case URI.new(url) do | |
{:ok, %URI{scheme: "https", path: path}} -> | |
unless is_pdf_uri?(path) do | |
url | |
|> fetch_content() | |
|> Html2Markdown.convert() | |
end | |
_ -> | |
nil | |
end | |
end | |
defp is_pdf_uri?(path), do: Path.extname(path) == ".pdf" | |
defp fetch_content(url) do | |
case Req.get(url) do | |
{:ok, %Req.Response{status: 200, body: content}} -> content | |
{:ok, request} -> {:error, request} | |
{:error, error} -> {:error, error} | |
end | |
end | |
defp preprocess_webpage_content(content) do | |
# Phi 3 estimated token length (with some wiggle-room): (string length) / 4 | |
max_token_length = 85_000 * 4 | |
maybe_trim_to_context_limit(content, max_token_length) | |
end | |
defp maybe_trim_to_context_limit(content, max_token_length) | |
when is_binary(content) and byte_size(content) <= max_token_length do | |
content | |
end | |
defp maybe_trim_to_context_limit(content, max_token_length) | |
when is_binary(content) and byte_size(content) >= max_token_length do | |
content | |
|> String.slice(0, max_token_length) | |
|> String.trim() | |
end | |
defp cast_page_age(date_string) when is_binary(date_string) do | |
case NaiveDateTime.from_iso8601(date_string) do | |
{:ok, parsed_date} -> | |
parsed_date | |
{:error, _error} -> | |
nil | |
end | |
end | |
defp cast_page_age(_date_string), do: nil | |
defp calculate_page_age(nil), do: "Unknown age" | |
defp calculate_page_age(%NaiveDateTime{} = page_age) do | |
total_days = | |
NaiveDateTime.utc_now() | |
|> NaiveDateTime.diff(page_age, :second) | |
|> div(86_400) | |
cond do | |
total_days < 60 -> | |
"#{total_days} " <> Inflex.inflect("day", total_days) | |
total_days < 365 -> | |
months = div(total_days, 30) | |
"#{months} " <> Inflex.inflect("month", months) | |
true -> | |
years = div(total_days, 365) | |
"#{years} " <> Inflex.inflect("year", years) | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule Requests.Brave do | |
@moduledoc """ | |
Web search using Brave | |
Docs: https://api.search.brave.com/app/documentation/web-search/get-started | |
""" | |
@brave_search_url "https://api.search.brave.com/res/v1/web/search" | |
@brave_api_key Application.compile_env(:your_app, :brave_api_key) | |
@headers [ | |
{"Accept", "application/json"}, | |
{"Accept-Encoding", "gzip"}, | |
{"X-Subscription-Token", @brave_api_key} | |
] | |
def search(query, count \\ 20, result_filter \\ "query, web") do | |
params = %{q: query, result_filter: result_filter, count: count} | |
case Req.get(@brave_search_url, headers: @headers, params: params) do | |
{:ok, %Req.Response{status: 200, body: body}} -> | |
body | |
{:ok, %Req.Response{} = response} -> | |
{:error, response} | |
{:error, reason} -> | |
{:error, reason} | |
end | |
end | |
defp get(url, params) do | |
Req.get(url, headers: @headers, params: params) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment