eapache · November 29, 2024 15:57
diff --git a/baby_rag.py b/baby_rag.py
 #!python3

 from libzim.reader import Archive
 from libzim.search import Query, Searcher
 from openai import OpenAI
 from markdownify import markdownify
 import re
 import sys

 zim = Archive("/tmp/wikipedia_en_top1m_maxi_2024-05.zim")
 searcher = Searcher(zim)
 client = OpenAI(api_key="-", base_url="http://localhost:8080/")
 context_size = 8192
 approx_context_chars = context_size * 2

 def simplify(string):
    return re.sub('\n\n\n+', '\n\n', string).strip()

 def chat(messages):
    response = client.chat.completions.create(messages=messages, model="-")
    return response.choices[0].message.content

 def articles_matching(query_string, results=10):
    print(f"searching for '{query_string}'")
    query = Query().set_query(query_string)
    search = searcher.search(query)
    return list(search.getResults(0, results))

 def summarize_article(path, query):
    try:
        entry = zim.get_entry_by_path(path)
    except KeyError:
        return f"No article at path {path}"
    content = bytes(entry.get_item().content).decode("UTF-8")
    content = simplify(markdownify(content, escape_misc=False))
    print(f"summarizing '{path}' ({len(content)} characters)")

    responses = []
    for i in range(0, len(content), approx_context_chars):
        if i > 0:
            print(f"...{i}")
        chunk = content[i:i+approx_context_chars] 
        response = chat([
            {
                "user": "system",
                "content": """
 Be precise and concise. In 200 words or less, summarize the key facts in the
 provided text that pertain to the user's query. If the text does not contain any
 information that answers the user's query, respond instead with "NO_DATA".
                """
            },
            { "user": "user", "content": query },
            { "user": "system", "content": chunk },
        ])
        if response.strip() != "NO_DATA":
            responses.append(response) 
    return simplify('\n'.join(responses))

 def query(query):
    prompt = [
        {
            "user": "system",
            "content": """
 Be precise and concise. Do not ask the user for more information.
 Instead, load additional information by using the following commands:
  /search QUERY - returns a list of wikipedia article paths
  /summarize ARTICLE - returns a summary of the given wikipedia article path

 Give one command at a time, and wait for the response before giving another.
 The first line not starting with a '/' is your final answer.
            """
        },
        { "user": "user", "content": query }
    ]
    while True:
        print("prompting...")
        response = simplify(chat(prompt))
        prompt.append({ "user": "agent", "content": response })
        if response.startswith("/"):
            if response.startswith("/search"):
                search = simplify(response.removeprefix("/search "))
                matches = articles_matching(search)
                prompt.append({ "user": "system", "content": '\n'.join(matches) })
            elif response.startswith("/summarize"):
                article = simplify(response.removeprefix("/summarize "))
                summary = summarize_article(article, query)
                prompt.append({ "user": "system", "content": summary })
        else:
            return response

 print(query(sys.argv[1]))
	#!python3

	from libzim.reader import Archive
	from libzim.search import Query, Searcher
	from openai import OpenAI
	from markdownify import markdownify
	import re
	import sys

	zim = Archive("/tmp/wikipedia_en_top1m_maxi_2024-05.zim")
	searcher = Searcher(zim)
	client = OpenAI(api_key="-", base_url="http://localhost:8080/")
	context_size = 8192
	approx_context_chars = context_size * 2

	def simplify(string):
	return re.sub('\n\n\n+', '\n\n', string).strip()

	def chat(messages):
	response = client.chat.completions.create(messages=messages, model="-")
	return response.choices[0].message.content

	def articles_matching(query_string, results=10):
	print(f"searching for '{query_string}'")
	query = Query().set_query(query_string)
	search = searcher.search(query)
	return list(search.getResults(0, results))

	def summarize_article(path, query):
	try:
	entry = zim.get_entry_by_path(path)
	except KeyError:
	return f"No article at path {path}"
	content = bytes(entry.get_item().content).decode("UTF-8")
	content = simplify(markdownify(content, escape_misc=False))
	print(f"summarizing '{path}' ({len(content)} characters)")

	responses = []
	for i in range(0, len(content), approx_context_chars):
	if i > 0:
	print(f"...{i}")
	chunk = content[i:i+approx_context_chars]
	response = chat([
	{
	"user": "system",
	"content": """
	Be precise and concise. In 200 words or less, summarize the key facts in the
	provided text that pertain to the user's query. If the text does not contain any
	information that answers the user's query, respond instead with "NO_DATA".
	"""
	},
	{ "user": "user", "content": query },
	{ "user": "system", "content": chunk },
	])
	if response.strip() != "NO_DATA":
	responses.append(response)
	return simplify('\n'.join(responses))

	def query(query):
	prompt = [
	{
	"user": "system",
	"content": """
	Be precise and concise. Do not ask the user for more information.
	Instead, load additional information by using the following commands:
	/search QUERY - returns a list of wikipedia article paths
	/summarize ARTICLE - returns a summary of the given wikipedia article path

	Give one command at a time, and wait for the response before giving another.
	The first line not starting with a '/' is your final answer.
	"""
	},
	{ "user": "user", "content": query }
	]
	while True:
	print("prompting...")
	response = simplify(chat(prompt))
	prompt.append({ "user": "agent", "content": response })
	if response.startswith("/"):
	if response.startswith("/search"):
	search = simplify(response.removeprefix("/search "))
	matches = articles_matching(search)
	prompt.append({ "user": "system", "content": '\n'.join(matches) })
	elif response.startswith("/summarize"):
	article = simplify(response.removeprefix("/summarize "))
	summary = summarize_article(article, query)
	prompt.append({ "user": "system", "content": summary })
	else:
	return response

	print(query(sys.argv[1]))