Created
April 25, 2024 20:31
-
-
Save justinledwards/68e819c1e6051e6c9bfd77a5cd289ca6 to your computer and use it in GitHub Desktop.
Auto choosing chunked ai summary bot
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import subprocess | |
from bs4 import BeautifulSoup | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from playwright.sync_api import sync_playwright | |
MAX_CONTEXT_SIZE = 8000 | |
OVERLAP_TOKENS = 100 | |
def fetch_html(url): | |
with sync_playwright() as p: | |
browser = p.chromium.launch() | |
page = browser.new_page() | |
page.goto(url) | |
html = page.content() | |
title = page.title() | |
browser.close() | |
return html, title | |
def summarize(text, title, max_size=MAX_CONTEXT_SIZE): | |
prompt = f"summarize the actual content of {title}, max {int( max_size / 4)} words" | |
process = subprocess.run(['ollama', 'run', 'llama3', prompt], | |
input=text.encode('utf-8'), | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
check=True) | |
return process.stdout.decode().strip() | |
def get_chunks(html): | |
soup = BeautifulSoup(html, 'lxml') | |
text = soup.get_text() | |
sentences = sent_tokenize(text) | |
chunks = [] | |
chunk = [] | |
for sentence in sentences: | |
tokens = word_tokenize(sentence) | |
if len(chunk) + len(tokens) <= MAX_CONTEXT_SIZE // 2: | |
chunk.extend(tokens) | |
else: | |
chunks.append(' '.join(chunk)) | |
chunk = tokens | |
chunks.append(' '.join(chunk)) | |
return chunks | |
def load_html_from_string(html, title): | |
chunks = get_chunks(html) | |
summaries = [] | |
for i, chunk in enumerate(chunks): | |
overlap = '' | |
if i > 0: | |
overlap = ' '.join(chunks[i-1].split()[-OVERLAP_TOKENS:]) | |
summary1 = summarize(f"{overlap} {chunk}", title) | |
summary2 = summarize(f"{overlap} {chunk}", title, max_size=MAX_CONTEXT_SIZE // 2) | |
better_summary = evaluate_summaries(summary1, summary2, title) | |
print(f"Choices for chunk {i+1}:") | |
print(f"Choice 1") | |
print(f"--------") | |
print(f"{summary1}") | |
print(f"--------") | |
print(f"Choice 2") | |
print(f"--------") | |
print(f"{summary2}") | |
print(f"Chosen summary: {better_summary}\n") | |
summaries.append(better_summary) | |
return summaries | |
def evaluate_summaries(summary1, summary2, title): | |
prompt = f"You are a bot that relays the best summary of two choices. Only output the exact content of {title} of the chosen best with no extra text as you are only a part of the larger output: \r\nChoice 1\r\n {summary1} \r\n---\r\nChoice 2\r\n {summary2}" | |
process = subprocess.run(['ollama', 'run', 'llama3', prompt], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
check=True) | |
return process.stdout.decode().strip() | |
def combine_summaries(summaries): | |
combined_text = ' '.join(summaries) | |
if len(combined_text) > MAX_CONTEXT_SIZE: | |
combined_text = summarize(combined_text, '', max_size=MAX_CONTEXT_SIZE // 2) | |
process = subprocess.run(['ollama', 'run', 'llama3', 'These summaries are the response of a bot choosing the best of 2 sub summaries. Clean up any parts they relayed other than just the content and combine these sub-summaries into a comprehensive summary of actual content in markdown format:'], | |
input=combined_text.encode('utf-8'), | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
check=True) | |
return process.stdout.decode().strip() | |
if __name__ == "__main__": | |
import sys | |
if len(sys.argv) != 2: | |
print("Usage: script.py <URL>") | |
sys.exit(1) | |
url = sys.argv[1] | |
html, title = fetch_html(url) | |
summaries = load_html_from_string(html, title) | |
comprehensive_summary = combine_summaries(summaries) | |
print(comprehensive_summary) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment