justinledwards · April 25, 2024 20:31
diff --git a/ai-summary-choice.py b/ai-summary-choice.py
 import os
 import subprocess
 from bs4 import BeautifulSoup
 from nltk.tokenize import sent_tokenize, word_tokenize
 from playwright.sync_api import sync_playwright

 MAX_CONTEXT_SIZE = 8000
 OVERLAP_TOKENS = 100

 def fetch_html(url):
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        page.goto(url)
        html = page.content()
        title = page.title()
        browser.close()
        return html, title

 def summarize(text, title, max_size=MAX_CONTEXT_SIZE):
    prompt = f"summarize the actual content of {title}, max {int( max_size / 4)} words"
    process = subprocess.run(['ollama', 'run', 'llama3', prompt], 
                             input=text.encode('utf-8'), 
                             stdout=subprocess.PIPE, 
                             stderr=subprocess.PIPE, 
                             check=True)
    return process.stdout.decode().strip()

 def get_chunks(html):
    soup = BeautifulSoup(html, 'lxml')
    text = soup.get_text()
    sentences = sent_tokenize(text)
    chunks = []
    chunk = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        if len(chunk) + len(tokens) <= MAX_CONTEXT_SIZE // 2:
            chunk.extend(tokens)
        else:
            chunks.append(' '.join(chunk))
            chunk = tokens
    chunks.append(' '.join(chunk))
    return chunks

 def load_html_from_string(html, title):
    chunks = get_chunks(html)
    summaries = []
    for i, chunk in enumerate(chunks):
        overlap = ''
        if i > 0:
            overlap = ' '.join(chunks[i-1].split()[-OVERLAP_TOKENS:])
        summary1 = summarize(f"{overlap} {chunk}", title)
        summary2 = summarize(f"{overlap} {chunk}", title, max_size=MAX_CONTEXT_SIZE // 2)
        better_summary = evaluate_summaries(summary1, summary2, title)
        print(f"Choices for chunk {i+1}:")
        print(f"Choice 1")
        print(f"--------")
        print(f"{summary1}")
        print(f"--------")
        print(f"Choice 2")
        print(f"--------")
        print(f"{summary2}")
        print(f"Chosen summary: {better_summary}\n")
        summaries.append(better_summary)
    return summaries

 def evaluate_summaries(summary1, summary2, title):
    prompt = f"You are a bot that relays the best summary of two choices. Only output the exact content of {title} of the chosen best with no extra text as you are only a part of the larger output: \r\nChoice 1\r\n {summary1} \r\n---\r\nChoice 2\r\n {summary2}"
    process = subprocess.run(['ollama', 'run', 'llama3', prompt], 
                             stdout=subprocess.PIPE, 
                             stderr=subprocess.PIPE, 
                             check=True)
    return process.stdout.decode().strip()

 def combine_summaries(summaries):
    combined_text = ' '.join(summaries)
    if len(combined_text) > MAX_CONTEXT_SIZE:
        combined_text = summarize(combined_text, '', max_size=MAX_CONTEXT_SIZE // 2)
    process = subprocess.run(['ollama', 'run', 'llama3', 'These summaries are the response of a bot choosing the best of 2 sub summaries.  Clean up any parts they relayed other than just the content and combine these sub-summaries into a comprehensive summary of actual content in markdown format:'], 
                             input=combined_text.encode('utf-8'), 
                             stdout=subprocess.PIPE, 
                             stderr=subprocess.PIPE, 
                             check=True)
    return process.stdout.decode().strip()

 if __name__ == "__main__":
    import sys
    if len(sys.argv) != 2:
        print("Usage: script.py <URL>")
        sys.exit(1)
    url = sys.argv[1]
    html, title = fetch_html(url)
    summaries = load_html_from_string(html, title)
    comprehensive_summary = combine_summaries(summaries)
    print(comprehensive_summary)
	import os
	import subprocess
	from bs4 import BeautifulSoup
	from nltk.tokenize import sent_tokenize, word_tokenize
	from playwright.sync_api import sync_playwright

	MAX_CONTEXT_SIZE = 8000
	OVERLAP_TOKENS = 100

	def fetch_html(url):
	with sync_playwright() as p:
	browser = p.chromium.launch()
	page = browser.new_page()
	page.goto(url)
	html = page.content()
	title = page.title()
	browser.close()
	return html, title

	def summarize(text, title, max_size=MAX_CONTEXT_SIZE):
	prompt = f"summarize the actual content of {title}, max {int( max_size / 4)} words"
	process = subprocess.run(['ollama', 'run', 'llama3', prompt],
	input=text.encode('utf-8'),
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	check=True)
	return process.stdout.decode().strip()

	def get_chunks(html):
	soup = BeautifulSoup(html, 'lxml')
	text = soup.get_text()
	sentences = sent_tokenize(text)
	chunks = []
	chunk = []
	for sentence in sentences:
	tokens = word_tokenize(sentence)
	if len(chunk) + len(tokens) <= MAX_CONTEXT_SIZE // 2:
	chunk.extend(tokens)
	else:
	chunks.append(' '.join(chunk))
	chunk = tokens
	chunks.append(' '.join(chunk))
	return chunks

	def load_html_from_string(html, title):
	chunks = get_chunks(html)
	summaries = []
	for i, chunk in enumerate(chunks):
	overlap = ''
	if i > 0:
	overlap = ' '.join(chunks[i-1].split()[-OVERLAP_TOKENS:])
	summary1 = summarize(f"{overlap} {chunk}", title)
	summary2 = summarize(f"{overlap} {chunk}", title, max_size=MAX_CONTEXT_SIZE // 2)
	better_summary = evaluate_summaries(summary1, summary2, title)
	print(f"Choices for chunk {i+1}:")
	print(f"Choice 1")
	print(f"--------")
	print(f"{summary1}")
	print(f"--------")
	print(f"Choice 2")
	print(f"--------")
	print(f"{summary2}")
	print(f"Chosen summary: {better_summary}\n")
	summaries.append(better_summary)
	return summaries

	def evaluate_summaries(summary1, summary2, title):
	prompt = f"You are a bot that relays the best summary of two choices. Only output the exact content of {title} of the chosen best with no extra text as you are only a part of the larger output: \r\nChoice 1\r\n {summary1} \r\n---\r\nChoice 2\r\n {summary2}"
	process = subprocess.run(['ollama', 'run', 'llama3', prompt],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	check=True)
	return process.stdout.decode().strip()

	def combine_summaries(summaries):
	combined_text = ' '.join(summaries)
	if len(combined_text) > MAX_CONTEXT_SIZE:
	combined_text = summarize(combined_text, '', max_size=MAX_CONTEXT_SIZE // 2)
	process = subprocess.run(['ollama', 'run', 'llama3', 'These summaries are the response of a bot choosing the best of 2 sub summaries. Clean up any parts they relayed other than just the content and combine these sub-summaries into a comprehensive summary of actual content in markdown format:'],
	input=combined_text.encode('utf-8'),
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	check=True)
	return process.stdout.decode().strip()

	if __name__ == "__main__":
	import sys
	if len(sys.argv) != 2:
	print("Usage: script.py <URL>")
	sys.exit(1)
	url = sys.argv[1]
	html, title = fetch_html(url)
	summaries = load_html_from_string(html, title)
	comprehensive_summary = combine_summaries(summaries)
	print(comprehensive_summary)