Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save frederickk/746b3a5572b0dfee00f807b2ee9fedf9 to your computer and use it in GitHub Desktop.
Save frederickk/746b3a5572b0dfee00f807b2ee9fedf9 to your computer and use it in GitHub Desktop.
Use Ollama to run an AI summariser on your machine for free
import argparse
import os
import time
import sys
from openai import OpenAI
def chunk_text(text, chunk_size=400, overlap=100):
"""Split text into overlapping chunks of roughly chunk_size characters."""
if len(text) <= chunk_size:
return [text]
chunks = []
start = 0
while start < len(text):
# Determine end of current chunk
end = min(start + chunk_size, len(text))
# If we're not at the end of the text, try to find a good break point
if end < len(text):
# Look for a period, question mark, or exclamation followed by space or newline
for i in range(end, max(start, end - 200), -1):
if i < len(text) and text[i-1] in '.!?\n' and (i == len(text) or text[i].isspace()):
end = i
break
# Add the chunk
chunks.append(text[start:end])
# Move start position for next chunk, accounting for overlap
start = end - overlap
# Make sure we're making progress
if start >= end:
start = end
return chunks
def summarize_chunk(client, chunk, system_prompt, model, max_tokens):
"""Summarize a single chunk of text."""
try:
response = client.chat.completions.create(
model=model,
max_tokens=max_tokens,
stream=False, # No streaming for individual chunks
messages=[
{
"role": "system",
"content": system_prompt
},
{
"role": "user",
"content": chunk
}
]
)
return response.choices[0].message.content
except Exception as e:
print(f"Error summarizing chunk: {e}")
return f"Error processing this part of the text: {e}"
def main():
# This is fast, relatively small model from Google that summarizes decently
# DEFAULT_OPEN_WEIGHTS_MODEL = "gemma2:9b"
DEFAULT_OPEN_WEIGHTS_MODEL = "nomic-embed-text:latest"
IGNORED_OLLAMA_API_KEY = "ollama"
# This is the ollama server installed from ollama.com
DEFAULT_OLLAMA_SERVER_URL = "http://localhost:11434/v1"
# Default system prompt
DEFAULT_SYSTEM_PROMPT = """You are a text summarization assistant.
Generate a concise summary of the given input text while preserving the key information and main points.
Provide the summary in three bullet points, totalling 100 words or less."""
# System prompt for combining chunks
COMBINE_CHUNKS_PROMPT = """You are a text summarization assistant.
Combine the following summaries into a coherent overall summary.
Eliminate redundancies and ensure the final summary captures all key points.
Provide the summary in three to five bullet points, totalling 150 words or less."""
parser = argparse.ArgumentParser(description='Summarize text using Ollama models')
parser.add_argument('--input-file', type=str, default='', help='Path to the input text file')
parser.add_argument('--input-text', type=str, default='', help='Input text to summarize')
parser.add_argument('--output-file', type=str, default='', help='Path to save the output summary')
parser.add_argument('--model', type=str, default=DEFAULT_OPEN_WEIGHTS_MODEL, help='Model to use for the API')
parser.add_argument('--base-url', type=str, default=DEFAULT_OLLAMA_SERVER_URL, help='Base URL for the Ollama server (which is OpenAI-compatible)')
parser.add_argument('--max-tokens', type=int, default=100, help='Maximum number of tokens in the summary')
parser.add_argument('--system-prompt', type=str, default=DEFAULT_SYSTEM_PROMPT, help='Custom system prompt to use')
parser.add_argument('--chunk-size', type=int, default=1000, help='Character count per chunk for long texts')
parser.add_argument('--chunk-overlap', type=int, default=100, help='Character overlap between chunks')
parser.add_argument('--no-chunking', action='store_true', help='Disable chunking regardless of text length')
args = parser.parse_args()
user_message = ""
if args.input_file:
# Read input from file
try:
with open(args.input_file, 'r') as file:
user_message = file.read()
except Exception as e:
print(f"Error reading input file: {e}")
sys.exit(1)
elif args.input_text:
# Use input text from command-line argument
user_message = args.input_text
else:
print("Either input-file or input-text must be provided")
sys.exit(1)
client = OpenAI(
api_key=IGNORED_OLLAMA_API_KEY,
base_url=args.base_url
)
start = time.time()
# Determine if we need to chunk the text
should_chunk = len(user_message) > args.chunk_size * 3 and not args.no_chunking
print(f"Prompt is {len(user_message)}. Chunking is {should_chunk}")
if should_chunk:
print(f"Text is {len(user_message)} characters long. Processing in chunks...")
chunks = chunk_text(user_message, args.chunk_size, args.chunk_overlap)
print(f"Split into {len(chunks)} chunks")
# Process each chunk
chunk_summaries = []
for i, chunk in enumerate(chunks):
print(f"\nProcessing chunk {i+1}/{len(chunks)} ({len(chunk)} characters)...")
chunk_summary = summarize_chunk(client, chunk, args.system_prompt, args.model, args.max_tokens)
chunk_summaries.append(chunk_summary)
print(f"Chunk {i+1} summary: {chunk_summary[:100]}...")
# Combine the summaries
combined_text = "\n\n".join([f"Summary {i+1}:\n{summary}" for i, summary in enumerate(chunk_summaries)])
print("\nCombining all summaries into final result...")
# Stream the final combined summary
try:
stream = client.chat.completions.create(
model=args.model,
max_tokens=args.max_tokens * 2, # Allow more tokens for the combined summary
stream=True,
messages=[
{
"role": "system",
# "content": COMBINE_CHUNKS_PROMPT
"content": args.system_prompt
},
{
"role": "user",
"content": combined_text
}
]
)
except Exception as e:
print(f"Error creating final summary: {e}")
sys.exit(1)
else:
# Process normally without chunking
try:
stream = client.chat.completions.create(
model=args.model,
max_tokens=args.max_tokens,
stream=True,
messages=[
{
"role": "system",
"content": args.system_prompt
},
{
"role": "user",
"content": user_message
}
]
)
except Exception as e:
print(f"ChatCompletionStream error: {e}")
sys.exit(1)
print("\nFinal Summary: ")
content = ""
completion_tokens = 0
for chunk in stream:
if chunk.choices[0].delta.content is not None:
content += chunk.choices[0].delta.content
print(chunk.choices[0].delta.content, end='', flush=True)
completion_tokens += len(chunk.choices[0].delta.content)
print(f"\n\nFinal Output: \n{content}")
elapsed = time.time() - start
print(f"\n\nTokens generated in final Output: {completion_tokens}")
print(f"Output tokens per Second: {completion_tokens/elapsed:.2f}")
print(f"Total Execution Time: {elapsed:.2f} seconds")
# Save to output file if specified
if args.output_file:
try:
with open(args.output_file, 'w') as file:
file.write(content)
print(f"Output saved to {args.output_file}")
except Exception as e:
print(f"Error saving to output file: {e}")
# TIP: the FIRST time you run this code, the model is loaded into memory, and this will
# be slow. On my Mac M2 I got 4 tokens/s.
# But run it a second time within 5 minutes and it'll run 10x faster!
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment