Last active
March 9, 2025 00:28
-
-
Save ristomatti/d3322510f2985e3e69f102baa2444ea2 to your computer and use it in GitHub Desktop.
Markdown LLM summarizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# web2md - Convert one or more web pages to a Markdown document, optionally | |
# summarizing the content. | |
# | |
# Requirements | |
# - percollate: https://github.com/danburzo/percollate | |
# - aichat: https://github.com/sigoden/aichat | |
# | |
# Copyright (c) 2025 Ristomatti Airo <[email protected]> | |
# | |
# This work is licensed under the terms of the MIT license. | |
# For a copy, see <https://opensource.org/licenses/MIT>. | |
# shellcheck disable=SC2068 | |
set -Eeuo pipefail | |
VERSION="0.4.0" | |
DEFAULT_MODEL="openai:o1-mini" | |
PROMPT=$(cat <<EOF | |
Summarize text paragaphs, bullet points and tabular content of this Markdown document. | |
Rules: | |
1. RETAIN document structure: heading, bullet points, tables and source links. | |
2. RETAIN key information. | |
3. REMOVE common postfixes from headings. | |
4. REMOVE author information. | |
5. ENSURE there's only one top level heading by incrementing the rest accordingly. | |
6. ADD Obsidian TOC plugin snippet before the second heading: | |
\`\`\`table-of-contents | |
minLevel: 2 | |
maxLevel: 2 | |
\`\`\` | |
7. DO NOT wrap output in a codeblock. | |
EOF | |
) | |
trap cleanup SIGINT SIGTERM ERR EXIT | |
script_name=$(basename "${BASH_SOURCE[0]}") | |
print_help() { | |
cat <<EOF | |
${script_name} ${VERSION} | |
${BOLD}USAGE${RESET} | |
${YELLOW}${script_name}${RESET} [OPTIONS] URL [URL]... | |
${BOLD}OPTIONS${RESET} | |
-t, --title <title> Document title | |
-o, --output <path> Output file path | |
-s, --summary Summarize output | |
-m, --model AI model for summarization (default: openai:o1-mini) | |
-h, --help Print help | |
-v, --version Print version | |
EOF | |
exit 0 | |
} | |
print_version() { | |
die "${BOLD}${script_name}${RESET} ${VERSION}" 0 | |
} | |
cleanup() { | |
trap - SIGINT SIGTERM ERR EXIT | |
# script cleanup here | |
} | |
setup_colors() { | |
if [[ -t 2 ]] && [[ -z "${NO_COLOR-}" ]] && [[ "${TERM-}" != "dumb" ]]; then | |
RESET=$(tput sgr0) BOLD=$(tput bold) DIM=$(tput setaf 11) GREEN=$(tput setaf 2) YELLOW=$(tput setaf 3) | |
else | |
RESET="" BOLD="" DIM="" GREEN="" | |
fi | |
} | |
msg() { | |
echo >&2 -e "${1-}" | |
} | |
die() { | |
local msg=$1 | |
local code=${2-1} # default exit status 1 | |
msg "$msg" | |
exit "$code" | |
} | |
validate_deps() { | |
if ! command -v percollate &>/dev/null; then | |
die "${script_name} requires percollate. Please install it first." | |
fi | |
if ! command -v aichat &>/dev/null; then | |
die "${script_name} requires aichat. Please install it first." | |
fi | |
} | |
# Parse command line options | |
parse_options() { | |
title= | |
output_path= | |
summary=0 | |
model="$DEFAULT_MODEL" | |
while :; do | |
case "${1-}" in | |
-h | --help) print_help ;; | |
-v | --version) print_version ;; | |
-s | --summary) summary=1 ;; | |
-t | --title) | |
title="${2-}" | |
shift | |
;; | |
-o | --output) | |
output_path="${2-}" | |
shift | |
;; | |
-m | --model) | |
model="${2-}" | |
shift | |
;; | |
-?*) die "Unknown option: $1" ;; | |
*) break ;; | |
esac | |
shift | |
done | |
urls=("$@") | |
# Validate arguments | |
if [[ ${#urls[@]} -eq 0 ]]; then | |
die "${script_name}: no arguments provided" | |
fi | |
return 0 | |
} | |
web2md() { | |
local output | |
setup_colors | |
validate_deps | |
parse_options "$@" | |
if [[ -n "$title" ]]; then | |
msg "${DIM}Title:${RESET} ${title}" | |
fi | |
if [[ -n "$output_path" ]]; then | |
msg "${DIM}Output path:${RESET} ${output_path}" | |
fi | |
msg "${DIM}Pages:${RESET}" | |
for url in "${urls[@]}"; do | |
msg "- ${url}" | |
done | |
msg "\n${YELLOW}Converting to Markdown...${RESET}\n" | |
if [[ -n "$title" ]]; then | |
output="$(percollate md --no-toc --title "$title" --output - ${urls[@]-} )" | |
else | |
output="$(percollate md --no-toc --output - ${urls[@]-} )" | |
fi | |
if [[ $summary -eq 1 ]]; then | |
output="$(summarize "$output")" | |
fi | |
if [[ -n "$output_path" ]]; then | |
cat <<< "$output" > "$output_path" | |
msg "\n${GREEN}Done!${RESET}" | |
else | |
# Use glow or bat if available | |
if command -v glow &>/dev/null; then | |
glow --pager <<< "$output" | |
elif command -v bat &>/dev/null; then | |
bat --style=plain --language markdown <<< "$output" | |
else | |
cat <<< "$output" | |
fi | |
fi | |
} | |
summarize() { | |
local -r input="$1-" | |
msg "\n${YELLOW}Summarizing...${RESET}" | |
aichat --no-stream --model "$model" "$PROMPT" <<< "$input" | |
} | |
web2md "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment