Twister915 · October 11, 2025 03:53
diff --git a/main.go b/main.go
 package main

 import (
 	"context"
 	"io"
 	"log/slog"
 	"net/http"
 	"os"
 	"os/signal"
 	"path"
 	"regexp"
 	"slices"
 	"strings"
 	"sync"

 	"github.com/PuerkitoBio/goquery"
 )

 // main initializes the context for graceful shutdown,
 // sets up channels for targets and texts, starts the worker pool,
 // and processes the fetched texts by writing them to disk.
 func main() {
 	// Create a context that listens for interrupt signals (e.g., Ctrl+C)
 	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
 	defer cancel() // Ensure cancellation is called when main exits

 	// Channel to send discovered targets (Unicode text pages)
 	targets := make(chan Msg[Target])
 	// Start the process to discover all targets from the index
 	go listUnicodeTexts(ctx, targets)

 	// Channel to receive processed text contents
 	texts := make(chan Msg[Text])
 	// Start 8 concurrent workers to fetch and extract text from each target
 	go fetchTextWorkers(ctx, 8, targets, texts)

 	// Write all extracted texts to the filesystem under "the_archive"
 	if err := writeTexts(ctx, "the_archive", texts); err != nil {
 		panic(err) // Panic on any error during file writing
 	}
 }

 // Target represents a single text page to be fetched.
 // It contains the hierarchical path (for file organization)
 // and the URL (href) to fetch the content.
 type Target struct {
 	Path []string // Hierarchical path derived from breadcrumbs (used for file directory)
 	Href string   // URL of the specific text page
 }

 // Msg is a generic container for either a successful result or an error.
 // This allows us to send both data and errors over the same channel.
 type Msg[R any] struct {
 	Result R     // The actual data (e.g., Target or Text)
 	Err    error // Any error encountered during processing
 }

 // listUnicodeTexts discovers all text pages by crawling the website's index.
 // It starts from the top-level index and recursively explores deeper indices.
 func listUnicodeTexts(ctx context.Context, targets chan<- Msg[Target]) {
 	defer close(targets) // Close the targets channel when done

 	var err error
 	// Ensure that any error is sent to the channel before closing
 	defer func() {
 		if err != nil {
 			targets <- Msg[Target]{Err: err}
 		}
 	}()

 	// Start by reading the top-level index page
 	err = readTopIndex(ctx, targets)
 	return
 }

 // IndexURL is the starting point for crawling the entire archive.
 const IndexURL = "https://etcsl.orinst.ox.ac.uk/edition2/etcslbycat.php"

 // readTopIndex fetches the top-level index page and extracts links to deeper indices.
 func readTopIndex(ctx context.Context, targets chan<- Msg[Target]) (err error) {
 	slog.InfoContext(ctx, "fetching index",
 		slog.String("url", IndexURL))

 	// Fetch the HTML content of the top index
 	response, err := Fetch(ctx, IndexURL)
 	if err != nil {
 		return
 	}
 	defer response.Close() // Ensure the response body is closed

 	// Parse the HTML using goquery
 	document, err := goquery.NewDocumentFromReader(response)
 	if err != nil {
 		return
 	}

 	// Find all <a> tags within <li> elements (potential deeper indices)
 	document.Find("li a").Each(func(i int, s *goquery.Selection) {
 		if err != nil {
 			return // Stop processing if an error occurred
 		}

 		// Extract the href attribute and link text
 		href, exists := s.Attr("href")
 		linkText := strings.TrimSpace(s.Text())

 		// Only process valid Unicode links (ignore ASCII or broken ones)
 		if !exists || !strings.Contains(linkText, "Unicode") || strings.Contains(href, "charenc=j") {
 			return
 		}

 		// Build the breadcrumb path from the current <a> tag
 		path := buildPath(s)

 		// Normalize the href to make it a full URL
 		url := href
 		if strings.HasPrefix(url, "../") {
 			url = "https://etcsl.orinst.ox.ac.uk/" + strings.TrimPrefix(url, "../")
 		} else if strings.HasPrefix(url, "/") {
 			url = "https://etcsl.orinst.ox.ac.uk" + url
 		}

 		// Recursively explore the deeper index page
 		if err = readDeeperIndex(ctx, Target{Path: path, Href: url}, targets); err != nil {
 			return
 		}

 		// Check if the context is cancelled (e.g., Ctrl+C)
 		err = ctx.Err()
 	})
 	return
 }

 // readDeeperIndex fetches a deeper index page and extracts links to individual text pages.
 func readDeeperIndex(ctx context.Context, parent Target, targets chan<- Msg[Target]) (err error) {
 	slog.InfoContext(ctx, "reading deeper index",
 		slog.Any("parent", parent))

 	// Fetch the HTML content of the deeper index
 	response, err := Fetch(ctx, parent.Href)
 	if err != nil {
 		return
 	}
 	defer response.Close() // Ensure the response body is closed

 	// Parse the HTML using goquery
 	document, err := goquery.NewDocumentFromReader(response)
 	if err != nil {
 		return
 	}

 	// Find all <a> tags within <li> elements (potential text pages)
 	document.Find("li a").Each(func(i int, s *goquery.Selection) {
 		if err != nil {
 			return // Stop processing if an error occurred
 		}

 		// Extract the href attribute and link text
 		href, exists := s.Attr("href")
 		linkText := strings.TrimSpace(s.Text())

 		// Only process valid translation links (ignore non-text pages)
 		if !exists || !strings.Contains(linkText, "translation") {
 			return
 		}

 		// Build the breadcrumb path from the current <a> tag
 		subpath := buildPath(s)

 		// Combine the parent path with the current subpath
 		path := make([]string, len(parent.Path)+1, len(parent.Path)+len(subpath)+1)
 		copy(path, parent.Path)
 		copy(path[len(parent.Path):], subpath)
 		path[len(path)-1] = cleanupTitleLine(s.Parent().Text())

 		// Normalize the href to make it a full URL
 		href = "https://etcsl.orinst.ox.ac.uk/cgi-bin/" + href

 		// Send the discovered text page to the targets channel
 		select {
 		case targets <- Msg[Target]{Result: Target{Path: path, Href: href}}:
 		case <-ctx.Done():
 			err = ctx.Err()
 		}
 	})
 	return
 }

 // Regular expressions for cleaning up titles
 var (
 	patLeadingNumber = regexp.MustCompile(`^([0-9.]+\s)`) // Matches leading numbers (e.g., "1. ")
 	patAfterColon    = regexp.MustCompile(":.+$")         // Matches everything after a colon
 )

 // cleanupTitleLine removes unwanted parts from the title text
 func cleanupTitleLine(title string) string {
 	title = patLeadingNumber.ReplaceAllString(title, "") // Remove leading numbers
 	title = patAfterColon.ReplaceAllString(title, "")    // Remove everything after a colon
 	return title
 }

 // buildPath constructs a breadcrumb path by walking up the DOM tree
 // from the <a> tag through its parent <li> elements.
 func buildPath(s *goquery.Selection) []string {
 	var path []string

 	// Traverse up through parent <li> elements until <body>
 	s.ParentsUntil("body").Each(func(i int, sel *goquery.Selection) {
 		if goquery.NodeName(sel) == "li" {
 			// Remove nested <ul> to get only the visible text
 			text := sel.Find("ul").Remove().End().Text()
 			text = strings.TrimSpace(text)

 			if text != "" {
 				// Avoid duplicates (e.g., same heading repeated)
 				if len(path) == 0 || path[len(path)-1] != text {
 					path = append(path, text)
 				}
 			}
 		}
 	})

 	// Clean up each path segment
 	for i := range path {
 		str := &path[i]
 		*str = strings.TrimSuffix(*str, ":")
 		*str = trimSuffixLower(*str, " (unicode | ascii)")
 		*str = strings.TrimSpace(*str)
 	}

 	// Reverse the path to get the correct order
 	slices.Reverse(path)

 	return path
 }

 // Fetch creates an HTTP request with the given context and URL,
 // then returns the response body for reading.
 func Fetch(ctx context.Context, url string) (out io.ReadCloser, err error) {
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
 	if err != nil {
 		return
 	}

 	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
 		return
 	}

 	return resp.Body, nil
 }

 // Helper function to trim a suffix case-insensitively
 func trimSuffixLower(s, suffix string) string {
 	if idx := strings.LastIndex(strings.ToLower(s), suffix); idx != -1 {
 		s = s[:idx]
 	}
 	return s
 }

 // Text represents a fetched text page with its content.
 type Text struct {
 	Target
 	Contents string // The extracted text content in Markdown format
 }

 // fetchTextWorkers starts a pool of workers to fetch text pages concurrently.
 func fetchTextWorkers(ctx context.Context, n int, targets <-chan Msg[Target], dst chan<- Msg[Text]) {
 	defer close(dst) // Close the destination channel when done

 	var wg sync.WaitGroup
 	defer wg.Wait() // Wait for all workers to finish

 	// Start n workers
 	for i := 0; i < n; i++ {
 		wg.Add(1)
 		go fetchTextWorker(ctx, &wg, targets, dst)
 	}
 }

 // fetchTextWorker processes targets from the channel and fetches their content.
 func fetchTextWorker(ctx context.Context, wg *sync.WaitGroup, targets <-chan Msg[Target], dst chan<- Msg[Text]) {
 	var err error
 	// Ensure that any error is sent to the channel before exiting
 	defer func() {
 		if err != nil {
 			dst <- Msg[Text]{Err: err}
 		}
 	}()

 	defer wg.Done() // Mark this worker as done when exiting

 	for {
 		select {
 		case msg, ok := <-targets:
 			if !ok {
 				return // No more targets to process
 			}

 			var downloaded Text
 			downloaded, err = fetchText(ctx, msg.Result)
 			select {
 			case dst <- Msg[Text]{Result: downloaded}:
 			case <-ctx.Done():
 				err = ctx.Err()
 				return
 			}
 		case <-ctx.Done():
 			err = ctx.Err()
 			return
 		}
 	}
 }

 // fetchText fetches the content of a single text page.
 func fetchText(ctx context.Context, target Target) (out Text, err error) {
 	response, err := Fetch(ctx, target.Href)
 	if err != nil {
 		return
 	}

 	defer response.Close()
 	document, err := goquery.NewDocumentFromReader(response)
 	if err == nil {
 		out.Target = target
 		out.Contents = ExtractMarkdown(document)
 	}

 	return
 }

 // ExtractMarkdown extracts the main content from an ETCSL translation page
 // and returns it as clean Markdown.
 func ExtractMarkdown(doc *goquery.Document) string {
 	var md strings.Builder

 	// Extract the main heading
 	doc.Find("h2").Each(func(i int, s *goquery.Selection) {
 		text := cleanText(s)
 		if text != "" {
 			md.WriteString("# " + text + "\n\n")
 		}
 	})

 	// Find the main content table
 	// Use a more specific selector to avoid the top nav table
 	found := false
 	doc.Find("table").EachWithBreak(func(i int, s *goquery.Selection) bool {
 		// Check if this table has <p> tags (content table)
 		if s.Find("p").Length() > 0 {
 			s.Find("p").Each(func(j int, p *goquery.Selection) {
 				text := cleanText(p)
 				if text != "" {
 					// Optional: skip if it's just a rubric like "1st kirugu"
 					if !isRubric(text) {
 						md.WriteString(text + "\n\n")
 					}
 				}
 			})
 			found = true
 			return false // Break out of .Each
 		}
 		return true
 	})

 	// Fallback: if no content table found, try body > p
 	if !found {
 		doc.Find("body > p").Each(func(i int, p *goquery.Selection) {
 			text := cleanText(p)
 			if text != "" {
 				md.WriteString(text + "\n\n")
 			}
 		})
 	}

 	return md.String()
 }

 var reSpace = regexp.MustCompile(`\s+`)

 // cleanText removes unwanted elements and returns clean text
 func cleanText(s *goquery.Selection) string {
 	// Clone to avoid modifying original
 	clone := s.Clone()

 	// Remove <a> tags that are just line numbers or anchors
 	clone.Find("a[href]").Each(func(i int, a *goquery.Selection) {
 		href, has := a.Attr("href")
 		if !has {
 			return
 		}
 		// Remove if it's just a line ID or link to line
 		if strings.Contains(href, "lineid") || strings.HasPrefix(href, "etcsl.cgi?text=") {
 			a.SetHtml("")
 		}
 	})

 	// Remove <a> tags with only name (anchors)
 	clone.Find("a[name]").Each(func(i int, a *goquery.Selection) {
 		a.SetHtml("")
 	})

 	// Remove <span class="gap"> (not real content)
 	clone.Find("span.gap").Each(func(i int, gap *goquery.Selection) {
 		gap.SetHtml("")
 	})

 	// Now get the text
 	text := clone.Text()
 	text = strings.TrimSpace(text)
 	text = reSpace.ReplaceAllString(text, " ")

 	return text
 }

 // isRubric checks if text is just a rubric like "1st kirugu"
 func isRubric(text string) bool {
 	return strings.Contains(text, "kirugu") &&
 		(strings.HasPrefix(text, "1st") ||
 			strings.HasPrefix(text, "2nd") ||
 			strings.HasPrefix(text, "3rd") ||
 			strings.Contains(text, "th kirugu"))
 }

 // writeTexts processes the fetched texts and writes them to disk.
 func writeTexts(ctx context.Context, root string, texts <-chan Msg[Text]) (err error) {
 	for {
 		select {
 		case text, ok := <-texts:
 			if !ok {
 				return
 			}

 			if text.Err != nil {
 				if err == nil {
 					err = text.Err
 				}
 			} else if err = writeText(ctx, root, text.Result); err != nil {
 				return
 			}
 		case <-ctx.Done():
 			err = ctx.Err()
 			return
 		}
 	}
 }

 // writeText writes a single text to disk.
 func writeText(ctx context.Context, root string, text Text) (err error) {
 	filename := path.Join(text.Path...) + ".md"
 	filePath := path.Join(root, filename)
 	parentDir := path.Dir(filePath)
 	if err = os.MkdirAll(parentDir, 0755); err != nil {
 		return
 	}

 	slog.InfoContext(ctx, "writing file",
 		slog.String("filename", filename),
 		slog.Int("size", len(text.Contents)),
 		slog.Any("path", text.Path))

 	err = os.WriteFile(filePath, []byte(text.Contents), 0644)
 	return
 }
	package main

	import (
	"context"
	"io"
	"log/slog"
	"net/http"
	"os"
	"os/signal"
	"path"
	"regexp"
	"slices"
	"strings"
	"sync"

	"github.com/PuerkitoBio/goquery"
	)

	// main initializes the context for graceful shutdown,
	// sets up channels for targets and texts, starts the worker pool,
	// and processes the fetched texts by writing them to disk.
	func main() {
	// Create a context that listens for interrupt signals (e.g., Ctrl+C)
	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
	defer cancel() // Ensure cancellation is called when main exits

	// Channel to send discovered targets (Unicode text pages)
	targets := make(chan Msg[Target])
	// Start the process to discover all targets from the index
	go listUnicodeTexts(ctx, targets)

	// Channel to receive processed text contents
	texts := make(chan Msg[Text])
	// Start 8 concurrent workers to fetch and extract text from each target
	go fetchTextWorkers(ctx, 8, targets, texts)

	// Write all extracted texts to the filesystem under "the_archive"
	if err := writeTexts(ctx, "the_archive", texts); err != nil {
	panic(err) // Panic on any error during file writing
	}
	}

	// Target represents a single text page to be fetched.
	// It contains the hierarchical path (for file organization)
	// and the URL (href) to fetch the content.
	type Target struct {
	Path []string // Hierarchical path derived from breadcrumbs (used for file directory)
	Href string // URL of the specific text page
	}

	// Msg is a generic container for either a successful result or an error.
	// This allows us to send both data and errors over the same channel.
	type Msg[R any] struct {
	Result R // The actual data (e.g., Target or Text)
	Err error // Any error encountered during processing
	}

	// listUnicodeTexts discovers all text pages by crawling the website's index.
	// It starts from the top-level index and recursively explores deeper indices.
	func listUnicodeTexts(ctx context.Context, targets chan<- Msg[Target]) {
	defer close(targets) // Close the targets channel when done

	var err error
	// Ensure that any error is sent to the channel before closing
	defer func() {
	if err != nil {
	targets <- Msg[Target]{Err: err}
	}
	}()

	// Start by reading the top-level index page
	err = readTopIndex(ctx, targets)
	return
	}

	// IndexURL is the starting point for crawling the entire archive.
	const IndexURL = "https://etcsl.orinst.ox.ac.uk/edition2/etcslbycat.php"

	// readTopIndex fetches the top-level index page and extracts links to deeper indices.
	func readTopIndex(ctx context.Context, targets chan<- Msg[Target]) (err error) {
	slog.InfoContext(ctx, "fetching index",
	slog.String("url", IndexURL))

	// Fetch the HTML content of the top index
	response, err := Fetch(ctx, IndexURL)
	if err != nil {
	return
	}
	defer response.Close() // Ensure the response body is closed

	// Parse the HTML using goquery
	document, err := goquery.NewDocumentFromReader(response)
	if err != nil {
	return
	}

	// Find all <a> tags within <li> elements (potential deeper indices)
	document.Find("li a").Each(func(i int, s *goquery.Selection) {
	if err != nil {
	return // Stop processing if an error occurred
	}

	// Extract the href attribute and link text
	href, exists := s.Attr("href")
	linkText := strings.TrimSpace(s.Text())

	// Only process valid Unicode links (ignore ASCII or broken ones)
	if !exists \|\| !strings.Contains(linkText, "Unicode") \|\| strings.Contains(href, "charenc=j") {
	return
	}

	// Build the breadcrumb path from the current <a> tag
	path := buildPath(s)

	// Normalize the href to make it a full URL
	url := href
	if strings.HasPrefix(url, "../") {
	url = "https://etcsl.orinst.ox.ac.uk/" + strings.TrimPrefix(url, "../")
	} else if strings.HasPrefix(url, "/") {
	url = "https://etcsl.orinst.ox.ac.uk" + url
	}

	// Recursively explore the deeper index page
	if err = readDeeperIndex(ctx, Target{Path: path, Href: url}, targets); err != nil {
	return
	}

	// Check if the context is cancelled (e.g., Ctrl+C)
	err = ctx.Err()
	})
	return
	}

	// readDeeperIndex fetches a deeper index page and extracts links to individual text pages.
	func readDeeperIndex(ctx context.Context, parent Target, targets chan<- Msg[Target]) (err error) {
	slog.InfoContext(ctx, "reading deeper index",
	slog.Any("parent", parent))

	// Fetch the HTML content of the deeper index
	response, err := Fetch(ctx, parent.Href)
	if err != nil {
	return
	}
	defer response.Close() // Ensure the response body is closed

	// Parse the HTML using goquery
	document, err := goquery.NewDocumentFromReader(response)
	if err != nil {
	return
	}

	// Find all <a> tags within <li> elements (potential text pages)
	document.Find("li a").Each(func(i int, s *goquery.Selection) {
	if err != nil {
	return // Stop processing if an error occurred
	}

	// Extract the href attribute and link text
	href, exists := s.Attr("href")
	linkText := strings.TrimSpace(s.Text())

	// Only process valid translation links (ignore non-text pages)
	if !exists \|\| !strings.Contains(linkText, "translation") {
	return
	}

	// Build the breadcrumb path from the current <a> tag
	subpath := buildPath(s)

	// Combine the parent path with the current subpath
	path := make([]string, len(parent.Path)+1, len(parent.Path)+len(subpath)+1)
	copy(path, parent.Path)
	copy(path[len(parent.Path):], subpath)
	path[len(path)-1] = cleanupTitleLine(s.Parent().Text())

	// Normalize the href to make it a full URL
	href = "https://etcsl.orinst.ox.ac.uk/cgi-bin/" + href

	// Send the discovered text page to the targets channel
	select {
	case targets <- Msg[Target]{Result: Target{Path: path, Href: href}}:
	case <-ctx.Done():
	err = ctx.Err()
	}
	})
	return
	}

	// Regular expressions for cleaning up titles
	var (
	patLeadingNumber = regexp.MustCompile(`^([0-9.]+\s)`) // Matches leading numbers (e.g., "1. ")
	patAfterColon = regexp.MustCompile(":.+$") // Matches everything after a colon
	)

	// cleanupTitleLine removes unwanted parts from the title text
	func cleanupTitleLine(title string) string {
	title = patLeadingNumber.ReplaceAllString(title, "") // Remove leading numbers
	title = patAfterColon.ReplaceAllString(title, "") // Remove everything after a colon
	return title
	}

	// buildPath constructs a breadcrumb path by walking up the DOM tree
	// from the <a> tag through its parent <li> elements.
	func buildPath(s *goquery.Selection) []string {
	var path []string

	// Traverse up through parent <li> elements until <body>
	s.ParentsUntil("body").Each(func(i int, sel *goquery.Selection) {
	if goquery.NodeName(sel) == "li" {
	// Remove nested <ul> to get only the visible text
	text := sel.Find("ul").Remove().End().Text()
	text = strings.TrimSpace(text)

	if text != "" {
	// Avoid duplicates (e.g., same heading repeated)
	if len(path) == 0 \|\| path[len(path)-1] != text {
	path = append(path, text)
	}
	}
	}
	})

	// Clean up each path segment
	for i := range path {
	str := &path[i]
	str = strings.TrimSuffix(str, ":")
	str = trimSuffixLower(str, " (unicode \| ascii)")
	str = strings.TrimSpace(str)
	}

	// Reverse the path to get the correct order
	slices.Reverse(path)

	return path
	}

	// Fetch creates an HTTP request with the given context and URL,
	// then returns the response body for reading.
	func Fetch(ctx context.Context, url string) (out io.ReadCloser, err error) {
	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
	if err != nil {
	return
	}

	resp, err := http.DefaultClient.Do(req)
	if err != nil {
	return
	}

	return resp.Body, nil
	}

	// Helper function to trim a suffix case-insensitively
	func trimSuffixLower(s, suffix string) string {
	if idx := strings.LastIndex(strings.ToLower(s), suffix); idx != -1 {
	s = s[:idx]
	}
	return s
	}

	// Text represents a fetched text page with its content.
	type Text struct {
	Target
	Contents string // The extracted text content in Markdown format
	}

	// fetchTextWorkers starts a pool of workers to fetch text pages concurrently.
	func fetchTextWorkers(ctx context.Context, n int, targets <-chan Msg[Target], dst chan<- Msg[Text]) {
	defer close(dst) // Close the destination channel when done

	var wg sync.WaitGroup
	defer wg.Wait() // Wait for all workers to finish

	// Start n workers
	for i := 0; i < n; i++ {
	wg.Add(1)
	go fetchTextWorker(ctx, &wg, targets, dst)
	}
	}

	// fetchTextWorker processes targets from the channel and fetches their content.
	func fetchTextWorker(ctx context.Context, wg *sync.WaitGroup, targets <-chan Msg[Target], dst chan<- Msg[Text]) {
	var err error
	// Ensure that any error is sent to the channel before exiting
	defer func() {
	if err != nil {
	dst <- Msg[Text]{Err: err}
	}
	}()

	defer wg.Done() // Mark this worker as done when exiting

	for {
	select {
	case msg, ok := <-targets:
	if !ok {
	return // No more targets to process
	}

	var downloaded Text
	downloaded, err = fetchText(ctx, msg.Result)
	select {
	case dst <- Msg[Text]{Result: downloaded}:
	case <-ctx.Done():
	err = ctx.Err()
	return
	}
	case <-ctx.Done():
	err = ctx.Err()
	return
	}
	}
	}

	// fetchText fetches the content of a single text page.
	func fetchText(ctx context.Context, target Target) (out Text, err error) {
	response, err := Fetch(ctx, target.Href)
	if err != nil {
	return
	}

	defer response.Close()
	document, err := goquery.NewDocumentFromReader(response)
	if err == nil {
	out.Target = target
	out.Contents = ExtractMarkdown(document)
	}

	return
	}

	// ExtractMarkdown extracts the main content from an ETCSL translation page
	// and returns it as clean Markdown.
	func ExtractMarkdown(doc *goquery.Document) string {
	var md strings.Builder

	// Extract the main heading
	doc.Find("h2").Each(func(i int, s *goquery.Selection) {
	text := cleanText(s)
	if text != "" {
	md.WriteString("# " + text + "\n\n")
	}
	})

	// Find the main content table
	// Use a more specific selector to avoid the top nav table
	found := false
	doc.Find("table").EachWithBreak(func(i int, s *goquery.Selection) bool {
	// Check if this table has <p> tags (content table)
	if s.Find("p").Length() > 0 {
	s.Find("p").Each(func(j int, p *goquery.Selection) {
	text := cleanText(p)
	if text != "" {
	// Optional: skip if it's just a rubric like "1st kirugu"
	if !isRubric(text) {
	md.WriteString(text + "\n\n")
	}
	}
	})
	found = true
	return false // Break out of .Each
	}
	return true
	})

	// Fallback: if no content table found, try body > p
	if !found {
	doc.Find("body > p").Each(func(i int, p *goquery.Selection) {
	text := cleanText(p)
	if text != "" {
	md.WriteString(text + "\n\n")
	}
	})
	}

	return md.String()
	}

	var reSpace = regexp.MustCompile(`\s+`)

	// cleanText removes unwanted elements and returns clean text
	func cleanText(s *goquery.Selection) string {
	// Clone to avoid modifying original
	clone := s.Clone()

	// Remove <a> tags that are just line numbers or anchors
	clone.Find("a[href]").Each(func(i int, a *goquery.Selection) {
	href, has := a.Attr("href")
	if !has {
	return
	}
	// Remove if it's just a line ID or link to line
	if strings.Contains(href, "lineid") \|\| strings.HasPrefix(href, "etcsl.cgi?text=") {
	a.SetHtml("")
	}
	})

	// Remove <a> tags with only name (anchors)
	clone.Find("a[name]").Each(func(i int, a *goquery.Selection) {
	a.SetHtml("")
	})

	// Remove <span class="gap"> (not real content)
	clone.Find("span.gap").Each(func(i int, gap *goquery.Selection) {
	gap.SetHtml("")
	})

	// Now get the text
	text := clone.Text()
	text = strings.TrimSpace(text)
	text = reSpace.ReplaceAllString(text, " ")

	return text
	}

	// isRubric checks if text is just a rubric like "1st kirugu"
	func isRubric(text string) bool {
	return strings.Contains(text, "kirugu") &&
	(strings.HasPrefix(text, "1st") \|\|
	strings.HasPrefix(text, "2nd") \|\|
	strings.HasPrefix(text, "3rd") \|\|
	strings.Contains(text, "th kirugu"))
	}

	// writeTexts processes the fetched texts and writes them to disk.
	func writeTexts(ctx context.Context, root string, texts <-chan Msg[Text]) (err error) {
	for {
	select {
	case text, ok := <-texts:
	if !ok {
	return
	}

	if text.Err != nil {
	if err == nil {
	err = text.Err
	}
	} else if err = writeText(ctx, root, text.Result); err != nil {
	return
	}
	case <-ctx.Done():
	err = ctx.Err()
	return
	}
	}
	}

	// writeText writes a single text to disk.
	func writeText(ctx context.Context, root string, text Text) (err error) {
	filename := path.Join(text.Path...) + ".md"
	filePath := path.Join(root, filename)
	parentDir := path.Dir(filePath)
	if err = os.MkdirAll(parentDir, 0755); err != nil {
	return
	}

	slog.InfoContext(ctx, "writing file",
	slog.String("filename", filename),
	slog.Int("size", len(text.Contents)),
	slog.Any("path", text.Path))

	err = os.WriteFile(filePath, []byte(text.Contents), 0644)
	return
	}
No results found