Skip to content

Instantly share code, notes, and snippets.

@Twister915
Last active October 11, 2025 03:53
Show Gist options
  • Select an option

  • Save Twister915/79b1ce37950f8d4dde18c0d2a909c59f to your computer and use it in GitHub Desktop.

Select an option

Save Twister915/79b1ce37950f8d4dde18c0d2a909c59f to your computer and use it in GitHub Desktop.
Scrapes all the ancient Sumerian texts
package main
import (
"context"
"io"
"log/slog"
"net/http"
"os"
"os/signal"
"path"
"regexp"
"slices"
"strings"
"sync"
"github.com/PuerkitoBio/goquery"
)
// main initializes the context for graceful shutdown,
// sets up channels for targets and texts, starts the worker pool,
// and processes the fetched texts by writing them to disk.
func main() {
// Create a context that listens for interrupt signals (e.g., Ctrl+C)
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
defer cancel() // Ensure cancellation is called when main exits
// Channel to send discovered targets (Unicode text pages)
targets := make(chan Msg[Target])
// Start the process to discover all targets from the index
go listUnicodeTexts(ctx, targets)
// Channel to receive processed text contents
texts := make(chan Msg[Text])
// Start 8 concurrent workers to fetch and extract text from each target
go fetchTextWorkers(ctx, 8, targets, texts)
// Write all extracted texts to the filesystem under "the_archive"
if err := writeTexts(ctx, "the_archive", texts); err != nil {
panic(err) // Panic on any error during file writing
}
}
// Target represents a single text page to be fetched.
// It contains the hierarchical path (for file organization)
// and the URL (href) to fetch the content.
type Target struct {
Path []string // Hierarchical path derived from breadcrumbs (used for file directory)
Href string // URL of the specific text page
}
// Msg is a generic container for either a successful result or an error.
// This allows us to send both data and errors over the same channel.
type Msg[R any] struct {
Result R // The actual data (e.g., Target or Text)
Err error // Any error encountered during processing
}
// listUnicodeTexts discovers all text pages by crawling the website's index.
// It starts from the top-level index and recursively explores deeper indices.
func listUnicodeTexts(ctx context.Context, targets chan<- Msg[Target]) {
defer close(targets) // Close the targets channel when done
var err error
// Ensure that any error is sent to the channel before closing
defer func() {
if err != nil {
targets <- Msg[Target]{Err: err}
}
}()
// Start by reading the top-level index page
err = readTopIndex(ctx, targets)
return
}
// IndexURL is the starting point for crawling the entire archive.
const IndexURL = "https://etcsl.orinst.ox.ac.uk/edition2/etcslbycat.php"
// readTopIndex fetches the top-level index page and extracts links to deeper indices.
func readTopIndex(ctx context.Context, targets chan<- Msg[Target]) (err error) {
slog.InfoContext(ctx, "fetching index",
slog.String("url", IndexURL))
// Fetch the HTML content of the top index
response, err := Fetch(ctx, IndexURL)
if err != nil {
return
}
defer response.Close() // Ensure the response body is closed
// Parse the HTML using goquery
document, err := goquery.NewDocumentFromReader(response)
if err != nil {
return
}
// Find all <a> tags within <li> elements (potential deeper indices)
document.Find("li a").Each(func(i int, s *goquery.Selection) {
if err != nil {
return // Stop processing if an error occurred
}
// Extract the href attribute and link text
href, exists := s.Attr("href")
linkText := strings.TrimSpace(s.Text())
// Only process valid Unicode links (ignore ASCII or broken ones)
if !exists || !strings.Contains(linkText, "Unicode") || strings.Contains(href, "charenc=j") {
return
}
// Build the breadcrumb path from the current <a> tag
path := buildPath(s)
// Normalize the href to make it a full URL
url := href
if strings.HasPrefix(url, "../") {
url = "https://etcsl.orinst.ox.ac.uk/" + strings.TrimPrefix(url, "../")
} else if strings.HasPrefix(url, "/") {
url = "https://etcsl.orinst.ox.ac.uk" + url
}
// Recursively explore the deeper index page
if err = readDeeperIndex(ctx, Target{Path: path, Href: url}, targets); err != nil {
return
}
// Check if the context is cancelled (e.g., Ctrl+C)
err = ctx.Err()
})
return
}
// readDeeperIndex fetches a deeper index page and extracts links to individual text pages.
func readDeeperIndex(ctx context.Context, parent Target, targets chan<- Msg[Target]) (err error) {
slog.InfoContext(ctx, "reading deeper index",
slog.Any("parent", parent))
// Fetch the HTML content of the deeper index
response, err := Fetch(ctx, parent.Href)
if err != nil {
return
}
defer response.Close() // Ensure the response body is closed
// Parse the HTML using goquery
document, err := goquery.NewDocumentFromReader(response)
if err != nil {
return
}
// Find all <a> tags within <li> elements (potential text pages)
document.Find("li a").Each(func(i int, s *goquery.Selection) {
if err != nil {
return // Stop processing if an error occurred
}
// Extract the href attribute and link text
href, exists := s.Attr("href")
linkText := strings.TrimSpace(s.Text())
// Only process valid translation links (ignore non-text pages)
if !exists || !strings.Contains(linkText, "translation") {
return
}
// Build the breadcrumb path from the current <a> tag
subpath := buildPath(s)
// Combine the parent path with the current subpath
path := make([]string, len(parent.Path)+1, len(parent.Path)+len(subpath)+1)
copy(path, parent.Path)
copy(path[len(parent.Path):], subpath)
path[len(path)-1] = cleanupTitleLine(s.Parent().Text())
// Normalize the href to make it a full URL
href = "https://etcsl.orinst.ox.ac.uk/cgi-bin/" + href
// Send the discovered text page to the targets channel
select {
case targets <- Msg[Target]{Result: Target{Path: path, Href: href}}:
case <-ctx.Done():
err = ctx.Err()
}
})
return
}
// Regular expressions for cleaning up titles
var (
patLeadingNumber = regexp.MustCompile(`^([0-9.]+\s)`) // Matches leading numbers (e.g., "1. ")
patAfterColon = regexp.MustCompile(":.+$") // Matches everything after a colon
)
// cleanupTitleLine removes unwanted parts from the title text
func cleanupTitleLine(title string) string {
title = patLeadingNumber.ReplaceAllString(title, "") // Remove leading numbers
title = patAfterColon.ReplaceAllString(title, "") // Remove everything after a colon
return title
}
// buildPath constructs a breadcrumb path by walking up the DOM tree
// from the <a> tag through its parent <li> elements.
func buildPath(s *goquery.Selection) []string {
var path []string
// Traverse up through parent <li> elements until <body>
s.ParentsUntil("body").Each(func(i int, sel *goquery.Selection) {
if goquery.NodeName(sel) == "li" {
// Remove nested <ul> to get only the visible text
text := sel.Find("ul").Remove().End().Text()
text = strings.TrimSpace(text)
if text != "" {
// Avoid duplicates (e.g., same heading repeated)
if len(path) == 0 || path[len(path)-1] != text {
path = append(path, text)
}
}
}
})
// Clean up each path segment
for i := range path {
str := &path[i]
*str = strings.TrimSuffix(*str, ":")
*str = trimSuffixLower(*str, " (unicode | ascii)")
*str = strings.TrimSpace(*str)
}
// Reverse the path to get the correct order
slices.Reverse(path)
return path
}
// Fetch creates an HTTP request with the given context and URL,
// then returns the response body for reading.
func Fetch(ctx context.Context, url string) (out io.ReadCloser, err error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return
}
return resp.Body, nil
}
// Helper function to trim a suffix case-insensitively
func trimSuffixLower(s, suffix string) string {
if idx := strings.LastIndex(strings.ToLower(s), suffix); idx != -1 {
s = s[:idx]
}
return s
}
// Text represents a fetched text page with its content.
type Text struct {
Target
Contents string // The extracted text content in Markdown format
}
// fetchTextWorkers starts a pool of workers to fetch text pages concurrently.
func fetchTextWorkers(ctx context.Context, n int, targets <-chan Msg[Target], dst chan<- Msg[Text]) {
defer close(dst) // Close the destination channel when done
var wg sync.WaitGroup
defer wg.Wait() // Wait for all workers to finish
// Start n workers
for i := 0; i < n; i++ {
wg.Add(1)
go fetchTextWorker(ctx, &wg, targets, dst)
}
}
// fetchTextWorker processes targets from the channel and fetches their content.
func fetchTextWorker(ctx context.Context, wg *sync.WaitGroup, targets <-chan Msg[Target], dst chan<- Msg[Text]) {
var err error
// Ensure that any error is sent to the channel before exiting
defer func() {
if err != nil {
dst <- Msg[Text]{Err: err}
}
}()
defer wg.Done() // Mark this worker as done when exiting
for {
select {
case msg, ok := <-targets:
if !ok {
return // No more targets to process
}
var downloaded Text
downloaded, err = fetchText(ctx, msg.Result)
select {
case dst <- Msg[Text]{Result: downloaded}:
case <-ctx.Done():
err = ctx.Err()
return
}
case <-ctx.Done():
err = ctx.Err()
return
}
}
}
// fetchText fetches the content of a single text page.
func fetchText(ctx context.Context, target Target) (out Text, err error) {
response, err := Fetch(ctx, target.Href)
if err != nil {
return
}
defer response.Close()
document, err := goquery.NewDocumentFromReader(response)
if err == nil {
out.Target = target
out.Contents = ExtractMarkdown(document)
}
return
}
// ExtractMarkdown extracts the main content from an ETCSL translation page
// and returns it as clean Markdown.
func ExtractMarkdown(doc *goquery.Document) string {
var md strings.Builder
// Extract the main heading
doc.Find("h2").Each(func(i int, s *goquery.Selection) {
text := cleanText(s)
if text != "" {
md.WriteString("# " + text + "\n\n")
}
})
// Find the main content table
// Use a more specific selector to avoid the top nav table
found := false
doc.Find("table").EachWithBreak(func(i int, s *goquery.Selection) bool {
// Check if this table has <p> tags (content table)
if s.Find("p").Length() > 0 {
s.Find("p").Each(func(j int, p *goquery.Selection) {
text := cleanText(p)
if text != "" {
// Optional: skip if it's just a rubric like "1st kirugu"
if !isRubric(text) {
md.WriteString(text + "\n\n")
}
}
})
found = true
return false // Break out of .Each
}
return true
})
// Fallback: if no content table found, try body > p
if !found {
doc.Find("body > p").Each(func(i int, p *goquery.Selection) {
text := cleanText(p)
if text != "" {
md.WriteString(text + "\n\n")
}
})
}
return md.String()
}
var reSpace = regexp.MustCompile(`\s+`)
// cleanText removes unwanted elements and returns clean text
func cleanText(s *goquery.Selection) string {
// Clone to avoid modifying original
clone := s.Clone()
// Remove <a> tags that are just line numbers or anchors
clone.Find("a[href]").Each(func(i int, a *goquery.Selection) {
href, has := a.Attr("href")
if !has {
return
}
// Remove if it's just a line ID or link to line
if strings.Contains(href, "lineid") || strings.HasPrefix(href, "etcsl.cgi?text=") {
a.SetHtml("")
}
})
// Remove <a> tags with only name (anchors)
clone.Find("a[name]").Each(func(i int, a *goquery.Selection) {
a.SetHtml("")
})
// Remove <span class="gap"> (not real content)
clone.Find("span.gap").Each(func(i int, gap *goquery.Selection) {
gap.SetHtml("")
})
// Now get the text
text := clone.Text()
text = strings.TrimSpace(text)
text = reSpace.ReplaceAllString(text, " ")
return text
}
// isRubric checks if text is just a rubric like "1st kirugu"
func isRubric(text string) bool {
return strings.Contains(text, "kirugu") &&
(strings.HasPrefix(text, "1st") ||
strings.HasPrefix(text, "2nd") ||
strings.HasPrefix(text, "3rd") ||
strings.Contains(text, "th kirugu"))
}
// writeTexts processes the fetched texts and writes them to disk.
func writeTexts(ctx context.Context, root string, texts <-chan Msg[Text]) (err error) {
for {
select {
case text, ok := <-texts:
if !ok {
return
}
if text.Err != nil {
if err == nil {
err = text.Err
}
} else if err = writeText(ctx, root, text.Result); err != nil {
return
}
case <-ctx.Done():
err = ctx.Err()
return
}
}
}
// writeText writes a single text to disk.
func writeText(ctx context.Context, root string, text Text) (err error) {
filename := path.Join(text.Path...) + ".md"
filePath := path.Join(root, filename)
parentDir := path.Dir(filePath)
if err = os.MkdirAll(parentDir, 0755); err != nil {
return
}
slog.InfoContext(ctx, "writing file",
slog.String("filename", filename),
slog.Int("size", len(text.Contents)),
slog.Any("path", text.Path))
err = os.WriteFile(filePath, []byte(text.Contents), 0644)
return
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment