Last active
October 11, 2025 03:53
-
-
Save Twister915/79b1ce37950f8d4dde18c0d2a909c59f to your computer and use it in GitHub Desktop.
Scrapes all the ancient Sumerian texts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package main | |
| import ( | |
| "context" | |
| "io" | |
| "log/slog" | |
| "net/http" | |
| "os" | |
| "os/signal" | |
| "path" | |
| "regexp" | |
| "slices" | |
| "strings" | |
| "sync" | |
| "github.com/PuerkitoBio/goquery" | |
| ) | |
| // main initializes the context for graceful shutdown, | |
| // sets up channels for targets and texts, starts the worker pool, | |
| // and processes the fetched texts by writing them to disk. | |
| func main() { | |
| // Create a context that listens for interrupt signals (e.g., Ctrl+C) | |
| ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) | |
| defer cancel() // Ensure cancellation is called when main exits | |
| // Channel to send discovered targets (Unicode text pages) | |
| targets := make(chan Msg[Target]) | |
| // Start the process to discover all targets from the index | |
| go listUnicodeTexts(ctx, targets) | |
| // Channel to receive processed text contents | |
| texts := make(chan Msg[Text]) | |
| // Start 8 concurrent workers to fetch and extract text from each target | |
| go fetchTextWorkers(ctx, 8, targets, texts) | |
| // Write all extracted texts to the filesystem under "the_archive" | |
| if err := writeTexts(ctx, "the_archive", texts); err != nil { | |
| panic(err) // Panic on any error during file writing | |
| } | |
| } | |
| // Target represents a single text page to be fetched. | |
| // It contains the hierarchical path (for file organization) | |
| // and the URL (href) to fetch the content. | |
| type Target struct { | |
| Path []string // Hierarchical path derived from breadcrumbs (used for file directory) | |
| Href string // URL of the specific text page | |
| } | |
| // Msg is a generic container for either a successful result or an error. | |
| // This allows us to send both data and errors over the same channel. | |
| type Msg[R any] struct { | |
| Result R // The actual data (e.g., Target or Text) | |
| Err error // Any error encountered during processing | |
| } | |
| // listUnicodeTexts discovers all text pages by crawling the website's index. | |
| // It starts from the top-level index and recursively explores deeper indices. | |
| func listUnicodeTexts(ctx context.Context, targets chan<- Msg[Target]) { | |
| defer close(targets) // Close the targets channel when done | |
| var err error | |
| // Ensure that any error is sent to the channel before closing | |
| defer func() { | |
| if err != nil { | |
| targets <- Msg[Target]{Err: err} | |
| } | |
| }() | |
| // Start by reading the top-level index page | |
| err = readTopIndex(ctx, targets) | |
| return | |
| } | |
| // IndexURL is the starting point for crawling the entire archive. | |
| const IndexURL = "https://etcsl.orinst.ox.ac.uk/edition2/etcslbycat.php" | |
| // readTopIndex fetches the top-level index page and extracts links to deeper indices. | |
| func readTopIndex(ctx context.Context, targets chan<- Msg[Target]) (err error) { | |
| slog.InfoContext(ctx, "fetching index", | |
| slog.String("url", IndexURL)) | |
| // Fetch the HTML content of the top index | |
| response, err := Fetch(ctx, IndexURL) | |
| if err != nil { | |
| return | |
| } | |
| defer response.Close() // Ensure the response body is closed | |
| // Parse the HTML using goquery | |
| document, err := goquery.NewDocumentFromReader(response) | |
| if err != nil { | |
| return | |
| } | |
| // Find all <a> tags within <li> elements (potential deeper indices) | |
| document.Find("li a").Each(func(i int, s *goquery.Selection) { | |
| if err != nil { | |
| return // Stop processing if an error occurred | |
| } | |
| // Extract the href attribute and link text | |
| href, exists := s.Attr("href") | |
| linkText := strings.TrimSpace(s.Text()) | |
| // Only process valid Unicode links (ignore ASCII or broken ones) | |
| if !exists || !strings.Contains(linkText, "Unicode") || strings.Contains(href, "charenc=j") { | |
| return | |
| } | |
| // Build the breadcrumb path from the current <a> tag | |
| path := buildPath(s) | |
| // Normalize the href to make it a full URL | |
| url := href | |
| if strings.HasPrefix(url, "../") { | |
| url = "https://etcsl.orinst.ox.ac.uk/" + strings.TrimPrefix(url, "../") | |
| } else if strings.HasPrefix(url, "/") { | |
| url = "https://etcsl.orinst.ox.ac.uk" + url | |
| } | |
| // Recursively explore the deeper index page | |
| if err = readDeeperIndex(ctx, Target{Path: path, Href: url}, targets); err != nil { | |
| return | |
| } | |
| // Check if the context is cancelled (e.g., Ctrl+C) | |
| err = ctx.Err() | |
| }) | |
| return | |
| } | |
| // readDeeperIndex fetches a deeper index page and extracts links to individual text pages. | |
| func readDeeperIndex(ctx context.Context, parent Target, targets chan<- Msg[Target]) (err error) { | |
| slog.InfoContext(ctx, "reading deeper index", | |
| slog.Any("parent", parent)) | |
| // Fetch the HTML content of the deeper index | |
| response, err := Fetch(ctx, parent.Href) | |
| if err != nil { | |
| return | |
| } | |
| defer response.Close() // Ensure the response body is closed | |
| // Parse the HTML using goquery | |
| document, err := goquery.NewDocumentFromReader(response) | |
| if err != nil { | |
| return | |
| } | |
| // Find all <a> tags within <li> elements (potential text pages) | |
| document.Find("li a").Each(func(i int, s *goquery.Selection) { | |
| if err != nil { | |
| return // Stop processing if an error occurred | |
| } | |
| // Extract the href attribute and link text | |
| href, exists := s.Attr("href") | |
| linkText := strings.TrimSpace(s.Text()) | |
| // Only process valid translation links (ignore non-text pages) | |
| if !exists || !strings.Contains(linkText, "translation") { | |
| return | |
| } | |
| // Build the breadcrumb path from the current <a> tag | |
| subpath := buildPath(s) | |
| // Combine the parent path with the current subpath | |
| path := make([]string, len(parent.Path)+1, len(parent.Path)+len(subpath)+1) | |
| copy(path, parent.Path) | |
| copy(path[len(parent.Path):], subpath) | |
| path[len(path)-1] = cleanupTitleLine(s.Parent().Text()) | |
| // Normalize the href to make it a full URL | |
| href = "https://etcsl.orinst.ox.ac.uk/cgi-bin/" + href | |
| // Send the discovered text page to the targets channel | |
| select { | |
| case targets <- Msg[Target]{Result: Target{Path: path, Href: href}}: | |
| case <-ctx.Done(): | |
| err = ctx.Err() | |
| } | |
| }) | |
| return | |
| } | |
| // Regular expressions for cleaning up titles | |
| var ( | |
| patLeadingNumber = regexp.MustCompile(`^([0-9.]+\s)`) // Matches leading numbers (e.g., "1. ") | |
| patAfterColon = regexp.MustCompile(":.+$") // Matches everything after a colon | |
| ) | |
| // cleanupTitleLine removes unwanted parts from the title text | |
| func cleanupTitleLine(title string) string { | |
| title = patLeadingNumber.ReplaceAllString(title, "") // Remove leading numbers | |
| title = patAfterColon.ReplaceAllString(title, "") // Remove everything after a colon | |
| return title | |
| } | |
| // buildPath constructs a breadcrumb path by walking up the DOM tree | |
| // from the <a> tag through its parent <li> elements. | |
| func buildPath(s *goquery.Selection) []string { | |
| var path []string | |
| // Traverse up through parent <li> elements until <body> | |
| s.ParentsUntil("body").Each(func(i int, sel *goquery.Selection) { | |
| if goquery.NodeName(sel) == "li" { | |
| // Remove nested <ul> to get only the visible text | |
| text := sel.Find("ul").Remove().End().Text() | |
| text = strings.TrimSpace(text) | |
| if text != "" { | |
| // Avoid duplicates (e.g., same heading repeated) | |
| if len(path) == 0 || path[len(path)-1] != text { | |
| path = append(path, text) | |
| } | |
| } | |
| } | |
| }) | |
| // Clean up each path segment | |
| for i := range path { | |
| str := &path[i] | |
| *str = strings.TrimSuffix(*str, ":") | |
| *str = trimSuffixLower(*str, " (unicode | ascii)") | |
| *str = strings.TrimSpace(*str) | |
| } | |
| // Reverse the path to get the correct order | |
| slices.Reverse(path) | |
| return path | |
| } | |
| // Fetch creates an HTTP request with the given context and URL, | |
| // then returns the response body for reading. | |
| func Fetch(ctx context.Context, url string) (out io.ReadCloser, err error) { | |
| req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) | |
| if err != nil { | |
| return | |
| } | |
| resp, err := http.DefaultClient.Do(req) | |
| if err != nil { | |
| return | |
| } | |
| return resp.Body, nil | |
| } | |
| // Helper function to trim a suffix case-insensitively | |
| func trimSuffixLower(s, suffix string) string { | |
| if idx := strings.LastIndex(strings.ToLower(s), suffix); idx != -1 { | |
| s = s[:idx] | |
| } | |
| return s | |
| } | |
| // Text represents a fetched text page with its content. | |
| type Text struct { | |
| Target | |
| Contents string // The extracted text content in Markdown format | |
| } | |
| // fetchTextWorkers starts a pool of workers to fetch text pages concurrently. | |
| func fetchTextWorkers(ctx context.Context, n int, targets <-chan Msg[Target], dst chan<- Msg[Text]) { | |
| defer close(dst) // Close the destination channel when done | |
| var wg sync.WaitGroup | |
| defer wg.Wait() // Wait for all workers to finish | |
| // Start n workers | |
| for i := 0; i < n; i++ { | |
| wg.Add(1) | |
| go fetchTextWorker(ctx, &wg, targets, dst) | |
| } | |
| } | |
| // fetchTextWorker processes targets from the channel and fetches their content. | |
| func fetchTextWorker(ctx context.Context, wg *sync.WaitGroup, targets <-chan Msg[Target], dst chan<- Msg[Text]) { | |
| var err error | |
| // Ensure that any error is sent to the channel before exiting | |
| defer func() { | |
| if err != nil { | |
| dst <- Msg[Text]{Err: err} | |
| } | |
| }() | |
| defer wg.Done() // Mark this worker as done when exiting | |
| for { | |
| select { | |
| case msg, ok := <-targets: | |
| if !ok { | |
| return // No more targets to process | |
| } | |
| var downloaded Text | |
| downloaded, err = fetchText(ctx, msg.Result) | |
| select { | |
| case dst <- Msg[Text]{Result: downloaded}: | |
| case <-ctx.Done(): | |
| err = ctx.Err() | |
| return | |
| } | |
| case <-ctx.Done(): | |
| err = ctx.Err() | |
| return | |
| } | |
| } | |
| } | |
| // fetchText fetches the content of a single text page. | |
| func fetchText(ctx context.Context, target Target) (out Text, err error) { | |
| response, err := Fetch(ctx, target.Href) | |
| if err != nil { | |
| return | |
| } | |
| defer response.Close() | |
| document, err := goquery.NewDocumentFromReader(response) | |
| if err == nil { | |
| out.Target = target | |
| out.Contents = ExtractMarkdown(document) | |
| } | |
| return | |
| } | |
| // ExtractMarkdown extracts the main content from an ETCSL translation page | |
| // and returns it as clean Markdown. | |
| func ExtractMarkdown(doc *goquery.Document) string { | |
| var md strings.Builder | |
| // Extract the main heading | |
| doc.Find("h2").Each(func(i int, s *goquery.Selection) { | |
| text := cleanText(s) | |
| if text != "" { | |
| md.WriteString("# " + text + "\n\n") | |
| } | |
| }) | |
| // Find the main content table | |
| // Use a more specific selector to avoid the top nav table | |
| found := false | |
| doc.Find("table").EachWithBreak(func(i int, s *goquery.Selection) bool { | |
| // Check if this table has <p> tags (content table) | |
| if s.Find("p").Length() > 0 { | |
| s.Find("p").Each(func(j int, p *goquery.Selection) { | |
| text := cleanText(p) | |
| if text != "" { | |
| // Optional: skip if it's just a rubric like "1st kirugu" | |
| if !isRubric(text) { | |
| md.WriteString(text + "\n\n") | |
| } | |
| } | |
| }) | |
| found = true | |
| return false // Break out of .Each | |
| } | |
| return true | |
| }) | |
| // Fallback: if no content table found, try body > p | |
| if !found { | |
| doc.Find("body > p").Each(func(i int, p *goquery.Selection) { | |
| text := cleanText(p) | |
| if text != "" { | |
| md.WriteString(text + "\n\n") | |
| } | |
| }) | |
| } | |
| return md.String() | |
| } | |
| var reSpace = regexp.MustCompile(`\s+`) | |
| // cleanText removes unwanted elements and returns clean text | |
| func cleanText(s *goquery.Selection) string { | |
| // Clone to avoid modifying original | |
| clone := s.Clone() | |
| // Remove <a> tags that are just line numbers or anchors | |
| clone.Find("a[href]").Each(func(i int, a *goquery.Selection) { | |
| href, has := a.Attr("href") | |
| if !has { | |
| return | |
| } | |
| // Remove if it's just a line ID or link to line | |
| if strings.Contains(href, "lineid") || strings.HasPrefix(href, "etcsl.cgi?text=") { | |
| a.SetHtml("") | |
| } | |
| }) | |
| // Remove <a> tags with only name (anchors) | |
| clone.Find("a[name]").Each(func(i int, a *goquery.Selection) { | |
| a.SetHtml("") | |
| }) | |
| // Remove <span class="gap"> (not real content) | |
| clone.Find("span.gap").Each(func(i int, gap *goquery.Selection) { | |
| gap.SetHtml("") | |
| }) | |
| // Now get the text | |
| text := clone.Text() | |
| text = strings.TrimSpace(text) | |
| text = reSpace.ReplaceAllString(text, " ") | |
| return text | |
| } | |
| // isRubric checks if text is just a rubric like "1st kirugu" | |
| func isRubric(text string) bool { | |
| return strings.Contains(text, "kirugu") && | |
| (strings.HasPrefix(text, "1st") || | |
| strings.HasPrefix(text, "2nd") || | |
| strings.HasPrefix(text, "3rd") || | |
| strings.Contains(text, "th kirugu")) | |
| } | |
| // writeTexts processes the fetched texts and writes them to disk. | |
| func writeTexts(ctx context.Context, root string, texts <-chan Msg[Text]) (err error) { | |
| for { | |
| select { | |
| case text, ok := <-texts: | |
| if !ok { | |
| return | |
| } | |
| if text.Err != nil { | |
| if err == nil { | |
| err = text.Err | |
| } | |
| } else if err = writeText(ctx, root, text.Result); err != nil { | |
| return | |
| } | |
| case <-ctx.Done(): | |
| err = ctx.Err() | |
| return | |
| } | |
| } | |
| } | |
| // writeText writes a single text to disk. | |
| func writeText(ctx context.Context, root string, text Text) (err error) { | |
| filename := path.Join(text.Path...) + ".md" | |
| filePath := path.Join(root, filename) | |
| parentDir := path.Dir(filePath) | |
| if err = os.MkdirAll(parentDir, 0755); err != nil { | |
| return | |
| } | |
| slog.InfoContext(ctx, "writing file", | |
| slog.String("filename", filename), | |
| slog.Int("size", len(text.Contents)), | |
| slog.Any("path", text.Path)) | |
| err = os.WriteFile(filePath, []byte(text.Contents), 0644) | |
| return | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment