Skip to content

Instantly share code, notes, and snippets.

@bouroo
Last active June 15, 2025 13:46
Show Gist options
  • Save bouroo/ae9ad5ca25a1eb8f998e57543b163155 to your computer and use it in GitHub Desktop.
Save bouroo/ae9ad5ca25a1eb8f998e57543b163155 to your computer and use it in GitHub Desktop.
Find files that contains all keywords
package main
import (
"bufio"
"context"
"encoding/csv"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
"sync"
"sync/atomic"
)
const (
// Default buffer size for file scanning.
scannerBufferSize = 64 * 1024 // 64 KB
// Maximum token size for file scanning (e.g., for very long lines).
scannerMaxTokenSize = 10 * 1024 * 1024 // 10 MB
// Output CSV file name.
outputCSVFileName = "results.csv"
// Prefix for workflow paths.
workflowPathPrefix = "workflows/"
// Suffix appended to operation keywords.
operationKeywordSuffix = "("
// Operation prefixes.
insertOperationPrefix = "insert_ngl_"
updateOperationPrefix = "update_ngl_"
)
// Job represents a task to check a file with a given keyword set.
type Job struct {
filePath string
tableName string
operation string
keywords map[string]bool // Keywords to search for in the file
}
// Result represents the outcome of processing a file.
type Result struct {
success bool // true if all keywords were found
tableName string
operation string
keywordsList string // Comma-separated string of keywords
workflowName string
errMsg string // Error message if processing failed
}
// checkFileContainsKeywords checks if the file contains all the specified keywords.
// It returns true if all keywords are found, false otherwise, and an error if file
// operations fail.
func checkFileContainsKeywords(filePath string, keywords map[string]bool) (bool, error) {
file, err := os.Open(filePath)
if err != nil {
return false, fmt.Errorf("failed to open file %s: %w", filePath, err)
}
defer file.Close()
// Clone the keywords map to track the remaining ones. This ensures that
// each call to checkFileContainsKeywords operates on its own set of keywords
// without modifying the original map, which might be shared across jobs.
remainingKeywords := make(map[string]bool, len(keywords))
for k, v := range keywords {
remainingKeywords[k] = v
}
scanner := bufio.NewScanner(file)
buf := make([]byte, scannerBufferSize)
scanner.Buffer(buf, scannerMaxTokenSize)
for scanner.Scan() {
line := scanner.Text()
// Iterate over a copy of keys to safely delete from the map during iteration.
// Or, more efficiently, iterate and collect keys to delete.
keysToDelete := make([]string, 0, len(remainingKeywords))
for k := range remainingKeywords {
if strings.Contains(line, k) {
keysToDelete = append(keysToDelete, k)
}
}
for _, k := range keysToDelete {
delete(remainingKeywords, k)
}
// Early exit if no keywords remain to be found.
if len(remainingKeywords) == 0 {
return true, nil
}
}
if err := scanner.Err(); err != nil {
return false, fmt.Errorf("error scanning file %s: %w", filePath, err)
}
return len(remainingKeywords) == 0, nil
}
// worker processes jobs from jobCh and sends results to resultCh.
// It's designed to be run as a goroutine.
func worker(jobCh <-chan Job, resultCh chan<- Result, wg *sync.WaitGroup) {
defer wg.Done()
for job := range jobCh {
contains, err := checkFileContainsKeywords(job.filePath, job.keywords)
var workflowName string
// Derive workflow name if filePath starts with the defined prefix.
if strings.HasPrefix(job.filePath, workflowPathPrefix) {
parts := strings.Split(job.filePath, "/")
if len(parts) > 1 {
workflowName = parts[1]
}
}
// Build comma-separated keywords string for the result.
var sb strings.Builder
for k := range job.keywords {
if sb.Len() > 0 {
sb.WriteString(", ")
}
// Remove the operation keyword suffix for display.
sb.WriteString(strings.TrimSuffix(k, operationKeywordSuffix))
}
res := Result{
tableName: job.tableName,
operation: job.operation,
keywordsList: sb.String(),
workflowName: workflowName,
}
if err != nil {
res.errMsg = err.Error()
} else {
res.success = contains
}
resultCh <- res
}
}
// addJob creates a Job with the given operation and enqueues it into jobCh.
// It copies the base keywords and appends an operation-specific keyword.
func addJob(jobCh chan<- Job, filePath, tableName, operation string, baseKeywords map[string]bool) {
// Copy the base keyword map to ensure each job has its own independent set.
keywords := make(map[string]bool, len(baseKeywords))
for k, v := range baseKeywords {
keywords[k] = v
}
// Append the operation-specific keyword (with an opening parenthesis suffix).
keywords[operation+operationKeywordSuffix] = true
jobCh <- Job{
filePath: filePath,
tableName: tableName,
operation: operation,
keywords: keywords,
}
}
func main() {
reader := bufio.NewReader(os.Stdin)
fmt.Print("Enter the directory path: ")
dirPath, err := reader.ReadString('\n')
if err != nil {
fmt.Printf("Error reading directory path: %v\n", err)
return
}
dirPath = strings.TrimSpace(dirPath)
fmt.Print("Enter table name: ")
tableName, err := reader.ReadString('\n')
if err != nil {
fmt.Printf("Error reading table name: %v\n", err)
return
}
tableName = strings.TrimSpace(tableName)
// Define operations based on the table name.
insertOperation := insertOperationPrefix + tableName
updateOperation := updateOperationPrefix + tableName
fmt.Print("Enter keywords separated by commas: ")
keywordsInput, err := reader.ReadString('\n')
if err != nil {
fmt.Printf("Error reading keywords: %v\n", err)
return
}
keywordsInput = strings.TrimSpace(keywordsInput)
keywordsSlice := strings.Split(keywordsInput, ",")
// Build the base keyword map from user input.
baseKeywordMap := make(map[string]bool, len(keywordsSlice))
for _, kw := range keywordsSlice {
trimmed := strings.TrimSpace(kw)
if trimmed != "" {
baseKeywordMap[trimmed] = true
}
}
// Open (or create) the CSV output file.
outputFile, err := os.OpenFile(outputCSVFileName, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
if err != nil {
fmt.Printf("Error opening/creating CSV file %s: %v\n", outputCSVFileName, err)
return
}
defer outputFile.Close()
csvWriter := csv.NewWriter(outputFile)
defer csvWriter.Flush() // Ensure all buffered CSV writes are flushed before exiting.
// Write header if the file is newly created or empty.
if stat, err := outputFile.Stat(); err == nil && stat.Size() == 0 {
if err := csvWriter.Write([]string{"Table name", "Operation", "Keywords", "Workflow name"}); err != nil {
fmt.Printf("Error writing header to CSV: %v\n", err)
return
}
} else if err != nil && !os.IsNotExist(err) { // Handle error if stat fails, but ignore "file not exist" as OpenFile handles creation.
fmt.Printf("Error stating CSV file %s: %v\n", outputCSVFileName, err)
return
}
// Setup channels for jobs and results. Buffered channels improve concurrency.
jobCh := make(chan Job, 100)
resultCh := make(chan Result, 100)
// Start worker goroutines. Number of workers defaults to CPU count.
var workersWg sync.WaitGroup
maxWorkers := runtime.NumCPU()
for i := 0; i < maxWorkers; i++ {
workersWg.Add(1)
go worker(jobCh, resultCh, &workersWg)
}
// Atomic counters for summary statistics.
var totalFiles int64
var filesContainingKeywords int64
// Context for directory walk (can be used for cancellation).
ctx, cancel := context.WithCancel(context.Background())
defer cancel() // Ensure context is cancelled on exit.
// Goroutine to walk the directory and enqueue jobs.
go func() {
defer close(jobCh) // Close job channel once all files are enqueued or walk is done.
err := filepath.WalkDir(dirPath, func(path string, d os.DirEntry, err error) error {
if err != nil {
// Report error for this path but continue walking the directory.
fmt.Printf("Error accessing path %s: %v\n", path, err)
return nil
}
// Skip directories.
if d.IsDir() {
return nil
}
atomic.AddInt64(&totalFiles, 1)
// Enqueue jobs for both insert and update operations for each file.
addJob(jobCh, path, tableName, insertOperation, baseKeywordMap)
addJob(jobCh, path, tableName, updateOperation, baseKeywordMap)
// Check for cancellation through context.
select {
case <-ctx.Done():
return ctx.Err() // Stop walking if context is cancelled.
default:
return nil
}
})
if err != nil {
fmt.Printf("Error walking the directory: %v\n", err)
}
}()
// Goroutine to process results and write to CSV.
var printWg sync.WaitGroup
printWg.Add(1)
go func() {
defer printWg.Done()
fmt.Println("\n=== Keyword Search Results ===")
for r := range resultCh {
if r.errMsg != "" {
fmt.Printf("❌ Error processing file for '%s' (Operation: %s): %s\n", r.tableName, r.operation, r.errMsg)
continue
}
if r.success {
atomic.AddInt64(&filesContainingKeywords, 1)
// Trim the operation keyword suffix for display in CSV.
if err := csvWriter.Write([]string{r.tableName, strings.TrimSuffix(r.operation, operationKeywordSuffix), r.keywordsList, r.workflowName}); err != nil {
fmt.Printf("Error writing result to CSV: %v\n", err)
}
}
}
}()
// Wait for all worker goroutines to finish their jobs.
workersWg.Wait()
// Close the result channel once all workers are done, signaling the result processing goroutine to exit.
close(resultCh)
// Wait for the result processing goroutine to finish.
printWg.Wait()
// Final summary output.
fmt.Printf("\n=== Summary ===\n")
fmt.Printf("Total files checked: %d\n", totalFiles)
fmt.Printf("Files containing all keywords: %d\n", filesContainingKeywords)
fmt.Println("==============================")
fmt.Printf("Results have been saved to %s\n", outputCSVFileName)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment