Last active
June 15, 2025 13:46
-
-
Save bouroo/ae9ad5ca25a1eb8f998e57543b163155 to your computer and use it in GitHub Desktop.
Find files that contains all keywords
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"context" | |
"encoding/csv" | |
"fmt" | |
"os" | |
"path/filepath" | |
"runtime" | |
"strings" | |
"sync" | |
"sync/atomic" | |
) | |
const ( | |
// Default buffer size for file scanning. | |
scannerBufferSize = 64 * 1024 // 64 KB | |
// Maximum token size for file scanning (e.g., for very long lines). | |
scannerMaxTokenSize = 10 * 1024 * 1024 // 10 MB | |
// Output CSV file name. | |
outputCSVFileName = "results.csv" | |
// Prefix for workflow paths. | |
workflowPathPrefix = "workflows/" | |
// Suffix appended to operation keywords. | |
operationKeywordSuffix = "(" | |
// Operation prefixes. | |
insertOperationPrefix = "insert_ngl_" | |
updateOperationPrefix = "update_ngl_" | |
) | |
// Job represents a task to check a file with a given keyword set. | |
type Job struct { | |
filePath string | |
tableName string | |
operation string | |
keywords map[string]bool // Keywords to search for in the file | |
} | |
// Result represents the outcome of processing a file. | |
type Result struct { | |
success bool // true if all keywords were found | |
tableName string | |
operation string | |
keywordsList string // Comma-separated string of keywords | |
workflowName string | |
errMsg string // Error message if processing failed | |
} | |
// checkFileContainsKeywords checks if the file contains all the specified keywords. | |
// It returns true if all keywords are found, false otherwise, and an error if file | |
// operations fail. | |
func checkFileContainsKeywords(filePath string, keywords map[string]bool) (bool, error) { | |
file, err := os.Open(filePath) | |
if err != nil { | |
return false, fmt.Errorf("failed to open file %s: %w", filePath, err) | |
} | |
defer file.Close() | |
// Clone the keywords map to track the remaining ones. This ensures that | |
// each call to checkFileContainsKeywords operates on its own set of keywords | |
// without modifying the original map, which might be shared across jobs. | |
remainingKeywords := make(map[string]bool, len(keywords)) | |
for k, v := range keywords { | |
remainingKeywords[k] = v | |
} | |
scanner := bufio.NewScanner(file) | |
buf := make([]byte, scannerBufferSize) | |
scanner.Buffer(buf, scannerMaxTokenSize) | |
for scanner.Scan() { | |
line := scanner.Text() | |
// Iterate over a copy of keys to safely delete from the map during iteration. | |
// Or, more efficiently, iterate and collect keys to delete. | |
keysToDelete := make([]string, 0, len(remainingKeywords)) | |
for k := range remainingKeywords { | |
if strings.Contains(line, k) { | |
keysToDelete = append(keysToDelete, k) | |
} | |
} | |
for _, k := range keysToDelete { | |
delete(remainingKeywords, k) | |
} | |
// Early exit if no keywords remain to be found. | |
if len(remainingKeywords) == 0 { | |
return true, nil | |
} | |
} | |
if err := scanner.Err(); err != nil { | |
return false, fmt.Errorf("error scanning file %s: %w", filePath, err) | |
} | |
return len(remainingKeywords) == 0, nil | |
} | |
// worker processes jobs from jobCh and sends results to resultCh. | |
// It's designed to be run as a goroutine. | |
func worker(jobCh <-chan Job, resultCh chan<- Result, wg *sync.WaitGroup) { | |
defer wg.Done() | |
for job := range jobCh { | |
contains, err := checkFileContainsKeywords(job.filePath, job.keywords) | |
var workflowName string | |
// Derive workflow name if filePath starts with the defined prefix. | |
if strings.HasPrefix(job.filePath, workflowPathPrefix) { | |
parts := strings.Split(job.filePath, "/") | |
if len(parts) > 1 { | |
workflowName = parts[1] | |
} | |
} | |
// Build comma-separated keywords string for the result. | |
var sb strings.Builder | |
for k := range job.keywords { | |
if sb.Len() > 0 { | |
sb.WriteString(", ") | |
} | |
// Remove the operation keyword suffix for display. | |
sb.WriteString(strings.TrimSuffix(k, operationKeywordSuffix)) | |
} | |
res := Result{ | |
tableName: job.tableName, | |
operation: job.operation, | |
keywordsList: sb.String(), | |
workflowName: workflowName, | |
} | |
if err != nil { | |
res.errMsg = err.Error() | |
} else { | |
res.success = contains | |
} | |
resultCh <- res | |
} | |
} | |
// addJob creates a Job with the given operation and enqueues it into jobCh. | |
// It copies the base keywords and appends an operation-specific keyword. | |
func addJob(jobCh chan<- Job, filePath, tableName, operation string, baseKeywords map[string]bool) { | |
// Copy the base keyword map to ensure each job has its own independent set. | |
keywords := make(map[string]bool, len(baseKeywords)) | |
for k, v := range baseKeywords { | |
keywords[k] = v | |
} | |
// Append the operation-specific keyword (with an opening parenthesis suffix). | |
keywords[operation+operationKeywordSuffix] = true | |
jobCh <- Job{ | |
filePath: filePath, | |
tableName: tableName, | |
operation: operation, | |
keywords: keywords, | |
} | |
} | |
func main() { | |
reader := bufio.NewReader(os.Stdin) | |
fmt.Print("Enter the directory path: ") | |
dirPath, err := reader.ReadString('\n') | |
if err != nil { | |
fmt.Printf("Error reading directory path: %v\n", err) | |
return | |
} | |
dirPath = strings.TrimSpace(dirPath) | |
fmt.Print("Enter table name: ") | |
tableName, err := reader.ReadString('\n') | |
if err != nil { | |
fmt.Printf("Error reading table name: %v\n", err) | |
return | |
} | |
tableName = strings.TrimSpace(tableName) | |
// Define operations based on the table name. | |
insertOperation := insertOperationPrefix + tableName | |
updateOperation := updateOperationPrefix + tableName | |
fmt.Print("Enter keywords separated by commas: ") | |
keywordsInput, err := reader.ReadString('\n') | |
if err != nil { | |
fmt.Printf("Error reading keywords: %v\n", err) | |
return | |
} | |
keywordsInput = strings.TrimSpace(keywordsInput) | |
keywordsSlice := strings.Split(keywordsInput, ",") | |
// Build the base keyword map from user input. | |
baseKeywordMap := make(map[string]bool, len(keywordsSlice)) | |
for _, kw := range keywordsSlice { | |
trimmed := strings.TrimSpace(kw) | |
if trimmed != "" { | |
baseKeywordMap[trimmed] = true | |
} | |
} | |
// Open (or create) the CSV output file. | |
outputFile, err := os.OpenFile(outputCSVFileName, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) | |
if err != nil { | |
fmt.Printf("Error opening/creating CSV file %s: %v\n", outputCSVFileName, err) | |
return | |
} | |
defer outputFile.Close() | |
csvWriter := csv.NewWriter(outputFile) | |
defer csvWriter.Flush() // Ensure all buffered CSV writes are flushed before exiting. | |
// Write header if the file is newly created or empty. | |
if stat, err := outputFile.Stat(); err == nil && stat.Size() == 0 { | |
if err := csvWriter.Write([]string{"Table name", "Operation", "Keywords", "Workflow name"}); err != nil { | |
fmt.Printf("Error writing header to CSV: %v\n", err) | |
return | |
} | |
} else if err != nil && !os.IsNotExist(err) { // Handle error if stat fails, but ignore "file not exist" as OpenFile handles creation. | |
fmt.Printf("Error stating CSV file %s: %v\n", outputCSVFileName, err) | |
return | |
} | |
// Setup channels for jobs and results. Buffered channels improve concurrency. | |
jobCh := make(chan Job, 100) | |
resultCh := make(chan Result, 100) | |
// Start worker goroutines. Number of workers defaults to CPU count. | |
var workersWg sync.WaitGroup | |
maxWorkers := runtime.NumCPU() | |
for i := 0; i < maxWorkers; i++ { | |
workersWg.Add(1) | |
go worker(jobCh, resultCh, &workersWg) | |
} | |
// Atomic counters for summary statistics. | |
var totalFiles int64 | |
var filesContainingKeywords int64 | |
// Context for directory walk (can be used for cancellation). | |
ctx, cancel := context.WithCancel(context.Background()) | |
defer cancel() // Ensure context is cancelled on exit. | |
// Goroutine to walk the directory and enqueue jobs. | |
go func() { | |
defer close(jobCh) // Close job channel once all files are enqueued or walk is done. | |
err := filepath.WalkDir(dirPath, func(path string, d os.DirEntry, err error) error { | |
if err != nil { | |
// Report error for this path but continue walking the directory. | |
fmt.Printf("Error accessing path %s: %v\n", path, err) | |
return nil | |
} | |
// Skip directories. | |
if d.IsDir() { | |
return nil | |
} | |
atomic.AddInt64(&totalFiles, 1) | |
// Enqueue jobs for both insert and update operations for each file. | |
addJob(jobCh, path, tableName, insertOperation, baseKeywordMap) | |
addJob(jobCh, path, tableName, updateOperation, baseKeywordMap) | |
// Check for cancellation through context. | |
select { | |
case <-ctx.Done(): | |
return ctx.Err() // Stop walking if context is cancelled. | |
default: | |
return nil | |
} | |
}) | |
if err != nil { | |
fmt.Printf("Error walking the directory: %v\n", err) | |
} | |
}() | |
// Goroutine to process results and write to CSV. | |
var printWg sync.WaitGroup | |
printWg.Add(1) | |
go func() { | |
defer printWg.Done() | |
fmt.Println("\n=== Keyword Search Results ===") | |
for r := range resultCh { | |
if r.errMsg != "" { | |
fmt.Printf("❌ Error processing file for '%s' (Operation: %s): %s\n", r.tableName, r.operation, r.errMsg) | |
continue | |
} | |
if r.success { | |
atomic.AddInt64(&filesContainingKeywords, 1) | |
// Trim the operation keyword suffix for display in CSV. | |
if err := csvWriter.Write([]string{r.tableName, strings.TrimSuffix(r.operation, operationKeywordSuffix), r.keywordsList, r.workflowName}); err != nil { | |
fmt.Printf("Error writing result to CSV: %v\n", err) | |
} | |
} | |
} | |
}() | |
// Wait for all worker goroutines to finish their jobs. | |
workersWg.Wait() | |
// Close the result channel once all workers are done, signaling the result processing goroutine to exit. | |
close(resultCh) | |
// Wait for the result processing goroutine to finish. | |
printWg.Wait() | |
// Final summary output. | |
fmt.Printf("\n=== Summary ===\n") | |
fmt.Printf("Total files checked: %d\n", totalFiles) | |
fmt.Printf("Files containing all keywords: %d\n", filesContainingKeywords) | |
fmt.Println("==============================") | |
fmt.Printf("Results have been saved to %s\n", outputCSVFileName) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment