Skip to content

Instantly share code, notes, and snippets.

@Thraetaona
Created January 6, 2025 05:59
Show Gist options
  • Save Thraetaona/85da4b8c2e609c2871bb6a75458cd7a7 to your computer and use it in GitHub Desktop.
Save Thraetaona/85da4b8c2e609c2871bb6a75458cd7a7 to your computer and use it in GitHub Desktop.
A Python command-line search engine that recursively searches through directories for files containing lines that match user-defined search queries.
#!/usr/bin/env python3
# ----------------------------------------------------------------------
# SPDX-License-Identifier: CC0-1.0
# Authored by Fereydoun Memarzanjany
#
# To the maximum extent possible under law, the Author waives all
# copyright and related or neighboring rights to this code.
#
# You should have received a copy of the CC0 legalcode along with this
# work; if not, see <http://creativecommons.org/publicdomain/zero/1.0/>
# ----------------------------------------------------------------------
import os # O/S filesystem abstraction
import sys # Get command line arguments
import re # Regular expressions (technically not a searching library)
# Define ANSI X3.64-1974 3-bit color constants
# Ideally, you'd check if the VT terminal is a TTY and supports ANSI
# escapes prior to using these, but this is just an academic assignment.
COLOR_RESET = "\033[0m"
COLOR_BLUE = "\033[94m"
COLOR_GREEN = "\033[92m"
COLOR_YELLOW = "\033[93m"
# This is used to recursively search a directory (and its subfolders)
# for files containing matches to a user-given search query.
#
# Search queries can be exemplified as follows: foo +bar +(bar baz) bip
# Preceding a query or a "group" (i.e., perentheses) indicates that it
# it "required" to appear in a line of text, in any one of the files.
# For instance, foo and bip are "optional" queries, which may or may
# not appear in the line, whereas bar is a "required" term AND _EITHER_
# of bar or baz are also "required" to appear in the line.
#
# NOTE: This does not differentiate between plus (+) signs, meaning
# "required" terms, and plus signs literally appearing in tokens.
class SearchEngine:
def __init__(self, directory):
self.directory = directory
# The Rules had asked for the search engine to index and pre-
# tokenize the files into memory. In reality, it does not
# automatically make the program faster, as a lot of factors
# would depend on I/O-bounded conditions as well as CPU cores'
# cache sizes and locality; so, it is pretty counter-intuitive
# to micro-optimize like this. Also, we'd have to constantly
# look after the directory for updates (in case the user adds
# more files/folders) and update the index, as necessary.
# I am not doing any of that, as they weren't explicitly stated.
self.index = {}
self.build_index() # Pre-index the directory prior to user input
# Builds the in-memory index of files with line numbers and tokens
def build_index(self):
# os.walk() generates the file names in a directory tree;
# it is NOT a searching library outside Python's core utils.
for dir_path, _dir_names, file_names in os.walk(self.directory):
for file_name in file_names:
# NOTE: The file might be UTF-16 on old Windows.
# TODO: Handle exceptions?
path = os.path.join(dir_path, file_name)
with open(path, 'r', encoding='utf-8') as f:
lines = []
# Iterate over each line in the file
for line_num, line in enumerate(f, 1):
tokens = self.tokenize(line)
lines.append(
(line_num, tokens, line.strip())
)
# This is effectively a hashtable.
self.index[os.path.abspath(path)] = lines
# Naively tokenizes text by splitting on word boundaries (lowercase)
# https://stackoverflow.com/questions/43647186
def tokenize(self, text):
return re.findall(r"\b\w+\b", text.lower())
# Parses the user query into required and optional terms/groups
# "grouped" terms, e.g., +(foo bar), are treated as OR conditions
# Whitespaces separate queries (OR); plus signs indicate "required."
def parse_query(self, query):
# The "required" list is an array of set()'s, and the reasoning
# is that each required "group" only needs to have one of its
# terms satisfies; in a sense, "optional" is a required element.
required = []
optional = set()
tokens = query.split()
tokens_iter = iter(tokens)
# TODO: Maybe use regex instead of if-else branching?
for token in tokens_iter:
if token.startswith('+('):
# Handle required group with OR condition
if token.endswith(')'):
group_content = token[2:-1]
else:
group_parts = [token[2:]]
while True:
try:
next_token = next(tokens_iter)
if next_token.endswith(')'):
group_parts.append(next_token[:-1])
break
group_parts.append(next_token)
except StopIteration:
break
group_content = ' '.join(group_parts)
group_terms = set(group_content.split())
required.append(group_terms)
# ----------------------------------------------------------
elif token.startswith('+'):
required.append({token[1:]})
# ----------------------------------------------------------
elif token.startswith('('):
# Handle optional group
if token.endswith(')'):
group_content = token[1:-1]
else:
group_parts = [token[1:]]
while True:
try:
next_token = next(tokens_iter)
if next_token.endswith(')'):
group_parts.append(next_token[:-1])
break
group_parts.append(next_token)
except StopIteration:
break
group_content = ' '.join(group_parts)
optional.update(group_content.split())
# ----------------------------------------------------------
else:
optional.add(token.lower())
return required, optional
# Determines if a line matches the query criteria
# It returns a boolean, indicating whether the given token set
# satisfies all of the "required" criteria, and it also returns
# the number of hits (i.e., optional terms) in said line.
def matches(self, token_set, required, optional):
# For each required "group," at least one term must match
for group in required:
if not any(term in token_set for term in group):
return False, 0
# Calculate the rank (hits) of optional terms, for sorting
# https://www.w3schools.com/python/ref_set_intersection.asp
hits = len(optional.intersection(token_set))
return True, hits
# Processes the in-memory index and yields matching lines & details
def search_files(self, required, optional):
for path, lines in self.index.items():
for line_number, tokens, content in lines:
token_set = set(tokens)
is_match, hits = \
self.matches(token_set, required, optional)
# NOTE: yield (as opposed to return) could be used to
# display results as they are found, real-time.
# https://www.geeksforgeeks.org/generators-in-python/
if is_match:
yield (path, line_number, content)
# REPL (Read-Eval-Print Loop) for interactive search
def interactive_search(self):
while True:
try:
query = input("> ")
if not query:
break # Exit when the user enters an empty query
# Parse the user's query for "required" and "optional"
# items; required items have a "+" prefix.
# Grouped quries, e.g., +(foo bar), get OR'd together.
required, optional = self.parse_query(query)
# Iterate over the generator directly, yielding results
# Results follow "path, line number, and line content"
for path, line_number, content in \
self.search_files(required, optional):
print(
f"{COLOR_BLUE}{path}{COLOR_RESET} "
f"{COLOR_GREEN}{line_number}{COLOR_RESET} "
f"\"{COLOR_YELLOW}{content}{COLOR_RESET}\""
)
except KeyboardInterrupt:
# This occurs if the user presses Ctrl+C (SIGINT)
break
def main():
# NOTE: This is really a "hack." We are checking whether or not
# the user has supplied 3 ARGUMENTS, whereas our program is actually
# supposed to operate on OPTIONS (i.e., --dir) instead; we are
# "pretending" to have a --dir option by hardcoding the argv at
# index 1 (i.e., the argument after the program name) to be --dir.
# I have built full-fledged command-line applications in C before,
# but the parser itself took hundreds of lines; I won't do it here.
if len(sys.argv) != 3 or sys.argv[1] != '--dir':
print(f"Usage: {sys.argv[0]} --dir <directory>")
sys.exit(1)
directory = sys.argv[2] # Index 2 because index 1 is "--dir"
engine = SearchEngine(directory)
engine.interactive_search()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment