Created
January 6, 2025 05:59
-
-
Save Thraetaona/85da4b8c2e609c2871bb6a75458cd7a7 to your computer and use it in GitHub Desktop.
A Python command-line search engine that recursively searches through directories for files containing lines that match user-defined search queries.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# ---------------------------------------------------------------------- | |
# SPDX-License-Identifier: CC0-1.0 | |
# Authored by Fereydoun Memarzanjany | |
# | |
# To the maximum extent possible under law, the Author waives all | |
# copyright and related or neighboring rights to this code. | |
# | |
# You should have received a copy of the CC0 legalcode along with this | |
# work; if not, see <http://creativecommons.org/publicdomain/zero/1.0/> | |
# ---------------------------------------------------------------------- | |
import os # O/S filesystem abstraction | |
import sys # Get command line arguments | |
import re # Regular expressions (technically not a searching library) | |
# Define ANSI X3.64-1974 3-bit color constants | |
# Ideally, you'd check if the VT terminal is a TTY and supports ANSI | |
# escapes prior to using these, but this is just an academic assignment. | |
COLOR_RESET = "\033[0m" | |
COLOR_BLUE = "\033[94m" | |
COLOR_GREEN = "\033[92m" | |
COLOR_YELLOW = "\033[93m" | |
# This is used to recursively search a directory (and its subfolders) | |
# for files containing matches to a user-given search query. | |
# | |
# Search queries can be exemplified as follows: foo +bar +(bar baz) bip | |
# Preceding a query or a "group" (i.e., perentheses) indicates that it | |
# it "required" to appear in a line of text, in any one of the files. | |
# For instance, foo and bip are "optional" queries, which may or may | |
# not appear in the line, whereas bar is a "required" term AND _EITHER_ | |
# of bar or baz are also "required" to appear in the line. | |
# | |
# NOTE: This does not differentiate between plus (+) signs, meaning | |
# "required" terms, and plus signs literally appearing in tokens. | |
class SearchEngine: | |
def __init__(self, directory): | |
self.directory = directory | |
# The Rules had asked for the search engine to index and pre- | |
# tokenize the files into memory. In reality, it does not | |
# automatically make the program faster, as a lot of factors | |
# would depend on I/O-bounded conditions as well as CPU cores' | |
# cache sizes and locality; so, it is pretty counter-intuitive | |
# to micro-optimize like this. Also, we'd have to constantly | |
# look after the directory for updates (in case the user adds | |
# more files/folders) and update the index, as necessary. | |
# I am not doing any of that, as they weren't explicitly stated. | |
self.index = {} | |
self.build_index() # Pre-index the directory prior to user input | |
# Builds the in-memory index of files with line numbers and tokens | |
def build_index(self): | |
# os.walk() generates the file names in a directory tree; | |
# it is NOT a searching library outside Python's core utils. | |
for dir_path, _dir_names, file_names in os.walk(self.directory): | |
for file_name in file_names: | |
# NOTE: The file might be UTF-16 on old Windows. | |
# TODO: Handle exceptions? | |
path = os.path.join(dir_path, file_name) | |
with open(path, 'r', encoding='utf-8') as f: | |
lines = [] | |
# Iterate over each line in the file | |
for line_num, line in enumerate(f, 1): | |
tokens = self.tokenize(line) | |
lines.append( | |
(line_num, tokens, line.strip()) | |
) | |
# This is effectively a hashtable. | |
self.index[os.path.abspath(path)] = lines | |
# Naively tokenizes text by splitting on word boundaries (lowercase) | |
# https://stackoverflow.com/questions/43647186 | |
def tokenize(self, text): | |
return re.findall(r"\b\w+\b", text.lower()) | |
# Parses the user query into required and optional terms/groups | |
# "grouped" terms, e.g., +(foo bar), are treated as OR conditions | |
# Whitespaces separate queries (OR); plus signs indicate "required." | |
def parse_query(self, query): | |
# The "required" list is an array of set()'s, and the reasoning | |
# is that each required "group" only needs to have one of its | |
# terms satisfies; in a sense, "optional" is a required element. | |
required = [] | |
optional = set() | |
tokens = query.split() | |
tokens_iter = iter(tokens) | |
# TODO: Maybe use regex instead of if-else branching? | |
for token in tokens_iter: | |
if token.startswith('+('): | |
# Handle required group with OR condition | |
if token.endswith(')'): | |
group_content = token[2:-1] | |
else: | |
group_parts = [token[2:]] | |
while True: | |
try: | |
next_token = next(tokens_iter) | |
if next_token.endswith(')'): | |
group_parts.append(next_token[:-1]) | |
break | |
group_parts.append(next_token) | |
except StopIteration: | |
break | |
group_content = ' '.join(group_parts) | |
group_terms = set(group_content.split()) | |
required.append(group_terms) | |
# ---------------------------------------------------------- | |
elif token.startswith('+'): | |
required.append({token[1:]}) | |
# ---------------------------------------------------------- | |
elif token.startswith('('): | |
# Handle optional group | |
if token.endswith(')'): | |
group_content = token[1:-1] | |
else: | |
group_parts = [token[1:]] | |
while True: | |
try: | |
next_token = next(tokens_iter) | |
if next_token.endswith(')'): | |
group_parts.append(next_token[:-1]) | |
break | |
group_parts.append(next_token) | |
except StopIteration: | |
break | |
group_content = ' '.join(group_parts) | |
optional.update(group_content.split()) | |
# ---------------------------------------------------------- | |
else: | |
optional.add(token.lower()) | |
return required, optional | |
# Determines if a line matches the query criteria | |
# It returns a boolean, indicating whether the given token set | |
# satisfies all of the "required" criteria, and it also returns | |
# the number of hits (i.e., optional terms) in said line. | |
def matches(self, token_set, required, optional): | |
# For each required "group," at least one term must match | |
for group in required: | |
if not any(term in token_set for term in group): | |
return False, 0 | |
# Calculate the rank (hits) of optional terms, for sorting | |
# https://www.w3schools.com/python/ref_set_intersection.asp | |
hits = len(optional.intersection(token_set)) | |
return True, hits | |
# Processes the in-memory index and yields matching lines & details | |
def search_files(self, required, optional): | |
for path, lines in self.index.items(): | |
for line_number, tokens, content in lines: | |
token_set = set(tokens) | |
is_match, hits = \ | |
self.matches(token_set, required, optional) | |
# NOTE: yield (as opposed to return) could be used to | |
# display results as they are found, real-time. | |
# https://www.geeksforgeeks.org/generators-in-python/ | |
if is_match: | |
yield (path, line_number, content) | |
# REPL (Read-Eval-Print Loop) for interactive search | |
def interactive_search(self): | |
while True: | |
try: | |
query = input("> ") | |
if not query: | |
break # Exit when the user enters an empty query | |
# Parse the user's query for "required" and "optional" | |
# items; required items have a "+" prefix. | |
# Grouped quries, e.g., +(foo bar), get OR'd together. | |
required, optional = self.parse_query(query) | |
# Iterate over the generator directly, yielding results | |
# Results follow "path, line number, and line content" | |
for path, line_number, content in \ | |
self.search_files(required, optional): | |
print( | |
f"{COLOR_BLUE}{path}{COLOR_RESET} " | |
f"{COLOR_GREEN}{line_number}{COLOR_RESET} " | |
f"\"{COLOR_YELLOW}{content}{COLOR_RESET}\"" | |
) | |
except KeyboardInterrupt: | |
# This occurs if the user presses Ctrl+C (SIGINT) | |
break | |
def main(): | |
# NOTE: This is really a "hack." We are checking whether or not | |
# the user has supplied 3 ARGUMENTS, whereas our program is actually | |
# supposed to operate on OPTIONS (i.e., --dir) instead; we are | |
# "pretending" to have a --dir option by hardcoding the argv at | |
# index 1 (i.e., the argument after the program name) to be --dir. | |
# I have built full-fledged command-line applications in C before, | |
# but the parser itself took hundreds of lines; I won't do it here. | |
if len(sys.argv) != 3 or sys.argv[1] != '--dir': | |
print(f"Usage: {sys.argv[0]} --dir <directory>") | |
sys.exit(1) | |
directory = sys.argv[2] # Index 2 because index 1 is "--dir" | |
engine = SearchEngine(directory) | |
engine.interactive_search() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment