Thraetaona · January 6, 2025 05:59
diff --git a/search.py b/search.py
 #!/usr/bin/env python3

 # ----------------------------------------------------------------------
 # SPDX-License-Identifier: CC0-1.0
 # Authored by Fereydoun Memarzanjany
 #
 # To the maximum extent possible under law, the Author waives all
 # copyright and related or neighboring rights to this code.
 #
 # You should have received a copy of the CC0 legalcode along with this
 # work; if not, see <http://creativecommons.org/publicdomain/zero/1.0/>
 # ----------------------------------------------------------------------


 import os  # O/S filesystem abstraction
 import sys # Get command line arguments
 import re  # Regular expressions (technically not a searching library)

 # Define ANSI X3.64-1974 3-bit color constants
 # Ideally, you'd check if the VT terminal is a TTY and supports ANSI
 # escapes prior to using these, but this is just an academic assignment.
 COLOR_RESET  = "\033[0m"
 COLOR_BLUE   = "\033[94m"
 COLOR_GREEN  = "\033[92m"
 COLOR_YELLOW = "\033[93m"


 # This is used to recursively search a directory (and its subfolders)
 # for files containing matches to a user-given search query.
 #
 # Search queries can be exemplified as follows: foo +bar +(bar baz) bip
 # Preceding a query or a "group" (i.e., perentheses) indicates that it
 # it "required" to appear in a line of text, in any one of the files.
 #     For instance, foo and bip are "optional" queries, which may or may
 # not appear in the line, whereas bar is a "required" term AND _EITHER_
 # of bar or baz are also "required" to appear in the line.
 #
 # NOTE: This does not differentiate between plus (+) signs, meaning
 # "required" terms, and plus signs literally appearing in tokens.
 class SearchEngine:
    def __init__(self, directory):
        self.directory = directory
        # The Rules had asked for the search engine to index and pre-
        # tokenize the files into memory.  In reality, it does not
        # automatically make the program faster, as a lot of factors
        # would depend on I/O-bounded conditions as well as CPU cores'
        # cache sizes and locality; so, it is pretty counter-intuitive
        # to micro-optimize like this.  Also, we'd have to constantly
        # look after the directory for updates (in case the user adds
        # more files/folders) and update the index, as necessary.
        # I am not doing any of that, as they weren't explicitly stated.
        self.index = {}
        self.build_index() # Pre-index the directory prior to user input

    # Builds the in-memory index of files with line numbers and tokens
    def build_index(self):
        # os.walk() generates the file names in a directory tree;
        # it is NOT a searching library outside Python's core utils.
        for dir_path, _dir_names, file_names in os.walk(self.directory):
            for file_name in file_names:
                # NOTE: The file might be UTF-16 on old Windows.
                # TODO: Handle exceptions?
                path = os.path.join(dir_path, file_name)
                with open(path, 'r', encoding='utf-8') as f:
                    lines = []
                    # Iterate over each line in the file
                    for line_num, line in enumerate(f, 1):
                        tokens = self.tokenize(line)
                        lines.append(
                            (line_num, tokens, line.strip())
                        )

                    # This is effectively a hashtable.
                    self.index[os.path.abspath(path)] = lines

    # Naively tokenizes text by splitting on word boundaries (lowercase)
    # https://stackoverflow.com/questions/43647186
    def tokenize(self, text):
        return re.findall(r"\b\w+\b", text.lower())

    # Parses the user query into required and optional terms/groups
    # "grouped" terms, e.g., +(foo bar), are treated as OR conditions
    # Whitespaces separate queries (OR); plus signs indicate "required."
    def parse_query(self, query):
        # The "required" list is an array of set()'s, and the reasoning
        # is that each required "group" only needs to have one of its
        # terms satisfies; in a sense, "optional" is a required element.
        required = []
        optional = set()

        tokens = query.split()
        tokens_iter = iter(tokens)

        # TODO: Maybe use regex instead of if-else branching?
        for token in tokens_iter:
            if token.startswith('+('):
                # Handle required group with OR condition
                if token.endswith(')'):
                    group_content = token[2:-1]
                else:
                    group_parts = [token[2:]]
                    while True:
                        try:
                            next_token = next(tokens_iter)
                            if next_token.endswith(')'):
                                group_parts.append(next_token[:-1])
                                break
                            group_parts.append(next_token)
                        except StopIteration:
                            break
                    group_content = ' '.join(group_parts)

                group_terms = set(group_content.split())
                required.append(group_terms)
            # ----------------------------------------------------------
            elif token.startswith('+'):
                required.append({token[1:]})
            # ----------------------------------------------------------
            elif token.startswith('('):
                # Handle optional group
                if token.endswith(')'):
                    group_content = token[1:-1]
                else:
                    group_parts = [token[1:]]
                    while True:
                        try:
                            next_token = next(tokens_iter)
                            if next_token.endswith(')'):
                                group_parts.append(next_token[:-1])
                                break
                            group_parts.append(next_token)
                        except StopIteration:
                            break
                    group_content = ' '.join(group_parts)
                optional.update(group_content.split())
            # ----------------------------------------------------------
            else:
                optional.add(token.lower())

        return required, optional

    # Determines if a line matches the query criteria
    # It returns a boolean, indicating whether the given token set
    # satisfies all of the "required" criteria, and it also returns
    # the number of hits (i.e., optional terms) in said line.
    def matches(self, token_set, required, optional):
        # For each required "group," at least one term must match
        for group in required:
            if not any(term in token_set for term in group):
                return False, 0

        # Calculate the rank (hits) of optional terms, for sorting
        # https://www.w3schools.com/python/ref_set_intersection.asp
        hits = len(optional.intersection(token_set))
        return True, hits

    # Processes the in-memory index and yields matching lines & details
    def search_files(self, required, optional):
        for path, lines in self.index.items():
            for line_number, tokens, content in lines:
                token_set = set(tokens)
                is_match, hits = \
                    self.matches(token_set, required, optional)

                # NOTE: yield (as opposed to return) could be used to
                # display results as they are found, real-time.
                # https://www.geeksforgeeks.org/generators-in-python/
                if is_match:
                    yield (path, line_number, content)

    # REPL (Read-Eval-Print Loop) for interactive search
    def interactive_search(self):
        while True:
            try:
                query = input("> ")
                if not query:
                    break  # Exit when the user enters an empty query

                # Parse the user's query for "required" and "optional"
                # items; required items have a "+" prefix.
                # Grouped quries, e.g., +(foo bar), get OR'd together.
                required, optional = self.parse_query(query)

                # Iterate over the generator directly, yielding results
                # Results follow "path, line number, and line content"
                for path, line_number, content in \
                    self.search_files(required, optional):

                    print(
                        f"{COLOR_BLUE}{path}{COLOR_RESET} "
                        f"{COLOR_GREEN}{line_number}{COLOR_RESET} "
                        f"\"{COLOR_YELLOW}{content}{COLOR_RESET}\""
                    )
            except KeyboardInterrupt:
                # This occurs if the user presses Ctrl+C (SIGINT)
                break

 def main():
    # NOTE: This is really a "hack."  We are checking whether or not
    # the user has supplied 3 ARGUMENTS, whereas our program is actually
    # supposed to operate on OPTIONS (i.e., --dir) instead; we are
    # "pretending" to have a --dir option by hardcoding the argv at
    # index 1 (i.e., the argument after the program name) to be --dir.
    # I have built full-fledged command-line applications in C before,
    # but the parser itself took hundreds of lines; I won't do it here.
    if len(sys.argv) != 3 or sys.argv[1] != '--dir':
        print(f"Usage: {sys.argv[0]} --dir <directory>")
        sys.exit(1)

    directory = sys.argv[2]  # Index 2 because index 1 is "--dir"

    engine = SearchEngine(directory)
    engine.interactive_search()

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	# ----------------------------------------------------------------------
	# SPDX-License-Identifier: CC0-1.0
	# Authored by Fereydoun Memarzanjany
	#
	# To the maximum extent possible under law, the Author waives all
	# copyright and related or neighboring rights to this code.
	#
	# You should have received a copy of the CC0 legalcode along with this
	# work; if not, see <http://creativecommons.org/publicdomain/zero/1.0/>
	# ----------------------------------------------------------------------


	import os # O/S filesystem abstraction
	import sys # Get command line arguments
	import re # Regular expressions (technically not a searching library)

	# Define ANSI X3.64-1974 3-bit color constants
	# Ideally, you'd check if the VT terminal is a TTY and supports ANSI
	# escapes prior to using these, but this is just an academic assignment.
	COLOR_RESET = "\033[0m"
	COLOR_BLUE = "\033[94m"
	COLOR_GREEN = "\033[92m"
	COLOR_YELLOW = "\033[93m"


	# This is used to recursively search a directory (and its subfolders)
	# for files containing matches to a user-given search query.
	#
	# Search queries can be exemplified as follows: foo +bar +(bar baz) bip
	# Preceding a query or a "group" (i.e., perentheses) indicates that it
	# it "required" to appear in a line of text, in any one of the files.
	# For instance, foo and bip are "optional" queries, which may or may
	# not appear in the line, whereas bar is a "required" term AND _EITHER_
	# of bar or baz are also "required" to appear in the line.
	#
	# NOTE: This does not differentiate between plus (+) signs, meaning
	# "required" terms, and plus signs literally appearing in tokens.
	class SearchEngine:
	def __init__(self, directory):
	self.directory = directory
	# The Rules had asked for the search engine to index and pre-
	# tokenize the files into memory. In reality, it does not
	# automatically make the program faster, as a lot of factors
	# would depend on I/O-bounded conditions as well as CPU cores'
	# cache sizes and locality; so, it is pretty counter-intuitive
	# to micro-optimize like this. Also, we'd have to constantly
	# look after the directory for updates (in case the user adds
	# more files/folders) and update the index, as necessary.
	# I am not doing any of that, as they weren't explicitly stated.
	self.index = {}
	self.build_index() # Pre-index the directory prior to user input

	# Builds the in-memory index of files with line numbers and tokens
	def build_index(self):
	# os.walk() generates the file names in a directory tree;
	# it is NOT a searching library outside Python's core utils.
	for dir_path, _dir_names, file_names in os.walk(self.directory):
	for file_name in file_names:
	# NOTE: The file might be UTF-16 on old Windows.
	# TODO: Handle exceptions?
	path = os.path.join(dir_path, file_name)
	with open(path, 'r', encoding='utf-8') as f:
	lines = []
	# Iterate over each line in the file
	for line_num, line in enumerate(f, 1):
	tokens = self.tokenize(line)
	lines.append(
	(line_num, tokens, line.strip())
	)

	# This is effectively a hashtable.
	self.index[os.path.abspath(path)] = lines

	# Naively tokenizes text by splitting on word boundaries (lowercase)
	# https://stackoverflow.com/questions/43647186
	def tokenize(self, text):
	return re.findall(r"\b\w+\b", text.lower())

	# Parses the user query into required and optional terms/groups
	# "grouped" terms, e.g., +(foo bar), are treated as OR conditions
	# Whitespaces separate queries (OR); plus signs indicate "required."
	def parse_query(self, query):
	# The "required" list is an array of set()'s, and the reasoning
	# is that each required "group" only needs to have one of its
	# terms satisfies; in a sense, "optional" is a required element.
	required = []
	optional = set()

	tokens = query.split()
	tokens_iter = iter(tokens)

	# TODO: Maybe use regex instead of if-else branching?
	for token in tokens_iter:
	if token.startswith('+('):
	# Handle required group with OR condition
	if token.endswith(')'):
	group_content = token[2:-1]
	else:
	group_parts = [token[2:]]
	while True:
	try:
	next_token = next(tokens_iter)
	if next_token.endswith(')'):
	group_parts.append(next_token[:-1])
	break
	group_parts.append(next_token)
	except StopIteration:
	break
	group_content = ' '.join(group_parts)

	group_terms = set(group_content.split())
	required.append(group_terms)
	# ----------------------------------------------------------
	elif token.startswith('+'):
	required.append({token[1:]})
	# ----------------------------------------------------------
	elif token.startswith('('):
	# Handle optional group
	if token.endswith(')'):
	group_content = token[1:-1]
	else:
	group_parts = [token[1:]]
	while True:
	try:
	next_token = next(tokens_iter)
	if next_token.endswith(')'):
	group_parts.append(next_token[:-1])
	break
	group_parts.append(next_token)
	except StopIteration:
	break
	group_content = ' '.join(group_parts)
	optional.update(group_content.split())
	# ----------------------------------------------------------
	else:
	optional.add(token.lower())

	return required, optional

	# Determines if a line matches the query criteria
	# It returns a boolean, indicating whether the given token set
	# satisfies all of the "required" criteria, and it also returns
	# the number of hits (i.e., optional terms) in said line.
	def matches(self, token_set, required, optional):
	# For each required "group," at least one term must match
	for group in required:
	if not any(term in token_set for term in group):
	return False, 0

	# Calculate the rank (hits) of optional terms, for sorting
	# https://www.w3schools.com/python/ref_set_intersection.asp
	hits = len(optional.intersection(token_set))
	return True, hits

	# Processes the in-memory index and yields matching lines & details
	def search_files(self, required, optional):
	for path, lines in self.index.items():
	for line_number, tokens, content in lines:
	token_set = set(tokens)
	is_match, hits = \
	self.matches(token_set, required, optional)

	# NOTE: yield (as opposed to return) could be used to
	# display results as they are found, real-time.
	# https://www.geeksforgeeks.org/generators-in-python/
	if is_match:
	yield (path, line_number, content)

	# REPL (Read-Eval-Print Loop) for interactive search
	def interactive_search(self):
	while True:
	try:
	query = input("> ")
	if not query:
	break # Exit when the user enters an empty query

	# Parse the user's query for "required" and "optional"
	# items; required items have a "+" prefix.
	# Grouped quries, e.g., +(foo bar), get OR'd together.
	required, optional = self.parse_query(query)

	# Iterate over the generator directly, yielding results
	# Results follow "path, line number, and line content"
	for path, line_number, content in \
	self.search_files(required, optional):

	print(
	f"{COLOR_BLUE}{path}{COLOR_RESET} "
	f"{COLOR_GREEN}{line_number}{COLOR_RESET} "
	f"\"{COLOR_YELLOW}{content}{COLOR_RESET}\""
	)
	except KeyboardInterrupt:
	# This occurs if the user presses Ctrl+C (SIGINT)
	break

	def main():
	# NOTE: This is really a "hack." We are checking whether or not
	# the user has supplied 3 ARGUMENTS, whereas our program is actually
	# supposed to operate on OPTIONS (i.e., --dir) instead; we are
	# "pretending" to have a --dir option by hardcoding the argv at
	# index 1 (i.e., the argument after the program name) to be --dir.
	# I have built full-fledged command-line applications in C before,
	# but the parser itself took hundreds of lines; I won't do it here.
	if len(sys.argv) != 3 or sys.argv[1] != '--dir':
	print(f"Usage: {sys.argv[0]} --dir <directory>")
	sys.exit(1)

	directory = sys.argv[2] # Index 2 because index 1 is "--dir"

	engine = SearchEngine(directory)
	engine.interactive_search()

	if __name__ == "__main__":
	main()