Last active
May 4, 2022 14:27
-
-
Save fijiaaron/9cc5bf4c822a169144473f2076becc17 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## check_for_distinct_paths.py | |
""" | |
Given a list of paths, return a list of distinct paths: | |
If path1 is the same as path2, they are considered duplicates. | |
For example: | |
>>> get_distinct_paths(['abc/def', 'abc/def']) | |
['abc/def'] | |
If path1 is a prefix of path2, then path1 is considered a duplicate of path2. | |
For example: | |
>>> get_distinct_paths(['abc', 'abc/def']) | |
['abc/def'] | |
>>> get_distinct_paths(['abc', 'abc/def', 'abc/def/ghi', 'abc/def/xyz']) | |
['abc/def/ghi', 'abc/def/xyz'] | |
Search all paths and eliminate all that are prefixes of others and return the list of distinct paths. | |
For example: | |
>>> get_distinct_paths(['abc', 'abc/def/ghi', 'abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def']) | |
['abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def'] | |
We are essentially finding potential files, but eliminating potential directories that contain those files. | |
""" | |
def get_distinct_paths(list_of_paths): | |
## split paths into parts so we can compare lists | |
split_paths = [list(path.split("/")) for path in list_of_paths] | |
# print('split_paths', split_paths) | |
## sort paths so that we can more easily check shortest to longest | |
## (this is not needed) | |
# sorted_paths = sorted(split_paths, key=len) | |
# print('sorted_paths', sorted_paths) | |
## just assign back to "paths" for simpler reading (even though what we have now is a sorted list of lists) | |
paths = split_paths | |
# paths = sorted_paths | |
## keep track of distict paths and those that are just partials (i.e. directories, not files) | |
distincts = [] | |
partials = [] | |
## check each path to see if it is part of any of the other paths | |
for i, path in enumerate(paths): | |
distinct = True | |
# print('path', i, path) | |
## compare it against each remaining path here | |
## (one thing that tripped me up was that `split_paths.pop(0)` will alter the original list, because it's a reference) | |
for j, rpath in enumerate(paths[i+1:]): | |
# print(f'\t checking if path {path} is in {rpath}') | |
## only need to see if the list of elements in the path we are checking matches the first len() elements in the other paths | |
if path == rpath[:len(path)]: | |
## if it does, we know it's not distinct and can move on | |
distinct = False | |
# print('\t\t partial found', path) | |
## convert path back into a string | |
path = '/'.join(path) | |
## add to list of partial paths | |
partials.append(path) | |
break | |
if distinct: | |
# print('\t\t distinct found', path) | |
## convert path back into a string | |
path = '/'.join(path) | |
## add to list of distinct paths | |
distincts.append(path) | |
return distincts | |
## run doctest by default when invoked directly | |
## (typically you would import like so: | |
## `from check_for_distinct_paths import get_distinct_paths` | |
if __name__ == '__main__': | |
import doctest | |
doctest.testmod() | |
## run a test and verify expected result | |
paths = ['abc', 'abc/def/ghi', 'abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def'] | |
expected = [ 'abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def' ] | |
distinct_paths = get_distinct_paths(paths) | |
print('distinct_paths', distinct_paths) | |
## convert list to a set for easier comparison | |
assert set(distinct_paths) == set(expected) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment