fijiaaron · May 4, 2022 14:27
diff --git a/check_for_distinct_paths.py b/check_for_distinct_paths.py
 ## check_for_distinct_paths.py

 """ 
 Given a list of paths, return a list of distinct paths:

 If path1 is the same as path2, they are considered duplicates.
 For example:

 >>> get_distinct_paths(['abc/def', 'abc/def'])
 ['abc/def']

 If path1 is a prefix of path2, then path1 is considered a duplicate of path2.
 For example:

 >>> get_distinct_paths(['abc', 'abc/def'])
 ['abc/def']

 >>> get_distinct_paths(['abc', 'abc/def', 'abc/def/ghi', 'abc/def/xyz'])
 ['abc/def/ghi', 'abc/def/xyz']

 Search all paths and eliminate all that are prefixes of others and return the list of distinct paths.
 For example:

 >>> get_distinct_paths(['abc', 'abc/def/ghi', 'abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def'])
 ['abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def']

 We are essentially finding potential files, but eliminating potential directories that contain those files.
 """
 def get_distinct_paths(list_of_paths):
 	
 	## split paths into parts so we can compare lists
 	split_paths = [list(path.split("/")) for path in list_of_paths]
 	# print('split_paths', split_paths)


 	## sort paths so that we can more easily check shortest to longest
 	## (this is not needed)
 	# sorted_paths = sorted(split_paths, key=len)
 	# print('sorted_paths', sorted_paths)


 	## just assign back to "paths" for simpler reading (even though what we have now is a sorted list of lists)
 	paths = split_paths
 	# paths = sorted_paths
 	
 	## keep track of distict paths and those that are just partials (i.e. directories, not files)
 	distincts = []
 	partials = []

 	## check each path to see if it is part of any of the other paths
 	for i, path in enumerate(paths):
 		distinct = True
 		# print('path', i, path)

 		## compare it against each remaining path here
 		## (one thing that tripped me up was that `split_paths.pop(0)` will alter the original list, because it's a reference)
 		for j, rpath in enumerate(paths[i+1:]):        
 			# print(f'\t checking if path {path} is in {rpath}')

 			## only need to see if the list of elements in the path we are checking matches the first len() elements in the other paths
 			if path == rpath[:len(path)]:
 				## if it does, we know it's not distinct and can move on
 				distinct = False
 				# print('\t\t partial found', path)
 				## convert path back into a string
 				path = '/'.join(path)
 				## add to list of partial paths
 				partials.append(path)
 				break
 		if distinct: 
 			# print('\t\t distinct found', path)
 			## convert path back into a string
 			path = '/'.join(path)
 			## add to list of distinct paths
 			distincts.append(path)			
 	return distincts


 ## run doctest by default when invoked directly
 ## (typically you would import like so:
 ## `from check_for_distinct_paths import get_distinct_paths`
 if __name__ == '__main__':
 	import doctest
 	doctest.testmod()

 ## run a test and verify expected result
 paths = ['abc', 'abc/def/ghi', 'abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def']
 expected = [ 'abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def' ]

 distinct_paths = get_distinct_paths(paths)
 print('distinct_paths', distinct_paths)

 ## convert list to a set for easier comparison
 assert set(distinct_paths) == set(expected)
	## check_for_distinct_paths.py

	"""
	Given a list of paths, return a list of distinct paths:

	If path1 is the same as path2, they are considered duplicates.
	For example:

	>>> get_distinct_paths(['abc/def', 'abc/def'])
	['abc/def']

	If path1 is a prefix of path2, then path1 is considered a duplicate of path2.
	For example:

	>>> get_distinct_paths(['abc', 'abc/def'])
	['abc/def']

	>>> get_distinct_paths(['abc', 'abc/def', 'abc/def/ghi', 'abc/def/xyz'])
	['abc/def/ghi', 'abc/def/xyz']

	Search all paths and eliminate all that are prefixes of others and return the list of distinct paths.
	For example:

	>>> get_distinct_paths(['abc', 'abc/def/ghi', 'abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def'])
	['abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def']

	We are essentially finding potential files, but eliminating potential directories that contain those files.
	"""
	def get_distinct_paths(list_of_paths):

	## split paths into parts so we can compare lists
	split_paths = [list(path.split("/")) for path in list_of_paths]
	# print('split_paths', split_paths)


	## sort paths so that we can more easily check shortest to longest
	## (this is not needed)
	# sorted_paths = sorted(split_paths, key=len)
	# print('sorted_paths', sorted_paths)


	## just assign back to "paths" for simpler reading (even though what we have now is a sorted list of lists)
	paths = split_paths
	# paths = sorted_paths

	## keep track of distict paths and those that are just partials (i.e. directories, not files)
	distincts = []
	partials = []

	## check each path to see if it is part of any of the other paths
	for i, path in enumerate(paths):
	distinct = True
	# print('path', i, path)

	## compare it against each remaining path here
	## (one thing that tripped me up was that `split_paths.pop(0)` will alter the original list, because it's a reference)
	for j, rpath in enumerate(paths[i+1:]):
	# print(f'\t checking if path {path} is in {rpath}')

	## only need to see if the list of elements in the path we are checking matches the first len() elements in the other paths
	if path == rpath[:len(path)]:
	## if it does, we know it's not distinct and can move on
	distinct = False
	# print('\t\t partial found', path)
	## convert path back into a string
	path = '/'.join(path)
	## add to list of partial paths
	partials.append(path)
	break
	if distinct:
	# print('\t\t distinct found', path)
	## convert path back into a string
	path = '/'.join(path)
	## add to list of distinct paths
	distincts.append(path)
	return distincts


	## run doctest by default when invoked directly
	## (typically you would import like so:
	## `from check_for_distinct_paths import get_distinct_paths`
	if __name__ == '__main__':
	import doctest
	doctest.testmod()

	## run a test and verify expected result
	paths = ['abc', 'abc/def/ghi', 'abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def']
	expected = [ 'abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def' ]

	distinct_paths = get_distinct_paths(paths)
	print('distinct_paths', distinct_paths)

	## convert list to a set for easier comparison
	assert set(distinct_paths) == set(expected)