amakukha · December 29, 2022 19:51
diff --git a/code_duplication.py b/code_duplication.py
 #!/usr/bin/env python3

 '''
 Code duplication assessment tool. Runs in linear time.
 Usage:
    Put your packages into a single directory and run this script:
        python3 code_duplication.py > report.txt
    Then sort files by similarity:
        cat report.txt | awk 'NF > 1' | sort -rn | less
 '''

 import os
 from collections import Counter, defaultdict

 SNIPPET_SIZE = 5    # how many lines of code constitute a "snippet"?
 VERBOSE = False     # show matching snippets?
 FILE_EXTENSIONS = (".java", ".ts", ".js", ".tsx", ".jsx", ".py", ".cpp", ".c")

 snippets = defaultdict(list)    # snippet -> file list
 for root, dirs, files in os.walk("."):
    for file in files:
        if file.endswith(FILE_EXTENSIONS):
            filename = os.path.join(root, file)
            print()
            print(filename)
            lines = []                      # list of non-empty code lines in this file, trimmed
            snippets_count = 0              # counter of snippets in the current file
            similar_files = Counter()       # filename -> snippet match count
            for line in open(filename):
                line = line.strip()
                if not line: continue
                lines.append('\n\t' + line) # separator \n\t is added to every line
                if len(lines) < SNIPPET_SIZE:
                    continue
                snippets_count += 1
                snippet = ''.join(lines[-SNIPPET_SIZE:])
                if snippet in snippets:
                    for similar_filename in snippets[snippet]:
                        if VERBOSE:
                            print('-', similar_filename)
                            print(snippet)
                        similar_files[similar_filename] += 1
                snippets[snippet].append(filename)
            for similar_filename in similar_files:
                identical_count = similar_files[similar_filename]
                if identical_count > 0 and similar_filename != filename:
                    score = identical_count*100/snippets_count     # TODO: make the formula symmetrical
                    print('\t{}\t{:.1f}\t{}\t{}'.format(identical_count, score, similar_filename, filename))
	#!/usr/bin/env python3

	'''
	Code duplication assessment tool. Runs in linear time.
	Usage:
	Put your packages into a single directory and run this script:
	python3 code_duplication.py > report.txt
	Then sort files by similarity:
	cat report.txt \| awk 'NF > 1' \| sort -rn \| less
	'''

	import os
	from collections import Counter, defaultdict

	SNIPPET_SIZE = 5 # how many lines of code constitute a "snippet"?
	VERBOSE = False # show matching snippets?
	FILE_EXTENSIONS = (".java", ".ts", ".js", ".tsx", ".jsx", ".py", ".cpp", ".c")

	snippets = defaultdict(list) # snippet -> file list
	for root, dirs, files in os.walk("."):
	for file in files:
	if file.endswith(FILE_EXTENSIONS):
	filename = os.path.join(root, file)
	print()
	print(filename)
	lines = [] # list of non-empty code lines in this file, trimmed
	snippets_count = 0 # counter of snippets in the current file
	similar_files = Counter() # filename -> snippet match count
	for line in open(filename):
	line = line.strip()
	if not line: continue
	lines.append('\n\t' + line) # separator \n\t is added to every line
	if len(lines) < SNIPPET_SIZE:
	continue
	snippets_count += 1
	snippet = ''.join(lines[-SNIPPET_SIZE:])
	if snippet in snippets:
	for similar_filename in snippets[snippet]:
	if VERBOSE:
	print('-', similar_filename)
	print(snippet)
	similar_files[similar_filename] += 1
	snippets[snippet].append(filename)
	for similar_filename in similar_files:
	identical_count = similar_files[similar_filename]
	if identical_count > 0 and similar_filename != filename:
	score = identical_count*100/snippets_count # TODO: make the formula symmetrical
	print('\t{}\t{:.1f}\t{}\t{}'.format(identical_count, score, similar_filename, filename))