Last active
April 14, 2023 08:42
-
-
Save manoelcampos/862bbd1984c97de295b419e3a4a5d432 to your computer and use it in GitHub Desktop.
Parses the HTML files generated by the 'Analyse > Locate Duplicates' tool of IntelliJ IDEA and them creates a csv file with summarised statistics. Pass to the script, the path of the directory where the IntelliJ reports were generated and it will parse everything
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
clear | |
echo -e "Parses a directory with a set of HTML files with information about code duplication generated using IntelliJ IDEA 'Analyse > Locate Duplicates' tool.\n" | |
echo "The parser just analyses the directories starting with the word 'group'." | |
echo "Each dir represents the information about the duplication of a given code block, and just about that single block." | |
echo "A given dir contains HTML files that show where a specific block of code was duplicated." | |
echo "Each HTML in a dir shows where that block of code was duplicated, can be it across different source code files or in the same one." | |
echo "Therefore, to know how many lines of code the duplicated block has, this script just gets the number of lines of the first file in the dir," | |
echo "removing the first 2 lines. The other files into the dir just show where the duplication occur." | |
echo "To just compute the statistics about number of duplicated lines of code, just the first file in each dir is enough." | |
echo -e "\nThis script was developed by Manoel Campos da Silva Filho. http://about.me/manoelcampos" | |
echo "https://gist.github.com/manoelcampos/862bbd1984c97de295b419e3a4a5d432" | |
echo "" | |
if [ $# -gt 0 ]; then | |
SOURCE_DIR=$1 | |
else | |
SOURCE_DIR="." | |
fi | |
groups_dir_pattern="$SOURCE_DIR/group*" | |
#removes the trailling backslash from the dir name | |
SOURCE_DIR=${SOURCE_DIR%/} | |
echo "Source diretory: $SOURCE_DIR" | |
echo "" | |
ls -d $groups_dir_pattern &> /dev/null | |
if [ $? != 0 ]; then | |
echo "The directory '$SOURCE_DIR' doesn't contain IntelliJ IDEA code duplication reports." >&2 | |
echo "These reports are genereted inside numbered sub-directories starting with the name 'group'." >&2 | |
echo -e "If the reports are in a different directory, provide its path in the command line\n" >&2 | |
exit -1 | |
fi | |
echo "Parsing InteliJ IDEA HTML files with the duplicated code metrics... Please wait... It may take a wile..." | |
filename="./output.csv" | |
echo "Directory with information of block duplication;file used to parse code duplication information;start line of the duplicated code block inside the analysed source code file;end line of the duplicated code block;number of lines of the duplicated code block;number of times the block was duplicated;total lines of duplicated code" > $filename | |
#the directories where the code duplication information is are those ones that start with the word "group" | |
#iterate over all group* sub-directories, returning just the number in the end of the sub-directory name | |
for group_dir_number in `ls -d $groups_dir_pattern | egrep -o '\d+$' | sort -n`; do | |
#gets the complete name of the sub-directory | |
group_dir="$SOURCE_DIR/group$group_dir_number" | |
#gets just the first file in the sub-directory, that is enough to get the amount of lines of the duplicated code block | |
first_group_file=`ls $group_dir | head -n 1` | |
#gets just the numbers in the file that are between spaces. these values represent the start and end line of the duplicated code block | |
#if there is just one number, the code block has just one line | |
#gets the first number between spaces | |
start_line=`cat $group_dir/$first_group_file | egrep -o '<h4>.*</h4>' | egrep -o '( \d+ )+' | head -n 1 ` | |
# multiply by 1 to check if it is really a number | |
(( start_line = start_line * 1 )) | |
#gets the last number between spaces | |
end_line=`cat $group_dir/$first_group_file | egrep -o '<h4>.*</h4>' | egrep -o '( \d+ )+' | tail -n 1 ` | |
# multiply by 1 to check if it is really a number | |
(( end_line = end_line * 1 )) | |
#if the duplicated code block has just one line, there isn't information about the final line, just the start line (obviously). | |
#thus, the end line is the same start line | |
if [ $end_line -eq 0 ]; then | |
end_line=$start_line | |
fi | |
(( lines_count_of_duplicated_code_block = end_line - start_line + 1 )) | |
#if the start and end line are equal to 0, no line information was found for the duplicated code block | |
#and it will be ignored in the statistics | |
(( start_plus_end = start_line + end_line )) | |
number_of_times_block_duplicated=`ls $group_dir/*.html | wc -l` | |
(( total_lines_of_duplicated_code = number_of_times_block_duplicated * lines_count_of_duplicated_code_block )) | |
if [ $start_plus_end -gt 0 ]; then | |
echo "group$group_dir_number; $first_group_file;$start_line;$end_line;$lines_count_of_duplicated_code_block;$number_of_times_block_duplicated;$total_lines_of_duplicated_code" >> $filename | |
else | |
echo "It was not possible to identify the start and end line of the duplicated code block in sub-directory group$group_dir_number" >&2 | |
fi | |
done | |
echo "Finished. See the $filename file." | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment