manoelcampos · April 14, 2023 08:42
diff --git a/parse-intellij-code-duplication-report.sh b/parse-intellij-code-duplication-report.sh
 #!/bin/bash

 clear


 echo -e "Parses a directory with a set of HTML files with information about code duplication generated using IntelliJ IDEA 'Analyse > Locate Duplicates' tool.\n"
 echo "The parser just analyses the directories starting with the word 'group'."
 echo "Each dir represents the information about the duplication of a given code block, and just about that single block."
 echo "A given dir contains HTML files that show where a specific block of code was duplicated."
 echo "Each HTML in a dir shows where that block of code was duplicated, can be it across different source code files or in the same one."
 echo "Therefore, to know how many lines of code the duplicated block has, this script just gets the number of lines of the first file in the dir,"
 echo "removing the first 2 lines. The other files into the dir just show where the duplication occur."
 echo "To just compute the statistics about number of duplicated lines of code, just the first file in each dir is enough."
 echo -e "\nThis script was developed by Manoel Campos da Silva Filho. http://about.me/manoelcampos"
 echo "https://gist.github.com/manoelcampos/862bbd1984c97de295b419e3a4a5d432"
 echo ""

 if [ $# -gt 0 ]; then
 	SOURCE_DIR=$1
 else
 	SOURCE_DIR="."
 fi

 groups_dir_pattern="$SOURCE_DIR/group*"

 #removes the trailling backslash from the dir name 
 SOURCE_DIR=${SOURCE_DIR%/}
 echo "Source diretory: $SOURCE_DIR"
 echo ""

 ls -d $groups_dir_pattern &> /dev/null
 if [ $? != 0 ]; then
 	echo "The directory '$SOURCE_DIR' doesn't contain IntelliJ IDEA code duplication reports." >&2
 	echo "These reports are genereted inside numbered sub-directories starting with the name 'group'." >&2
 	echo -e "If the reports are in a different directory, provide its path in the command line\n" >&2
 	exit -1
 fi

 echo "Parsing InteliJ IDEA HTML files with the duplicated code metrics... Please wait... It may take a wile..."

 filename="./output.csv"
 echo "Directory with information of block duplication;file used to parse code duplication information;start line of the duplicated code block inside the analysed source code file;end line of the duplicated code block;number of lines of the duplicated code block;number of times the block was duplicated;total lines of duplicated code" > $filename

 #the directories where the code duplication information is are those ones that start with the word "group"
 #iterate over all group* sub-directories, returning just the number in the end of the sub-directory name
 for group_dir_number in `ls -d $groups_dir_pattern | egrep -o '\d+$' | sort -n`; do
 	#gets the complete name of the sub-directory
 	group_dir="$SOURCE_DIR/group$group_dir_number"

 	#gets just the first file in the sub-directory, that is enough to get the amount of lines of the duplicated code block
 	first_group_file=`ls $group_dir | head -n 1`

 	#gets just the numbers in the file that are between spaces. these values represent the start and end line of the duplicated code block
 	#if there is just one number, the code block has just one line
 	#gets the first number between spaces
 	start_line=`cat $group_dir/$first_group_file | egrep -o '<h4>.*</h4>' |  egrep -o '( \d+ )+' | head -n 1 `

 	# multiply by 1 to check if it is really a number
 	(( start_line = start_line * 1 ))

 	#gets the last number between spaces
 	end_line=`cat $group_dir/$first_group_file | egrep -o '<h4>.*</h4>' | egrep -o '( \d+ )+' | tail -n 1 `

 	# multiply by 1 to check if it is really a number
 	(( end_line = end_line * 1 ))

 	#if the duplicated code block has just one line, there isn't information about the final line, just the start line (obviously).
 	#thus, the end line is the same start line
 	if [ $end_line -eq 0 ]; then
 		end_line=$start_line
 	fi

 	(( lines_count_of_duplicated_code_block = end_line - start_line + 1 ))

 	#if the start and end line are equal to 0, no line information was found for the duplicated code block
 	#and it will be ignored in the statistics
 	(( start_plus_end = start_line + end_line ))

 	number_of_times_block_duplicated=`ls $group_dir/*.html | wc -l`

 	(( total_lines_of_duplicated_code = number_of_times_block_duplicated * lines_count_of_duplicated_code_block ))
 	if [ $start_plus_end -gt 0 ]; then
 		echo "group$group_dir_number; $first_group_file;$start_line;$end_line;$lines_count_of_duplicated_code_block;$number_of_times_block_duplicated;$total_lines_of_duplicated_code" >> $filename
 	else
 		echo "It was not possible to identify the start and end line of the duplicated code block in sub-directory group$group_dir_number" >&2
 	fi
 done

 echo "Finished. See the $filename file."
	#!/bin/bash

	clear


	echo -e "Parses a directory with a set of HTML files with information about code duplication generated using IntelliJ IDEA 'Analyse > Locate Duplicates' tool.\n"
	echo "The parser just analyses the directories starting with the word 'group'."
	echo "Each dir represents the information about the duplication of a given code block, and just about that single block."
	echo "A given dir contains HTML files that show where a specific block of code was duplicated."
	echo "Each HTML in a dir shows where that block of code was duplicated, can be it across different source code files or in the same one."
	echo "Therefore, to know how many lines of code the duplicated block has, this script just gets the number of lines of the first file in the dir,"
	echo "removing the first 2 lines. The other files into the dir just show where the duplication occur."
	echo "To just compute the statistics about number of duplicated lines of code, just the first file in each dir is enough."
	echo -e "\nThis script was developed by Manoel Campos da Silva Filho. http://about.me/manoelcampos"
	echo "https://gist.github.com/manoelcampos/862bbd1984c97de295b419e3a4a5d432"
	echo ""

	if [ $# -gt 0 ]; then
	SOURCE_DIR=$1
	else
	SOURCE_DIR="."
	fi

	groups_dir_pattern="$SOURCE_DIR/group*"

	#removes the trailling backslash from the dir name
	SOURCE_DIR=${SOURCE_DIR%/}
	echo "Source diretory: $SOURCE_DIR"
	echo ""

	ls -d $groups_dir_pattern &> /dev/null
	if [ $? != 0 ]; then
	echo "The directory '$SOURCE_DIR' doesn't contain IntelliJ IDEA code duplication reports." >&2
	echo "These reports are genereted inside numbered sub-directories starting with the name 'group'." >&2
	echo -e "If the reports are in a different directory, provide its path in the command line\n" >&2
	exit -1
	fi

	echo "Parsing InteliJ IDEA HTML files with the duplicated code metrics... Please wait... It may take a wile..."

	filename="./output.csv"
	echo "Directory with information of block duplication;file used to parse code duplication information;start line of the duplicated code block inside the analysed source code file;end line of the duplicated code block;number of lines of the duplicated code block;number of times the block was duplicated;total lines of duplicated code" > $filename

	#the directories where the code duplication information is are those ones that start with the word "group"
	#iterate over all group* sub-directories, returning just the number in the end of the sub-directory name
	for group_dir_number in `ls -d $groups_dir_pattern \| egrep -o '\d+$' \| sort -n`; do
	#gets the complete name of the sub-directory
	group_dir="$SOURCE_DIR/group$group_dir_number"

	#gets just the first file in the sub-directory, that is enough to get the amount of lines of the duplicated code block
	first_group_file=`ls $group_dir \| head -n 1`

	#gets just the numbers in the file that are between spaces. these values represent the start and end line of the duplicated code block
	#if there is just one number, the code block has just one line
	#gets the first number between spaces
	start_line=`cat $group_dir/$first_group_file \| egrep -o '<h4>.*</h4>' \| egrep -o '( \d+ )+' \| head -n 1 `

	# multiply by 1 to check if it is really a number
	(( start_line = start_line * 1 ))

	#gets the last number between spaces
	end_line=`cat $group_dir/$first_group_file \| egrep -o '<h4>.*</h4>' \| egrep -o '( \d+ )+' \| tail -n 1 `

	# multiply by 1 to check if it is really a number
	(( end_line = end_line * 1 ))

	#if the duplicated code block has just one line, there isn't information about the final line, just the start line (obviously).
	#thus, the end line is the same start line
	if [ $end_line -eq 0 ]; then
	end_line=$start_line
	fi

	(( lines_count_of_duplicated_code_block = end_line - start_line + 1 ))

	#if the start and end line are equal to 0, no line information was found for the duplicated code block
	#and it will be ignored in the statistics
	(( start_plus_end = start_line + end_line ))

	number_of_times_block_duplicated=`ls $group_dir/*.html \| wc -l`

	(( total_lines_of_duplicated_code = number_of_times_block_duplicated * lines_count_of_duplicated_code_block ))
	if [ $start_plus_end -gt 0 ]; then
	echo "group$group_dir_number; $first_group_file;$start_line;$end_line;$lines_count_of_duplicated_code_block;$number_of_times_block_duplicated;$total_lines_of_duplicated_code" >> $filename
	else
	echo "It was not possible to identify the start and end line of the duplicated code block in sub-directory group$group_dir_number" >&2
	fi
	done

	echo "Finished. See the $filename file."