cgpu · June 16, 2021 02:05
diff --git a/finddups.nf b/finddups.nf
 params.directories="."
 params.headsize=100000
 params.extensions="bam bai"
 params.help=false
 params.extrafind=""
 params.lines = 1000
 params.publishDir="."

 def helpMessage() {
  log.info"""

 =========================================
 Usage:
 	find duplicate files, generate a bash script to generate symbolic links replacing the duplicates.


 Mandatory arguments:
 	--directories (dir) find files under dir
 	--extensions 'string' space/comma/pipe separated list of extensions
 	  
 Other options:
 	--extrafind (string) extra arguments for find. eg. " -size '+10000' "
 	--lines (int) split the input into 'n' parallel jobs [${params.lines}]
  --headsize (int) max number of bytes for calculating the MD5 [${params.headsize}]
 	--publishDir (dir)

 Nextflow options:
  -w  Work directory used by Nextflow.

 workflow Author: Pierre Lindenbaum @yokofakun 20200304
 =========================================

  """
 }


 if( params.help ) {
    helpMessage()
    exit 0
 }


 process findFiles {
    tag "${params.extensions}"
    cache 'lenient'
    executor 'local'
    cpus 5
    output:
    	file("split.list") into split_list
    script:
 	def suffixes = ".*\\.\\("+ params.extensions.split("[ ,\\|]+").findAll{T->!T.isEmpty()}.collect{T->T.replaceAll("\\.","\\\\.")}.join("\\|") +"\\)\$"
 	"""
 	find ${params.directories}  -type f -regex '${suffixes}' ${params.extrafind} > split.list
 	"""
 	}

 process split {
 	tag "N=${params.lines}"
 	executor 'local'
 	input:
 		file splitin from split_list
 	output:
 		file("chunks.txt") into chunk_list
 	script:
 	"""
 	cut -f 2,4 "${splitin}" |\
 		tr "\t" "\\n" |\
 		LC_ALL=C sort -T . | uniq |\
 		split -a 9 --additional-suffix=.list --lines=${params.lines} - chunck.
 	find \${PWD} -type f -name "chunck.*.list" > chunks.txt
 	"""
 	}


 chunk_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{chunk_items}

 /* scan a set of files ,extract the firs bytes and calculate md5 .
   Output is
 
     MD5(comma)path(comma)date(comma)size(comma)


 sorted on MD5 and date
 */
 process md5sum {
 	tag "${chunk}"
        cache 'lenient'
 	input:
 		val chunk from chunk_items
 	output:
 		file("md5.list") into md5_list
 	script:
 	"""
 	set -o pipefail

 	cat "${chunk}"  | while read F
 	do
 		if [ -f "\${F}" ]; then
 			head -c '${params.headsize}'  "\${F}" | md5sum | cut -d ' ' -f1 | tr "\\n" ","
 			echo -n "\${F},"
 			stat -c "%Y,%s" "\${F}"
 		fi
 	done | sort -T. -t "," -k1,1 -k3,3n > md5.list
 	"""
 	}

 /** merge already sorted list on MD5/date */
 process merge {
 	executor "local"
 	tag "N=${L.size()}"
 	input:
 		val L from md5_list.collect()
 	output:
 		file("merged.list") into merged_list
 	script:
 	"""
 	sort -T . -t "," -k1,1 -k3,3n --merge  ${L.join(" ")} > merged.list
 	"""
 	}

 /** create cleanup bash script */
 process createScript {
 	publishDir "${params.publishDir}" ,  mode: 'copy', overwrite: true
 	executor "local"
 	input:
 		file merged from merged_list
 	output:
 		file("fdups.bash") into bash_script
 	script:
 	"""
 cat << __EOF__ > jeter.awk
 BEGIN	{
 	printf("#!/bin/bash\\nset -e\\n");
 	PREV_HASH="";
 	PREV_FILE="";
 	FS=","
 	}
 	{
 	FNAME=\\\$2;
 	if(\\\$1!=PREV_HASH) {
 		PREV_HASH = \\\$1;
 		PREV_FILE = FNAME;
 		}
 	else if(FNAME==PREV_FILE) {
 		printf("##ERROR %s %s\\n",FNAME,PREV_FILE);
 		}
 	else
 		{
 		printf("# %s\\n",\\\$1);
 		printf("test -f \\"%s\\" ", PREV_FILE);
 		printf(" && test -f \\"%s\\" ", FNAME);
 		printf(" && cmp  \\"%s\\" \\"%s\\" ", PREV_FILE,FNAME);
 		printf(" && mv -v  \\"%s\\" \\"%s.back\\" ", FNAME,FNAME);
 		printf(" && ln -s  \\"%s\\" \\"%s\\" ",PREV_FILE, FNAME);
 		printf(" && rm -v  \\"%s.back\\"\\n", FNAME);
 		N+=int(\\\$4);
 		printf("\\n");
 		}
 	}

 END	{
 	printf("echo 'Saved %d bytes.'\\n\",N);
 	}
 __EOF__


 awk -f jeter.awk "${merged}" > fdups.bash

 	"""
 	}
	params.directories="."
	params.headsize=100000
	params.extensions="bam bai"
	params.help=false
	params.extrafind=""
	params.lines = 1000
	params.publishDir="."

	def helpMessage() {
	log.info"""

	=========================================
	Usage:
	find duplicate files, generate a bash script to generate symbolic links replacing the duplicates.


	Mandatory arguments:
	--directories (dir) find files under dir
	--extensions 'string' space/comma/pipe separated list of extensions

	Other options:
	--extrafind (string) extra arguments for find. eg. " -size '+10000' "
	--lines (int) split the input into 'n' parallel jobs [${params.lines}]
	--headsize (int) max number of bytes for calculating the MD5 [${params.headsize}]
	--publishDir (dir)

	Nextflow options:
	-w Work directory used by Nextflow.

	workflow Author: Pierre Lindenbaum @yokofakun 20200304
	=========================================

	"""
	}


	if( params.help ) {
	helpMessage()
	exit 0
	}


	process findFiles {
	tag "${params.extensions}"
	cache 'lenient'
	executor 'local'
	cpus 5
	output:
	file("split.list") into split_list
	script:
	def suffixes = ".*\\.\\("+ params.extensions.split("[ ,\\\|]+").findAll{T->!T.isEmpty()}.collect{T->T.replaceAll("\\.","\\\\.")}.join("\\\|") +"\\)\$"
	"""
	find ${params.directories} -type f -regex '${suffixes}' ${params.extrafind} > split.list
	"""
	}

	process split {
	tag "N=${params.lines}"
	executor 'local'
	input:
	file splitin from split_list
	output:
	file("chunks.txt") into chunk_list
	script:
	"""
	cut -f 2,4 "${splitin}" \|\
	tr "\t" "\\n" \|\
	LC_ALL=C sort -T . \| uniq \|\
	split -a 9 --additional-suffix=.list --lines=${params.lines} - chunck.
	find \${PWD} -type f -name "chunck.*.list" > chunks.txt
	"""
	}


	chunk_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{chunk_items}

	/* scan a set of files ,extract the firs bytes and calculate md5 .
	Output is

	MD5(comma)path(comma)date(comma)size(comma)


	sorted on MD5 and date
	*/
	process md5sum {
	tag "${chunk}"
	cache 'lenient'
	input:
	val chunk from chunk_items
	output:
	file("md5.list") into md5_list
	script:
	"""
	set -o pipefail

	cat "${chunk}" \| while read F
	do
	if [ -f "\${F}" ]; then
	head -c '${params.headsize}' "\${F}" \| md5sum \| cut -d ' ' -f1 \| tr "\\n" ","
	echo -n "\${F},"
	stat -c "%Y,%s" "\${F}"
	fi
	done \| sort -T. -t "," -k1,1 -k3,3n > md5.list
	"""
	}

	/** merge already sorted list on MD5/date */
	process merge {
	executor "local"
	tag "N=${L.size()}"
	input:
	val L from md5_list.collect()
	output:
	file("merged.list") into merged_list
	script:
	"""
	sort -T . -t "," -k1,1 -k3,3n --merge ${L.join(" ")} > merged.list
	"""
	}

	/** create cleanup bash script */
	process createScript {
	publishDir "${params.publishDir}" , mode: 'copy', overwrite: true
	executor "local"
	input:
	file merged from merged_list
	output:
	file("fdups.bash") into bash_script
	script:
	"""
	cat << __EOF__ > jeter.awk
	BEGIN {
	printf("#!/bin/bash\\nset -e\\n");
	PREV_HASH="";
	PREV_FILE="";
	FS=","
	}
	{
	FNAME=\\\$2;
	if(\\\$1!=PREV_HASH) {
	PREV_HASH = \\\$1;
	PREV_FILE = FNAME;
	}
	else if(FNAME==PREV_FILE) {
	printf("##ERROR %s %s\\n",FNAME,PREV_FILE);
	}
	else
	{
	printf("# %s\\n",\\\$1);
	printf("test -f \\"%s\\" ", PREV_FILE);
	printf(" && test -f \\"%s\\" ", FNAME);
	printf(" && cmp \\"%s\\" \\"%s\\" ", PREV_FILE,FNAME);
	printf(" && mv -v \\"%s\\" \\"%s.back\\" ", FNAME,FNAME);
	printf(" && ln -s \\"%s\\" \\"%s\\" ",PREV_FILE, FNAME);
	printf(" && rm -v \\"%s.back\\"\\n", FNAME);
	N+=int(\\\$4);
	printf("\\n");
	}
	}

	END {
	printf("echo 'Saved %d bytes.'\\n\",N);
	}
	__EOF__


	awk -f jeter.awk "${merged}" > fdups.bash

	"""
	}