-
-
Save cgpu/1b60bd5a6b6f7dc126144a4dc700c872 to your computer and use it in GitHub Desktop.
workflow nextflow finding duplicated files and creating a bash script to replace the duplicate with a symbolic link
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
params.directories="." | |
params.headsize=100000 | |
params.extensions="bam bai" | |
params.help=false | |
params.extrafind="" | |
params.lines = 1000 | |
params.publishDir="." | |
def helpMessage() { | |
log.info""" | |
========================================= | |
Usage: | |
find duplicate files, generate a bash script to generate symbolic links replacing the duplicates. | |
Mandatory arguments: | |
--directories (dir) find files under dir | |
--extensions 'string' space/comma/pipe separated list of extensions | |
Other options: | |
--extrafind (string) extra arguments for find. eg. " -size '+10000' " | |
--lines (int) split the input into 'n' parallel jobs [${params.lines}] | |
--headsize (int) max number of bytes for calculating the MD5 [${params.headsize}] | |
--publishDir (dir) | |
Nextflow options: | |
-w Work directory used by Nextflow. | |
workflow Author: Pierre Lindenbaum @yokofakun 20200304 | |
========================================= | |
""" | |
} | |
if( params.help ) { | |
helpMessage() | |
exit 0 | |
} | |
process findFiles { | |
tag "${params.extensions}" | |
cache 'lenient' | |
executor 'local' | |
cpus 5 | |
output: | |
file("split.list") into split_list | |
script: | |
def suffixes = ".*\\.\\("+ params.extensions.split("[ ,\\|]+").findAll{T->!T.isEmpty()}.collect{T->T.replaceAll("\\.","\\\\.")}.join("\\|") +"\\)\$" | |
""" | |
find ${params.directories} -type f -regex '${suffixes}' ${params.extrafind} > split.list | |
""" | |
} | |
process split { | |
tag "N=${params.lines}" | |
executor 'local' | |
input: | |
file splitin from split_list | |
output: | |
file("chunks.txt") into chunk_list | |
script: | |
""" | |
cut -f 2,4 "${splitin}" |\ | |
tr "\t" "\\n" |\ | |
LC_ALL=C sort -T . | uniq |\ | |
split -a 9 --additional-suffix=.list --lines=${params.lines} - chunck. | |
find \${PWD} -type f -name "chunck.*.list" > chunks.txt | |
""" | |
} | |
chunk_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{chunk_items} | |
/* scan a set of files ,extract the firs bytes and calculate md5 . | |
Output is | |
MD5(comma)path(comma)date(comma)size(comma) | |
sorted on MD5 and date | |
*/ | |
process md5sum { | |
tag "${chunk}" | |
cache 'lenient' | |
input: | |
val chunk from chunk_items | |
output: | |
file("md5.list") into md5_list | |
script: | |
""" | |
set -o pipefail | |
cat "${chunk}" | while read F | |
do | |
if [ -f "\${F}" ]; then | |
head -c '${params.headsize}' "\${F}" | md5sum | cut -d ' ' -f1 | tr "\\n" "," | |
echo -n "\${F}," | |
stat -c "%Y,%s" "\${F}" | |
fi | |
done | sort -T. -t "," -k1,1 -k3,3n > md5.list | |
""" | |
} | |
/** merge already sorted list on MD5/date */ | |
process merge { | |
executor "local" | |
tag "N=${L.size()}" | |
input: | |
val L from md5_list.collect() | |
output: | |
file("merged.list") into merged_list | |
script: | |
""" | |
sort -T . -t "," -k1,1 -k3,3n --merge ${L.join(" ")} > merged.list | |
""" | |
} | |
/** create cleanup bash script */ | |
process createScript { | |
publishDir "${params.publishDir}" , mode: 'copy', overwrite: true | |
executor "local" | |
input: | |
file merged from merged_list | |
output: | |
file("fdups.bash") into bash_script | |
script: | |
""" | |
cat << __EOF__ > jeter.awk | |
BEGIN { | |
printf("#!/bin/bash\\nset -e\\n"); | |
PREV_HASH=""; | |
PREV_FILE=""; | |
FS="," | |
} | |
{ | |
FNAME=\\\$2; | |
if(\\\$1!=PREV_HASH) { | |
PREV_HASH = \\\$1; | |
PREV_FILE = FNAME; | |
} | |
else if(FNAME==PREV_FILE) { | |
printf("##ERROR %s %s\\n",FNAME,PREV_FILE); | |
} | |
else | |
{ | |
printf("# %s\\n",\\\$1); | |
printf("test -f \\"%s\\" ", PREV_FILE); | |
printf(" && test -f \\"%s\\" ", FNAME); | |
printf(" && cmp \\"%s\\" \\"%s\\" ", PREV_FILE,FNAME); | |
printf(" && mv -v \\"%s\\" \\"%s.back\\" ", FNAME,FNAME); | |
printf(" && ln -s \\"%s\\" \\"%s\\" ",PREV_FILE, FNAME); | |
printf(" && rm -v \\"%s.back\\"\\n", FNAME); | |
N+=int(\\\$4); | |
printf("\\n"); | |
} | |
} | |
END { | |
printf("echo 'Saved %d bytes.'\\n\",N); | |
} | |
__EOF__ | |
awk -f jeter.awk "${merged}" > fdups.bash | |
""" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment