Created
June 9, 2021 08:37
-
-
Save lindenb/00267989d5c14289ab5b20d2bae68b9f to your computer and use it in GitHub Desktop.
Fast GATK: calling a small region for a large number of Bams with GATK HaplotypeCaller
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
The MIT License (MIT) | |
Copyright (c) 2021 Pierre Lindenbaum | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
*/ | |
//package com.github.lindenb.jvarkit.tools.gatk; | |
import java.security.MessageDigest; | |
import java.security.NoSuchAlgorithmException; | |
import org.broadinstitute.hellbender.*; | |
import java.util.*; | |
import htsjdk.samtools.util.*; | |
import java.io.*; | |
import org.broadinstitute.barclay.argparser.Argument; | |
import org.broadinstitute.barclay.argparser.CommandLineArgumentParser; | |
import org.broadinstitute.hellbender.cmdline.CommandLineProgram; | |
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | |
import org.apache.logging.log4j.LogManager; | |
import org.apache.logging.log4j.Logger; | |
public class FastGatk extends Main { | |
private static final Logger LOG = LogManager.getLogger(Main.class); | |
@Argument(fullName = "base-dir", shortName = "D", doc = "base workfing directory",optional=false) | |
private File baseDir=null; | |
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "File to which variants should be written") | |
private File finalVcf = null; | |
@Argument(fullName = StandardArgumentDefinitions.VERBOSITY_NAME, shortName = StandardArgumentDefinitions.VERBOSITY_NAME, doc = "Control verbosity of logging.", common = true, optional = true) | |
public Log.LogLevel VERBOSITY = Log.LogLevel.INFO; | |
@Argument(fullName = StandardArgumentDefinitions.REFERENCE_LONG_NAME, shortName = StandardArgumentDefinitions.REFERENCE_SHORT_NAME, doc = "Indexed fasta reference", common = true, optional = false) | |
private File reference=null; | |
@Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, doc = "A file containing the path to the bams", common = true, optional = false) | |
private File bamsIn= null; | |
@Argument(fullName = "minimum-mapq", shortName = "Q", doc = "minimum mapping quality", common = true, optional = true) | |
private int mapq= 10; | |
@Argument(fullName = "dbsnp", shortName = "dbsnp", doc = "dbsnp file", common = true, optional = true) | |
private File dbsnpFile= null; | |
@Argument(fullName = "force", shortName = "force", doc = "do not redo if file(s) exist", common = true, optional = true) | |
private boolean forceRedo =false; | |
@Argument(fullName = "batch-size", shortName = "batch-size", doc = "batch size", common = true, optional = true) | |
private int batchSize=50; | |
@Argument(fullName = StandardArgumentDefinitions.INTERVALS_LONG_NAME, shortName = StandardArgumentDefinitions.INTERVALS_SHORT_NAME, doc = "interval chr:start-end", common = true, optional = false) | |
private String interval= null; | |
//private File tmpDirectory = null; | |
/** return md5 of a string */ | |
public static final String md5(final String in) { | |
final MessageDigest _md5; | |
try { | |
_md5 = java.security.MessageDigest.getInstance("MD5"); | |
} catch (final NoSuchAlgorithmException e) { | |
throw new RuntimeException("MD5 algorithm not found", e); | |
} | |
_md5.reset(); | |
_md5.update(in.getBytes()); | |
String s = new java.math.BigInteger(1, _md5.digest()).toString(16); | |
if (s.length() != 32) { | |
final String zeros = "00000000000000000000000000000000"; | |
s = zeros.substring(0, 32 - s.length()) + s; | |
} | |
return s; | |
} | |
private void execute(final List<String> argv) throws Exception { | |
final String[] args = argv.toArray(new String[argv.size()]); | |
LOG.info(getCommandLineName()+":Executing: gatk "+ String.join(" ",argv)); | |
final CommandLineProgram program = | |
this.setupConfigAndExtractProgram(args, | |
this.getPackageList(), | |
this.getClassList(), | |
this.getCommandLineName() | |
); | |
final Object result = Main.runCommandLineProgram(program, args); | |
if(result==null) return; | |
if(Boolean.TRUE.equals(result)) return; | |
LOG.warn("Returned "+ result.getClass()); | |
LOG.error("Result is "+ result); | |
final Throwable err= (result instanceof Throwable?Throwable.class.cast(result):null); | |
if(err!=null) { | |
throw new RuntimeException(err); | |
} | |
else | |
{ | |
throw new RuntimeException("Failure"); | |
} | |
} | |
private File haplotypeCaller(final String bam) throws Exception { | |
LOG.info(bam); | |
final String outfname = md5(bam+":"+this.interval)+".g.vcf.gz"; | |
final File out = new File(this.baseDir,outfname); | |
final File outtbi = new File(this.baseDir,outfname+".tbi"); | |
if(!this.forceRedo && out.exists() && outtbi.exists()) { | |
LOG.info("File already exists:"+out+" for "+bam); | |
return out; | |
} | |
final List<String> cmd= new ArrayList<>(); | |
cmd.add("HaplotypeCaller"); | |
cmd.add("-R"); | |
cmd.add(this.reference.toString()); | |
cmd.add("-I"); | |
cmd.add(bam); | |
cmd.add("-L"); | |
cmd.add(this.interval); | |
cmd.add("-ERC"); | |
cmd.add("GVCF"); | |
cmd.add("-O"); | |
cmd.add(out.toString()); | |
cmd.add("--tmp-dir"); | |
cmd.add(this.baseDir.toString()); | |
if(this.dbsnpFile!=null) { | |
cmd.add("--dbsnp"); | |
cmd.add(this.dbsnpFile.toString()); | |
} | |
if(this.mapq>0) { | |
cmd.add("--minimum-mapping-quality"); | |
cmd.add(String.valueOf(this.mapq)); | |
} | |
execute(cmd); | |
IOUtil.assertFileIsReadable(out); | |
IOUtil.assertFileIsReadable(outtbi); | |
return out; | |
} | |
private File dbImport(final List<File> gvcfs) throws Exception { | |
final String outfname = md5("database:"+this.interval); | |
final File database = new File(this.baseDir,outfname+".db"); | |
final File dbFlag = new File(this.baseDir,outfname+".db.flag"); | |
if(!this.forceRedo && database.exists() && dbFlag.exists() && | |
IOUtil.slurp(dbFlag).equals("0") | |
) { | |
LOG.info("Database was already processed:" + database); | |
return database; | |
} | |
if(database.exists() && database.isDirectory()) { | |
LOG.warn("Deleting old "+database); | |
IOUtil.deleteDirectoryTree(database); | |
} | |
final List<String> cmd = new ArrayList<>(); | |
cmd.add("GenomicsDBImport"); | |
cmd.add("--genomicsdb-workspace-path"); | |
cmd.add(database.toString()); | |
cmd.add("-L"); | |
cmd.add(this.interval); | |
cmd.add("--tmp-dir"); | |
cmd.add(this.baseDir.toString()); | |
if(this.batchSize>0) { | |
cmd.add("--batch-size"); | |
cmd.add(String.valueOf(this.batchSize)); | |
if(gvcfs.size()/this.batchSize>100) { | |
// Use the consolidate flag if more than a hundred batches were used. | |
cmd.add("--consolidate"); | |
} | |
} | |
for(final File gvcf : gvcfs) { | |
cmd.add("-V"); | |
cmd.add(gvcf.toString()); | |
} | |
execute(cmd); | |
IOUtil.assertDirectoryIsReadable(database); | |
try(PrintWriter pw = new PrintWriter(dbFlag)) { | |
pw.print("0"); | |
pw.flush(); | |
} | |
return database; | |
} | |
private File genotype(final File database) throws Exception { | |
final List<String> cmd = new ArrayList<>(); | |
cmd.add("GenotypeGVCFs"); | |
cmd.add("-V"); | |
cmd.add("gendb://"+database.toString()); | |
cmd.add("-O"); | |
cmd.add(this.finalVcf.toString()); | |
cmd.add("-R"); | |
cmd.add(this.reference.toString()); | |
cmd.add("--tmp-dir"); | |
cmd.add(this.baseDir.toString()); | |
if(this.dbsnpFile!=null) { | |
cmd.add("--dbsnp"); | |
cmd.add(this.dbsnpFile.toString()); | |
} | |
execute(cmd); | |
IOUtil.assertFileIsReadable(this.finalVcf); | |
return this.finalVcf; | |
} | |
private int doWork(final String[] args) { | |
try { | |
final CommandLineArgumentParser cmdLineParser = new CommandLineArgumentParser(this); | |
final boolean ret = cmdLineParser.parseArguments(System.err, args); | |
if (!ret) { | |
System.err.println(cmdLineParser.usage(false, false)); | |
return -1; | |
} | |
//create workdir | |
if(!this.baseDir.exists() && !this.baseDir.mkdir()) { | |
LOG.error("Cannot create "+this.baseDir); | |
return -1; | |
} | |
final List<String> bams = IOUtil.slurpLines(this.bamsIn); | |
if (bams.isEmpty()) { | |
LOG.error("No Bam was provided"); | |
return -1; | |
} | |
final List<File> gvcfs = new ArrayList<>(bams.size()); | |
for(String bam:bams) { | |
gvcfs.add(haplotypeCaller(bam)); | |
} | |
final File database = dbImport(gvcfs); | |
genotype(database); | |
LOG.info("OK "+ this.finalVcf); | |
return 0; | |
} | |
catch(final Throwable err) { | |
err.printStackTrace(); | |
return -1; | |
} | |
} | |
@Override | |
protected String getCommandLineName() { | |
return this.getClass().getSimpleName(); | |
} | |
public static void main(final String[] args) { | |
int ret= new FastGatk().doWork(args); | |
System.exit(ret); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Script | |
# | |
# real 10m33.301s | |
# user 2m52.326s | |
# sys 0m15.430s | |
# Wrapper | |
# | |
# real 4m44.807s | |
# user 1m38.720s | |
# sys 0m6.059s | |
# | |
all: result1.vcf.gz result2.vcf.gz | |
result2.vcf.gz: standalone.sh jeter.list | |
time bash ./standalone.sh | |
result1.vcf.gz : fastgatk.jar jeter.list | |
rm -rf TMP1 | |
time java -Djava.io.tmpdir=TMP1 -cp ${GATKJAR}:$< FastGatk \ | |
-I jeter.list --base-dir TMP1 --output $@ \ | |
--reference /home/lindenbaum-p/src/jvarkit/src/test/resources/rotavirus_rf.fa \ | |
-L RF01 2> log.err | |
jeter.list: | |
find /home/lindenbaum-p/src/jvarkit/src/test/resources/ -type f -name "S*.bam" > jeter.list | |
fastgatk.jar: FastGatk.java | |
rm -rf TMP | |
mkdir TMP | |
javac -d TMP -cp ${GATKJAR} -sourcepath . $< | |
jar cvf $@ -C TMP . | |
rm -rf TMP | |
clean: | |
rm -rf fastgatk.jar TMP1 TMP2 result1.vcf.gz result1.vcf.gz.tbi result2.vcf.gz result2.vcf.gz.tbi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
ref: https://twitter.com/yokofakun/status/1402548621715951616