Skip to content

Instantly share code, notes, and snippets.

View pgonzale60's full-sized avatar

Pablo Gonzalez pgonzale60

View GitHub Profile
@pgonzale60
pgonzale60 / btk_multiassem.nf
Created February 20, 2020 16:10
Blobtoolkit nextflow pipeline
#!/usr/bin/env nextflow
/*
========================================================================================
Explore assembly contamination. Started February 2020.
#### Author
Pablo González <[email protected]>
----------------------------------------------------------------------------------------
#! /bin/bash
# This script calculates number of protein coding genes and assembly, intron and coding exon span
# Usage
# calc_coding_intron_total_spans.sh assembly annotation species
echo species: $3
# For gffread, load conda env
source /ceph/software/conda/bin/activate /ceph/users/pgonzales/.conda/envs/funannotate
# Nigonize orthologs
library(tidyverse)
library(reshape2)
library(cluster)
# Inputs:
# Dictionary of chromosomes that remain untouched
priorNigon <- tibble(chromosome = c("ce_I", "ce_II", "ce_III", "ce_IV", "ce_V", "ot_V", "pp_ChrX"),
nigon = c(LETTERS[1:5], "N", "X"))
params.query = "queTest.fa"
params.ref = "refTest.fa"
params.out = "~/test.txt"
params.minqueryCov = 75
params.chunkSize = 100
Channel
.fromPath( params.query )
.ifEmpty { error "Cannot find any reads matching: ${params.query}" }
threads=60
RAMmem=60
condaEnv=/ceph/users/pgonzales/miniconda3/envs/callvars
outdir=/data/blaxter/Otipulae_lr/analyses/assemblies/freebayes/
pe_reads1=/data/blaxter/Otipulae_lr/analyses/assemblies/trimmd_Illumina/ERR1656469_1.fastq.gz
pe_reads2=/data/blaxter/Otipulae_lr/analyses/assemblies/trimmd_Illumina/ERR1656469_2.fastq.gz
assembly=/data/blaxter/Otipulae_lr/analyses/assemblies/gap5/O.tipAlanCuratedPreCorrection.fa
assemName=freebayes_otipulaeA
wkdir=/scratch/pgonzales/snippy/$assemName
library(tidyverse)
# Inputs: window size
windwSize <- 1e3
# Samtools depth result
illuCov <- read_tsv("oscheius_tipulae.local.v3_3_ERR1656469.cov",
col_names = c("chr", "pos", "cov"))
# Summarize
assembly=/data/blaxter/Otipulae_lr/raw/alternativeGenomeSources/assembly/oscheius_tipulae.local.v3_3.genomic.fa
windSize=1000
coverageFile=/data/blaxter/Otipulae_lr/analyses/qualityMetrics/mappingStats/bwa/freebayes_oscheius_tipulae.local.v3_3_ERR1656469.cov
# Load samtools and bedtools
source /ceph/software/conda/bin/activate /ceph/users/pgonzales/miniconda3/envs/callvars
# Index fasta file
samtools faidx ${assembly}
# Extract gene coordinates of protein coding genes.
# In this case GenBank source allows to distinguish
# protein coding from non-coding genes.
gunzip -c mxv1.200520.ragoo.rnm.gtf.gz | \
awk -F '\t' \
'BEGIN{OFS=FS}{if($2 == "GenBank" && $3 == "gene"){print}}' > genes.tsv
# Reorder busco output to match bedtools input format
awk -F '\t' 'BEGIN{OFS=FS}{if(NR>3 && $3 != ""){print $3, $4, $5, $1}}' \
busco_full_mxv1.tsv > busco_range.tsv
# Install snippy
conda install -c conda-forge -c bioconda -c defaults snippy
# Run
snippy --cpu 16 --ram 30 --outdir /storage/monarchs/sexBias/analyses/qc/callvars \
--ref mxv1.200520.ragoo.rnm.fa \
--R1 /storage/monarchs/sexBias/raw/sra/SRR10100549_1.fastq.gz \
--R2 /storage/monarchs/sexBias/raw/sra/SRR10100549_2.fastq.gz
# Summarize number of heterozigous sites
bcftools view -g "het" /storage/monarchs/sexBias/analyses/qc/callvars/SRR1010054/snps.raw.vcf | \
@pgonzale60
pgonzale60 / diptera_chrom_painting.Rmd
Last active August 6, 2020 20:19
Colour chromosomes of dipterans according to cluster of USCOs
---
title: "Chromosome evolution in Diptera"
output: html_notebook
---
Get chromosome numbers and sequence IDs correspondece of downloaded assemblies.
```{bash}
cat namedAssems/*.fa | grep ">" | sed -E 's/>([[:alnum:]_\.]*) .*chromosome ([[:alnum:]_\.]*).*/\1\t\2/' | grep -v ">" | gzip -c > seqIDs2chrom.tsv.gz