Michael Paulini epaule

epaule / hap_bedgraph.py

Last active November 14, 2024 11:32

	#!/usr/bin/env python3
	# usage: python3 hap_bedgraph.py FASTA > BEDGRAPH

	import sys
	import re
	from Bio import SeqIO
	ns=re.compile("[Nn]+")
	for seq_record in SeqIO.parse(sys.argv[1], "fasta"):
	for m in ns.finditer(str(seq_record.seq)):
	print(f'{seq_record.id}\t{m.start()}\t{m.end()+1}\t200')

epaule / hap2_hap1_ID_mapping.sh

Created November 13, 2024 10:00

rename sequences in fasta based on mashmap mapping

	#!/bin/bash

	# Tom Mathers.

	# Script to map hap2 chromosome (SUPER) IDs to hap1 IDs for genomeark assemblies.

	# Script aligns hap2 to hap1 with mashmap and selects the longest alignment per chrom. Only scaffolds with SUPER IDs are included in the output mapping.

	## NOTE - the script filters out sex known sex chromsomes (X, Y, Z and W). These are not reported in the final mapping file.

epaule / remove_ids_from_fasta.rb

Created November 11, 2024 11:38

remove ids from a fasta file

	#!/usr/bin/env ruby
	# usage: ruby remove_id_from_fasta.rb FILE_WITH_IDS FASTA_FILE

	require 'bio'

	ids = []
	File.foreach(ARGV.shift, chomp: true) do \|line\|
	ids << line
	end

epaule / get_rnaseq_number_for_species.rb

Created December 4, 2023 11:32

	#!/nfs/users/nfs_m/mh6/.linuxbrew/homebrew/bin/ruby
	# usage: ruby get_rnaseq_number_for_species.rb "Xestospongia muta"

	require "net/http"
	require "uri"
	require "cgi"

	species = ARGV[0]

	species = CGI.escapeURIComponent(species)

epaule / gist:6a6b78f07a9ddc64e0a58fbd8c12094e

Last active November 23, 2023 13:19

find_omark_contamination.pl

epaule / get_lineage_from_species.cr

Last active November 23, 2023 13:21

get taxonomic lineage from ENA for a species name

	#!/usr/bin/env crystal
	require "http/client"
	require "json"

	species = ARGV[0]

	species = species.gsub(/\s/, "%20")

	r = HTTP::Client.get("https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/#{species}", headers: HTTP::Headers{"Accept" => "application/json"})
	raise "cannot get the taxonomy" unless r.success?

epaule / mark_scaffolds.rb

Last active November 16, 2023 11:38

	#!/usr/bin/env ruby
	# frozen_string_literal: true

	# add > signs to the nearest gaps in a TPF based on AGP breaks
	# usage: ./mark_scaffolds.rb -a AGP -t TPF -o TPF2

	require 'optparse'

	# get the gaps from the AGP as Hash of Arrays [seq_id] => [1,2,3]
	def parse_agp(file)

epaule / cut.txt

Last active September 7, 2023 09:18

chromosome renaming

	Step 9 - Rename and re-run maps
	It is likely that between the pairs of chromosomes there will be discrepancies in size. This will result in the files being in different orders when they get size sorted. This needs to be corrected before submission. The can be done manually or using the following method:

	Run:
	bsub -G grit-grp -n 16 -e e_hap2hap_mapping -o o_hap2hap_mapping -M 16000 -R 'select[mem>16000] rusage[mem=16000] span[hosts=1]' sh /software/grit/projects/vgp_curation_scripts/hap2_hap1_ID_mapping.sh <h1_fasta> <h2_fasta>


	Then:
	/software/grit/projects/vgp_curation_scripts/update_mapping.rb -c <renamed_hap2_chroms.csv> -f <hap2.fa> -t <hap2_hap1.tsv> > hap2.renamed.fa

epaule / fix_capitalisation.bash

Last active August 21, 2023 11:58

	#!/bin/bash

	# simple files
	for f in .bed .contamination recommendation report.txt taxonomy.rpt //combined_summary.csv .fa .gb
	do
	perl -i_original -pne 's/scaffold_/SCAFFOLD_/' $f
	done

	# compressed fastas
	for f in *.gz

epaule / BUSCO_synteny_v1.1.sh

Last active July 17, 2023 14:59

curation scripts

	#!/bin/bash

	# Script by Tom Mathers.

	# Script requires 32 cores and ~50Gb ram (depending on genome size).
	# Seqkit and BUSCO need to be in path.
	# Scaffold IDs need to be in final tol format.
	# Query and ref short IDs need to be two letter codes eg. "Ac".
	# Query and ref fasta files need to be in working dir.