Skip to content

Instantly share code, notes, and snippets.

@epaule
Created November 13, 2024 10:00
Show Gist options
  • Save epaule/8c5cb7c07d53295b39cc0c14a27170d5 to your computer and use it in GitHub Desktop.
Save epaule/8c5cb7c07d53295b39cc0c14a27170d5 to your computer and use it in GitHub Desktop.
rename sequences in fasta based on mashmap mapping
#!/bin/bash
# Tom Mathers.
# Script to map hap2 chromosome (SUPER) IDs to hap1 IDs for genomeark assemblies.
# Script aligns hap2 to hap1 with mashmap and selects the longest alignment per chrom. Only scaffolds with SUPER IDs are included in the output mapping.
## NOTE - the script filters out sex known sex chromsomes (X, Y, Z and W). These are not reported in the final mapping file.
## NOTE 2 - the script filters align ments on min 96% mash identity. If extremee haplotype divergence is epected check results carefully and adjust as needed.
# Output file is a tsv with HAP2 id in column 1 and the corresponding HAP1 ID in column 2.
# Input files are hap1 and hap2 fasta files generated by rapid_join.pl (ie. with SUPER IDs).
# Always give <hap1_fasta> as first argument and <hap2_fasta> as second argument.
# Run on farm with at least 16 cores and 16 Gb ram.
if [ $# -ne 2 ]; then
echo $0: usage: sh hap2_hap1_ID_mapping.sh hap1_fasta hap2_fasta
exit 1
fi
hap1=$1
hap2=$2
# Map hap 2 to hap 1
mashmap -r ${hap1} -q ${hap2} -f one-to-one -t 16;
# Make results table
awk '{print $1}' mashmap.out | grep -v SCAFFOLD | grep -v unloc | grep -v X | grep -v Y | grep -v Z | grep -v W | uniq > tmp; while read id; do awk -v val=${id} '$1==val && $10 >= 96' mashmap.out | awk '{print $0 "\t" $9 - $8}' | sort -nrk11,11 | head -n1 ; done < tmp | awk '{print $1 "\t" $6}' > hap2_hap1.tsv
# Sanity checking of results
var1=$(cut -f1 hap2_hap1.tsv | sort -u | wc -l)
var2=$(cut -f2 hap2_hap1.tsv | sort -u | wc -l)
echo "$var1 $var2" | awk '{if ($1 != $2) print "\n\nWARNING duplicate chromosome names detected!\n\nPlease check results.\n\n"}'
var3=$(cat tmp | wc -l)
var4=$(cat hap2_hap1.tsv | wc -l)
echo "$var3 $var4" | awk '{if ($1 > $2) print "\n\nWARNING not all autosomes are included in hap2_hap1.tsv\n\n"}'
# Clean up
rm tmp
#!/usr/bin/env ruby
require 'optparse'
fasta = ''
table = ''
debug = false
new_mapping = false
chromosome_list = false
def get_mapping(file)
mapping = Hash.new
File.new(file).each_line{|line|
line.chomp!
c=line.split
next unless c[1] =~ /SUPER_\d+/ # do not map sex chromosomes across
next if mapping.values.include?(c[1]) # create an artificial 1:1 mapping
next if mapping.keys.include?(c[0]) # create an artificial 1:1 mapping
mapping[c[0]]=c[1]
}
mapping
end
def create_chromosme_list_line(id)
/(SUPER_[A-Z0-9]+)(_unloc_\d+)*/.match(id)
name = "#{$1}"
suffix = "#{$2}"
stripped_name = name.sub("SUPER_",'')
line = []
if suffix.length >=1
line = [name+suffix,stripped_name,'no']
else
line = [name,stripped_name,'yes']
end
line.join(',')
end
OptionParser.new do |parser|
parser.banner = "Usage: update_mapping --fasta FASTA_FILE --mashmap_table TABLE_TSV [ --debug | --new_mapping NEW_MAPPING_TSV | --help | --chromosome_list NEW_CHROMOSOME_LIST]"
parser.on("-f", "--fasta FASTA_FILE") { |f| fasta = f }
parser.on("-t", "--mashmap_table TABLE_TSV") { |f| table = f }
parser.on("-d", "--debug") { debug = true }
parser.on("-n", "--new_mapping NEW_MAPPING_TSV") { |f| new_mapping = f }
parser.on("-c", "--chromosome_name NEW_CHROMOSOME_LIST") {|f| chromosome_list = f}
parser.on("-h", "--help") do
puts parser
exit
end
end.parse!
hap2_hap1_mapping = get_mapping(table)
count=1
new_table = File.new(new_mapping,'w') if new_mapping
new_chrom_file = File.new(chromosome_list,'w') if chromosome_list
File.new(fasta).each_line{|line|
if />(SUPER_\d+)(_unloc_\d+)*/.match(line)
name = "#{$1}"
suffix = "#{$2}"
if hap2_hap1_mapping.has_key?(name)
line = line.sub(name, hap2_hap1_mapping[name])
$stderr.puts("renaming #{name}#{suffix} => #{hap2_hap1_mapping[name]}#{suffix}") if debug
new_table.puts([name+suffix, hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
else
while hap2_hap1_mapping.reverse_each.to_h.has_key?("SUPER_#{count}")
count+=1
end
id = "SUPER_#{count}"
line = line.sub(name,id)
hap2_hap1_mapping[id]=name
$stderr.puts("renaming #{name}#{suffix} => #{id}#{suffix}") if debug
new_table.puts( [name+suffix,hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
end
elsif />(SUPER_\w+)(_unloc_\d+)*/.match(line)
name = "#{$1}"
suffix = "#{$2}"
new_chrom_file.puts(create_chromosme_list_line(name+suffix)) if chromosome_list
end
print line
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment