Created
November 13, 2024 10:00
-
-
Save epaule/8c5cb7c07d53295b39cc0c14a27170d5 to your computer and use it in GitHub Desktop.
rename sequences in fasta based on mashmap mapping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Tom Mathers. | |
# Script to map hap2 chromosome (SUPER) IDs to hap1 IDs for genomeark assemblies. | |
# Script aligns hap2 to hap1 with mashmap and selects the longest alignment per chrom. Only scaffolds with SUPER IDs are included in the output mapping. | |
## NOTE - the script filters out sex known sex chromsomes (X, Y, Z and W). These are not reported in the final mapping file. | |
## NOTE 2 - the script filters align ments on min 96% mash identity. If extremee haplotype divergence is epected check results carefully and adjust as needed. | |
# Output file is a tsv with HAP2 id in column 1 and the corresponding HAP1 ID in column 2. | |
# Input files are hap1 and hap2 fasta files generated by rapid_join.pl (ie. with SUPER IDs). | |
# Always give <hap1_fasta> as first argument and <hap2_fasta> as second argument. | |
# Run on farm with at least 16 cores and 16 Gb ram. | |
if [ $# -ne 2 ]; then | |
echo $0: usage: sh hap2_hap1_ID_mapping.sh hap1_fasta hap2_fasta | |
exit 1 | |
fi | |
hap1=$1 | |
hap2=$2 | |
# Map hap 2 to hap 1 | |
mashmap -r ${hap1} -q ${hap2} -f one-to-one -t 16; | |
# Make results table | |
awk '{print $1}' mashmap.out | grep -v SCAFFOLD | grep -v unloc | grep -v X | grep -v Y | grep -v Z | grep -v W | uniq > tmp; while read id; do awk -v val=${id} '$1==val && $10 >= 96' mashmap.out | awk '{print $0 "\t" $9 - $8}' | sort -nrk11,11 | head -n1 ; done < tmp | awk '{print $1 "\t" $6}' > hap2_hap1.tsv | |
# Sanity checking of results | |
var1=$(cut -f1 hap2_hap1.tsv | sort -u | wc -l) | |
var2=$(cut -f2 hap2_hap1.tsv | sort -u | wc -l) | |
echo "$var1 $var2" | awk '{if ($1 != $2) print "\n\nWARNING duplicate chromosome names detected!\n\nPlease check results.\n\n"}' | |
var3=$(cat tmp | wc -l) | |
var4=$(cat hap2_hap1.tsv | wc -l) | |
echo "$var3 $var4" | awk '{if ($1 > $2) print "\n\nWARNING not all autosomes are included in hap2_hap1.tsv\n\n"}' | |
# Clean up | |
rm tmp |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'optparse' | |
fasta = '' | |
table = '' | |
debug = false | |
new_mapping = false | |
chromosome_list = false | |
def get_mapping(file) | |
mapping = Hash.new | |
File.new(file).each_line{|line| | |
line.chomp! | |
c=line.split | |
next unless c[1] =~ /SUPER_\d+/ # do not map sex chromosomes across | |
next if mapping.values.include?(c[1]) # create an artificial 1:1 mapping | |
next if mapping.keys.include?(c[0]) # create an artificial 1:1 mapping | |
mapping[c[0]]=c[1] | |
} | |
mapping | |
end | |
def create_chromosme_list_line(id) | |
/(SUPER_[A-Z0-9]+)(_unloc_\d+)*/.match(id) | |
name = "#{$1}" | |
suffix = "#{$2}" | |
stripped_name = name.sub("SUPER_",'') | |
line = [] | |
if suffix.length >=1 | |
line = [name+suffix,stripped_name,'no'] | |
else | |
line = [name,stripped_name,'yes'] | |
end | |
line.join(',') | |
end | |
OptionParser.new do |parser| | |
parser.banner = "Usage: update_mapping --fasta FASTA_FILE --mashmap_table TABLE_TSV [ --debug | --new_mapping NEW_MAPPING_TSV | --help | --chromosome_list NEW_CHROMOSOME_LIST]" | |
parser.on("-f", "--fasta FASTA_FILE") { |f| fasta = f } | |
parser.on("-t", "--mashmap_table TABLE_TSV") { |f| table = f } | |
parser.on("-d", "--debug") { debug = true } | |
parser.on("-n", "--new_mapping NEW_MAPPING_TSV") { |f| new_mapping = f } | |
parser.on("-c", "--chromosome_name NEW_CHROMOSOME_LIST") {|f| chromosome_list = f} | |
parser.on("-h", "--help") do | |
puts parser | |
exit | |
end | |
end.parse! | |
hap2_hap1_mapping = get_mapping(table) | |
count=1 | |
new_table = File.new(new_mapping,'w') if new_mapping | |
new_chrom_file = File.new(chromosome_list,'w') if chromosome_list | |
File.new(fasta).each_line{|line| | |
if />(SUPER_\d+)(_unloc_\d+)*/.match(line) | |
name = "#{$1}" | |
suffix = "#{$2}" | |
if hap2_hap1_mapping.has_key?(name) | |
line = line.sub(name, hap2_hap1_mapping[name]) | |
$stderr.puts("renaming #{name}#{suffix} => #{hap2_hap1_mapping[name]}#{suffix}") if debug | |
new_table.puts([name+suffix, hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping | |
new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list | |
else | |
while hap2_hap1_mapping.reverse_each.to_h.has_key?("SUPER_#{count}") | |
count+=1 | |
end | |
id = "SUPER_#{count}" | |
line = line.sub(name,id) | |
hap2_hap1_mapping[id]=name | |
$stderr.puts("renaming #{name}#{suffix} => #{id}#{suffix}") if debug | |
new_table.puts( [name+suffix,hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping | |
new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list | |
end | |
elsif />(SUPER_\w+)(_unloc_\d+)*/.match(line) | |
name = "#{$1}" | |
suffix = "#{$2}" | |
new_chrom_file.puts(create_chromosme_list_line(name+suffix)) if chromosome_list | |
end | |
print line | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment