epaule · November 13, 2024 10:00
diff --git a/hap2_hap1_ID_mapping.sh b/hap2_hap1_ID_mapping.sh
 #!/bin/bash

 # Tom Mathers.

 # Script to map hap2 chromosome (SUPER) IDs to hap1 IDs for genomeark assemblies.

 # Script aligns hap2 to hap1 with mashmap and selects the longest alignment per chrom. Only scaffolds with SUPER IDs are included in the output mapping.

 ## NOTE - the script filters out sex known sex chromsomes (X, Y, Z and W). These are not reported in the final mapping file.

 ## NOTE 2 - the script filters align ments on min 96% mash identity. If extremee haplotype divergence is epected check results carefully and adjust as needed. 

 # Output file is a tsv with HAP2 id in column 1 and the corresponding HAP1 ID in column 2. 

 # Input files are hap1 and hap2 fasta files generated by rapid_join.pl (ie. with SUPER IDs).

 # Always give <hap1_fasta> as first argument and <hap2_fasta> as second argument.

 # Run on farm with at least 16 cores and 16 Gb ram.

 if [ $# -ne 2 ]; then
    echo $0: usage: sh hap2_hap1_ID_mapping.sh hap1_fasta hap2_fasta
    exit 1
 fi

 hap1=$1
 hap2=$2

 # Map hap 2 to hap 1


 mashmap -r ${hap1} -q ${hap2} -f one-to-one -t 16;

 # Make results table

 awk '{print $1}' mashmap.out | grep -v SCAFFOLD | grep -v unloc | grep -v X | grep -v Y | grep -v Z | grep -v W | uniq > tmp; while read id; do awk -v val=${id} '$1==val && $10 >= 96' mashmap.out | awk '{print $0 "\t" $9 - $8}' | sort -nrk11,11 | head -n1 ; done < tmp | awk '{print $1 "\t" $6}' > hap2_hap1.tsv

 # Sanity checking of results

 var1=$(cut -f1 hap2_hap1.tsv | sort -u | wc -l)

 var2=$(cut -f2 hap2_hap1.tsv | sort -u | wc -l)

 echo "$var1 $var2" | awk '{if ($1 != $2) print "\n\nWARNING duplicate chromosome names detected!\n\nPlease check results.\n\n"}'

 var3=$(cat tmp | wc -l)

 var4=$(cat hap2_hap1.tsv | wc -l)

 echo "$var3 $var4" | awk '{if ($1 > $2) print "\n\nWARNING not all autosomes are included in hap2_hap1.tsv\n\n"}'

 # Clean up

 rm tmp
diff --git a/update_mapping.rb b/update_mapping.rb
 #!/usr/bin/env ruby

 require 'optparse'

 fasta = ''
 table = ''
 debug = false
 new_mapping = false
 chromosome_list = false

 def get_mapping(file)
  mapping = Hash.new
  File.new(file).each_line{|line|
    line.chomp!
    c=line.split
    next unless c[1] =~ /SUPER_\d+/ # do not map sex chromosomes across 
    next if mapping.values.include?(c[1]) # create an artificial 1:1 mapping
    next if mapping.keys.include?(c[0]) # create an artificial 1:1 mapping
    mapping[c[0]]=c[1]
  }
  mapping
 end

 def create_chromosme_list_line(id)
    /(SUPER_[A-Z0-9]+)(_unloc_\d+)*/.match(id)
    name   = "#{$1}"
    suffix = "#{$2}"
    stripped_name = name.sub("SUPER_",'')
    line = []
    if suffix.length >=1
      line = [name+suffix,stripped_name,'no']
    else
      line = [name,stripped_name,'yes']
    end
    line.join(',')
 end

 OptionParser.new do |parser|
  parser.banner = "Usage: update_mapping --fasta FASTA_FILE --mashmap_table TABLE_TSV [ --debug | --new_mapping NEW_MAPPING_TSV | --help | --chromosome_list NEW_CHROMOSOME_LIST]"
  parser.on("-f", "--fasta FASTA_FILE") { |f| fasta = f }
  parser.on("-t", "--mashmap_table TABLE_TSV") { |f| table = f }
  parser.on("-d", "--debug") { debug = true }
  parser.on("-n", "--new_mapping NEW_MAPPING_TSV") { |f| new_mapping = f }
  parser.on("-c", "--chromosome_name NEW_CHROMOSOME_LIST") {|f| chromosome_list = f}
  parser.on("-h", "--help") do
    puts parser
    exit
  end
 end.parse!

 hap2_hap1_mapping = get_mapping(table)

 count=1
 new_table = File.new(new_mapping,'w') if new_mapping
 new_chrom_file = File.new(chromosome_list,'w') if chromosome_list

 File.new(fasta).each_line{|line|
    if />(SUPER_\d+)(_unloc_\d+)*/.match(line)
      name   = "#{$1}"
      suffix = "#{$2}"
      if hap2_hap1_mapping.has_key?(name)
        line = line.sub(name, hap2_hap1_mapping[name])
        $stderr.puts("renaming #{name}#{suffix} => #{hap2_hap1_mapping[name]}#{suffix}") if debug
        new_table.puts([name+suffix, hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
        new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
      else
        while hap2_hap1_mapping.reverse_each.to_h.has_key?("SUPER_#{count}")
          count+=1
        end
        id = "SUPER_#{count}"
        line = line.sub(name,id)
        hap2_hap1_mapping[id]=name
        $stderr.puts("renaming #{name}#{suffix} => #{id}#{suffix}") if debug
        new_table.puts( [name+suffix,hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
        new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
      end
    elsif />(SUPER_\w+)(_unloc_\d+)*/.match(line)
      name   = "#{$1}"
      suffix = "#{$2}"
      new_chrom_file.puts(create_chromosme_list_line(name+suffix)) if chromosome_list
    end
    print line
 }
	#!/bin/bash

	# Tom Mathers.

	# Script to map hap2 chromosome (SUPER) IDs to hap1 IDs for genomeark assemblies.

	# Script aligns hap2 to hap1 with mashmap and selects the longest alignment per chrom. Only scaffolds with SUPER IDs are included in the output mapping.

	## NOTE - the script filters out sex known sex chromsomes (X, Y, Z and W). These are not reported in the final mapping file.

	## NOTE 2 - the script filters align ments on min 96% mash identity. If extremee haplotype divergence is epected check results carefully and adjust as needed.

	# Output file is a tsv with HAP2 id in column 1 and the corresponding HAP1 ID in column 2.

	# Input files are hap1 and hap2 fasta files generated by rapid_join.pl (ie. with SUPER IDs).

	# Always give <hap1_fasta> as first argument and <hap2_fasta> as second argument.

	# Run on farm with at least 16 cores and 16 Gb ram.

	if [ $# -ne 2 ]; then
	echo $0: usage: sh hap2_hap1_ID_mapping.sh hap1_fasta hap2_fasta
	exit 1
	fi

	hap1=$1
	hap2=$2

	# Map hap 2 to hap 1


	mashmap -r ${hap1} -q ${hap2} -f one-to-one -t 16;

	# Make results table

	awk '{print $1}' mashmap.out \| grep -v SCAFFOLD \| grep -v unloc \| grep -v X \| grep -v Y \| grep -v Z \| grep -v W \| uniq > tmp; while read id; do awk -v val=${id} '$1==val && $10 >= 96' mashmap.out \| awk '{print $0 "\t" $9 - $8}' \| sort -nrk11,11 \| head -n1 ; done < tmp \| awk '{print $1 "\t" $6}' > hap2_hap1.tsv

	# Sanity checking of results

	var1=$(cut -f1 hap2_hap1.tsv \| sort -u \| wc -l)

	var2=$(cut -f2 hap2_hap1.tsv \| sort -u \| wc -l)

	echo "$var1 $var2" \| awk '{if ($1 != $2) print "\n\nWARNING duplicate chromosome names detected!\n\nPlease check results.\n\n"}'

	var3=$(cat tmp \| wc -l)

	var4=$(cat hap2_hap1.tsv \| wc -l)

	echo "$var3 $var4" \| awk '{if ($1 > $2) print "\n\nWARNING not all autosomes are included in hap2_hap1.tsv\n\n"}'

	# Clean up

	rm tmp
	#!/usr/bin/env ruby

	require 'optparse'

	fasta = ''
	table = ''
	debug = false
	new_mapping = false
	chromosome_list = false

	def get_mapping(file)
	mapping = Hash.new
	File.new(file).each_line{\|line\|
	line.chomp!
	c=line.split
	next unless c[1] =~ /SUPER_\d+/ # do not map sex chromosomes across
	next if mapping.values.include?(c[1]) # create an artificial 1:1 mapping
	next if mapping.keys.include?(c[0]) # create an artificial 1:1 mapping
	mapping[c[0]]=c[1]
	}
	mapping
	end

	def create_chromosme_list_line(id)
	/(SUPER_[A-Z0-9]+)(_unloc_\d+)*/.match(id)
	name = "#{$1}"
	suffix = "#{$2}"
	stripped_name = name.sub("SUPER_",'')
	line = []
	if suffix.length >=1
	line = [name+suffix,stripped_name,'no']
	else
	line = [name,stripped_name,'yes']
	end
	line.join(',')
	end

	OptionParser.new do \|parser\|
	parser.banner = "Usage: update_mapping --fasta FASTA_FILE --mashmap_table TABLE_TSV [ --debug \| --new_mapping NEW_MAPPING_TSV \| --help \| --chromosome_list NEW_CHROMOSOME_LIST]"
	parser.on("-f", "--fasta FASTA_FILE") { \|f\| fasta = f }
	parser.on("-t", "--mashmap_table TABLE_TSV") { \|f\| table = f }
	parser.on("-d", "--debug") { debug = true }
	parser.on("-n", "--new_mapping NEW_MAPPING_TSV") { \|f\| new_mapping = f }
	parser.on("-c", "--chromosome_name NEW_CHROMOSOME_LIST") {\|f\| chromosome_list = f}
	parser.on("-h", "--help") do
	puts parser
	exit
	end
	end.parse!

	hap2_hap1_mapping = get_mapping(table)

	count=1
	new_table = File.new(new_mapping,'w') if new_mapping
	new_chrom_file = File.new(chromosome_list,'w') if chromosome_list

	File.new(fasta).each_line{\|line\|
	if />(SUPER_\d+)(_unloc_\d+)*/.match(line)
	name = "#{$1}"
	suffix = "#{$2}"
	if hap2_hap1_mapping.has_key?(name)
	line = line.sub(name, hap2_hap1_mapping[name])
	$stderr.puts("renaming #{name}#{suffix} => #{hap2_hap1_mapping[name]}#{suffix}") if debug
	new_table.puts([name+suffix, hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
	new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
	else
	while hap2_hap1_mapping.reverse_each.to_h.has_key?("SUPER_#{count}")
	count+=1
	end
	id = "SUPER_#{count}"
	line = line.sub(name,id)
	hap2_hap1_mapping[id]=name
	$stderr.puts("renaming #{name}#{suffix} => #{id}#{suffix}") if debug
	new_table.puts( [name+suffix,hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
	new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
	end
	elsif />(SUPER_\w+)(_unloc_\d+)*/.match(line)
	name = "#{$1}"
	suffix = "#{$2}"
	new_chrom_file.puts(create_chromosme_list_line(name+suffix)) if chromosome_list
	end
	print line
	}