Last active
June 12, 2018 07:36
-
-
Save raphink/a23b406ba7e62cf12b3adf06576d4979 to your computer and use it in GitHub Desktop.
A script to prepare FTDNA Chromosome Browser CSV extract for Gephi, inspired by http://twigsofyore.blogspot.com/2018/03/triangulation-is-icing-not-cake.html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'optparse' | |
require 'csv' | |
options = { | |
:min_cm => 7, | |
:min_snp => 700, | |
:min_total_cm => 20, | |
} | |
OptionParser.new do |opts| | |
opts.banner = "Usage: #{$0} [options] <file>.csv" | |
opts.on("--min-cm CM", Integer, "Set min cM for segment filtering") do |m| | |
options[:min_cm] = m | |
end | |
opts.on("--min-snp SNP", Integer, "Set min SNPs for segment filtering") do |s| | |
options[:min_snp] = s | |
end | |
opts.on("--min-total-cm CM", Integer, "Ignore nodes with cM under value") do |c| | |
options[:min_total_cm] = c | |
end | |
end.parse! | |
file=ARGV[0] | |
nodes = {} | |
id = 0 | |
data = CSV.read(file) | |
# Remove headers | |
# NAME,MATCHNAME,CHROMOSOME,START LOCATION,END LOCATION,CENTIMORGANS,MATCHING SNPS | |
data.shift | |
# Process nodes with modularity_class = total cM above 7 | |
data.each do |row| | |
name = row[1] | |
# TBD entries are not useful | |
next if name == 'TBD TBD' | |
nodes[name] ||= { | |
:id => id += 1, | |
:segments => [], | |
} | |
nodes[name][:segments] << { | |
:chromosome => row[2].to_i, | |
:start => row[3].to_i, | |
:end => row[4].to_i, | |
:cm => row[5].to_i, | |
:snp => row[6].to_i, | |
} | |
end | |
nodes.each do |k, v| | |
nodes[k][:total_cm] = v[:segments].inject(0) { |sum, val| | |
(val[:cm] >= options[:min_cm] && val[:snp] >= options[:min_snp]) ? sum + val[:cm] : sum | |
} | |
end | |
nodes.reject! { |k, v| v[:total_cm] < options[:min_total_cm] } | |
# Write nodes | |
CSV.open('nodes.csv', 'wb') { |csv| | |
csv << ['Id', 'Label', 'Interval', 'Modularity Class'] | |
nodes.each { |k, v| | |
csv << [ | |
v[:id], | |
k, | |
'', | |
v[:total_cm], | |
] | |
} | |
} | |
# Generate edges | |
edges = [] | |
id = 0 | |
done = {} | |
nodes.each do |k, v| | |
done[k] ||= [] | |
nodes.each do |kk, vv| | |
done[kk] ||= [] | |
# Do not compare with itself | |
next if k == kk | |
l = 0 | |
v[:segments].each do |s| | |
vv[:segments].each do |ss| | |
next if s[:chromosome] != ss[:chromosome] | |
next if s[:cm] < options[:min_cm] || ss[:cm] < options[:min_cm] | |
next if s[:snp] < options[:min_snp] || ss[:snp] < options[:min_snp] | |
next if ss[:start] >= s[:end] || ss[:end] <= s[:start] | |
next if done[k].include?(kk) || done[kk].include?(k) | |
# This is a pretty bad calculation which only uses positions | |
l += ss[:end] > s[:end] ? s[:end] - ss[:start] : ss[:end] - s[:start] | |
end | |
end | |
edges << { | |
:source => v[:id], | |
:target => vv[:id], | |
:type => 'Undirected', | |
:id => id += 1, | |
:weight => l, | |
} if l > 0 | |
done[k] << kk | |
end | |
end | |
# Write edges | |
CSV.open('edges.csv', 'wb') { |csv| | |
csv << ['Source', 'Target', 'Type', 'Id', 'Label', 'Interval', 'Weight'] | |
edges.each { |e| | |
csv << [ | |
e[:source], | |
e[:target], | |
'Undirected', | |
e[:id], | |
'', | |
'', | |
e[:weight], | |
] if e[:weight] > 0 | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment