Skip to content

Instantly share code, notes, and snippets.

@raphink
Last active June 12, 2018 07:36
Show Gist options
  • Save raphink/a23b406ba7e62cf12b3adf06576d4979 to your computer and use it in GitHub Desktop.
Save raphink/a23b406ba7e62cf12b3adf06576d4979 to your computer and use it in GitHub Desktop.
A script to prepare FTDNA Chromosome Browser CSV extract for Gephi, inspired by http://twigsofyore.blogspot.com/2018/03/triangulation-is-icing-not-cake.html
#!/usr/bin/env ruby
require 'optparse'
require 'csv'
options = {
:min_cm => 7,
:min_snp => 700,
:min_total_cm => 20,
}
OptionParser.new do |opts|
opts.banner = "Usage: #{$0} [options] <file>.csv"
opts.on("--min-cm CM", Integer, "Set min cM for segment filtering") do |m|
options[:min_cm] = m
end
opts.on("--min-snp SNP", Integer, "Set min SNPs for segment filtering") do |s|
options[:min_snp] = s
end
opts.on("--min-total-cm CM", Integer, "Ignore nodes with cM under value") do |c|
options[:min_total_cm] = c
end
end.parse!
file=ARGV[0]
nodes = {}
id = 0
data = CSV.read(file)
# Remove headers
# NAME,MATCHNAME,CHROMOSOME,START LOCATION,END LOCATION,CENTIMORGANS,MATCHING SNPS
data.shift
# Process nodes with modularity_class = total cM above 7
data.each do |row|
name = row[1]
# TBD entries are not useful
next if name == 'TBD TBD'
nodes[name] ||= {
:id => id += 1,
:segments => [],
}
nodes[name][:segments] << {
:chromosome => row[2].to_i,
:start => row[3].to_i,
:end => row[4].to_i,
:cm => row[5].to_i,
:snp => row[6].to_i,
}
end
nodes.each do |k, v|
nodes[k][:total_cm] = v[:segments].inject(0) { |sum, val|
(val[:cm] >= options[:min_cm] && val[:snp] >= options[:min_snp]) ? sum + val[:cm] : sum
}
end
nodes.reject! { |k, v| v[:total_cm] < options[:min_total_cm] }
# Write nodes
CSV.open('nodes.csv', 'wb') { |csv|
csv << ['Id', 'Label', 'Interval', 'Modularity Class']
nodes.each { |k, v|
csv << [
v[:id],
k,
'',
v[:total_cm],
]
}
}
# Generate edges
edges = []
id = 0
done = {}
nodes.each do |k, v|
done[k] ||= []
nodes.each do |kk, vv|
done[kk] ||= []
# Do not compare with itself
next if k == kk
l = 0
v[:segments].each do |s|
vv[:segments].each do |ss|
next if s[:chromosome] != ss[:chromosome]
next if s[:cm] < options[:min_cm] || ss[:cm] < options[:min_cm]
next if s[:snp] < options[:min_snp] || ss[:snp] < options[:min_snp]
next if ss[:start] >= s[:end] || ss[:end] <= s[:start]
next if done[k].include?(kk) || done[kk].include?(k)
# This is a pretty bad calculation which only uses positions
l += ss[:end] > s[:end] ? s[:end] - ss[:start] : ss[:end] - s[:start]
end
end
edges << {
:source => v[:id],
:target => vv[:id],
:type => 'Undirected',
:id => id += 1,
:weight => l,
} if l > 0
done[k] << kk
end
end
# Write edges
CSV.open('edges.csv', 'wb') { |csv|
csv << ['Source', 'Target', 'Type', 'Id', 'Label', 'Interval', 'Weight']
edges.each { |e|
csv << [
e[:source],
e[:target],
'Undirected',
e[:id],
'',
'',
e[:weight],
] if e[:weight] > 0
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment