Created
June 14, 2010 13:32
-
-
Save keithshep/437676 to your computer and use it in GitHub Desktop.
liftover and sort a CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby -w | |
################################################################################ | |
# Created By: Keith Sheppard ([email protected]) | |
# Modification History: | |
# July 7, 2008: Initial revision created to run the liftover utility on | |
# the CGD (cgd.jax.org) imputed SNP data in comma-separated format | |
# | |
# This is a ruby script for running liftover to map SNPs in a source | |
# comma-separated file to a destination file using a different NCBI genome | |
# coordinate system. Run the script with no parameters to see usage | |
################################################################################ | |
################################################################################ | |
# a little function for printing usage and quitting the script | |
################################################################################ | |
def printusageandexit(exit_status=1) | |
STDERR.print( | |
"Usage: liftover-csv.rb <input.csv|input-dir> [chromosome-column-name] " + | |
"[base-pair-position-column-name] chain-file <output.csv|output-dir>\n") | |
STDERR.print " default chromosome-column-name=ChrID\n" | |
STDERR.print " default base-pair-position-column-name=build 36 bp Position\n" | |
STDERR.print " NOTE: this script expects the \"liftover\" executable to be\n" | |
STDERR.print " on the path\n" | |
STDERR.print " NOTE: the output will only be sorted correctly if the \n" | |
STDERR.print " input file is for a single chromosome. this is because\n" | |
STDERR.print " the output is sorted on base-pair position only\n" | |
Process.exit(exit_status) | |
end | |
################################################################################ | |
# get the index of the column with the given name assuming that the header | |
# passed in is comma separated. If we can't find it, return -1 | |
################################################################################ | |
def findcolumnindex(header_line, column_name) | |
i = 0 | |
header_line.split(',').each do |each_column_name| | |
if each_column_name == column_name | |
return i | |
else | |
i = i + 1 | |
end | |
end | |
# we couldn't find it... | |
return -1 | |
end | |
################################################################################ | |
# sort and save the genotype data to the given file. this function returns | |
# an opened read-only reference to the file that it creates | |
################################################################################ | |
def sortandsave(data_hash, file_name) | |
out_file = File.open(file_name, "w") | |
data_hash.sort.each do |key, value| | |
# join the data with a ',' so that we follow a CSV format, and write it | |
# to the file | |
out_file.puts(value.join(",")) | |
end | |
# OK, we're done.. close the file | |
out_file.close | |
# Now reopen the file as read-only and return it | |
return File.open(file_name, "r") | |
end | |
################################################################################ | |
# Lift the given SNPs using the liftover command line program. The liftover | |
# application should be in the users PATH and should be executable by typing | |
# "liftover" | |
# in_file_name: | |
# the path of the input file. this file should be a CSV file with a header | |
# row. the header row should contain a "ChrID" column that contains | |
# chromosome ID's in BED format (so that liftover can use them) and all of | |
# the chromosomes should be the same. | |
# chromosome_column_name_arg: | |
# the name of the column that contains the chromosome name | |
# base_pair_column_name_name: | |
# the name of the column that contains the SNPs | |
# chain_file_name: | |
# the path to the chain file to use for the liftover command | |
# out_file_name: | |
# the path to the output file that we should create for the lifted data | |
################################################################################ | |
def lift_snps( | |
in_file_name, | |
chromosome_column_name_arg, | |
base_pair_column_name_name, | |
chain_file_name, | |
out_file_name) | |
# These variables can be modified (or maybe parameterized) to match what you | |
# need them to be. | |
# The name of the liftover executable | |
liftover_executable = "liftover" | |
# If you're running out of memory you should reduce the | |
# max_rows_in_memory variable. | |
max_rows_in_memory = 100000 | |
# do a couple of sanity checks before we start | |
if File.exists?(out_file_name) | |
STDERR.print( | |
"Refusing to run because output file " + out_file_name + " exists\n") | |
Process.exit(1) | |
end | |
# intermediate/temporary files used in the invokation of liftover | |
temp_dir_name = "temp-dir" | |
if File.exists?(temp_dir_name) | |
STDERR.print( | |
"Refusing to run because " + temp_dir_name + " already exists." + | |
" If you don't need it, delete it and re-run the command.\n") | |
Process.exit(1) | |
end | |
Dir.mkdir(temp_dir_name) | |
temp_bed_file_name = temp_dir_name + File::SEPARATOR + "delete-me.bed" | |
temp_lifted_bed_file_name = temp_dir_name + File::SEPARATOR + "delete-me-also.bed" | |
temp_unmapped_file_name = temp_dir_name + File::SEPARATOR + "delete-me.unmapped" | |
# open up the input genotype file | |
in_file = File.open(in_file_name, "r") | |
# blow through and comments or empty lines to get to the CSV header | |
header_line = "" | |
in_file.each_line do |each_header_line| | |
each_header_line = each_header_line.strip | |
if (!each_header_line.empty?) && each_header_line[0..0] != '#' | |
header_line = each_header_line | |
break | |
end | |
end | |
# find the column where the base pair value is kept | |
base_pair_column_index = findcolumnindex(header_line, base_pair_column_name_name) | |
if(base_pair_column_index == -1) | |
STDERR.print( | |
"Failed to locate column named " + base_pair_column_name_name + | |
" in file " + in_file_name + "\n\n") | |
printusageandexit | |
end | |
# find the column where the chromosome identifier is kept | |
chr_id_column_index = findcolumnindex(header_line, chromosome_column_name_arg) | |
if(chr_id_column_index == -1) | |
STDERR.print( | |
"Failed to locate column named " + chromosome_column_name_arg + | |
" in file " + in_file_name + "\n\n") | |
printusageandexit | |
end | |
# create a BED formatted file so that liftover can convert the data. Also | |
# create a mapping from old_position=>old_row | |
print( | |
"building temporary BED file for SNP data from " + | |
in_file_name + " ...\n") | |
temp_bed_file = File.open(temp_bed_file_name, "w") | |
in_file.each_line do |line| | |
line = line.strip | |
line_fields = line.split(','); | |
base_pair_position = line_fields[base_pair_column_index] | |
base_pair_plus_one_position = (base_pair_position.to_i + 1).to_s | |
chromosome_name = line_fields[chr_id_column_index] | |
# liftover will pass the line through the 4th position unmodified, so we'll | |
# be able to rebuild the rows | |
temp_bed_file.puts( | |
chromosome_name + "\t" + | |
base_pair_position + "\t" + | |
base_pair_plus_one_position + "\t" + | |
line) | |
end | |
# close the BED files | |
in_file.close | |
temp_bed_file.close | |
# make sure that the chain file that we were given exists | |
unless File.exists?(chain_file_name) | |
STDERR.print "Can't find chain file: " + chain_file_name + " ...\n\n" | |
printusageandexit | |
end | |
# OK... try to invoke the liftover command | |
print "running liftover on the BED file ...\n" | |
print( | |
liftover_executable + " " + | |
temp_bed_file_name + " " + | |
chain_file_name + " " + | |
temp_lifted_bed_file_name + " " + | |
temp_unmapped_file_name + "\n") | |
unless system( | |
liftover_executable, | |
temp_bed_file_name, | |
chain_file_name, | |
temp_lifted_bed_file_name, | |
temp_unmapped_file_name) | |
STDERR.print "liftover command failed: " + $?.to_s + "\n\n" | |
Process.exit(1) | |
end | |
print "restoring CSV format to liftover output ...\n" | |
# turn the lifted bed file into a position map where new_position=>old_row | |
temp_lifted_bed_file = File.open(temp_lifted_bed_file_name, "r") | |
new_positions = {} | |
sorted_files = [] | |
temp_lifted_bed_file.each_line do |line| | |
bed_tokens = line.strip.split(' ') | |
# the remapped position will be the 1st token of the bed file | |
new_position = bed_tokens[1] | |
# the old csv line should have been passed through unchanged to the | |
# 3rd token. Tokenize this line and replace the old position | |
# with the new position | |
old_csv_line = bed_tokens[3] | |
csv_tokens = old_csv_line.split(',') | |
csv_tokens[base_pair_column_index] = new_position | |
# map the tokens by base pair position | |
new_positions[new_position.to_i] = csv_tokens | |
# don't let the map grow past "max rows" value. dump the current values | |
# to an intermediate file | |
if new_positions.length == max_rows_in_memory | |
temp_sorted_file_name = | |
temp_dir_name + File::SEPARATOR + File.basename(out_file_name) + "." + | |
sorted_files.length.to_s | |
print( | |
" sorting and saving " + new_positions.length.to_s + " rows to " + | |
temp_sorted_file_name + " ...\n") | |
sorted_files[sorted_files.length] = sortandsave( | |
new_positions, | |
temp_sorted_file_name) | |
new_positions.replace({}) | |
end | |
end | |
# dump any remaining rows to the final intermediate file | |
unless new_positions.empty? | |
temp_sorted_file_name = | |
temp_dir_name + File::SEPARATOR + File.basename(out_file_name) + "." + | |
sorted_files.length.to_s | |
print( | |
" sorting and saving " + new_positions.length.to_s + " rows to " + | |
temp_sorted_file_name + " ...\n") | |
sorted_files[sorted_files.length] = sortandsave( | |
new_positions, | |
temp_sorted_file_name) | |
new_positions.replace({}) | |
end | |
# merge all of the partial results into our final output file | |
print " merging results to " + out_file_name + " ...\n" | |
sorted_file_map = {} | |
out_file = File.open(out_file_name, "w") | |
out_file.puts(header_line) | |
sorted_files.each do |curr_file| | |
line = curr_file.gets | |
if line != nil | |
sorted_file_map[curr_file] = line.strip.split(',') | |
end | |
end | |
# keep track of the prev snp position for a sanity check | |
prev_snp_position = -1 | |
until sorted_file_map.empty? | |
min_file = sorted_file_map.keys[0] | |
min_position = sorted_file_map.values[0][base_pair_column_index].to_i | |
sorted_file_map.each do |file_key, value| | |
curr_position = value[base_pair_column_index].to_i | |
if curr_position < min_position | |
min_position = curr_position | |
min_file = file_key | |
end | |
end | |
# do sanity check to make sure that SNP positions are being output | |
# in order | |
if min_position < prev_snp_position | |
STDERR.print "Error detected: bad order for SNP output" | |
Process.exit(1) | |
else | |
prev_snp_position = min_position | |
end | |
out_file.puts(sorted_file_map[min_file].join(',')) | |
new_value = min_file.gets | |
if new_value == nil | |
sorted_file_map.delete(min_file) | |
min_file.close | |
else | |
sorted_file_map[min_file] = new_value.strip.split(',') | |
end | |
end | |
temp_lifted_bed_file.close | |
out_file.close | |
# delete all of the temporary files | |
Dir.entries(temp_dir_name).each do |temp_file| | |
begin | |
File.delete( | |
temp_dir_name + File::SEPARATOR + | |
temp_file) | |
rescue | |
# don't care... if our Dir delete fails, then we're in trouble | |
end | |
end | |
Dir.delete(temp_dir_name) | |
end | |
################################################################################ | |
# Main entry point of script. | |
# Run the script without any parameters to get usage. | |
################################################################################ | |
# assign meaningful names to the arguments | |
if ARGV.length == 3 | |
in_file_arg = ARGV[0] | |
chromosome_column_name_arg = "ChrID" | |
base_pair_column_name_arg = "build 36 bp Position" | |
chain_file_arg = ARGV[1] | |
out_file_arg = ARGV[2] | |
elsif ARGV.length == 5 | |
in_file_arg = ARGV[0] | |
chromosome_column_name_arg = ARGV[1] | |
base_pair_column_name_arg = ARGV[2] | |
chain_file_arg = ARGV[3] | |
out_file_arg = ARGV[4] | |
else | |
# the args don't match what we expect... quit | |
printusageandexit | |
end | |
if File.directory?(in_file_arg) | |
unless File.directory?(out_file_arg) | |
Dir.mkdir(out_file_arg) | |
end | |
print "converting directory " + in_file_arg + " ...\n" | |
Dir.entries(in_file_arg).each do |curr_file| | |
if File.extname(curr_file) == ".csv" | |
curr_in_file_name = in_file_arg + File::SEPARATOR + curr_file | |
curr_out_file_name = out_file_arg + File::SEPARATOR + File.basename(curr_file) | |
print "========================================\n" | |
print( | |
"converting " + curr_in_file_name + " to " + | |
curr_out_file_name + " ...\n") | |
lift_snps( | |
curr_in_file_name, | |
chromosome_column_name_arg, | |
base_pair_column_name_arg, | |
chain_file_arg, | |
curr_out_file_name) | |
end | |
end | |
else | |
lift_snps( | |
in_file_arg, | |
chromosome_column_name_arg, | |
base_pair_column_name_arg, | |
chain_file_arg, | |
out_file_arg) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment