Convert Bird genotype data to BED PoMoCNV

The input is the genotype data from Supplementary file S13 from the study by Edwards et al. There are 14112 genes for the 94 samples. I assume the first part of the sample name AA_SRR23446543#1 is the population ID and the last letter is the haplotype ID. So there are 15 samples for AI (A. insularis), 15 for AW (A. woodehouseii), 14 for AC (A. coerulescens), and 1 for each of AA (A. californica), CY (Cyanocorax yucatanicus), and CS (Cyanocitta cristata).

Note that coordiantes are arbitary, 100 bases per each gene. If we have the gene length, we could make it accurate, probably shouldn't matter. We could double check whether CNV length matters in the PoMoCNV framework.

f= "data_S13.csv"
f_read= open(f,'r')
sample_names_haps=[]
cnv_dic={}
for line in f_read: 
    if line.startswith("Gene"):
        #l1=line 
        # assert sample_names[0]=='Gene'
        sample_names_haps= line.strip().split(",")[1:]
        for sample_name in sample_names_haps:
            cnv_dic[sample_name]={}
    else:
        line_split=line.strip().split(",")
        gene_name=line_split[0]
        genotype=line_split[1:]
        for sample_idx, sample_name in enumerate(sample_names_haps):
            cnv_dic[sample_name][gene_name]=int(genotype[sample_idx])
        #print(read_name, cnv)

print(len(cnv_dic))

cooridnates=[]
gene_aribtary_size=100
gene_coordinates={}
for gene_idx, gene_name in enumerate(gene_names):
    gene_coordinates[gene_name]=[gene_idx*gene_aribtary_size, (gene_idx+1)*gene_aribtary_size]

gene_names= list(cnv_dic[sample_name].keys())

bed_rows=[]
for sample_name in sample_names:
    gene_dict_hap1=cnv_dic[sample_name+"#1"]
    gene_dict_hap2=cnv_dic[sample_name+"#2" ] 
    # #for sample_name_hap, gene_dict in cnv_dic.items():
    for gene_name in gene_names:
        genotype=gene_dict_hap1[gene_name]+gene_dict_hap2[gene_name]
        start,end=gene_coordinates[gene_name]
        population_id=sample_name[:2]
        bed_row=["chr",start,end, genotype, sample_name, population_id ]
        bed_rows.append(bed_row)
len(bed_rows), bed_rows[:2]

output_file="bird_cnv.bed"
output_file=open(output_file,'w')
for bed_row in bed_rows:
    bed_row_str_list=[str(i) for i in bed_row]
    output_file.write("\t".join(bed_row_str_list)+"\n")
output_file.close()

Then we could use the output BED with PoMoCNV.

# chr	start	end	copy_number	individual_id	population_id

$ head bird_cnv.bed 
chr	0	100	2	AI_1603_79315	AI
chr	100	200	2	AI_1603_79315	AI
chr	200	300	2	AI_1603_79315	AI
chr	300	400	2	AI_1603_79315	AI
chr	400	500	2	AI_1603_79315	AI
chr	500	600	2	AI_1603_79315	AI
.
.
.
chr	1410700	1410800	0	CS_1	CS
chr	1410800	1410900	2	CS_1	CS
chr	1410900	1411000	0	CS_1	CS
chr	1411000	1411100	2	CS_1	CS
chr	1411100	1411200	0	CS_1	CS

sinamajidian/README.MD