Skip to content

Instantly share code, notes, and snippets.

@zacharysyoung
Last active November 4, 2021 21:29
Show Gist options
  • Save zacharysyoung/c1b33239378253bfdc3fab1e66f4874a to your computer and use it in GitHub Desktop.
Save zacharysyoung/c1b33239378253bfdc3fab1e66f4874a to your computer and use it in GitHub Desktop.
name POS id Ref ALT Frequency CDS Start End sequence_cc
chrM 41 . C T 0.002498 CDS 3307 4262 -3265
chrM 42 rs377245343 T TC 0.001562 CDS 3307 4262 -3264
chrM 55 . TA T 0.00406 CDS 4470 5511 -4414
chrM 55 . T C 0.001874 CDS 4470 5511 -4414
Col1 Col2 Subject Lower Upper
chrM ENSEMBL CDS 3307 4262
chrM ENSEMBL CDS 4470 5511
chrM ENSEMBL CDS 5904 7445
chrM ENSEMBL CDS 7586 8266
chrM ENSEMBL exon 100 200
chrM ENSEMBL exon 300 400
chrM ENSEMBL exon 700 750
name POS id Ref ALT Frequency
chrM 41 . C T 0.002498
chrM 42 rs377245343 T TC 0.001562
chrM 55 . TA T 0.00406
chrM 55 . T C 0.001874
import csv
from collections import defaultdict
# This will create a dictionary, where subject will be the key
# and the value will be a list (of rows)
chrM_rows_by_subject = defaultdict(list)
with open('chrM_genes.csv', newline='') as f:
reader = csv.reader(f)
next(reader) # read (skip) header
subject_col = 2
for row in reader:
# you mentioned empty rows, that divide subjects, so skip empty rows
if row == []:
continue
subject = row[subject_col]
chrM_rows_by_subject[subject].append(row)
# Return a row that matches `subject` with `pos` between start and end; or None.
def match_gene_row(subject, pos):
rows = chrM_rows_by_subject[subject]
for row in rows:
start = row[3]
end = row[4]
if pos >= start and pos <= end:
# return just the data we want...
return row
# or return nothing at all
return None
final_rows = [] # accumulate all rows here, for writing later
with open('chrM_location.csv', newline='') as f:
reader = csv.reader(f)
# Modify header
header = next(reader)
header.extend(['CDS','Start','End','sequence_cc'])
final_rows.append(header)
# Read rows and match to genes
pos_column = 1
for row in reader:
pos = row[pos_column]
matched_row = match_gene_row('CDS', pos)
if matched_row is not None:
subj, start, end = matched_row[2:5]
computed = str(int(pos)-int(start)+1) # this is coming out negative??
row.extend([subj, start, end, computed])
final_rows.append(row)
# Finally, write
with open('final.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(final_rows)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment