Last active
November 4, 2021 21:29
-
-
Save zacharysyoung/c1b33239378253bfdc3fab1e66f4874a to your computer and use it in GitHub Desktop.
Answer for SO (https://stackoverflow.com/q/69836764/246801)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name | POS | id | Ref | ALT | Frequency | CDS | Start | End | sequence_cc | |
---|---|---|---|---|---|---|---|---|---|---|
chrM | 41 | . | C | T | 0.002498 | CDS | 3307 | 4262 | -3265 | |
chrM | 42 | rs377245343 | T | TC | 0.001562 | CDS | 3307 | 4262 | -3264 | |
chrM | 55 | . | TA | T | 0.00406 | CDS | 4470 | 5511 | -4414 | |
chrM | 55 | . | T | C | 0.001874 | CDS | 4470 | 5511 | -4414 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Col1 | Col2 | Subject | Lower | Upper | |
---|---|---|---|---|---|
chrM | ENSEMBL | CDS | 3307 | 4262 | |
chrM | ENSEMBL | CDS | 4470 | 5511 | |
chrM | ENSEMBL | CDS | 5904 | 7445 | |
chrM | ENSEMBL | CDS | 7586 | 8266 | |
chrM | ENSEMBL | exon | 100 | 200 | |
chrM | ENSEMBL | exon | 300 | 400 | |
chrM | ENSEMBL | exon | 700 | 750 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name | POS | id | Ref | ALT | Frequency | |
---|---|---|---|---|---|---|
chrM | 41 | . | C | T | 0.002498 | |
chrM | 42 | rs377245343 | T | TC | 0.001562 | |
chrM | 55 | . | TA | T | 0.00406 | |
chrM | 55 | . | T | C | 0.001874 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from collections import defaultdict | |
# This will create a dictionary, where subject will be the key | |
# and the value will be a list (of rows) | |
chrM_rows_by_subject = defaultdict(list) | |
with open('chrM_genes.csv', newline='') as f: | |
reader = csv.reader(f) | |
next(reader) # read (skip) header | |
subject_col = 2 | |
for row in reader: | |
# you mentioned empty rows, that divide subjects, so skip empty rows | |
if row == []: | |
continue | |
subject = row[subject_col] | |
chrM_rows_by_subject[subject].append(row) | |
# Return a row that matches `subject` with `pos` between start and end; or None. | |
def match_gene_row(subject, pos): | |
rows = chrM_rows_by_subject[subject] | |
for row in rows: | |
start = row[3] | |
end = row[4] | |
if pos >= start and pos <= end: | |
# return just the data we want... | |
return row | |
# or return nothing at all | |
return None | |
final_rows = [] # accumulate all rows here, for writing later | |
with open('chrM_location.csv', newline='') as f: | |
reader = csv.reader(f) | |
# Modify header | |
header = next(reader) | |
header.extend(['CDS','Start','End','sequence_cc']) | |
final_rows.append(header) | |
# Read rows and match to genes | |
pos_column = 1 | |
for row in reader: | |
pos = row[pos_column] | |
matched_row = match_gene_row('CDS', pos) | |
if matched_row is not None: | |
subj, start, end = matched_row[2:5] | |
computed = str(int(pos)-int(start)+1) # this is coming out negative?? | |
row.extend([subj, start, end, computed]) | |
final_rows.append(row) | |
# Finally, write | |
with open('final.csv', 'w', newline='') as f: | |
writer = csv.writer(f) | |
writer.writerows(final_rows) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment