zacharysyoung · November 4, 2021 21:29
diff --git a/chrM_final.csv b/chrM_final.csv
diff --git a/chrM_genes.csv b/chrM_genes.csv
diff --git a/chrM_location.csv b/chrM_location.csv
diff --git a/main.py b/main.py
 import csv

 from collections import defaultdict

 # This will create a dictionary, where subject will be the key
 # and the value will be a list (of rows)
 chrM_rows_by_subject = defaultdict(list)


 with open('chrM_genes.csv', newline='') as f:
    reader = csv.reader(f)
    next(reader)  # read (skip) header

    subject_col = 2
    for row in reader:
        # you mentioned empty rows, that divide subjects, so skip empty rows
        if row == []:
            continue

        subject = row[subject_col]
        chrM_rows_by_subject[subject].append(row)


 # Return a row that matches `subject` with `pos` between start and end; or None.
 def match_gene_row(subject, pos):
    rows = chrM_rows_by_subject[subject]

    for row in rows:
        start = row[3]
        end   = row[4]

        if pos >= start and pos <= end:
            # return just the data we want...
            return row

    # or return nothing at all
    return None


 final_rows = []  # accumulate all rows here, for writing later

 with open('chrM_location.csv', newline='') as f:
    reader = csv.reader(f)

    # Modify header
    header = next(reader)
    header.extend(['CDS','Start','End','sequence_cc'])
    final_rows.append(header)

    # Read rows and match to genes
    pos_column = 1
    for row in reader:
        pos = row[pos_column]
        matched_row = match_gene_row('CDS', pos)

        if matched_row is not None:
            subj, start, end = matched_row[2:5]
            computed = str(int(pos)-int(start)+1) # this is coming out negative??
            row.extend([subj, start, end, computed])
            final_rows.append(row)


 # Finally, write
 with open('final.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(final_rows)
name	POS	id	Ref	ALT	Frequency	CDS	Start	End	sequence_cc
chrM	41	.	C	T	0.002498	CDS	3307	4262	-3265
chrM	42	rs377245343	T	TC	0.001562	CDS	3307	4262	-3264
chrM	55	.	TA	T	0.00406	CDS	4470	5511	-4414
chrM	55	.	T	C	0.001874	CDS	4470	5511	-4414
Col1	Col2	Subject	Lower	Upper
chrM	ENSEMBL	CDS	3307	4262
chrM	ENSEMBL	CDS	4470	5511
chrM	ENSEMBL	CDS	5904	7445
chrM	ENSEMBL	CDS	7586	8266
chrM	ENSEMBL	exon	100	200
chrM	ENSEMBL	exon	300	400
chrM	ENSEMBL	exon	700	750
	import csv

	from collections import defaultdict

	# This will create a dictionary, where subject will be the key
	# and the value will be a list (of rows)
	chrM_rows_by_subject = defaultdict(list)


	with open('chrM_genes.csv', newline='') as f:
	reader = csv.reader(f)
	next(reader) # read (skip) header

	subject_col = 2
	for row in reader:
	# you mentioned empty rows, that divide subjects, so skip empty rows
	if row == []:
	continue

	subject = row[subject_col]
	chrM_rows_by_subject[subject].append(row)


	# Return a row that matches `subject` with `pos` between start and end; or None.
	def match_gene_row(subject, pos):
	rows = chrM_rows_by_subject[subject]

	for row in rows:
	start = row[3]
	end = row[4]

	if pos >= start and pos <= end:
	# return just the data we want...
	return row

	# or return nothing at all
	return None


	final_rows = [] # accumulate all rows here, for writing later

	with open('chrM_location.csv', newline='') as f:
	reader = csv.reader(f)

	# Modify header
	header = next(reader)
	header.extend(['CDS','Start','End','sequence_cc'])
	final_rows.append(header)

	# Read rows and match to genes
	pos_column = 1
	for row in reader:
	pos = row[pos_column]
	matched_row = match_gene_row('CDS', pos)

	if matched_row is not None:
	subj, start, end = matched_row[2:5]
	computed = str(int(pos)-int(start)+1) # this is coming out negative??
	row.extend([subj, start, end, computed])
	final_rows.append(row)


	# Finally, write
	with open('final.csv', 'w', newline='') as f:
	writer = csv.writer(f)
	writer.writerows(final_rows)