Created
July 15, 2022 06:49
-
-
Save alexpreynolds/635a357ac52047027bfc99a9fb48a7ba to your computer and use it in GitHub Desktop.
Aggregation of epilogos score data per Hilbert curve order
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import math | |
import pandas as pd | |
import numpy as np | |
from scipy import stats | |
import json | |
from json import JSONEncoder | |
import zipfile | |
import zlib | |
from io import BytesIO | |
from datetime import datetime | |
class NumpyArrayEncoder(JSONEncoder): | |
def default(self, obj): | |
if isinstance(obj, np.ndarray): | |
return obj.tolist() | |
return JSONEncoder.default(self, obj) | |
SAMPLE_MULTIPLIER = 8192 | |
VIEW_ORDER = int(sys.argv[1]) | |
DATATYPE = "epilogos" | |
PUBLICATION = "Boix_et_al_833_sample" | |
ASSEMBLY = "hg38" | |
STATES = 18 | |
GROUP = "All_833_biosamples" | |
SALIENCY = "S1" | |
total_samples = VIEW_ORDER * SAMPLE_MULTIPLIER | |
chromosome_sizes = { "hg38": | |
[ | |
{ | |
"name": 1, | |
"size": 248956422 | |
}, | |
{ | |
"name": 2, | |
"size": 242193529 | |
}, | |
{ | |
"name": 3, | |
"size": 198295559 | |
}, | |
{ | |
"name": 4, | |
"size": 190214555 | |
}, | |
{ | |
"name": 5, | |
"size": 181538259 | |
}, | |
{ | |
"name": 6, | |
"size": 170805979 | |
}, | |
{ | |
"name": 7, | |
"size": 159345973 | |
}, | |
{ | |
"name": 8, | |
"size": 145138636 | |
}, | |
{ | |
"name": 9, | |
"size": 138394717 | |
}, | |
{ | |
"name": 10, | |
"size": 133797422 | |
}, | |
{ | |
"name": 11, | |
"size": 135086622 | |
}, | |
{ | |
"name": 12, | |
"size": 133275309 | |
}, | |
{ | |
"name": 13, | |
"size": 114364328 | |
}, | |
{ | |
"name": 14, | |
"size": 107043718 | |
}, | |
{ | |
"name": 15, | |
"size": 101991189 | |
}, | |
{ | |
"name": 16, | |
"size": 90338345 | |
}, | |
{ | |
"name": 17, | |
"size": 83257441 | |
}, | |
{ | |
"name": 18, | |
"size": 80373285 | |
}, | |
{ | |
"name": 19, | |
"size": 58617616 | |
}, | |
{ | |
"name": 20, | |
"size": 64444167 | |
}, | |
{ | |
"name": 21, | |
"size": 46709983 | |
}, | |
{ | |
"name": 22, | |
"size": 50818468 | |
}, | |
{ | |
"name": "X", | |
"size": 156040895 | |
}, | |
{ | |
"name": "Y", | |
"size": 57227415 | |
} | |
] | |
} | |
''' | |
Get total assembly length | |
''' | |
chromosome_size_acc = 0 | |
for cse in chromosome_sizes[ASSEMBLY]: | |
chromosome_size_acc += cse['size'] | |
''' | |
Build table of true samples to aggregate from score data per-chromosome | |
''' | |
samples = {} | |
for cse in chromosome_sizes[ASSEMBLY]: | |
chrom = 'chr{}'.format(cse['name']) | |
samples_per_chromosome = math.floor((total_samples * cse['size']) / chromosome_size_acc) | |
samples[chrom] = samples_per_chromosome | |
''' | |
For each chromosome, build a state vector from abs max of state scores per bin | |
''' | |
sv = {} | |
for cse in chromosome_sizes[ASSEMBLY]: | |
chrom = 'chr{}'.format(cse['name']) | |
print(chrom) | |
df = pd.read_csv( | |
'scores.{}.txt.gz'.format(chrom), | |
compression='gzip', | |
delimiter='\t', | |
lineterminator='\n', | |
header=None | |
) | |
# if the sum of a row is zero, we promote the last state | |
# as the max-scoring state in order to blank it out in | |
# rendering later on | |
pd.set_option('mode.chained_assignment', None) | |
df[df.iloc[:, 3:].sum(axis=1) == 0].iloc[:, -1] = sys.maxsize | |
# IMPORTANT: generate zero-based state values | |
df['agg'] = df.iloc[:, 3:].idxmax(axis=1) - 3 | |
agg = df['agg'].to_numpy() | |
r = float(len(agg)) | |
n = samples[chrom] | |
''' | |
np.array_split can generate subarrays of unequal length if the split | |
value does not divide the parent array length equally, so it is necessary | |
to truncate such subarrays where they are longer than the desired length | |
''' | |
a = np.array_split(agg, n) | |
d = int(math.floor(r / n)) | |
for i in range(len(a)): | |
e = a[i] | |
if len(e) > d: | |
diff = len(e) - d | |
a[i] = e[diff:] | |
if len(e) < d: | |
raise ValueError("Investigate subarrays further") | |
mode_states_vector = stats.mode(a, axis=1)[0].reshape(-1) | |
sv[chrom] = mode_states_vector | |
now = datetime.now() | |
sv['metadata'] = { | |
"created": now.isoformat(), | |
"datatype": DATATYPE, | |
"publication": PUBLICATION, | |
"assembly": ASSEMBLY, | |
"states": STATES, | |
"group": GROUP, | |
"saliency": SALIENCY, | |
"view_order": VIEW_ORDER, | |
"sample_multiplier": SAMPLE_MULTIPLIER, | |
} | |
ofn = 'hilbert-genome.{}.{}.{}.{}.{}.{}.{}.{}.zip'.format( | |
DATATYPE, | |
PUBLICATION, | |
ASSEMBLY, | |
STATES, | |
GROUP, | |
SALIENCY, | |
VIEW_ORDER, | |
SAMPLE_MULTIPLIER | |
) | |
dfn = 'data.json' | |
encoded_sv = json.dumps(sv, cls=NumpyArrayEncoder) | |
archive = BytesIO() | |
with zipfile.ZipFile(archive, 'w', zipfile.ZIP_DEFLATED) as ozah: | |
f = zipfile.ZipInfo(dfn) | |
ozah.writestr( | |
f, | |
encoded_sv.encode('utf-8'), | |
compress_type=zipfile.ZIP_DEFLATED | |
) | |
with open(ofn, 'wb') as ofh: | |
ofh.write(archive.getbuffer()) | |
archive.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment