Skip to content

Instantly share code, notes, and snippets.

@alexpreynolds
Created July 15, 2022 06:49
Show Gist options
  • Save alexpreynolds/635a357ac52047027bfc99a9fb48a7ba to your computer and use it in GitHub Desktop.
Save alexpreynolds/635a357ac52047027bfc99a9fb48a7ba to your computer and use it in GitHub Desktop.
Aggregation of epilogos score data per Hilbert curve order
#!/usr/bin/env python
import sys
import math
import pandas as pd
import numpy as np
from scipy import stats
import json
from json import JSONEncoder
import zipfile
import zlib
from io import BytesIO
from datetime import datetime
class NumpyArrayEncoder(JSONEncoder):
def default(self, obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
return JSONEncoder.default(self, obj)
SAMPLE_MULTIPLIER = 8192
VIEW_ORDER = int(sys.argv[1])
DATATYPE = "epilogos"
PUBLICATION = "Boix_et_al_833_sample"
ASSEMBLY = "hg38"
STATES = 18
GROUP = "All_833_biosamples"
SALIENCY = "S1"
total_samples = VIEW_ORDER * SAMPLE_MULTIPLIER
chromosome_sizes = { "hg38":
[
{
"name": 1,
"size": 248956422
},
{
"name": 2,
"size": 242193529
},
{
"name": 3,
"size": 198295559
},
{
"name": 4,
"size": 190214555
},
{
"name": 5,
"size": 181538259
},
{
"name": 6,
"size": 170805979
},
{
"name": 7,
"size": 159345973
},
{
"name": 8,
"size": 145138636
},
{
"name": 9,
"size": 138394717
},
{
"name": 10,
"size": 133797422
},
{
"name": 11,
"size": 135086622
},
{
"name": 12,
"size": 133275309
},
{
"name": 13,
"size": 114364328
},
{
"name": 14,
"size": 107043718
},
{
"name": 15,
"size": 101991189
},
{
"name": 16,
"size": 90338345
},
{
"name": 17,
"size": 83257441
},
{
"name": 18,
"size": 80373285
},
{
"name": 19,
"size": 58617616
},
{
"name": 20,
"size": 64444167
},
{
"name": 21,
"size": 46709983
},
{
"name": 22,
"size": 50818468
},
{
"name": "X",
"size": 156040895
},
{
"name": "Y",
"size": 57227415
}
]
}
'''
Get total assembly length
'''
chromosome_size_acc = 0
for cse in chromosome_sizes[ASSEMBLY]:
chromosome_size_acc += cse['size']
'''
Build table of true samples to aggregate from score data per-chromosome
'''
samples = {}
for cse in chromosome_sizes[ASSEMBLY]:
chrom = 'chr{}'.format(cse['name'])
samples_per_chromosome = math.floor((total_samples * cse['size']) / chromosome_size_acc)
samples[chrom] = samples_per_chromosome
'''
For each chromosome, build a state vector from abs max of state scores per bin
'''
sv = {}
for cse in chromosome_sizes[ASSEMBLY]:
chrom = 'chr{}'.format(cse['name'])
print(chrom)
df = pd.read_csv(
'scores.{}.txt.gz'.format(chrom),
compression='gzip',
delimiter='\t',
lineterminator='\n',
header=None
)
# if the sum of a row is zero, we promote the last state
# as the max-scoring state in order to blank it out in
# rendering later on
pd.set_option('mode.chained_assignment', None)
df[df.iloc[:, 3:].sum(axis=1) == 0].iloc[:, -1] = sys.maxsize
# IMPORTANT: generate zero-based state values
df['agg'] = df.iloc[:, 3:].idxmax(axis=1) - 3
agg = df['agg'].to_numpy()
r = float(len(agg))
n = samples[chrom]
'''
np.array_split can generate subarrays of unequal length if the split
value does not divide the parent array length equally, so it is necessary
to truncate such subarrays where they are longer than the desired length
'''
a = np.array_split(agg, n)
d = int(math.floor(r / n))
for i in range(len(a)):
e = a[i]
if len(e) > d:
diff = len(e) - d
a[i] = e[diff:]
if len(e) < d:
raise ValueError("Investigate subarrays further")
mode_states_vector = stats.mode(a, axis=1)[0].reshape(-1)
sv[chrom] = mode_states_vector
now = datetime.now()
sv['metadata'] = {
"created": now.isoformat(),
"datatype": DATATYPE,
"publication": PUBLICATION,
"assembly": ASSEMBLY,
"states": STATES,
"group": GROUP,
"saliency": SALIENCY,
"view_order": VIEW_ORDER,
"sample_multiplier": SAMPLE_MULTIPLIER,
}
ofn = 'hilbert-genome.{}.{}.{}.{}.{}.{}.{}.{}.zip'.format(
DATATYPE,
PUBLICATION,
ASSEMBLY,
STATES,
GROUP,
SALIENCY,
VIEW_ORDER,
SAMPLE_MULTIPLIER
)
dfn = 'data.json'
encoded_sv = json.dumps(sv, cls=NumpyArrayEncoder)
archive = BytesIO()
with zipfile.ZipFile(archive, 'w', zipfile.ZIP_DEFLATED) as ozah:
f = zipfile.ZipInfo(dfn)
ozah.writestr(
f,
encoded_sv.encode('utf-8'),
compress_type=zipfile.ZIP_DEFLATED
)
with open(ofn, 'wb') as ofh:
ofh.write(archive.getbuffer())
archive.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment