alexpreynolds · July 15, 2022 06:49
diff --git a/aggregate_scores.py b/aggregate_scores.py
 #!/usr/bin/env python

 import sys
 import math
 import pandas as pd
 import numpy as np
 from scipy import stats
 import json
 from json import JSONEncoder
 import zipfile
 import zlib
 from io import BytesIO
 from datetime import datetime

 class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

 SAMPLE_MULTIPLIER = 8192
 VIEW_ORDER = int(sys.argv[1])
 DATATYPE = "epilogos"
 PUBLICATION = "Boix_et_al_833_sample"
 ASSEMBLY = "hg38"
 STATES = 18
 GROUP = "All_833_biosamples"
 SALIENCY = "S1"

 total_samples = VIEW_ORDER * SAMPLE_MULTIPLIER
 chromosome_sizes = { "hg38": 
    [
        {
            "name": 1,
            "size": 248956422
        },
        {
            "name": 2,
            "size": 242193529
        },
        {
            "name": 3,
            "size": 198295559
        },
        {
            "name": 4,
            "size": 190214555
        },
        {
            "name": 5,
            "size": 181538259
        },
        {
            "name": 6,
            "size": 170805979
        },
        {
            "name": 7,
            "size": 159345973
        },
        {
            "name": 8,
            "size": 145138636
        },
        {
            "name": 9,
            "size": 138394717
        },
        {
            "name": 10,
            "size": 133797422
        },
        {
            "name": 11,
            "size": 135086622
        },
        {
            "name": 12,
            "size": 133275309
        },
        {
            "name": 13,
            "size": 114364328
        },
        {
            "name": 14,
            "size": 107043718
        },
        {
            "name": 15,
            "size": 101991189
        },
        {
            "name": 16,
            "size": 90338345
        },
        {
            "name": 17,
            "size": 83257441
        },
        {
            "name": 18,
            "size": 80373285
        },
        {
            "name": 19,
            "size": 58617616
        },
        {
            "name": 20,
            "size": 64444167
        },
        {
            "name": 21,
            "size": 46709983
        },
        {
            "name": 22,
            "size": 50818468
        },
        {
            "name": "X",
            "size": 156040895
        },
        {
            "name": "Y",
            "size": 57227415
        }
    ] 
 }

 '''
 Get total assembly length
 '''
 chromosome_size_acc = 0
 for cse in chromosome_sizes[ASSEMBLY]:
    chromosome_size_acc += cse['size']

 '''
 Build table of true samples to aggregate from score data per-chromosome
 '''
 samples = {}
 for cse in chromosome_sizes[ASSEMBLY]:
    chrom = 'chr{}'.format(cse['name'])
    samples_per_chromosome = math.floor((total_samples * cse['size']) / chromosome_size_acc)
    samples[chrom] = samples_per_chromosome

 '''
 For each chromosome, build a state vector from abs max of state scores per bin
 '''
 sv = {}
 for cse in chromosome_sizes[ASSEMBLY]:
    chrom = 'chr{}'.format(cse['name'])
    print(chrom)
    df = pd.read_csv(
        'scores.{}.txt.gz'.format(chrom), 
        compression='gzip', 
        delimiter='\t', 
        lineterminator='\n',
        header=None
    )
    
    # if the sum of a row is zero, we promote the last state 
    # as the max-scoring state in order to blank it out in 
    # rendering later on
    pd.set_option('mode.chained_assignment', None)
    df[df.iloc[:, 3:].sum(axis=1) == 0].iloc[:, -1] = sys.maxsize

    # IMPORTANT: generate zero-based state values
    df['agg'] = df.iloc[:, 3:].idxmax(axis=1) - 3
    agg = df['agg'].to_numpy()
    r = float(len(agg))
    n = samples[chrom]
    
    '''
    np.array_split can generate subarrays of unequal length if the split
    value does not divide the parent array length equally, so it is necessary
    to truncate such subarrays where they are longer than the desired length
    '''
    a = np.array_split(agg, n)
    d = int(math.floor(r / n))
    for i in range(len(a)):
        e = a[i]
        if len(e) > d:
            diff = len(e) - d
            a[i] = e[diff:]
        if len(e) < d:
            raise ValueError("Investigate subarrays further")
    mode_states_vector = stats.mode(a, axis=1)[0].reshape(-1)
    sv[chrom] = mode_states_vector

 now = datetime.now()

 sv['metadata'] = {
    "created": now.isoformat(),
    "datatype": DATATYPE,
    "publication": PUBLICATION,
    "assembly": ASSEMBLY,
    "states": STATES,
    "group": GROUP,
    "saliency": SALIENCY,
    "view_order": VIEW_ORDER,
    "sample_multiplier": SAMPLE_MULTIPLIER,
 }

 ofn = 'hilbert-genome.{}.{}.{}.{}.{}.{}.{}.{}.zip'.format(
    DATATYPE,
    PUBLICATION,
    ASSEMBLY,
    STATES,
    GROUP,
    SALIENCY,
    VIEW_ORDER,
    SAMPLE_MULTIPLIER
 )

 dfn = 'data.json'

 encoded_sv = json.dumps(sv, cls=NumpyArrayEncoder)

 archive = BytesIO()
 with zipfile.ZipFile(archive, 'w', zipfile.ZIP_DEFLATED) as ozah:
    f = zipfile.ZipInfo(dfn)
    ozah.writestr(
        f, 
        encoded_sv.encode('utf-8'), 
        compress_type=zipfile.ZIP_DEFLATED
    )
 with open(ofn, 'wb') as ofh:
    ofh.write(archive.getbuffer())
 archive.close()
	#!/usr/bin/env python

	import sys
	import math
	import pandas as pd
	import numpy as np
	from scipy import stats
	import json
	from json import JSONEncoder
	import zipfile
	import zlib
	from io import BytesIO
	from datetime import datetime

	class NumpyArrayEncoder(JSONEncoder):
	def default(self, obj):
	if isinstance(obj, np.ndarray):
	return obj.tolist()
	return JSONEncoder.default(self, obj)

	SAMPLE_MULTIPLIER = 8192
	VIEW_ORDER = int(sys.argv[1])
	DATATYPE = "epilogos"
	PUBLICATION = "Boix_et_al_833_sample"
	ASSEMBLY = "hg38"
	STATES = 18
	GROUP = "All_833_biosamples"
	SALIENCY = "S1"

	total_samples = VIEW_ORDER * SAMPLE_MULTIPLIER
	chromosome_sizes = { "hg38":
	[
	{
	"name": 1,
	"size": 248956422
	},
	{
	"name": 2,
	"size": 242193529
	},
	{
	"name": 3,
	"size": 198295559
	},
	{
	"name": 4,
	"size": 190214555
	},
	{
	"name": 5,
	"size": 181538259
	},
	{
	"name": 6,
	"size": 170805979
	},
	{
	"name": 7,
	"size": 159345973
	},
	{
	"name": 8,
	"size": 145138636
	},
	{
	"name": 9,
	"size": 138394717
	},
	{
	"name": 10,
	"size": 133797422
	},
	{
	"name": 11,
	"size": 135086622
	},
	{
	"name": 12,
	"size": 133275309
	},
	{
	"name": 13,
	"size": 114364328
	},
	{
	"name": 14,
	"size": 107043718
	},
	{
	"name": 15,
	"size": 101991189
	},
	{
	"name": 16,
	"size": 90338345
	},
	{
	"name": 17,
	"size": 83257441
	},
	{
	"name": 18,
	"size": 80373285
	},
	{
	"name": 19,
	"size": 58617616
	},
	{
	"name": 20,
	"size": 64444167
	},
	{
	"name": 21,
	"size": 46709983
	},
	{
	"name": 22,
	"size": 50818468
	},
	{
	"name": "X",
	"size": 156040895
	},
	{
	"name": "Y",
	"size": 57227415
	}
	]
	}

	'''
	Get total assembly length
	'''
	chromosome_size_acc = 0
	for cse in chromosome_sizes[ASSEMBLY]:
	chromosome_size_acc += cse['size']

	'''
	Build table of true samples to aggregate from score data per-chromosome
	'''
	samples = {}
	for cse in chromosome_sizes[ASSEMBLY]:
	chrom = 'chr{}'.format(cse['name'])
	samples_per_chromosome = math.floor((total_samples * cse['size']) / chromosome_size_acc)
	samples[chrom] = samples_per_chromosome

	'''
	For each chromosome, build a state vector from abs max of state scores per bin
	'''
	sv = {}
	for cse in chromosome_sizes[ASSEMBLY]:
	chrom = 'chr{}'.format(cse['name'])
	print(chrom)
	df = pd.read_csv(
	'scores.{}.txt.gz'.format(chrom),
	compression='gzip',
	delimiter='\t',
	lineterminator='\n',
	header=None
	)

	# if the sum of a row is zero, we promote the last state
	# as the max-scoring state in order to blank it out in
	# rendering later on
	pd.set_option('mode.chained_assignment', None)
	df[df.iloc[:, 3:].sum(axis=1) == 0].iloc[:, -1] = sys.maxsize

	# IMPORTANT: generate zero-based state values
	df['agg'] = df.iloc[:, 3:].idxmax(axis=1) - 3
	agg = df['agg'].to_numpy()
	r = float(len(agg))
	n = samples[chrom]

	'''
	np.array_split can generate subarrays of unequal length if the split
	value does not divide the parent array length equally, so it is necessary
	to truncate such subarrays where they are longer than the desired length
	'''
	a = np.array_split(agg, n)
	d = int(math.floor(r / n))
	for i in range(len(a)):
	e = a[i]
	if len(e) > d:
	diff = len(e) - d
	a[i] = e[diff:]
	if len(e) < d:
	raise ValueError("Investigate subarrays further")
	mode_states_vector = stats.mode(a, axis=1)[0].reshape(-1)
	sv[chrom] = mode_states_vector

	now = datetime.now()

	sv['metadata'] = {
	"created": now.isoformat(),
	"datatype": DATATYPE,
	"publication": PUBLICATION,
	"assembly": ASSEMBLY,
	"states": STATES,
	"group": GROUP,
	"saliency": SALIENCY,
	"view_order": VIEW_ORDER,
	"sample_multiplier": SAMPLE_MULTIPLIER,
	}

	ofn = 'hilbert-genome.{}.{}.{}.{}.{}.{}.{}.{}.zip'.format(
	DATATYPE,
	PUBLICATION,
	ASSEMBLY,
	STATES,
	GROUP,
	SALIENCY,
	VIEW_ORDER,
	SAMPLE_MULTIPLIER
	)

	dfn = 'data.json'

	encoded_sv = json.dumps(sv, cls=NumpyArrayEncoder)

	archive = BytesIO()
	with zipfile.ZipFile(archive, 'w', zipfile.ZIP_DEFLATED) as ozah:
	f = zipfile.ZipInfo(dfn)
	ozah.writestr(
	f,
	encoded_sv.encode('utf-8'),
	compress_type=zipfile.ZIP_DEFLATED
	)
	with open(ofn, 'wb') as ofh:
	ofh.write(archive.getbuffer())
	archive.close()