Last active
April 23, 2018 18:08
-
-
Save pkerpedjiev/188b1676496e0264c8fe0fa01572f7f6 to your computer and use it in GitHub Desktop.
streaming_loader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"%load_ext autoreload\n", | |
"%autoreload 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-01-31T19:35:31.631090Z", | |
"start_time": "2018-01-31T19:35:31.138300Z" | |
}, | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import cooler\n", | |
"from cooler.io import ContactBinner, CoolerMerger\n", | |
"import pandas as pd\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import bresenham" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-01-31T19:35:32.261309Z", | |
"start_time": "2018-01-31T19:35:32.256140Z" | |
}, | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#matrix_file = '../98.out.gz'\n", | |
"matrix_file = '../short_brh_10.gz'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-01-31T19:35:33.197169Z", | |
"start_time": "2018-01-31T19:35:32.885505Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CM000663.2\t248946422\t0\t100000\t+\tCM000663.2\t248946422\t0\t100000\t99.9942\t87\t87\t197999\r\n", | |
"CM000663.2\t248946422\t100000\t200000\t+\tCM000663.2\t248946422\t100000\t200000\t99.9942\t87\t87\t197999\r\n", | |
"CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t200000\t300000\t99.9942\t87\t87\t197999\r\n", | |
"CM000663.2\t248946422\t100000\t200000\t+\tCM000663.2\t248946422\t0\t100000\t98.4586\t51\t80\t27999\r\n", | |
"CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t0\t100000\t98.4586\t51\t80\t27999\r\n", | |
"CM000663.2\t248946422\t600000\t700000\t+\tCM000663.2\t248946422\t0\t100000\t98.0156\t46\t87\t5999\r\n", | |
"CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t0\t100000\t98.9146\t77\t85\t32999\r\n", | |
"CM000663.2\t248946422\t200000\t300000\t+\tCM000663.2\t248946422\t100000\t200000\t98.9146\t77\t85\t32999\r\n", | |
"CM000663.2\t248946422\t223900000\t224000000\t+\tCM000663.2\t248946422\t0\t100000\t98.1158\t61\t86\t43999\r\n", | |
"CM000663.2\t248946422\t223900000\t224000000\t+\tCM000663.2\t248946422\t100000\t200000\t98.1158\t61\t86\t43999\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!gzcat {matrix_file} | head" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-01-31T19:34:41.547674Z", | |
"start_time": "2018-01-31T19:33:01.642302Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 17\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!gzcat {matrix_file} | wc -l" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-01-31T19:35:34.909247Z", | |
"start_time": "2018-01-31T19:35:34.896192Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>chrom1</th>\n", | |
" <th>chrom1_len</th>\n", | |
" <th>start1</th>\n", | |
" <th>end1</th>\n", | |
" <th>strand</th>\n", | |
" <th>chrom2</th>\n", | |
" <th>chrom2_len</th>\n", | |
" <th>start2</th>\n", | |
" <th>end2</th>\n", | |
" <th>identity</th>\n", | |
" <th>foo</th>\n", | |
" <th>bar</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>0</td>\n", | |
" <td>100000</td>\n", | |
" <td>+</td>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>0</td>\n", | |
" <td>100000</td>\n", | |
" <td>99.9942</td>\n", | |
" <td>87</td>\n", | |
" <td>87</td>\n", | |
" <td>197999</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>100000</td>\n", | |
" <td>200000</td>\n", | |
" <td>+</td>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>100000</td>\n", | |
" <td>200000</td>\n", | |
" <td>99.9942</td>\n", | |
" <td>87</td>\n", | |
" <td>87</td>\n", | |
" <td>197999</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>200000</td>\n", | |
" <td>300000</td>\n", | |
" <td>+</td>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>200000</td>\n", | |
" <td>300000</td>\n", | |
" <td>99.9942</td>\n", | |
" <td>87</td>\n", | |
" <td>87</td>\n", | |
" <td>197999</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>100000</td>\n", | |
" <td>200000</td>\n", | |
" <td>+</td>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>0</td>\n", | |
" <td>100000</td>\n", | |
" <td>98.4586</td>\n", | |
" <td>51</td>\n", | |
" <td>80</td>\n", | |
" <td>27999</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>200000</td>\n", | |
" <td>300000</td>\n", | |
" <td>+</td>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>0</td>\n", | |
" <td>100000</td>\n", | |
" <td>98.4586</td>\n", | |
" <td>51</td>\n", | |
" <td>80</td>\n", | |
" <td>27999</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>600000</td>\n", | |
" <td>700000</td>\n", | |
" <td>+</td>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>0</td>\n", | |
" <td>100000</td>\n", | |
" <td>98.0156</td>\n", | |
" <td>46</td>\n", | |
" <td>87</td>\n", | |
" <td>5999</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>200000</td>\n", | |
" <td>300000</td>\n", | |
" <td>+</td>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>0</td>\n", | |
" <td>100000</td>\n", | |
" <td>98.9146</td>\n", | |
" <td>77</td>\n", | |
" <td>85</td>\n", | |
" <td>32999</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>200000</td>\n", | |
" <td>300000</td>\n", | |
" <td>+</td>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>100000</td>\n", | |
" <td>200000</td>\n", | |
" <td>98.9146</td>\n", | |
" <td>77</td>\n", | |
" <td>85</td>\n", | |
" <td>32999</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>223900000</td>\n", | |
" <td>224000000</td>\n", | |
" <td>+</td>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>0</td>\n", | |
" <td>100000</td>\n", | |
" <td>98.1158</td>\n", | |
" <td>61</td>\n", | |
" <td>86</td>\n", | |
" <td>43999</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>223900000</td>\n", | |
" <td>224000000</td>\n", | |
" <td>+</td>\n", | |
" <td>CM000663.2</td>\n", | |
" <td>248946422</td>\n", | |
" <td>100000</td>\n", | |
" <td>200000</td>\n", | |
" <td>98.1158</td>\n", | |
" <td>61</td>\n", | |
" <td>86</td>\n", | |
" <td>43999</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" chrom1 chrom1_len start1 end1 strand chrom2 \\\n", | |
"0 CM000663.2 248946422 0 100000 + CM000663.2 \n", | |
"1 CM000663.2 248946422 100000 200000 + CM000663.2 \n", | |
"2 CM000663.2 248946422 200000 300000 + CM000663.2 \n", | |
"3 CM000663.2 248946422 100000 200000 + CM000663.2 \n", | |
"4 CM000663.2 248946422 200000 300000 + CM000663.2 \n", | |
"5 CM000663.2 248946422 600000 700000 + CM000663.2 \n", | |
"6 CM000663.2 248946422 200000 300000 + CM000663.2 \n", | |
"7 CM000663.2 248946422 200000 300000 + CM000663.2 \n", | |
"8 CM000663.2 248946422 223900000 224000000 + CM000663.2 \n", | |
"9 CM000663.2 248946422 223900000 224000000 + CM000663.2 \n", | |
"\n", | |
" chrom2_len start2 end2 identity foo bar count \n", | |
"0 248946422 0 100000 99.9942 87 87 197999 \n", | |
"1 248946422 100000 200000 99.9942 87 87 197999 \n", | |
"2 248946422 200000 300000 99.9942 87 87 197999 \n", | |
"3 248946422 0 100000 98.4586 51 80 27999 \n", | |
"4 248946422 0 100000 98.4586 51 80 27999 \n", | |
"5 248946422 0 100000 98.0156 46 87 5999 \n", | |
"6 248946422 0 100000 98.9146 77 85 32999 \n", | |
"7 248946422 100000 200000 98.9146 77 85 32999 \n", | |
"8 248946422 0 100000 98.1158 61 86 43999 \n", | |
"9 248946422 100000 200000 98.1158 61 86 43999 " | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"fields = [('chrom1', object),\n", | |
" ('chrom1_len', int), \n", | |
" ('start1', int),\n", | |
" ('end1', int),\n", | |
" ('strand', object),\n", | |
" ('chrom2', object),\n", | |
" ('chrom2_len', int),\n", | |
" ('start2', int),\n", | |
" ('end2', int),\n", | |
" ('identity', float),\n", | |
" ('foo', int),\n", | |
" ('bar', int),\n", | |
" ('count', int)]\n", | |
"\n", | |
"reader = pd.read_csv(matrix_file, iterator=True, sep='\\t', \n", | |
" names=[f[0] for f in fields],\n", | |
" dtype=dict(fields))\n", | |
"reader.read(10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-01-31T19:36:36.833388Z", | |
"start_time": "2018-01-31T19:36:36.523384Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"fields: [('chrom1', <class 'object'>), ('chrom1_len', <class 'int'>), ('start1', <class 'int'>), ('end1', <class 'int'>), ('strand', <class 'object'>), ('chrom2', <class 'object'>), ('chrom2_len', <class 'int'>), ('start2', <class 'int'>), ('end2', <class 'int'>), ('identity', <class 'float'>), ('foo', <class 'int'>), ('bar', <class 'int'>), ('count', <class 'int'>)]\n" | |
] | |
} | |
], | |
"source": [ | |
"from cooler._binning import GenomeSegmentation\n", | |
"from collections import OrderedDict\n", | |
"import bresenham as brh\n", | |
"import functools as ft\n", | |
"import tempfile\n", | |
"import os\n", | |
"\n", | |
"\n", | |
"# add field number to the list of fields\n", | |
"'''\n", | |
"fields_and_nums = [tuple(list(f[:2)] + [i]) \n", | |
" for i, f in enumerate(fields)]\n", | |
"'''\n", | |
"print(\"fields:\", fields)\n", | |
"\n", | |
"\n", | |
"def load_contact_matrix(inpath, outpath, bins, chunksize, binsize, count_as_float=False, \n", | |
" read_options=None, sep='\\t', **kwargs):\n", | |
" \"\"\"\n", | |
" Load any flat contact matrix file into a cooler.\n", | |
" \n", | |
" Parameters\n", | |
" ----------\n", | |
" filepath : str\n", | |
" Path to the file\n", | |
" outpath : str\n", | |
" Cooler URI\n", | |
" bins : data frame\n", | |
" bin table\n", | |
" chunksize : int\n", | |
" Number of lines to process to generate each temp file\n", | |
" \"\"\"\n", | |
" if read_options is None:\n", | |
" read_options = {}\n", | |
" \n", | |
" print('dtypes', [np.dtype(f[1]) for f in fields])\n", | |
" \n", | |
" reader = pd.read_csv(\n", | |
" inpath, \n", | |
" iterator=True, \n", | |
" names=[f[0] for f in fields],\n", | |
" dtype=dict(fields),\n", | |
" sep=sep,\n", | |
" **read_options)\n", | |
" \n", | |
" reader.chunksize = chunksize\n", | |
" gs = GenomeSegmentation(chromsizes, bins)\n", | |
" bins =gs.bins.copy()\n", | |
" bins['chrom'] = bins['chrom'].astype(object)\n", | |
" bins['bin'] = bins.index\n", | |
" out_columns = ['bin1_id', 'bin2_id', 'count' ]\n", | |
" binsize = binsize\n", | |
" \n", | |
" temp_files = []\n", | |
" for df in reader:\n", | |
" # check for flips XXX - TODO: just flip everything\n", | |
" idmap = gs.idmap\n", | |
" chrom1_ids = df['chrom1'].apply(idmap.__getitem__)\n", | |
" chrom2_ids = df['chrom2'].apply(idmap.__getitem__)\n", | |
" assert np.all(chrom1_ids <= chrom2_ids)\n", | |
" \n", | |
" # print(\"df.head:\", df.head())\n", | |
" print(\"bins:\", bins.head())\n", | |
"\n", | |
" # assign bin IDs from bin table\n", | |
" df = (df.merge(bins, \n", | |
" left_on=['chrom1', 'start1', 'end1'], \n", | |
" right_on=['chrom', 'start', 'end'])\n", | |
" .merge(bins, \n", | |
" left_on=['chrom2', 'start2', 'end2'], \n", | |
" right_on=['chrom', 'start', 'end'], \n", | |
" suffixes=('1', '2'))\n", | |
" .rename(columns={'bin1': 'bin1_id', \n", | |
" 'bin2': 'bin2_id'}))\n", | |
" \n", | |
" # ensure output is sorted\n", | |
" df = (df[out_columns]\n", | |
" .sort_values(['bin1_id', 'bin2_id']))\n", | |
" \n", | |
" tf = tempfile.NamedTemporaryFile(suffix='.cool')\n", | |
" temp_files.append(tf)\n", | |
"\n", | |
" print('Creating', tf.name)\n", | |
" #pixels = StreamingLoader(bins, reader, chunksize=chunksize, binsize=binsize)\n", | |
" print(\"pixels\")\n", | |
" cooler.io.create(tf.name, bins, df)\n", | |
" print(\"flal\")\n", | |
"\n", | |
"\n", | |
" print('Merging...')\n", | |
" pixels = CoolerMerger([cooler.Cooler(tf.name) for tf in temp_files], chunksize)\n", | |
" cooler.io.create(outpath, bins, pixels, **kwargs)\n", | |
"\n", | |
" #print('Deleting temp files...')\n", | |
" #for tf in temp_files:\n", | |
" # os.remove(tf.name)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"binsize = 100000\n", | |
"#chromsizes = cooler.util.fetch_chromsizes('mm9')\n", | |
"#print(\"chromsizes\", chromsizes)\n", | |
"chromsizes = pd.read_table('../chromInfo_chr1.txt', delimiter='\\t', \n", | |
" names=['chromname', 'length'],\n", | |
" dtype=dict([(\"chromname\", object), ('length', int)]),\n", | |
" index_col='chromname')['length']\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2018-01-31T19:36:37.085Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"dtypes [dtype('O'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('O'), dtype('O'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('float64'), dtype('int64'), dtype('int64'), dtype('int64')]\n", | |
"bins: chrom start end bin\n", | |
"0 CM000663.2 0 100000 0\n", | |
"1 CM000663.2 100000 200000 1\n", | |
"2 CM000663.2 200000 300000 2\n", | |
"3 CM000663.2 300000 400000 3\n", | |
"4 CM000663.2 400000 500000 4\n", | |
"Creating /var/folders/yb/3jwdnlf17gd1dcq65th786d00000gn/T/tmpgmq5f_ne.cool\n", | |
"pixels\n", | |
"creating\n", | |
"flal\n", | |
"Merging...\n", | |
"creating\n" | |
] | |
} | |
], | |
"source": [ | |
"\n", | |
"bins = cooler.binnify(chromsizes, binsize)\n", | |
"load_contact_matrix(matrix_file, 'out.cool', bins, \n", | |
" count_as_float=False, \n", | |
" chunksize=50000000, binsize=binsize, sep='\\t',\n", | |
" aggregate=\"max\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"a = pd.DataFrame({\"a\": [1,2]})\n", | |
"a.apply(lambda x: pd.DataFrame({\"x\": [x['a']+1,x['a']+2]}), axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"gist": { | |
"data": { | |
"description": "streaming_loader.ipynb", | |
"public": false | |
}, | |
"id": "" | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.2" | |
}, | |
"toc": { | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": "block", | |
"toc_window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment