Skip to content

Instantly share code, notes, and snippets.

@nvictus
Last active April 7, 2017 18:17
Show Gist options
  • Save nvictus/c39b0704db99d12dc1c11b08ded3063a to your computer and use it in GitHub Desktop.
Save nvictus/c39b0704db99d12dc1c11b08ded3063a to your computer and use it in GitHub Desktop.
cooler-from-sparse-text.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-04-07T14:17:23.334971",
"end_time": "2017-04-07T14:17:23.654004"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "import six\nimport numpy as np\nimport pandas\nimport h5py\nimport cooler",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-04-07T14:17:23.655156",
"end_time": "2017-04-07T14:17:23.766220"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "# Must be sorted by chrom1, start1, chrom2, start2\n# Order of bins in each pixel must be \"upper triangular\" in agreement with the chromosome order below\n!head test.1000kb.tsv",
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": "chr1\t0\t1000000\tchr1\t0\t1000000\t2\r\nchr1\t0\t1000000\tchr1\t18000000\t19000000\t1\r\nchr1\t0\t1000000\tchr1\t24000000\t25000000\t1\r\nchr1\t0\t1000000\tchr1\t38000000\t39000000\t1\r\nchr1\t0\t1000000\tchr1\t78000000\t79000000\t1\r\nchr1\t0\t1000000\tchr1\t91000000\t92000000\t1\r\nchr1\t0\t1000000\tchr2\t217000000\t218000000\t1\r\nchr1\t0\t1000000\tchr2\t236000000\t237000000\t1\r\nchr1\t0\t1000000\tchr3\t197000000\t198000000\t1\r\nchr1\t0\t1000000\tchr4\t44000000\t45000000\t1\r\n",
"name": "stdout"
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-04-07T14:17:23.767941",
"end_time": "2017-04-07T14:17:23.879118"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "# Chromosome names and order are important\n# The order will be respected and chromosomes not specified will be ignored\n!cat hg19.chrom.sizes.select",
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": "chr1\t249250621\r\nchr2\t243199373\r\nchr3\t198022430\r\nchr4\t191154276\r\nchr5\t180915260\r\nchr6\t171115067\r\nchr7\t159138663\r\nchr8\t146364022\r\nchr9\t141213431\r\nchr10\t135534747\r\nchr11\t135006516\r\nchr12\t133851895\r\nchr13\t115169878\r\nchr14\t107349540\r\nchr15\t102531392\r\nchr16\t90354753\r\nchr17\t81195210\r\nchr18\t78077248\r\nchr19\t59128983\r\nchr20\t63025520\r\nchr21\t48129895\r\nchr22\t51304566\r\nchrX\t155270560\r\nchrY\t59373566\r\nchrM\t16571\r\n",
"name": "stdout"
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-04-07T14:17:23.880918",
"end_time": "2017-04-07T14:17:23.892032"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "def iter_sparse(filepath, bins, chunksize):\n \"\"\"\n Contact iterator for a sparse tsv Hi-C matrix with fields:\n \"chrom1, start1, end1, chrom2, start2, end2, count\"\n \n The fields are assumed to be defined and records assumed to \n be sorted consistently with the bin table provided.\n \n Parameters\n ----------\n filepath : str\n Path to tsv file\n bins : DataFrame\n A bin table dataframe\n chunksize : number of rows of the matrix file to read at a time\n \n \"\"\"\n iterator = pandas.read_csv(filepath, sep='\\t', iterator=True, \n names=['chrom1', 'start1', 'end1', \n 'chrom2', 'start2', 'end2', 'count'])\n bins['bin_id'] = bins.index\n \n for chunk in iterator:\n # assign bin IDs from bin table\n df = (chunk.merge(bins, \n left_on=['chrom1', 'start1', 'end1'], \n right_on=['chrom', 'start', 'end'])\n .merge(bins, \n left_on=['chrom2', 'start2', 'end2'], \n right_on=['chrom', 'start', 'end'], \n suffixes=('1', '2')))\n df = (df[['bin_id1', 'bin_id2', 'count']]\n .rename(columns={'bin_id1': 'bin1_id', \n 'bin_id2': 'bin2_id'})\n .sort_values(['bin1_id', 'bin2_id']))\n yield {k: v.values for k,v in six.iteritems(df)}",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-04-07T14:17:23.893270",
"end_time": "2017-04-07T14:17:24.143257"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "binsize = 1000000\nchromsizes = cooler.read_chromsizes('hg19.chrom.sizes.select', all_names=True)\nbins = cooler.binnify(chromsizes, binsize)\nchunksize = int(10e6)\ncooler.io.create('test2.1000kb.cool', chromsizes, bins, iter_sparse('test.1000kb.tsv', bins, chunksize), assembly='hg19')",
"execution_count": 5,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-04-07T14:17:24.144512",
"end_time": "2017-04-07T14:17:25.068230"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "!cooler dump -t pixels --join test2.1000kb.cool | head",
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": "chr1\t0\t1000000\tchr1\t0\t1000000\t2\r\nchr1\t0\t1000000\tchr1\t18000000\t19000000\t1\r\nchr1\t0\t1000000\tchr1\t24000000\t25000000\t1\r\nchr1\t0\t1000000\tchr1\t38000000\t39000000\t1\r\nchr1\t0\t1000000\tchr1\t78000000\t79000000\t1\r\nchr1\t0\t1000000\tchr1\t91000000\t92000000\t1\r\nchr1\t0\t1000000\tchr2\t217000000\t218000000\t1\r\nchr1\t0\t1000000\tchr2\t236000000\t237000000\t1\r\nchr1\t0\t1000000\tchr3\t197000000\t198000000\t1\r\nchr1\t0\t1000000\tchr4\t44000000\t45000000\t1\r\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"file_extension": ".py",
"pygments_lexer": "ipython3",
"codemirror_mode": {
"version": 3,
"name": "ipython"
},
"mimetype": "text/x-python",
"version": "3.5.2",
"nbconvert_exporter": "python",
"name": "python"
},
"toc": {
"toc_threshold": 6,
"toc_number_sections": true,
"toc_cell": false,
"toc_window_display": false
},
"gist": {
"id": "c39b0704db99d12dc1c11b08ded3063a",
"data": {
"description": "cooler-from-sparse-text.ipynb",
"public": false
}
},
"_draft": {
"nbviewer_url": "https://gist.github.com/c39b0704db99d12dc1c11b08ded3063a"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment