nvictus · April 7, 2017 18:17
diff --git a/cooler-from-sparse-text.ipynb b/cooler-from-sparse-text.ipynb
 {
  "cells": [
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-04-07T14:17:23.334971",
          "end_time": "2017-04-07T14:17:23.654004"
        },
        "trusted": true,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "import six\nimport numpy as np\nimport pandas\nimport h5py\nimport cooler",
      "execution_count": 1,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-04-07T14:17:23.655156",
          "end_time": "2017-04-07T14:17:23.766220"
        },
        "trusted": true,
        "collapsed": false
      },
      "cell_type": "code",
      "source": "# Must be sorted by chrom1, start1, chrom2, start2\n# Order of bins in each pixel must be \"upper triangular\" in agreement with the chromosome order below\n!head test.1000kb.tsv",
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "text": "chr1\t0\t1000000\tchr1\t0\t1000000\t2\r\nchr1\t0\t1000000\tchr1\t18000000\t19000000\t1\r\nchr1\t0\t1000000\tchr1\t24000000\t25000000\t1\r\nchr1\t0\t1000000\tchr1\t38000000\t39000000\t1\r\nchr1\t0\t1000000\tchr1\t78000000\t79000000\t1\r\nchr1\t0\t1000000\tchr1\t91000000\t92000000\t1\r\nchr1\t0\t1000000\tchr2\t217000000\t218000000\t1\r\nchr1\t0\t1000000\tchr2\t236000000\t237000000\t1\r\nchr1\t0\t1000000\tchr3\t197000000\t198000000\t1\r\nchr1\t0\t1000000\tchr4\t44000000\t45000000\t1\r\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-04-07T14:17:23.767941",
          "end_time": "2017-04-07T14:17:23.879118"
        },
        "trusted": true,
        "collapsed": false
      },
      "cell_type": "code",
      "source": "# Chromosome names and order are important\n# The order will be respected and chromosomes not specified will be ignored\n!cat hg19.chrom.sizes.select",
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": "chr1\t249250621\r\nchr2\t243199373\r\nchr3\t198022430\r\nchr4\t191154276\r\nchr5\t180915260\r\nchr6\t171115067\r\nchr7\t159138663\r\nchr8\t146364022\r\nchr9\t141213431\r\nchr10\t135534747\r\nchr11\t135006516\r\nchr12\t133851895\r\nchr13\t115169878\r\nchr14\t107349540\r\nchr15\t102531392\r\nchr16\t90354753\r\nchr17\t81195210\r\nchr18\t78077248\r\nchr19\t59128983\r\nchr20\t63025520\r\nchr21\t48129895\r\nchr22\t51304566\r\nchrX\t155270560\r\nchrY\t59373566\r\nchrM\t16571\r\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-04-07T14:17:23.880918",
          "end_time": "2017-04-07T14:17:23.892032"
        },
        "trusted": true,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "def iter_sparse(filepath, bins, chunksize):\n    \"\"\"\n    Contact iterator for a sparse tsv Hi-C matrix with fields:\n        \"chrom1, start1, end1, chrom2, start2, end2, count\"\n    \n    The fields are assumed to be defined and records assumed to \n    be sorted consistently with the bin table provided.\n    \n    Parameters\n    ----------\n    filepath : str\n        Path to tsv file\n    bins : DataFrame\n        A bin table dataframe\n    chunksize : number of rows of the matrix file to read at a time\n    \n    \"\"\"\n    iterator = pandas.read_csv(filepath, sep='\\t', iterator=True, \n                               names=['chrom1', 'start1', 'end1', \n                                      'chrom2', 'start2', 'end2', 'count'])\n    bins['bin_id'] = bins.index\n    \n    for chunk in iterator:\n        # assign bin IDs from bin table\n        df = (chunk.merge(bins, \n                          left_on=['chrom1', 'start1', 'end1'], \n                          right_on=['chrom', 'start', 'end'])\n                   .merge(bins, \n                          left_on=['chrom2', 'start2', 'end2'], \n                          right_on=['chrom', 'start', 'end'], \n                          suffixes=('1', '2')))\n        df = (df[['bin_id1', 'bin_id2', 'count']]\n                  .rename(columns={'bin_id1': 'bin1_id', \n                                   'bin_id2': 'bin2_id'})\n                  .sort_values(['bin1_id', 'bin2_id']))\n        yield {k: v.values for k,v in six.iteritems(df)}",
      "execution_count": 4,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-04-07T14:17:23.893270",
          "end_time": "2017-04-07T14:17:24.143257"
        },
        "trusted": true,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "binsize = 1000000\nchromsizes = cooler.read_chromsizes('hg19.chrom.sizes.select', all_names=True)\nbins = cooler.binnify(chromsizes, binsize)\nchunksize = int(10e6)\ncooler.io.create('test2.1000kb.cool', chromsizes, bins, iter_sparse('test.1000kb.tsv', bins, chunksize), assembly='hg19')",
      "execution_count": 5,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-04-07T14:17:24.144512",
          "end_time": "2017-04-07T14:17:25.068230"
        },
        "trusted": true,
        "collapsed": false
      },
      "cell_type": "code",
      "source": "!cooler dump -t pixels --join test2.1000kb.cool | head",
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "text": "chr1\t0\t1000000\tchr1\t0\t1000000\t2\r\nchr1\t0\t1000000\tchr1\t18000000\t19000000\t1\r\nchr1\t0\t1000000\tchr1\t24000000\t25000000\t1\r\nchr1\t0\t1000000\tchr1\t38000000\t39000000\t1\r\nchr1\t0\t1000000\tchr1\t78000000\t79000000\t1\r\nchr1\t0\t1000000\tchr1\t91000000\t92000000\t1\r\nchr1\t0\t1000000\tchr2\t217000000\t218000000\t1\r\nchr1\t0\t1000000\tchr2\t236000000\t237000000\t1\r\nchr1\t0\t1000000\tchr3\t197000000\t198000000\t1\r\nchr1\t0\t1000000\tchr4\t44000000\t45000000\t1\r\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3",
      "language": "python"
    },
    "language_info": {
      "file_extension": ".py",
      "pygments_lexer": "ipython3",
      "codemirror_mode": {
        "version": 3,
        "name": "ipython"
      },
      "mimetype": "text/x-python",
      "version": "3.5.2",
      "nbconvert_exporter": "python",
      "name": "python"
    },
    "toc": {
      "toc_threshold": 6,
      "toc_number_sections": true,
      "toc_cell": false,
      "toc_window_display": false
    },
    "gist": {
      "id": "c39b0704db99d12dc1c11b08ded3063a",
      "data": {
        "description": "cooler-from-sparse-text.ipynb",
        "public": false
      }
    },
    "_draft": {
      "nbviewer_url": "https://gist.github.com/c39b0704db99d12dc1c11b08ded3063a"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-04-07T14:17:23.334971",
	"end_time": "2017-04-07T14:17:23.654004"
	},
	"trusted": true,
	"collapsed": true
	},
	"cell_type": "code",
	"source": "import six\nimport numpy as np\nimport pandas\nimport h5py\nimport cooler",
	"execution_count": 1,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-04-07T14:17:23.655156",
	"end_time": "2017-04-07T14:17:23.766220"
	},
	"trusted": true,
	"collapsed": false
	},
	"cell_type": "code",
	"source": "# Must be sorted by chrom1, start1, chrom2, start2\n# Order of bins in each pixel must be \"upper triangular\" in agreement with the chromosome order below\n!head test.1000kb.tsv",
	"execution_count": 2,
	"outputs": [
	{
	"output_type": "stream",
	"text": "chr1\t0\t1000000\tchr1\t0\t1000000\t2\r\nchr1\t0\t1000000\tchr1\t18000000\t19000000\t1\r\nchr1\t0\t1000000\tchr1\t24000000\t25000000\t1\r\nchr1\t0\t1000000\tchr1\t38000000\t39000000\t1\r\nchr1\t0\t1000000\tchr1\t78000000\t79000000\t1\r\nchr1\t0\t1000000\tchr1\t91000000\t92000000\t1\r\nchr1\t0\t1000000\tchr2\t217000000\t218000000\t1\r\nchr1\t0\t1000000\tchr2\t236000000\t237000000\t1\r\nchr1\t0\t1000000\tchr3\t197000000\t198000000\t1\r\nchr1\t0\t1000000\tchr4\t44000000\t45000000\t1\r\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-04-07T14:17:23.767941",
	"end_time": "2017-04-07T14:17:23.879118"
	},
	"trusted": true,
	"collapsed": false
	},
	"cell_type": "code",
	"source": "# Chromosome names and order are important\n# The order will be respected and chromosomes not specified will be ignored\n!cat hg19.chrom.sizes.select",
	"execution_count": 3,
	"outputs": [
	{
	"output_type": "stream",
	"text": "chr1\t249250621\r\nchr2\t243199373\r\nchr3\t198022430\r\nchr4\t191154276\r\nchr5\t180915260\r\nchr6\t171115067\r\nchr7\t159138663\r\nchr8\t146364022\r\nchr9\t141213431\r\nchr10\t135534747\r\nchr11\t135006516\r\nchr12\t133851895\r\nchr13\t115169878\r\nchr14\t107349540\r\nchr15\t102531392\r\nchr16\t90354753\r\nchr17\t81195210\r\nchr18\t78077248\r\nchr19\t59128983\r\nchr20\t63025520\r\nchr21\t48129895\r\nchr22\t51304566\r\nchrX\t155270560\r\nchrY\t59373566\r\nchrM\t16571\r\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-04-07T14:17:23.880918",
	"end_time": "2017-04-07T14:17:23.892032"
	},
	"trusted": true,
	"collapsed": true
	},
	"cell_type": "code",
	"source": "def iter_sparse(filepath, bins, chunksize):\n \"\"\"\n Contact iterator for a sparse tsv Hi-C matrix with fields:\n \"chrom1, start1, end1, chrom2, start2, end2, count\"\n \n The fields are assumed to be defined and records assumed to \n be sorted consistently with the bin table provided.\n \n Parameters\n ----------\n filepath : str\n Path to tsv file\n bins : DataFrame\n A bin table dataframe\n chunksize : number of rows of the matrix file to read at a time\n \n \"\"\"\n iterator = pandas.read_csv(filepath, sep='\\t', iterator=True, \n names=['chrom1', 'start1', 'end1', \n 'chrom2', 'start2', 'end2', 'count'])\n bins['bin_id'] = bins.index\n \n for chunk in iterator:\n # assign bin IDs from bin table\n df = (chunk.merge(bins, \n left_on=['chrom1', 'start1', 'end1'], \n right_on=['chrom', 'start', 'end'])\n .merge(bins, \n left_on=['chrom2', 'start2', 'end2'], \n right_on=['chrom', 'start', 'end'], \n suffixes=('1', '2')))\n df = (df[['bin_id1', 'bin_id2', 'count']]\n .rename(columns={'bin_id1': 'bin1_id', \n 'bin_id2': 'bin2_id'})\n .sort_values(['bin1_id', 'bin2_id']))\n yield {k: v.values for k,v in six.iteritems(df)}",
	"execution_count": 4,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-04-07T14:17:23.893270",
	"end_time": "2017-04-07T14:17:24.143257"
	},
	"trusted": true,
	"collapsed": true
	},
	"cell_type": "code",
	"source": "binsize = 1000000\nchromsizes = cooler.read_chromsizes('hg19.chrom.sizes.select', all_names=True)\nbins = cooler.binnify(chromsizes, binsize)\nchunksize = int(10e6)\ncooler.io.create('test2.1000kb.cool', chromsizes, bins, iter_sparse('test.1000kb.tsv', bins, chunksize), assembly='hg19')",
	"execution_count": 5,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-04-07T14:17:24.144512",
	"end_time": "2017-04-07T14:17:25.068230"
	},
	"trusted": true,
	"collapsed": false
	},
	"cell_type": "code",
	"source": "!cooler dump -t pixels --join test2.1000kb.cool \| head",
	"execution_count": 6,
	"outputs": [
	{
	"output_type": "stream",
	"text": "chr1\t0\t1000000\tchr1\t0\t1000000\t2\r\nchr1\t0\t1000000\tchr1\t18000000\t19000000\t1\r\nchr1\t0\t1000000\tchr1\t24000000\t25000000\t1\r\nchr1\t0\t1000000\tchr1\t38000000\t39000000\t1\r\nchr1\t0\t1000000\tchr1\t78000000\t79000000\t1\r\nchr1\t0\t1000000\tchr1\t91000000\t92000000\t1\r\nchr1\t0\t1000000\tchr2\t217000000\t218000000\t1\r\nchr1\t0\t1000000\tchr2\t236000000\t237000000\t1\r\nchr1\t0\t1000000\tchr3\t197000000\t198000000\t1\r\nchr1\t0\t1000000\tchr4\t44000000\t45000000\t1\r\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"trusted": true,
	"collapsed": true
	},
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3",
	"language": "python"
	},
	"language_info": {
	"file_extension": ".py",
	"pygments_lexer": "ipython3",
	"codemirror_mode": {
	"version": 3,
	"name": "ipython"
	},
	"mimetype": "text/x-python",
	"version": "3.5.2",
	"nbconvert_exporter": "python",
	"name": "python"
	},
	"toc": {
	"toc_threshold": 6,
	"toc_number_sections": true,
	"toc_cell": false,
	"toc_window_display": false
	},
	"gist": {
	"id": "c39b0704db99d12dc1c11b08ded3063a",
	"data": {
	"description": "cooler-from-sparse-text.ipynb",
	"public": false
	}
	},
	"_draft": {
	"nbviewer_url": "https://gist.github.com/c39b0704db99d12dc1c11b08ded3063a"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}