Skip to content

Instantly share code, notes, and snippets.

@myui
Created April 25, 2020 08:14
Show Gist options
  • Save myui/8724902af53491201a34dba01071c88e to your computer and use it in GitHub Desktop.
Save myui/8724902af53491201a34dba01071c88e to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "fancs_sparse_matrix.ipynb",
"version": "0.3.2",
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/myui/7dc30ac8ad4b192f34d41b97f032c82f/fancs_sparse_matrix.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Dj4twnBTjyEL",
"colab_type": "code",
"outputId": "5d48133a-7562-411e-c92f-64c67973e1b3",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 170
}
},
"source": [
"!pip install td-client\n",
"!pip install memory_profiler\n",
"%load_ext memory_profiler"
],
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: td-client in /usr/local/lib/python3.6/dist-packages (0.13.0)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from td-client) (1.12.0)\n",
"Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/dist-packages (from td-client) (2.5.3)\n",
"Requirement already satisfied: urllib3 in /usr/local/lib/python3.6/dist-packages (from td-client) (1.24.3)\n",
"Requirement already satisfied: msgpack in /usr/local/lib/python3.6/dist-packages (from td-client) (0.5.6)\n",
"Requirement already satisfied: memory_profiler in /usr/local/lib/python3.6/dist-packages (0.55.0)\n",
"Requirement already satisfied: psutil in /usr/local/lib/python3.6/dist-packages (from memory_profiler) (5.4.8)\n",
"The memory_profiler extension is already loaded. To reload it, use:\n",
" %reload_ext memory_profiler\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "rQsECqr5vw15",
"colab_type": "code",
"outputId": "9d03ac45-fe7c-44bc-a9fc-1d77fa4fdc1f",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
}
},
"source": [
"from getpass import getpass\n",
"\n",
"td_api_key = getpass('Enter TD API KEY here')\n",
"print(\"Succeeded.\")"
],
"execution_count": 17,
"outputs": [
{
"output_type": "stream",
"text": [
"Enter TD API KEY here··········\n",
"Succeeded.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "SjKboSoEsCoM",
"colab_type": "code",
"colab": {}
},
"source": [
"from scipy.sparse import csr_matrix\n",
"from memory_profiler import profile\n",
"\n",
"def convert_to_csr_matrix(job):\n",
" indptr = [0]\n",
" indices = []\n",
" data = []\n",
" vocabulary = {}\n",
" \n",
" for row in job.result():\n",
" for col in row[0]:\n",
" fv = col.split(':')\n",
" f = fv[0]\n",
" v = fv[1] if len(fv) > 1 else 1.0\n",
" index = vocabulary.setdefault(f, len(vocabulary))\n",
" indices.append(index)\n",
" data.append(v)\n",
" indptr.append(len(indices))\n",
" \n",
" return vocabulary, csr_matrix((data, indices, indptr), dtype=float)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab_type": "code",
"outputId": "6ad1b879-3063-41ec-9d3e-59a84da7eab2",
"id": "T5q0TVKYn_Go",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 751
}
},
"source": [
"import tdclient\n",
"\n",
"with tdclient.Client(td_api_key) as td:\n",
" job = td.query(\"work1\", \"SELECT features FROM kddcup_test LIMIT 10\", type=\"presto\")\n",
" job.wait()\n",
" \n",
" %memit vocabulary, csr = convert_to_csr_matrix(job)\n",
" print(vocabulary)\n",
" print(csr.toarray())"
],
"execution_count": 23,
"outputs": [
{
"output_type": "stream",
"text": [
"peak memory: 121.40 MiB, increment: 0.00 MiB\n",
"{'1981531': 0, '16250865': 1, '10331277': 2, '6113737': 3, '3961389': 4, '13638576': 5, '16737420': 6, '813223': 7, '197254': 8, '5091135': 9, '0': 10, '486624': 11, '8546950': 12, '897060': 13, '32967': 14, '14111284': 15, '1328807': 16, '6023668': 17, '13170561': 18, '8186092': 19, '6340403': 20, '10582526': 21, '15942785': 22, '442046': 23, '10635989': 24, '13067590': 25, '5030921': 26, '391998': 27, '16686133': 28, '8186170': 29, '14373278': 30, '11668384': 31, '14411221': 32, '15182339': 33, '13959010': 34, '12687772': 35, '2486891': 36, '7780400': 37, '10868501': 38, '13695281': 39, '5312010': 40, '1551595': 41, '13660719': 42, '6068107': 43, '659238': 44, '10502959': 45, '10512490': 46, '9362879': 47, '2528618': 48, '15334256': 49, '5431954': 50, '9204208': 51, '13210102': 52, '608773': 53, '14646961': 54, '5722703': 55, '1593653': 56, '6985745': 57, '11120830': 58, '12214482': 59, '15933125': 60, '15338484': 61, '3910131': 62, '15899168': 63, '12316049': 64}\n",
"[[1. 1. 1. 1. 1. 1. 1. 1. 0.5 0.5 1. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n",
" [0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n",
" [0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0.\n",
" 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n",
" [0. 0. 0. 0. 0. 0. 0. 1. 0.5 0. 1. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n",
" [0. 0. 0. 0. 0. 0. 0. 1. 0.5 0. 1. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n",
" [0. 0. 0. 0. 0. 0. 0. 1. 1. 0.5 1. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.\n",
" 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n",
" [0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n",
" [0. 0. 0. 0. 0. 0. 0. 1. 0.5 0. 1. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n",
" [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.\n",
" 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. ]\n",
" [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. ]]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "fh4ae4U9r2Fy",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment