Last active
February 27, 2018 04:14
-
-
Save xhochy/36dfcdef072bf1d4c7cba2afb4b62b90 to your computer and use it in GitHub Desktop.
Apache Arrow in Jupyter with xeus-cling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Load the Arrow headers and shared libraries into the context" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#pragma cling load(\"arrow\")\n", | |
"#include <arrow/api.h>\n", | |
"\n", | |
"using namespace arrow;" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Add a convenience function to check the `Status` object returned by Arrow methods" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"void check_status(const Status& status) {\n", | |
" if (!status.ok()) {\n", | |
" std::cerr << status.message() << std::endl;\n", | |
" }\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We can now build Arrow arrays that are assembled later on into a `Table`." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"Int8Builder builder;\n", | |
"check_status(builder.Append(1));\n", | |
"check_status(builder.Append(-1));" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"std::shared_ptr<Array> i8_array;\n", | |
"check_status(builder.Finish(&i8_array))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"StringBuilder str_builder;\n", | |
"check_status(str_builder.Append(\"test\"));\n", | |
"check_status(str_builder.Append(\"another test\"));" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"std::shared_ptr<Array> str_array;\n", | |
"check_status(str_builder.Finish(&str_array))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"One important part of a notebook environment is that you interactively inspect the objects you have at hand." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(std::shared_ptr<arrow::Array> &) @0x7f98a2ff22f8\n" | |
] | |
} | |
], | |
"source": [ | |
"str_array" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(arrow::Array &) @0x6a498d0\n" | |
] | |
} | |
], | |
"source": [ | |
"*str_array" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(std::string) \"[\"test\", \"another test\"]\"\n" | |
] | |
} | |
], | |
"source": [ | |
"str_array->ToString()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"To build a `Table`, we also need to define a `Schema` that describes the expected data types." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(std::shared_ptr<Schema> &) @0x7f98a2ff2318\n" | |
] | |
} | |
], | |
"source": [ | |
"auto tbl_schema = arrow::schema({\n", | |
" arrow::field(\"int_column\", int8()),\n", | |
" arrow::field(\"str_column\", utf8())\n", | |
"})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"int_column: int8\n", | |
"str_column: string" | |
] | |
} | |
], | |
"source": [ | |
"std::cout << tbl_schema->ToString();" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Given the `tbl_schema` and the arrays, we can now construct a `Table` instance." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(std::shared_ptr<arrow::Table> &) @0x7f98a2ff2338\n" | |
] | |
} | |
], | |
"source": [ | |
"std::shared_ptr<Table> tbl = Table::Make(tbl_schema, {i8_array, str_array})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(int) 2\n" | |
] | |
} | |
], | |
"source": [ | |
"tbl->num_columns()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(long) 2\n" | |
] | |
} | |
], | |
"source": [ | |
"tbl->num_rows()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "xeus C++11", | |
"language": "", | |
"name": "xeus-cling-cpp11" | |
}, | |
"language_info": { | |
"codemirror_mode": "text/x-c++src", | |
"file_extension": ".cpp", | |
"mimetype": "text/x-c++src", | |
"name": "c++", | |
"version": "" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment