Created
January 3, 2021 19:33
-
-
Save vaclavdekanovsky/245bbb0158f3480295b817ba88b77702 to your computer and use it in GitHub Desktop.
How to read some fixed width file with julia CSV.jl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Reading a fixed with File with CSV.jl\n", | |
| "You can ignore the padding blank spaces using `ignorerepeated=true`, but only if there's no delimiter. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "using CSV, DataFrames" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"data-frame\"><thead><tr><th></th><th>A</th><th>B</th><th>C</th></tr><tr><th></th><th>Int64</th><th>Float64</th><th>String</th></tr></thead><tbody><p>1 rows × 3 columns</p><tr><th>1</th><td>1</td><td>2.0</td><td>X</td></tr></tbody></table>" | |
| ], | |
| "text/latex": [ | |
| "\\begin{tabular}{r|ccc}\n", | |
| "\t& A & B & C\\\\\n", | |
| "\t\\hline\n", | |
| "\t& Int64 & Float64 & String\\\\\n", | |
| "\t\\hline\n", | |
| "\t1 & 1 & 2.0 & X \\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/plain": [ | |
| "1×3 DataFrame\n", | |
| "│ Row │ A │ B │ C │\n", | |
| "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │\n", | |
| "├─────┼───────┼─────────┼────────┤\n", | |
| "│ 1 │ 1 │ 2.0 │ X │" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data = \"\"\"\n", | |
| "A B C \n", | |
| "1 2.0 \"X\"\n", | |
| "\"\"\"\n", | |
| "\n", | |
| "df = CSV.read(IOBuffer(data), DataFrame; \n", | |
| " delim=\" \",\n", | |
| " ignorerepeated=true)\n", | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "The column names are really 1 character long (without blank spaces)." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "3-element Array{String,1}:\n", | |
| " \"A\"\n", | |
| " \"B\"\n", | |
| " \"C\"" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "names(df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Also the column content doesn't contain the pads." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"data-frame\"><thead><tr><th></th><th>A</th><th>B</th><th>C</th></tr><tr><th></th><th>Int64</th><th>Int64</th><th>Int64</th></tr></thead><tbody><p>1 rows × 3 columns</p><tr><th>1</th><td>1</td><td>1</td><td>1</td></tr></tbody></table>" | |
| ], | |
| "text/latex": [ | |
| "\\begin{tabular}{r|ccc}\n", | |
| "\t& A & B & C\\\\\n", | |
| "\t\\hline\n", | |
| "\t& Int64 & Int64 & Int64\\\\\n", | |
| "\t\\hline\n", | |
| "\t1 & 1 & 1 & 1 \\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/plain": [ | |
| "1×3 DataFrame\n", | |
| "│ Row │ A │ B │ C │\n", | |
| "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │\n", | |
| "├─────┼───────┼───────┼───────┤\n", | |
| "│ 1 │ 1 │ 1 │ 1 │" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "length.(df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Dataframes with different delimiter\n", | |
| "Some tools, like databases usually export data into text format using delimiters. In such a case `ignorerepeated=true` won't help." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"data-frame\"><thead><tr><th></th><th> A </th><th> B </th><th> C </th></tr><tr><th></th><th>Int64</th><th>Float64</th><th>String</th></tr></thead><tbody><p>1 rows × 3 columns</p><tr><th>1</th><td>1</td><td>2.0</td><td> X </td></tr></tbody></table>" | |
| ], | |
| "text/latex": [ | |
| "\\begin{tabular}{r|ccc}\n", | |
| "\t& A & B & C \\\\\n", | |
| "\t\\hline\n", | |
| "\t& Int64 & Float64 & String\\\\\n", | |
| "\t\\hline\n", | |
| "\t1 & 1 & 2.0 & X \\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/plain": [ | |
| "1×3 DataFrame\n", | |
| "│ Row │ A │ B │ C │\n", | |
| "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │\n", | |
| "├─────┼───────┼─────────┼────────┤\n", | |
| "│ 1 │ 1 │ 2.0 │ X │" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data = \"\"\"\n", | |
| "| A | B | C \n", | |
| "| 1 | 2.0 | X \n", | |
| "\"\"\"\n", | |
| "\n", | |
| "df = CSV.read(IOBuffer(data), DataFrame; \n", | |
| " delim=\"|\",\n", | |
| " ignorerepeated=true)\n", | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Since the extra strings are blank spaces \" \" and not the pipe delimiters, nothing is removed using `ignorerepeated=true`. We need to remove the extra spaces manually." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "3-element Array{String,1}:\n", | |
| " \" A \"\n", | |
| " \" B \"\n", | |
| " \" C \"" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# header names contain blank spaces\n", | |
| "names(df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"data-frame\"><thead><tr><th></th><th> A </th><th> B </th><th> C </th></tr><tr><th></th><th>Int64</th><th>Int64</th><th>Int64</th></tr></thead><tbody><p>1 rows × 3 columns</p><tr><th>1</th><td>1</td><td>1</td><td>3</td></tr></tbody></table>" | |
| ], | |
| "text/latex": [ | |
| "\\begin{tabular}{r|ccc}\n", | |
| "\t& A & B & C \\\\\n", | |
| "\t\\hline\n", | |
| "\t& Int64 & Int64 & Int64\\\\\n", | |
| "\t\\hline\n", | |
| "\t1 & 1 & 1 & 3 \\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/plain": [ | |
| "1×3 DataFrame\n", | |
| "│ Row │ A │ B │ C │\n", | |
| "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │\n", | |
| "├─────┼───────┼────────┼───────┤\n", | |
| "│ 1 │ 1 │ 1 │ 3 │" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# also the lenght of string values can be increated by the blank spaces\n", | |
| "length.(df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "To solve these things, you have to perform additional operations, like stripping the white spaces from the strings." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "3-element Array{SubString{String},1}:\n", | |
| " \"A\"\n", | |
| " \"B\"\n", | |
| " \"C\"" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# applying strip (trim) element-wise (using . dot operator)\n", | |
| "strip.(names(df))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"data-frame\"><thead><tr><th></th><th>A</th><th>B</th><th>C</th></tr><tr><th></th><th>Int64</th><th>Float64</th><th>String</th></tr></thead><tbody><p>1 rows × 3 columns</p><tr><th>1</th><td>1</td><td>2.0</td><td> X </td></tr></tbody></table>" | |
| ], | |
| "text/latex": [ | |
| "\\begin{tabular}{r|ccc}\n", | |
| "\t& A & B & C\\\\\n", | |
| "\t\\hline\n", | |
| "\t& Int64 & Float64 & String\\\\\n", | |
| "\t\\hline\n", | |
| "\t1 & 1 & 2.0 & X \\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/plain": [ | |
| "1×3 DataFrame\n", | |
| "│ Row │ A │ B │ C │\n", | |
| "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │\n", | |
| "├─────┼───────┼─────────┼────────┤\n", | |
| "│ 1 │ 1 │ 2.0 │ X │" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# rename the names, by applying `strip` element wise to the column names vector\n", | |
| "rename!(df, strip.(names(df)))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "3-element Array{String,1}:\n", | |
| " \"A\"\n", | |
| " \"B\"\n", | |
| " \"C\"" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# now the names are stripped\n", | |
| "names(df)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Julia 1.4.1", | |
| "language": "julia", | |
| "name": "julia-1.4" | |
| }, | |
| "language_info": { | |
| "file_extension": ".jl", | |
| "mimetype": "application/julia", | |
| "name": "julia", | |
| "version": "1.4.1" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment