Last active
September 8, 2022 03:34
-
-
Save zverok/5438eb69da9ac34d791c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# It is show-the-point demo for my article | |
# "On DataFrame datatype in Ruby" http://zverok.github.io/blog/2016-01-10-dataframe.html | |
require 'good_data_frame' # `require': cannot load such file -- good_data_frame (LoadError) | |
# Initialization (with default index): hashes of column: values | |
# Values of each column are homogenous | |
# First and most important thing is "what columns is" | |
table = GDF.new( | |
manager: ['Tom', 'Jerry', 'Magda'], | |
salary: [1000.0, 800.0, 960.0], | |
sales: [20_000, 6_000, 15_000] | |
) | |
# => prints in IRB: | |
# | manager | salary | sales | | |
# --+---------+--------+----------+ | |
# 0 | Tom | 1000.0 | 20'000.0 | | |
# 1 | Jerry | 800.0 | 6'000.0 | | |
# 2 | Magda | 960.0 | 15'000.0 | | |
# ...and if here's 5+ lines, just prints "...", and then last 2-3 lines | |
# Initialization: with index other then 0,1,2... | |
table2 = GDF.new( | |
salary: [1000.0, 800.0, 960.0], | |
sales: [20_000, 6_000, 15_000] | |
# from "human" point-of-view, index is just a special column, isn't it? | |
index: [Time.parse('2015-01-01'), Time.parse('2015-02-01'), Time.parse('2015-03-01')] | |
) | |
# DataFrame like your olde good Array/Enumerable | |
table.count # => 3 (rows) | |
table.first(2) | |
# => | |
# | manager | salary | sales | | |
# --+---------+--------+----------+ | |
# 0 | Tom | 1000.0 | 20'000.0 | | |
# 1 | Jerry | 800.0 | 6'000.0 | | |
table.last(1) | |
# => | |
# | manager | salary | sales | | |
# --+---------+--------+----------+ | |
# 2 | Magda | 960.0 | 15'000.0 | | |
table.each do |row| | |
p [row.class, # GDF::Row, unlike column, is a virtual object for iteration only | |
row.index, # It is more like Struct/Hash/Hashie, | |
row.manager] # with fields corresponding to columns | |
end | |
# Also #sort, #select, #reject work as usual, but returning instance of | |
# GoodDataFrame | |
table.reject do |row| | |
row.salary < 900 | |
end | |
# | manager | salary | sales | | |
# --+---------+--------+----------+ | |
# 0 | Tom | 1000.0 | 20'000.0 | | |
# 2 | Magda | 960.0 | 15'000.0 | | |
# Adding data: performs check for column count and data type for each column | |
table << ['Jim', 950.0, 18_000] | |
# | manager | salary | sales | | |
# --+---------+--------+----------+ | |
# 0 | Tom | 1000.0 | 20'000.0 | | |
# 1 | Jerry | 800.0 | 6'000.0 | | |
# 2 | Magda | 960.0 | 15'000.0 | | |
# 3 | Jim | 950.0 | 18'000.0 | | |
# Working with columns: column IS an object | |
table.columns # => GDF::Columns proxy array | |
table.columns.push(bonus: [300.0, 120.0, 230.0]) | |
# => | |
# | manager | salary | sales | bonus | | |
# --+---------+--------+----------+-------+ | |
# 0 | Tom | 1000.0 | 20'000.0 | 300.0 | | |
# 1 | Jerry | 800.0 | 6'000.0 | 120.0 | | |
# 2 | Magda | 960.0 | 15'000.0 | 230.0 | | |
# or even: | |
table.columns[:bonus] = [300.0, 120.0, 230.0] | |
table.columns.delete(:salary) | |
# => | |
# | manager | sales | bonus | | |
# --+---------+----------+-------+ | |
# .... | |
table.columns.reorder(:bonus, :salary) | |
# => | |
# | manager | bonus | salary | | |
# --+---------+-------+--------+ | |
# .... | |
bonus = table.columns[:bonus] | |
# => | |
# 0 | 1 | 2 | |
# 300.0 | 120.0 | 230.0 | |
# | |
# It's GDF::Column type, Array-ish, yet not necessarily indexed 0..N, and | |
# with additional operations, making it more math-y: | |
bonus * 2 | |
# => | |
# 0 | 1 | 2 | |
# 600.0 | 240.0 | 460.0 | |
table.columns[:bonus] / table.columns[:salary] | |
# => | |
# 0 | 1 | 2 | |
# 0.30 | 0.15 | 0.24 | |
# "Views": | |
table.view(0..1, :salary) | |
# | salary | | |
# --+--------+ | |
# 0 | 1000.0 | | |
# 1 | 800.0 | | |
# 2 | 960.0 | | |
table.view(0..1, :salary) = 950.0 # equality forewer! | |
table | |
# => | |
# | manager | salary | sales | bonus | | |
# --+---------+--------+----------+-------+ | |
# 0 | Tom | 950.0 | 20'000.0 | 300.0 | | |
# 1 | Jerry | 950.0 | 6'000.0 | 120.0 | | |
# 2 | Magda | 950.0 | 15'000.0 | 230.0 | | |
table = GDF.new( | |
manager: ['Tom', 'Jerry', 'Magda'], | |
salary: [1000.0, nil, 960.0], | |
sales: [20_000, 6_000, nil] | |
) | |
table.view(&:nil?) | |
# | salary | sales | | |
# --+--------+-------+ | |
# 1 | nil | | | |
# 2 | | nil | | |
table.view(&:nil?).values = 0.0 | |
table.view(nil, :salary){|s| s < 1000}.values += 100 | |
# or? | |
table.select{|r| r.salary < 1000}.columns[:salary].values += 100 | |
# neither is cool :( | |
table.view{|val, r, c| c == :salary && val < 1000}.values += 100 | |
# ↑ ok, that's cute! | |
# Composite indexes | |
table = GoodDataFrame.new( | |
age: [30, 23, 38, 40, 22, 28], | |
salary: [1000.0, 980.0, 860.0, 950.0, 720.0, 1050.0], | |
# let's try the simplest thing possible | |
index: { | |
department_A: [:John, :Jeff, :Maggy], | |
department_B: [:Jake, :Mary, :Andrew] | |
} | |
) | |
# | age | salary | | |
# -------------+--------+-----+--------+ | |
# department_A | John | 30 | 1000.0 | | |
# | Jeff | 23 | 980.0 | | |
# | Maggy | 38 | 860.0 | | |
# -------------+--------+-----+--------+ | |
# department_B | Jake | 40 | 950.0 | | |
# | Mary | 22 | 720.0 | | |
# | Andrew | 28 | 1050.0 | | |
table[:department_A] | |
# | age | salary | | |
# -------------+--------+-----+--------+ | |
# department_A | John | 30 | 1000.0 | | |
# | Jeff | 23 | 980.0 | | |
# | Maggy | 38 | 860.0 | | |
table[:department_A, :John] | |
# | age | salary | | |
# -------------+--------+-----+--------+ | |
# department_A | John | 30 | 1000.0 | | |
# Enough for most of cool stats and pivots and stuff |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment