zverok · September 8, 2022 03:34
diff --git a/good_data_frame.rb b/good_data_frame.rb
 # It is show-the-point demo for my article
 # "On DataFrame datatype in Ruby" http://zverok.github.io/blog/2016-01-10-dataframe.html

 require 'good_data_frame' # `require': cannot load such file -- good_data_frame (LoadError)

 # Initialization (with default index): hashes of column: values
 # Values of each column are homogenous
 # First and most important thing is "what columns is"
 table = GDF.new(
  manager: ['Tom', 'Jerry', 'Magda'],
  salary:  [1000.0, 800.0,  960.0],
  sales:   [20_000, 6_000, 15_000]
 )
 # => prints in IRB:
 #   | manager | salary | sales    |
 # --+---------+--------+----------+
 # 0 | Tom     | 1000.0 | 20'000.0 |
 # 1 | Jerry   |  800.0 |  6'000.0 |
 # 2 | Magda   |  960.0 | 15'000.0 |
 # ...and if here's 5+ lines, just prints "...", and then last 2-3 lines

 # Initialization: with index other then 0,1,2...
 table2 = GDF.new(
  salary:  [1000.0, 800.0,  960.0],
  sales:   [20_000, 6_000, 15_000]

  # from "human" point-of-view, index is just a special column, isn't it?
  index: [Time.parse('2015-01-01'), Time.parse('2015-02-01'), Time.parse('2015-03-01')]
 )

 # DataFrame like your olde good Array/Enumerable
 table.count # => 3 (rows)

 table.first(2)
 # => 
 #   | manager | salary | sales    |
 # --+---------+--------+----------+
 # 0 | Tom     | 1000.0 | 20'000.0 |
 # 1 | Jerry   |  800.0 |  6'000.0 |

 table.last(1)
 # => 
 #   | manager | salary | sales    |
 # --+---------+--------+----------+
 # 2 | Magda   |  960.0 | 15'000.0 |

 table.each do |row|
  p [row.class,  # GDF::Row, unlike column, is a virtual object for iteration only
    row.index,   # It is more like Struct/Hash/Hashie, 
    row.manager] # with fields corresponding to columns
 end

 # Also #sort, #select, #reject work as usual, but returning instance of
 # GoodDataFrame
 table.reject do |row|
  row.salary < 900
 end
 #   | manager | salary | sales    |
 # --+---------+--------+----------+
 # 0 | Tom     | 1000.0 | 20'000.0 |
 # 2 | Magda   |  960.0 | 15'000.0 |

 # Adding data: performs check for column count and data type for each column
 table << ['Jim', 950.0, 18_000]
 #   | manager | salary | sales    |
 # --+---------+--------+----------+
 # 0 | Tom     | 1000.0 | 20'000.0 |
 # 1 | Jerry   |  800.0 |  6'000.0 |
 # 2 | Magda   |  960.0 | 15'000.0 |
 # 3 | Jim     |  950.0 | 18'000.0 |

 # Working with columns: column IS an object
 table.columns # => GDF::Columns proxy array

 table.columns.push(bonus: [300.0, 120.0, 230.0])
 # =>
 #   | manager | salary | sales    | bonus |
 # --+---------+--------+----------+-------+
 # 0 | Tom     | 1000.0 | 20'000.0 | 300.0 |
 # 1 | Jerry   |  800.0 |  6'000.0 | 120.0 |
 # 2 | Magda   |  960.0 | 15'000.0 | 230.0 |

 # or even:
 table.columns[:bonus] = [300.0, 120.0, 230.0]

 table.columns.delete(:salary)
 # =>
 #   | manager | sales    | bonus |
 # --+---------+----------+-------+
 # ....

 table.columns.reorder(:bonus, :salary)
 # =>
 #   | manager | bonus | salary |
 # --+---------+-------+--------+
 # ....

 bonus = table.columns[:bonus]
 # =>
 #     0 |     1 |     2
 # 300.0 | 120.0 | 230.0
 #
 # It's GDF::Column type, Array-ish, yet not necessarily indexed 0..N, and
 # with additional operations, making it more math-y:

 bonus * 2
 # =>
 #     0 |     1 |     2
 # 600.0 | 240.0 | 460.0

 table.columns[:bonus] / table.columns[:salary]
 # =>
 #    0 |    1 |    2
 # 0.30 | 0.15 | 0.24

 # "Views":
 table.view(0..1, :salary)
 #   | salary |
 # --+--------+
 # 0 | 1000.0 |
 # 1 |  800.0 |
 # 2 |  960.0 |

 table.view(0..1, :salary) = 950.0 # equality forewer!
 table
 # =>
 #   | manager | salary | sales    | bonus |
 # --+---------+--------+----------+-------+
 # 0 | Tom     |  950.0 | 20'000.0 | 300.0 |
 # 1 | Jerry   |  950.0 |  6'000.0 | 120.0 |
 # 2 | Magda   |  950.0 | 15'000.0 | 230.0 |

 table = GDF.new(
  manager: ['Tom', 'Jerry', 'Magda'],
  salary:  [1000.0, nil,  960.0],
  sales:   [20_000, 6_000, nil]
 )
 table.view(&:nil?)
 #   | salary | sales |
 # --+--------+-------+
 # 1 |    nil |       |
 # 2 |        |   nil |

 table.view(&:nil?).values = 0.0

 table.view(nil, :salary){|s| s < 1000}.values += 100
 # or?
 table.select{|r| r.salary < 1000}.columns[:salary].values += 100
 # neither is cool :(
 table.view{|val, r, c| c == :salary && val < 1000}.values += 100
 # ↑ ok, that's cute!

 # Composite indexes
 table = GoodDataFrame.new(
  age:    [30, 23, 38, 40, 22, 28],
  salary: [1000.0, 980.0, 860.0, 950.0, 720.0, 1050.0],

  # let's try the simplest thing possible
  index: {
    department_A: [:John, :Jeff, :Maggy],
    department_B: [:Jake, :Mary, :Andrew]
  }
 )
 #                       | age | salary |
 # -------------+--------+-----+--------+
 # department_A | John   |  30 | 1000.0 |
 #              | Jeff   |  23 |  980.0 |
 #              | Maggy  |  38 |  860.0 |
 # -------------+--------+-----+--------+
 # department_B | Jake   |  40 |  950.0 |
 #              | Mary   |  22 |  720.0 |
 #              | Andrew |  28 | 1050.0 |

 table[:department_A]
 #                       | age | salary |
 # -------------+--------+-----+--------+
 # department_A | John   |  30 | 1000.0 |
 #              | Jeff   |  23 |  980.0 |
 #              | Maggy  |  38 |  860.0 |

 table[:department_A, :John]
 #                       | age | salary |
 # -------------+--------+-----+--------+
 # department_A | John   |  30 | 1000.0 |


 # Enough for most of cool stats and pivots and stuff
	# It is show-the-point demo for my article
	# "On DataFrame datatype in Ruby" http://zverok.github.io/blog/2016-01-10-dataframe.html

	require 'good_data_frame' # `require': cannot load such file -- good_data_frame (LoadError)

	# Initialization (with default index): hashes of column: values
	# Values of each column are homogenous
	# First and most important thing is "what columns is"
	table = GDF.new(
	manager: ['Tom', 'Jerry', 'Magda'],
	salary: [1000.0, 800.0, 960.0],
	sales: [20_000, 6_000, 15_000]
	)
	# => prints in IRB:
	# \| manager \| salary \| sales \|
	# --+---------+--------+----------+
	# 0 \| Tom \| 1000.0 \| 20'000.0 \|
	# 1 \| Jerry \| 800.0 \| 6'000.0 \|
	# 2 \| Magda \| 960.0 \| 15'000.0 \|
	# ...and if here's 5+ lines, just prints "...", and then last 2-3 lines

	# Initialization: with index other then 0,1,2...
	table2 = GDF.new(
	salary: [1000.0, 800.0, 960.0],
	sales: [20_000, 6_000, 15_000]

	# from "human" point-of-view, index is just a special column, isn't it?
	index: [Time.parse('2015-01-01'), Time.parse('2015-02-01'), Time.parse('2015-03-01')]
	)

	# DataFrame like your olde good Array/Enumerable
	table.count # => 3 (rows)

	table.first(2)
	# =>
	# \| manager \| salary \| sales \|
	# --+---------+--------+----------+
	# 0 \| Tom \| 1000.0 \| 20'000.0 \|
	# 1 \| Jerry \| 800.0 \| 6'000.0 \|

	table.last(1)
	# =>
	# \| manager \| salary \| sales \|
	# --+---------+--------+----------+
	# 2 \| Magda \| 960.0 \| 15'000.0 \|

	table.each do \|row\|
	p [row.class, # GDF::Row, unlike column, is a virtual object for iteration only
	row.index, # It is more like Struct/Hash/Hashie,
	row.manager] # with fields corresponding to columns
	end

	# Also #sort, #select, #reject work as usual, but returning instance of
	# GoodDataFrame
	table.reject do \|row\|
	row.salary < 900
	end
	# \| manager \| salary \| sales \|
	# --+---------+--------+----------+
	# 0 \| Tom \| 1000.0 \| 20'000.0 \|
	# 2 \| Magda \| 960.0 \| 15'000.0 \|

	# Adding data: performs check for column count and data type for each column
	table << ['Jim', 950.0, 18_000]
	# \| manager \| salary \| sales \|
	# --+---------+--------+----------+
	# 0 \| Tom \| 1000.0 \| 20'000.0 \|
	# 1 \| Jerry \| 800.0 \| 6'000.0 \|
	# 2 \| Magda \| 960.0 \| 15'000.0 \|
	# 3 \| Jim \| 950.0 \| 18'000.0 \|

	# Working with columns: column IS an object
	table.columns # => GDF::Columns proxy array

	table.columns.push(bonus: [300.0, 120.0, 230.0])
	# =>
	# \| manager \| salary \| sales \| bonus \|
	# --+---------+--------+----------+-------+
	# 0 \| Tom \| 1000.0 \| 20'000.0 \| 300.0 \|
	# 1 \| Jerry \| 800.0 \| 6'000.0 \| 120.0 \|
	# 2 \| Magda \| 960.0 \| 15'000.0 \| 230.0 \|

	# or even:
	table.columns[:bonus] = [300.0, 120.0, 230.0]

	table.columns.delete(:salary)
	# =>
	# \| manager \| sales \| bonus \|
	# --+---------+----------+-------+
	# ....

	table.columns.reorder(:bonus, :salary)
	# =>
	# \| manager \| bonus \| salary \|
	# --+---------+-------+--------+
	# ....

	bonus = table.columns[:bonus]
	# =>
	# 0 \| 1 \| 2
	# 300.0 \| 120.0 \| 230.0
	#
	# It's GDF::Column type, Array-ish, yet not necessarily indexed 0..N, and
	# with additional operations, making it more math-y:

	bonus * 2
	# =>
	# 0 \| 1 \| 2
	# 600.0 \| 240.0 \| 460.0

	table.columns[:bonus] / table.columns[:salary]
	# =>
	# 0 \| 1 \| 2
	# 0.30 \| 0.15 \| 0.24

	# "Views":
	table.view(0..1, :salary)
	# \| salary \|
	# --+--------+
	# 0 \| 1000.0 \|
	# 1 \| 800.0 \|
	# 2 \| 960.0 \|

	table.view(0..1, :salary) = 950.0 # equality forewer!
	table
	# =>
	# \| manager \| salary \| sales \| bonus \|
	# --+---------+--------+----------+-------+
	# 0 \| Tom \| 950.0 \| 20'000.0 \| 300.0 \|
	# 1 \| Jerry \| 950.0 \| 6'000.0 \| 120.0 \|
	# 2 \| Magda \| 950.0 \| 15'000.0 \| 230.0 \|

	table = GDF.new(
	manager: ['Tom', 'Jerry', 'Magda'],
	salary: [1000.0, nil, 960.0],
	sales: [20_000, 6_000, nil]
	)
	table.view(&:nil?)
	# \| salary \| sales \|
	# --+--------+-------+
	# 1 \| nil \| \|
	# 2 \| \| nil \|

	table.view(&:nil?).values = 0.0

	table.view(nil, :salary){\|s\| s < 1000}.values += 100
	# or?
	table.select{\|r\| r.salary < 1000}.columns[:salary].values += 100
	# neither is cool :(
	table.view{\|val, r, c\| c == :salary && val < 1000}.values += 100
	# ↑ ok, that's cute!

	# Composite indexes
	table = GoodDataFrame.new(
	age: [30, 23, 38, 40, 22, 28],
	salary: [1000.0, 980.0, 860.0, 950.0, 720.0, 1050.0],

	# let's try the simplest thing possible
	index: {
	department_A: [:John, :Jeff, :Maggy],
	department_B: [:Jake, :Mary, :Andrew]
	}
	)
	# \| age \| salary \|
	# -------------+--------+-----+--------+
	# department_A \| John \| 30 \| 1000.0 \|
	# \| Jeff \| 23 \| 980.0 \|
	# \| Maggy \| 38 \| 860.0 \|
	# -------------+--------+-----+--------+
	# department_B \| Jake \| 40 \| 950.0 \|
	# \| Mary \| 22 \| 720.0 \|
	# \| Andrew \| 28 \| 1050.0 \|

	table[:department_A]
	# \| age \| salary \|
	# -------------+--------+-----+--------+
	# department_A \| John \| 30 \| 1000.0 \|
	# \| Jeff \| 23 \| 980.0 \|
	# \| Maggy \| 38 \| 860.0 \|

	table[:department_A, :John]
	# \| age \| salary \|
	# -------------+--------+-----+--------+
	# department_A \| John \| 30 \| 1000.0 \|


	# Enough for most of cool stats and pivots and stuff