Last active
November 17, 2020 14:04
-
-
Save mesejo/15dbc9132ca0f3d15f35f0f7ad2ff190 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
arr = pd.read_csv("test.txt", delim_whitespace=True).values | |
print(arr) | |
class Cube: | |
def __init__(self, row_index, col_index, data): | |
self.row_index = {r: i for i, r in enumerate(row_index)} | |
self.col_index = {c: i for i, c in enumerate(col_index)} | |
self.data = data | |
def __getitem__(self, item): | |
row, col = item | |
return self.data[self.row_index[row], self.col_index[col]] | |
def __repr__(self): | |
return repr(self.data) | |
def _x_tab(rows, columns, values): | |
"""Function for computing the cross-tab of simple arrays""" | |
unique_values_all_cols, idx = zip(*(np.unique(col, return_inverse=True) for col in [rows, columns])) | |
shape_xt = [uniq_vals_col.size for uniq_vals_col in unique_values_all_cols] | |
xt = np.zeros(shape_xt, dtype=np.float) | |
np.add.at(xt, idx, values) | |
return unique_values_all_cols, xt | |
def make_index(a, r): | |
"""Make array of tuples""" | |
l = [tuple(row) for row in a[:, r]] | |
return make_object_array(l) | |
def make_object_array(l): | |
a = np.empty(len(l), dtype=object) | |
a[:] = l | |
return a | |
def fill_label(ar, le): | |
"""Fill missing parts with ALL label""" | |
missing = tuple(["ALL"] * le) | |
return [(e + missing)[:le] for e in ar] | |
def x_tab(rows, cols, values): | |
"""Main function for cross tabulation""" | |
_, l_cols = rows.shape | |
total_agg = [] | |
total_idx = [] | |
for i in range(l_cols + 1): | |
(idx, _), agg = _x_tab(make_index(rows, list(range(i))), cols, values) | |
total_idx.extend(fill_label(idx, l_cols)) | |
total_agg.append(agg) | |
stacked_agg = np.vstack(total_agg) | |
stacked_agg_total = stacked_agg.sum(axis=1).reshape(-1, 1) | |
return Cube(total_idx, list(dict.fromkeys(cols)), np.concatenate((stacked_agg, stacked_agg_total), axis=1)) | |
result = x_tab(arr[:, [0, 1]], arr[:, 2], arr[:, 3]) | |
print(result) | |
print(result[('Dante', 'ALL'), 2001]) | |
print(result[('Dante', 'Inferno'), 2001]) | |
print(result[('Shakespeare', 'Hamlet'), 2000]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
AUTHOR BOOK YEAR SALES | |
Shakespeare Hamlet 2000 104.2 | |
Shakespeare Hamlet 2001 99.0 | |
Shakespeare Romeo 2000 27.0 | |
Shakespeare Romeo 2001 19.0 | |
Dante Inferno 2000 11.6 | |
Dante Inferno 2001 12.6 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment