Useful links:
- https://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html
- https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
- https://s3.amazonaws.com/assets.datacamp.com/img/blog/data+table+cheat+sheet.pdf
- http://pandas.pydata.org/pandas-docs/stable/comparison_with_r.html
- https://dato.com/products/create/docs/generated/graphlab.SFrame.html
INSERT INTO table (col1, col2, col3)
VALUES ('aa', 'bb', 'cc');
bind_rows(table, data.frame(col1="aa",col2="bb",col3="cc"))
rbind(table, data.table(col1="aa",col2="bb",col3="cc"))
table.append({'col1': 'aa', 'col2':'bb', 'col3':'cc'})
SELECT * FROM table1
UNION ALL
SELECT * FROM table2;
bind_rows(table1, table2)
rbind(table1,table2)
pandas.concat(table1,table2) # or
table1.append(table2)
table1.append(table2)
Sort table
by col1
ascending, col2
descending
SELECT *
FROM table
ORDER BY col1, col2 DESC
table %>% arrange(col1,desc(col2))
table[order(col1, -col2)] # or
setorder(table, col1, -col2) # operates in-place
table.sort(['col1','col2'], ascending=[1, 0])
table.sort([('col1', True), ('col2', False)])
SELECT *
FROM table
WHERE col1 = 'aa' AND col2 = 'bb'
table %>% filter(col1 == 'aa', col2 == 'bb')
table[col1=='aa' & col2=='bb']
table[(table['col1'] == 'aa') & (table['col2'] == 'bb')]
SELECT * , col1 - col2 AS col3
FROM table
flights %>% mutate(col3 = col1 - col2)
table[,col3:=col1-col2]
table['col3'] = table['col2'] - table['col1']
aka stack/melt
No standard way (apart from abuse of UNION ALL
, but this will perform multiple scans of the table).
Postgres can use unnest
:
SELECT id,
unnest(array['col1', 'col2', 'col3']) AS colname,
unnest(array[col1, col2, col3]) AS value
FROM table;
with tidyr
table %>% gather(colname,value,col1:col3)
melt(table, "id", c("col1","col2","col3"), "colname", "value")
pandas.melt(table,['id'],['col1','col2','col3'],'colname','value')
table.stack()
may also work for simple cases.
stack
operates on a single dict (or list) column, so this first needs to be created:
table.pack_columns(column_prefix='col',dtype=dict,
new_column_name='colpack').stack('colpack',new_column_name=['colname', 'value'])
aka unstack/cast
no standard method, though can be emulated using lots of joins
Postgres offers crosstab
, however the number of values and types need to be known in advance.
SELECT *
FROM crosstab3('
select id, colname, value
from table
order by 1,2
');
with tidyr
table %>% spread(colname, value)
dcast.data.table(table, id ~ colname, value.var="value")
Can be somewhat emulated:
table.pivot_table(values='value', index='id', columns='colname', aggfunc='sum')
however this requires an aggregation function.
df.groupby(['id','colname'])['value'].unstack()
might work?
table.unstack(['colname','value'],new_column_name='colpack').unpack('colpack',column_name_prefix="")
Not standard, but can be done using Postgres windowed functions
SELECT * , rank() OVER (PARTITION BY grp ORDER BY col DESC) AS pos
FROM table
WHERE pos <= 5;
table %>% group_by(grp) %>% top_n(col,5)
(may need ungroup()
at the end?)
table[,.SD[order(-col)[1:5]],by=grp]
table.sort(['grp','col'],ascending=[1, 0]).groupby('grp').head(5)
No builtin. The following is based on this how-to:
table = table.sort(['grp', 'col'], ascending=False)
table = table.add_row_number('id')
table = table.groupby('grp', gl.aggregate.MIN('id'))
table = table.join(grp, 'grp', how='left')
table['rank'] = table['id'] - table['Min of id']
top_k = table[table['rank'] <= 5]
Not standard, but can be done using Postgres windowed functions
SELECT group, col, value,
sum(value) OVER (PARTITION BY grp ORDER BY col) AS cum_value
FROM table
ORDER BY grp, col;
table %>% group_by(grp) %>% mutate(cum_value=cumsum(value))
table[, cum_value := cumsum(value), by=grp]
table.groupby('grp').cumsum('value')
No builtin, may be possible to use numpy integration http://forum.dato.com/discussion/comment/3566#Comment_3566