Last active
March 2, 2022 10:55
-
-
Save sumerc/6716c178e81e93316030dd8c46730078 to your computer and use it in GitHub Desktop.
Benchmark different codecs in Clickhouse table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// I am trying to benchmark different ways to store stack traces in Clickhouse db with different codecs and benchmark the compression | |
// The way it works is: | |
// 1. drop/create table with selected codecs | |
// 2. insert ARRAY(UInt32) (or ARRAY(UUID)) of data which mimics a valid stacktrace. Similar to pprof list of LocationIDs. | |
// minCallStackDepth/maxCallStackDepth defines the height range of the callstack and maxLocationCount defines the LocationID cardinality. | |
// The default value is selected as 10_000 which might be OKish since for a mid-sized application there will not be more than 10_000 unique | |
// functions/LocationsIDs? This is an assumption, though. | |
// 3. Optimize table and run stats Query to see column stats. Most important is the ratio. For example with (T64+ZSTD) on ARRAY(Uint32) I get | |
// ~2.3x compression on my local machine. | |
// This is the way I ran clickhouse on my M1: | |
// docker run \ | |
// --platform linux/amd64 \ | |
// -ti \ | |
// --ulimit nofile=262144:262144 \ | |
// --volume=$HOME/clickhouse-db:/var/lib/clickhouse \ | |
// -p 8123:8123 \ | |
// -p 9000:9000 \ | |
// clickhouse/clickhouse-server | |
package main | |
import ( | |
"context" | |
"fmt" | |
"log" | |
"math/rand" | |
"time" | |
"github.com/ClickHouse/clickhouse-go" | |
) | |
const tableName = `t31` | |
const minCallStackDepth = 1 | |
const maxCallStackDepth = 100 | |
const maxLocationCount = 10_000 | |
var ddl = fmt.Sprintf("CREATE TABLE %s (A Array(UInt32) CODEC(T64, ZSTD)) Engine=MergeTree ORDER BY A ;", tableName) | |
var result []struct { | |
A []uint32 `ch:"A"` | |
} | |
var statsSql = fmt.Sprintf(`SELECT | |
database, | |
table, | |
column, | |
type, | |
sum(rows) AS rows, | |
sum(column_data_compressed_bytes) AS compressed_bytes, | |
formatReadableSize(compressed_bytes) AS compressed, | |
formatReadableSize(sum(column_data_uncompressed_bytes)) AS uncompressed, | |
sum(column_data_uncompressed_bytes) / compressed_bytes AS ratio, | |
any(compression_codec) AS codec | |
FROM system.parts_columns AS pc | |
LEFT JOIN system.columns AS c | |
ON (pc.database = c.database) AND (c.table = pc.table) AND (c.name = pc.column) | |
WHERE (database LIKE '%%') AND (table LIKE '%s%%') AND active | |
GROUP BY | |
database, | |
table, | |
column, | |
type | |
ORDER BY ratio DESC`, tableName) | |
func randInt(min int, max int) int { | |
return min + rand.Intn(max-min) | |
} | |
func getRandomSlice(i int64) []uint32 { | |
// todo: use generics? | |
rand.Seed(i) | |
slice := make([]uint32, randInt(minCallStackDepth, maxCallStackDepth)) | |
for i := 0; i < len(slice); i++ { | |
slice[i] = uint32(rand.Intn(maxLocationCount)) | |
} | |
return slice | |
} | |
// var result []struct { | |
// A []uuid.UUID `ch:"A"` | |
// } | |
// func getRandomUUIDSlice() []uuid.UUID { | |
// rand.Seed(1) | |
// slice := make([]uuid.UUID, randInt(minCallStackDepth, maxCallStackDepth)) | |
// for i := 0; i < len(slice); i++ { | |
// slice[i] = uuid.New() | |
// } | |
// return slice | |
// } | |
func main() { | |
var ( | |
ctx = context.Background() | |
conn, err = clickhouse.Open(&clickhouse.Options{ | |
Addr: []string{"127.0.0.1:9000"}, | |
Auth: clickhouse.Auth{ | |
Database: "default", | |
Username: "default", | |
Password: "", | |
}, | |
//Debug: true, | |
DialTimeout: time.Second, | |
MaxOpenConns: 10, | |
MaxIdleConns: 5, | |
ConnMaxLifetime: time.Hour, | |
}) | |
) | |
if err != nil { | |
log.Fatal(err) | |
} | |
if err := conn.Exec(ctx, fmt.Sprintf("DROP TABLE IF EXISTS %s SYNC", tableName)); err != nil { | |
log.Fatal(err) | |
} | |
if err := conn.Exec(ctx, ddl); err != nil { | |
log.Fatal(err) | |
} | |
batch, err := conn.PrepareBatch(ctx, fmt.Sprintf("INSERT INTO %s", tableName)) | |
if err != nil { | |
log.Fatal(err) | |
} | |
for i := 0; i < 100_000; i++ { | |
err := batch.Append(getRandomSlice(int64(i))) | |
if err != nil { | |
log.Fatal(err) | |
} | |
} | |
err = batch.Send() | |
if err != nil { | |
log.Fatal(err) | |
} | |
// optimize table | |
if err = conn.Exec(ctx, fmt.Sprintf("optimize table %s final", tableName)); err != nil { | |
log.Fatal(err) | |
} | |
// retrieve column stats | |
var result []struct { | |
Col1 string `ch:"database"` | |
Col2 string `ch:"table"` | |
Col3 string `ch:"column"` | |
Col4 string `ch:"type"` | |
Col5 uint64 `ch:"rows"` | |
Col6 uint64 `ch:"compressed_bytes"` | |
Col7 string `ch:"compressed"` | |
Col8 string `ch:"uncompressed"` | |
Col9 float64 `ch:"ratio"` | |
Col10 string `ch:"codec"` | |
} | |
if err = conn.Select(ctx, &result, statsSql); err != nil { | |
log.Fatal(err) | |
} | |
log.Println(result) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment