Skip to content

Instantly share code, notes, and snippets.

@macobo
Created January 10, 2022 09:46
Show Gist options
  • Select an option

  • Save macobo/6563e830a3f1e2d724313e470ae1073f to your computer and use it in GitHub Desktop.

Select an option

Save macobo/6563e830a3f1e2d724313e470ae1073f to your computer and use it in GitHub Desktop.
SELECT groupArray(value)
FROM (
SELECT trim(BOTH '"' FROM JSONExtractRaw(person_props, 'name')) AS value,
count(*) as count
FROM events e
INNER JOIN (
SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 1
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0
) AS pdi
ON e.distinct_id = pdi.distinct_id
INNER JOIN (
SELECT id,
argMax(properties, _timestamp) as person_props
FROM person
WHERE team_id = 1
GROUP BY id
HAVING max(is_deleted) = 0
) person
ON pdi.person_id = person.id
WHERE team_id = 1
AND event = 'PushEvent'
AND timestamp >= '2011-01-01 00:00:00'
AND timestamp <= '2022-01-07 23:59:59'
GROUP BY value
ORDER BY count DESC
LIMIT 25
OFFSET 0
)
SETTINGS max_memory_usage=80000000000
-- clickhouse-flamegraphs/benchmark-1B-events/breakdown1-baseline.sql
SELECT groupArray(value)
FROM (
SELECT person.value AS value,
count(*) as count
FROM events e
INNER JOIN (
SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 1
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0
) AS pdi
ON e.distinct_id = pdi.distinct_id
INNER JOIN (
SELECT id,
-- Note: Fetching all properties always OOMs
trim(BOTH '"' FROM JSONExtractRaw(argMax(properties, _timestamp), 'name')) AS value
FROM person
WHERE team_id = 1
GROUP BY id
HAVING max(is_deleted) = 0
) person
ON pdi.person_id = person.id
WHERE team_id = 1
AND event = 'PushEvent'
AND timestamp >= '2011-01-01 00:00:00'
AND timestamp <= '2022-01-07 23:59:59'
GROUP BY value
ORDER BY count DESC
LIMIT 25
OFFSET 0
)
SETTINGS max_memory_usage=90000000000
-- clickhouse-flamegraphs/benchmark-1B-events/breakdown1-baseline2.sql
SELECT groupArray(value)
FROM (
SELECT person.value AS value,
count(*) as count
FROM events e
INNER JOIN (
SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 1
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0
) AS pdi
ON e.distinct_id = pdi.distinct_id
INNER JOIN (
SELECT id,
argMax(pmat_name, _timestamp) AS value
FROM person
WHERE team_id = 1
GROUP BY id
HAVING max(is_deleted) = 0
) person
ON pdi.person_id = person.id
WHERE team_id = 1
AND event = 'PushEvent'
AND timestamp >= '2011-01-01 00:00:00'
AND timestamp <= '2022-01-07 23:59:59'
GROUP BY value
ORDER BY count DESC
LIMIT 25
OFFSET 0
)
SETTINGS max_memory_usage=80000000000
-- clickhouse-flamegraphs/benchmark-1B-events/breakdown1-mat.sql
-- OOM
SELECT groupArray(value)
FROM (
SELECT person.value AS value,
count(*) as count
FROM events e
INNER JOIN (
SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 1
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0
) AS pdi
ON e.distinct_id = pdi.distinct_id
INNER JOIN (
SELECT id,
-- Note: Fetching all properties always OOMs
trim(BOTH '"' FROM JSONExtractRaw(argMax(properties, _timestamp), 'name')) AS value
FROM person
WHERE team_id = 1
GROUP BY id
HAVING max(is_deleted) = 0
) person
ON pdi.person_id = person.id
WHERE team_id = 1
AND event = 'PushEvent'
AND timestamp >= '2011-01-01 00:00:00'
AND timestamp <= '2022-01-07 23:59:59'
GROUP BY value
ORDER BY count DESC
LIMIT 25
OFFSET 0
)
SETTINGS max_memory_usage=80000000000
-- clickhouse-flamegraphs/benchmark-1B-events/breakdown1-old.sql
SELECT groupArray(value)
FROM (
SELECT pmat_name AS value,
count(*) as count
FROM events e
WHERE team_id = 1
AND event = 'PushEvent'
AND timestamp >= '2011-01-01 00:00:00'
AND timestamp <= '2022-01-07 23:59:59'
GROUP BY value
ORDER BY count DESC
LIMIT 25
OFFSET 0
)
SETTINGS max_memory_usage=80000000000
-- clickhouse-flamegraphs/benchmark-1B-events/breakdown1-person_id-mat.sql
-- 1 rows in set. Elapsed: 66.893 sec. Processed 2.06 billion rows, 137.56 GB (30.82 million rows/s., 2.06 GB/s.)
-- Memory usage: 19.06 GiB
SELECT groupArray(value)
FROM (
SELECT trim(BOTH '"' FROM JSONExtractRaw(person_properties, 'name')) AS value,
count(*) as count
FROM events e
WHERE team_id = 1
AND event = 'PushEvent'
AND timestamp >= '2011-01-01 00:00:00'
AND timestamp <= '2022-01-07 23:59:59'
GROUP BY value
ORDER BY count DESC
LIMIT 25
OFFSET 0
)
SETTINGS max_memory_usage=80000000000
-- clickhouse-flamegraphs/benchmark-1B-events/breakdown1-person_id.sql
-- 1 rows in set. Elapsed: 304.919 sec. Processed 2.06 billion rows, 597.37 GB (6.75 million rows/s., 1.96 GB/s.)
-- Memory: 18.82 GiB
SELECT groupArray(day_start) as date,
groupArray(count) as data
FROM (
SELECT SUM(total) AS count,
day_start
from (
SELECT toUInt16(0) AS total,
toStartOfMonth(toDateTime('2022-01-05 23:59:59') - toIntervalDay(number)) AS day_start
FROM numbers(dateDiff('day', toDateTime('2011-01-01 00:00:00'), toDateTime('2022-01-05 23:59:59')))
UNION ALL SELECT toUInt16(0) AS total,
toStartOfMonth(toDateTime('2011-01-01 00:00:00'))
UNION ALL SELECT count(DISTINCT person_id) as data,
toDateTime(toStartOfMonth(timestamp), 'UTC') as date
FROM (
SELECT e.timestamp as timestamp,
pdi.person_id as person_id
FROM events e
INNER JOIN (
SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id3
WHERE team_id = 1
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0
) AS pdi
ON e.distinct_id = pdi.distinct_id
WHERE team_id = 1
AND event = 'PushEvent'
AND toStartOfMonth(timestamp) >= toStartOfMonth(toDateTime('2011-01-01 00:00:00'))
AND timestamp <= '2022-01-05 23:59:59'
)
GROUP BY toStartOfMonth(timestamp)
)
group by day_start
order by day_start
)
SETTINGS max_memory_usage=80000000000
-- clickhouse-flamegraphs/benchmark-1B-events/dau-baseline.sql
-- 1 rows in set. Elapsed: 93.549 sec. Processed 2.11 billion rows, 141.55 GB (22.54 million rows/s., 1.51 GB/s.)
-- Memory usage: 22.51 GiB
SELECT groupArray(day_start) as date,
groupArray(count) as data
FROM (
SELECT SUM(total) AS count,
day_start
from (
SELECT toUInt16(0) AS total,
toStartOfMonth(toDateTime('2022-01-05 23:59:59') - toIntervalDay(number)) AS day_start
FROM numbers(dateDiff('day', toDateTime('2011-01-01 00:00:00'), toDateTime('2022-01-05 23:59:59')))
UNION ALL SELECT toUInt16(0) AS total,
toStartOfMonth(toDateTime('2011-01-01 00:00:00'))
UNION ALL SELECT count(DISTINCT person_id) as data,
toDateTime(toStartOfMonth(timestamp), 'UTC') as date
FROM (
SELECT e.timestamp as timestamp,
pdi.person_id as person_id
FROM events e
INNER JOIN (
SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id3
WHERE team_id = 1
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0
) AS pdi
ON e.distinct_id = pdi.distinct_id
WHERE team_id = 1
AND event = 'PushEvent'
AND toStartOfMonth(timestamp) >= toStartOfMonth(toDateTime('2011-01-01 00:00:00'))
AND timestamp <= '2022-01-05 23:59:59'
)
GROUP BY toStartOfMonth(timestamp)
)
group by day_start
order by day_start
)
SETTINGS max_memory_usage=80000000000
-- clickhouse-flamegraphs/benchmark-1B-events/dau-old.sql
-- 1 rows in set. Elapsed: 97.868 sec. Processed 2.11 billion rows, 141.55 GB (21.54 million rows/s., 1.45 GB/s.)
SELECT groupArray(day_start) as date,
groupArray(count) as data
FROM (
SELECT SUM(total) AS count,
day_start
from (
SELECT toUInt16(0) AS total,
toStartOfMonth(toDateTime('2022-01-05 23:59:59') - toIntervalDay(number)) AS day_start
FROM numbers(dateDiff('day', toDateTime('2011-01-01 00:00:00'), toDateTime('2022-01-05 23:59:59')))
UNION ALL SELECT toUInt16(0) AS total,
toStartOfMonth(toDateTime('2011-01-01 00:00:00'))
UNION ALL SELECT count(DISTINCT person_id) as data,
toDateTime(toStartOfMonth(timestamp), 'UTC') as date
FROM (
SELECT e.timestamp as timestamp,
distinct_id as person_id
FROM events e
WHERE team_id = 1
AND event = 'PushEvent'
AND toStartOfMonth(timestamp) >= toStartOfMonth(toDateTime('2011-01-01 00:00:00'))
AND timestamp <= '2022-01-05 23:59:59'
)
GROUP BY toStartOfMonth(timestamp)
)
group by day_start
order by day_start
)
-- clickhouse-flamegraphs/benchmark-1B-events/dau-person_id.sql
-- 1 rows in set. Elapsed: 69.739 sec. Processed 2.06 billion rows, 137.33 GB (29.52 million rows/s., 1.97 GB/s.)
-- Memory usage: 20.44 GB
-- clickhouse-flamegraphs/benchmark-1B-events/funnel.sql
SELECT countIf(steps = 1) step_1,
countIf(steps = 2) step_2,
avg(step_1_average_conversion_time_inner) step_1_average_conversion_time,
median(step_1_median_conversion_time_inner) step_1_median_conversion_time
FROM (
SELECT aggregation_target,
steps,
avg(step_1_conversion_time) step_1_average_conversion_time_inner,
median(step_1_conversion_time) step_1_median_conversion_time_inner
FROM (
SELECT aggregation_target,
steps,
max(steps) over (PARTITION BY aggregation_target) as max_steps,
step_1_conversion_time
FROM (
SELECT *,
if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 14 DAY, 2, 1) AS steps ,
if(isNotNull(latest_1) AND latest_1 <= latest_0 + INTERVAL 14 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time
FROM (
SELECT aggregation_target,
timestamp,
step_0,
latest_0,
step_1,
min(latest_1) over (PARTITION by aggregation_target ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1
FROM (
SELECT aggregation_target,
timestamp,
if(event = 'CreateEvent', 1, 0) as step_0,
if(step_0 = 1, timestamp, null) as latest_0,
if(event = 'PullRequestEvent', 1, 0) as step_1,
if(step_1 = 1, timestamp, null) as latest_1
FROM (
SELECT e.event as event,
e.team_id as team_id,
e.distinct_id as distinct_id,
e.timestamp as timestamp,
pdi.person_id as aggregation_target
FROM events_old e
INNER JOIN (
SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 1
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0
) AS pdi
ON e.distinct_id = pdi.distinct_id
WHERE team_id = 1
AND event IN ['CreateEvent', 'PullRequestEvent']
AND timestamp >= '2019-01-01 00:00:00'
AND timestamp <= '2022-01-07 23:59:59'
) events
WHERE (step_0 = 1 OR step_1 = 1)
)
)
WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1
)
)
GROUP BY aggregation_target,
steps
HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1
)
SETTINGS max_memory_usage=80000000000
-- 1 rows in set. Elapsed: 240.976 sec. Processed 1.38 billion rows, 96.56 GB (5.74 million rows/s., 400.71 MB/s.)
SELECT countIf(steps = 1) step_1,
countIf(steps = 2) step_2,
avg(step_1_average_conversion_time_inner) step_1_average_conversion_time,
median(step_1_median_conversion_time_inner) step_1_median_conversion_time
FROM (
SELECT aggregation_target,
steps,
avg(step_1_conversion_time) step_1_average_conversion_time_inner,
median(step_1_conversion_time) step_1_median_conversion_time_inner
FROM (
SELECT aggregation_target,
steps,
max(steps) over (PARTITION BY aggregation_target) as max_steps,
step_1_conversion_time
FROM (
SELECT *,
if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 14 DAY, 2, 1) AS steps ,
if(isNotNull(latest_1) AND latest_1 <= latest_0 + INTERVAL 14 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time
FROM (
SELECT aggregation_target,
timestamp,
step_0,
latest_0,
step_1,
min(latest_1) over (PARTITION by aggregation_target ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1
FROM (
SELECT aggregation_target,
timestamp,
if(event = 'CreateEvent', 1, 0) as step_0,
if(step_0 = 1, timestamp, null) as latest_0,
if(event = 'PullRequestEvent', 1, 0) as step_1,
if(step_1 = 1, timestamp, null) as latest_1
FROM (
SELECT e.event as event,
e.team_id as team_id,
e.distinct_id as distinct_id,
e.timestamp as timestamp,
distinct_id as aggregation_target
FROM events_old e
WHERE team_id = 1
AND event IN ['CreateEvent', 'PullRequestEvent']
AND timestamp >= '2019-01-01 00:00:00'
AND timestamp <= '2022-01-07 23:59:59'
) events
WHERE (step_0 = 1 OR step_1 = 1)
)
)
WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1
)
)
GROUP BY aggregation_target,
steps
HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1
) SETTINGS allow_experimental_window_functions = 1
SELECT countIf(steps = 1) step_1,
countIf(steps = 2) step_2,
avg(step_1_average_conversion_time_inner) step_1_average_conversion_time,
median(step_1_median_conversion_time_inner) step_1_median_conversion_time
FROM (
SELECT aggregation_target,
steps,
avg(step_1_conversion_time) step_1_average_conversion_time_inner,
median(step_1_conversion_time) step_1_median_conversion_time_inner
FROM (
SELECT aggregation_target,
steps,
max(steps) over (PARTITION BY aggregation_target) as max_steps,
step_1_conversion_time
FROM (
SELECT *,
if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 14 DAY, 2, 1) AS steps ,
if(isNotNull(latest_1) AND latest_1 <= latest_0 + INTERVAL 14 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time
FROM (
SELECT aggregation_target,
timestamp,
step_0,
latest_0,
step_1,
min(latest_1) over (PARTITION by aggregation_target ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1
FROM (
SELECT aggregation_target,
timestamp,
if(event = 'CreateEvent', 1, 0) as step_0,
if(step_0 = 1, timestamp, null) as latest_0,
if(event = 'PullRequestEvent', 1, 0) as step_1,
if(step_1 = 1, timestamp, null) as latest_1
FROM (
SELECT e.event as event,
e.team_id as team_id,
e.distinct_id as distinct_id,
e.timestamp as timestamp,
distinct_id as aggregation_target
FROM events e
WHERE team_id = 1
AND event IN ['CreateEvent', 'PullRequestEvent']
AND timestamp >= '2019-01-01 00:00:00'
AND timestamp <= '2022-01-07 23:59:59'
) events
WHERE (step_0 = 1 OR step_1 = 1)
)
)
WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1
)
)
GROUP BY aggregation_target,
steps
HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1
) SETTINGS allow_experimental_window_functions = 1
-- 1 rows in set. Elapsed: 140.824 sec. Processed 278.36 million rows, 19.03 GB (1.98 million rows/s., 135.15 MB/s.)
-- clickhouse-flamegraphs/benchmark-1B-events/funnel.sql
SELECT countIf(steps = 1) step_1,
countIf(steps = 2) step_2,
avg(step_1_average_conversion_time_inner) step_1_average_conversion_time,
median(step_1_median_conversion_time_inner) step_1_median_conversion_time
FROM (
SELECT aggregation_target,
steps,
avg(step_1_conversion_time) step_1_average_conversion_time_inner,
median(step_1_conversion_time) step_1_median_conversion_time_inner
FROM (
SELECT aggregation_target,
steps,
max(steps) over (PARTITION BY aggregation_target) as max_steps,
step_1_conversion_time
FROM (
SELECT *,
if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 14 DAY, 2, 1) AS steps ,
if(isNotNull(latest_1) AND latest_1 <= latest_0 + INTERVAL 14 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time
FROM (
SELECT aggregation_target,
timestamp,
step_0,
latest_0,
step_1,
min(latest_1) over (PARTITION by aggregation_target ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1
FROM (
SELECT aggregation_target,
timestamp,
if(event = 'CreateEvent', 1, 0) as step_0,
if(step_0 = 1, timestamp, null) as latest_0,
if(event = 'PullRequestEvent', 1, 0) as step_1,
if(step_1 = 1, timestamp, null) as latest_1
FROM (
SELECT e.event as event,
e.team_id as team_id,
e.distinct_id as distinct_id,
e.timestamp as timestamp,
pdi.person_id as aggregation_target
FROM events e
INNER JOIN (
SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 1
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0
) AS pdi
ON e.distinct_id = pdi.distinct_id
WHERE team_id = 1
AND event IN ['CreateEvent', 'PullRequestEvent']
AND timestamp >= '2019-01-01 00:00:00'
AND timestamp <= '2022-01-07 23:59:59'
) events
WHERE (step_0 = 1 OR step_1 = 1)
)
)
WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1
)
)
GROUP BY aggregation_target,
steps
HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1
)
SETTINGS max_memory_usage=80000000000
-- 1 rows in set. Elapsed: 201.670 sec. Processed 423.82 million rows, 31.33 GB (2.10 million rows/s., 155.35 MB/s.)
CREATE TABLE events
(
`uuid` UUID,
`event` String,
`properties` String,
`person_properties` String,
`timestamp` DateTime64(6, 'UTC'),
`team_id` Int64,
`distinct_id` String,
`elements_chain` String,
`created_at` DateTime64(6, 'UTC'),
`$group_0` String MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_0'), concat('^[', regexpQuoteMeta('"'), ']*|[', regexpQuoteMeta('"'), ']*$'), '') COMMENT 'column_materializer::$group_0',
`$group_1` String MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_1'), concat('^[', regexpQuoteMeta('"'), ']*|[', regexpQuoteMeta('"'), ']*$'), '') COMMENT 'column_materializer::$group_1',
`$group_2` String MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_2'), concat('^[', regexpQuoteMeta('"'), ']*|[', regexpQuoteMeta('"'), ']*$'), '') COMMENT 'column_materializer::$group_2',
`$group_3` String MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_3'), concat('^[', regexpQuoteMeta('"'), ']*|[', regexpQuoteMeta('"'), ']*$'), '') COMMENT 'column_materializer::$group_3',
`$group_4` String MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_4'), concat('^[', regexpQuoteMeta('"'), ']*|[', regexpQuoteMeta('"'), ']*$'), '') COMMENT 'column_materializer::$group_4',
`_timestamp` DateTime,
`_offset` UInt64
)
ENGINE = ReplacingMergeTree(_timestamp)
PARTITION BY toYYYYMM(timestamp)
ORDER BY (team_id, toDate(timestamp), event, cityHash64(distinct_id), cityHash64(uuid))
SAMPLE BY cityHash64(distinct_id)
SETTINGS index_granularity = 8192
CREATE TABLE person
(
`id` String,
`created_at` DateTime64(3),
`team_id` Int64,
`properties` String,
`is_identified` Int8,
`is_deleted` Int8 DEFAULT 0,
`_timestamp` DateTime,
`_offset` UInt64
)
ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (team_id, id)
SETTINGS index_granularity = 8192
CREATE TABLE person_distinct_id2
(
`team_id` Int64,
`distinct_id` String,
`person_id` String,
`is_deleted` Int8,
`version` Int64 DEFAULT 1,
`_timestamp` DateTime,
`_offset` UInt64
)
ENGINE = ReplacingMergeTree(version)
ORDER BY (team_id, distinct_id)
SETTINGS index_granularity = 512
INSERT INTO person_distinct_id2 (team_id, distinct_id, person_id)
SELECT 1 as team_id, repo_name as distinct_id, repo_name as person_id
FROM github_events
GROUP BY repo_name
SETTINGS max_memory_usage=40000000000
INSERT INTO person (team_id, id, properties)
SELECT
1 as team_id,
repo_name as id,
toJSONString(map(
'name', repo_name,
'comment_id', toString(any(comment_id)),
'created_at', toString(min(created_at)),
'creator_user_login', toString(any(creator_user_login)),
'comments', toString(sum(comments)),
'events', toString(count()),
'updated_at', toString(max(created_at)),
'refs', toString(any(ref)),
'additions', toString(sum(additions)),
'deletions', toString(sum(deletions)),
'changed_files', toString(sum(changed_files))
-- 'users', toString(count(DISTINCT actor_login))
)) as properties
FROM github_events
WHERE event_type IN ('CreateEvent', 'PullRequestEvent')
GROUP BY repo_name
SETTINGS max_memory_usage=80000000000
CREATE VIEW events_view AS
SELECT
1 as team_id,
generateUUIDv4() as uuid,
event_type as event,
repo_name as distinct_id,
created_at as timestamp,
created_at as _timestamp,
toJSONString(map(
'file_time', toString(file_time),
'event_type', toString(event_type),
'actor_login', toString(actor_login),
'repo_name', toString(repo_name),
'created_at', toString(created_at),
'updated_at', toString(updated_at),
'action', toString(action),
'comment_id', toString(comment_id),
'body', toString(body),
'path', toString(path),
'position', toString(position),
'line', toString(line),
'ref', toString(ref),
'ref_type', toString(ref_type),
'creator_user_login', toString(creator_user_login),
'number', toString(number),
'title', toString(title),
'labels', toString(labels),
'state', toString(state),
'locked', toString(locked),
'assignee', toString(assignee),
'assignees', toString(assignees),
'comments', toString(comments),
'author_association', toString(author_association),
'closed_at', toString(closed_at),
'merged_at', toString(merged_at),
'merge_commit_sha', toString(merge_commit_sha),
'requested_reviewers', toString(requested_reviewers),
'requested_teams', toString(requested_teams),
'head_ref', toString(head_ref),
'head_sha', toString(head_sha),
'base_ref', toString(base_ref),
'base_sha', toString(base_sha),
'merged', toString(merged),
'mergeable', toString(mergeable),
'rebaseable', toString(rebaseable),
'mergeable_state', toString(mergeable_state),
'merged_by', toString(merged_by),
'review_comments', toString(review_comments),
'commits', toString(commits),
'additions', toString(additions),
'deletions', toString(deletions),
'changed_files', toString(changed_files),
'diff_hunk', toString(diff_hunk),
'original_position', toString(original_position),
'commit_id', toString(commit_id),
'original_commit_id', toString(original_commit_id),
'push_size', toString(push_size),
'push_distinct_size', toString(push_distinct_size),
'member_login', toString(member_login),
'release_tag_name', toString(release_tag_name),
'release_name', toString(release_name),
'review_state', toString(review_state)
)) AS properties,
toJSONString(map(
'name', repo_name,
'comment_id', toString(comment_id),
'created_at', toString(created_at),
'creator_user_login', toString(creator_user_login),
'comments', toString(comments),
'events', toString(1),
'updated_at', toString(created_at),
'refs', toString(ref),
'additions', toString(additions),
'deletions', toString(deletions),
'changed_files', toString(changed_files)
-- 'users', toString(count(DISTINCT actor_login))
)) as person_properties
FROM github_events
ORDER BY toDate(timestamp), event
CREATE VIEW events_view1 AS
SELECT
1 as team_id,
generateUUIDv4() as uuid,
event_type as event,
repo_name as distinct_id,
created_at as timestamp,
created_at as _timestamp,
toJSONString(map(
'file_time', toString(file_time),
'event_type', toString(event_type),
'actor_login', toString(actor_login),
'repo_name', toString(repo_name),
'created_at', toString(created_at),
'updated_at', toString(updated_at),
'action', toString(action),
'comment_id', toString(comment_id),
'body', toString(body),
'path', toString(path),
'position', toString(position),
'line', toString(line),
'ref', toString(ref),
'ref_type', toString(ref_type),
'creator_user_login', toString(creator_user_login),
'number', toString(number),
'title', toString(title),
'labels', toString(labels),
'state', toString(state),
'locked', toString(locked),
'assignee', toString(assignee),
'assignees', toString(assignees),
'comments', toString(comments),
'author_association', toString(author_association),
'closed_at', toString(closed_at),
'merged_at', toString(merged_at),
'merge_commit_sha', toString(merge_commit_sha),
'requested_reviewers', toString(requested_reviewers),
'requested_teams', toString(requested_teams),
'head_ref', toString(head_ref),
'head_sha', toString(head_sha),
'base_ref', toString(base_ref),
'base_sha', toString(base_sha),
'merged', toString(merged),
'mergeable', toString(mergeable),
'rebaseable', toString(rebaseable),
'mergeable_state', toString(mergeable_state),
'merged_by', toString(merged_by),
'review_comments', toString(review_comments),
'commits', toString(commits),
'additions', toString(additions),
'deletions', toString(deletions),
'changed_files', toString(changed_files),
'diff_hunk', toString(diff_hunk),
'original_position', toString(original_position),
'commit_id', toString(commit_id),
'original_commit_id', toString(original_commit_id),
'push_size', toString(push_size),
'push_distinct_size', toString(push_distinct_size),
'member_login', toString(member_login),
'release_tag_name', toString(release_tag_name),
'release_name', toString(release_name),
'review_state', toString(review_state)
)) AS properties,
toJSONString(map(
'name', repo_name,
'comment_id', toString(comment_id),
'created_at', toString(created_at),
'creator_user_login', toString(creator_user_login),
'comments', toString(comments),
'events', toString(1),
'updated_at', toString(created_at),
'refs', toString(ref),
'additions', toString(additions),
'deletions', toString(deletions),
'changed_files', toString(changed_files)
-- 'users', toString(count(DISTINCT actor_login))
)) as person_properties
FROM github_events
WHERE event_type = 'PushEvent'
CREATE VIEW events_view2 AS
SELECT
1 as team_id,
generateUUIDv4() as uuid,
event_type as event,
repo_name as distinct_id,
created_at as timestamp,
created_at as _timestamp,
toJSONString(map(
'file_time', toString(file_time),
'event_type', toString(event_type),
'actor_login', toString(actor_login),
'repo_name', toString(repo_name),
'created_at', toString(created_at),
'updated_at', toString(updated_at),
'action', toString(action),
'comment_id', toString(comment_id),
'body', toString(body),
'path', toString(path),
'position', toString(position),
'line', toString(line),
'ref', toString(ref),
'ref_type', toString(ref_type),
'creator_user_login', toString(creator_user_login),
'number', toString(number),
'title', toString(title),
'labels', toString(labels),
'state', toString(state),
'locked', toString(locked),
'assignee', toString(assignee),
'assignees', toString(assignees),
'comments', toString(comments),
'author_association', toString(author_association),
'closed_at', toString(closed_at),
'merged_at', toString(merged_at),
'merge_commit_sha', toString(merge_commit_sha),
'requested_reviewers', toString(requested_reviewers),
'requested_teams', toString(requested_teams),
'head_ref', toString(head_ref),
'head_sha', toString(head_sha),
'base_ref', toString(base_ref),
'base_sha', toString(base_sha),
'merged', toString(merged),
'mergeable', toString(mergeable),
'rebaseable', toString(rebaseable),
'mergeable_state', toString(mergeable_state),
'merged_by', toString(merged_by),
'review_comments', toString(review_comments),
'commits', toString(commits),
'additions', toString(additions),
'deletions', toString(deletions),
'changed_files', toString(changed_files),
'diff_hunk', toString(diff_hunk),
'original_position', toString(original_position),
'commit_id', toString(commit_id),
'original_commit_id', toString(original_commit_id),
'push_size', toString(push_size),
'push_distinct_size', toString(push_distinct_size),
'member_login', toString(member_login),
'release_tag_name', toString(release_tag_name),
'release_name', toString(release_name),
'review_state', toString(review_state)
)) AS properties,
toJSONString(map(
'name', repo_name,
'comment_id', toString(comment_id),
'created_at', toString(created_at),
'creator_user_login', toString(creator_user_login),
'comments', toString(comments),
'events', toString(1),
'updated_at', toString(created_at),
'refs', toString(ref),
'additions', toString(additions),
'deletions', toString(deletions),
'changed_files', toString(changed_files)
-- 'users', toString(count(DISTINCT actor_login))
)) as person_properties
FROM github_events
WHERE event_type != 'PushEvent'
INSERT INTO events_old2 (team_id, uuid, event, distinct_id, timestamp, _timestamp, properties, person_properties)
SELECT
1 as team_id,
generateUUIDv4() as uuid,
event_type as event,
repo_name as distinct_id,
created_at as timestamp,
created_at as _timestamp,
toJSONString(map(
'file_time', toString(file_time),
'event_type', toString(event_type),
'actor_login', toString(actor_login),
'repo_name', toString(repo_name),
'created_at', toString(created_at),
'updated_at', toString(updated_at),
'action', toString(action),
'comment_id', toString(comment_id),
'body', toString(body),
'path', toString(path),
'position', toString(position),
'line', toString(line),
'ref', toString(ref),
'ref_type', toString(ref_type),
'creator_user_login', toString(creator_user_login),
'number', toString(number),
'title', toString(title),
'labels', toString(labels),
'state', toString(state),
'locked', toString(locked),
'assignee', toString(assignee),
'assignees', toString(assignees),
'comments', toString(comments),
'author_association', toString(author_association),
'closed_at', toString(closed_at),
'merged_at', toString(merged_at),
'merge_commit_sha', toString(merge_commit_sha),
'requested_reviewers', toString(requested_reviewers),
'requested_teams', toString(requested_teams),
'head_ref', toString(head_ref),
'head_sha', toString(head_sha),
'base_ref', toString(base_ref),
'base_sha', toString(base_sha),
'merged', toString(merged),
'mergeable', toString(mergeable),
'rebaseable', toString(rebaseable),
'mergeable_state', toString(mergeable_state),
'merged_by', toString(merged_by),
'review_comments', toString(review_comments),
'commits', toString(commits),
'additions', toString(additions),
'deletions', toString(deletions),
'changed_files', toString(changed_files),
'diff_hunk', toString(diff_hunk),
'original_position', toString(original_position),
'commit_id', toString(commit_id),
'original_commit_id', toString(original_commit_id),
'push_size', toString(push_size),
'push_distinct_size', toString(push_distinct_size),
'member_login', toString(member_login),
'release_tag_name', toString(release_tag_name),
'release_name', toString(release_name),
'review_state', toString(review_state)
)) AS properties,
toJSONString(map(
'name', repo_name,
'comment_id', toString(comment_id),
'created_at', toString(created_at),
'creator_user_login', toString(creator_user_login),
'comments', toString(comments),
'events', toString(1),
'updated_at', toString(created_at),
'refs', toString(ref),
'additions', toString(additions),
'deletions', toString(deletions),
'changed_files', toString(changed_files)
-- 'users', toString(count(DISTINCT actor_login))
)) as person_properties
FROM github_events2
-- clickhouse-client --query "SELECT * FROM events_view FORMAT JSONEachRow" | clickhouse-client --query "INSERT INTO events FORMAT JSONEachRow"
-- clickhouse-client --query "SELECT * FROM events_view1 FORMAT JSONEachRow" --progress | clickhouse-client --query "INSERT INTO events FORMAT JSONEachRow SETTINGS max_partitions_per_insert_block=100000"
-- clickhouse-client --query "SELECT * FROM events_view2 FORMAT JSONEachRow" --progress | clickhouse-client --query "INSERT INTO events FORMAT JSONEachRow SETTINGS max_partitions_per_insert_block=100000"
-- clickhouse-client --query "SELECT * FROM events_view WHERE event = 'PushEvent' FORMAT JSONEachRow" --progress | clickhouse-client --query "INSERT INTO events FORMAT JSONEachRow SETTINGS max_partitions_per_insert_block=100000"
-- clickhouse-client --query "SELECT * FROM events_view WHERE event != 'PushEvent' FORMAT JSONEachRow" --progress | clickhouse-client --query "INSERT INTO events FORMAT JSONEachRow SETTINGS max_partitions_per_insert_block=100000"
-- clickhouse-client --query "SELECT * FROM events_view1 FORMAT JSONEachRow LIMIT 500000000 OFFSET 0" --progress | clickhouse-client --query "INSERT INTO events FORMAT JSONEachRow SETTINGS max_partitions_per_insert_block=100000"
-- clickhouse-client --query "SELECT * FROM events_view1 FORMAT JSONEachRow LIMIT 500000000 OFFSET 500000000" --progress | clickhouse-client --query "INSERT INTO events FORMAT JSONEachRow SETTINGS max_partitions_per_insert_block=100000"
-- clickhouse-client --query "SELECT * FROM events_view1 FORMAT JSONEachRow LIMIT 500000000 OFFSET 1000000000" --progress | clickhouse-client --query "INSERT INTO events FORMAT JSONEachRow SETTINGS max_partitions_per_insert_block=100000"
-- z
-- clickhouse-client --query "SELECT * FROM events_view2 FORMAT JSONEachRow LIMIT 500000000 OFFSET 0" --progress | clickhouse-client --query "INSERT INTO events FORMAT JSONEachRow SETTINGS max_partitions_per_insert_block=100000"
ALTER TABLE events ADD COLUMN mat_file_name String Materialized trim(BOTH '\"' FROM JSONExtractRaw(properties, 'file_name'))
ALTER TABLE events ADD COLUMN pmat_name String Materialized trim(BOTH '\"' FROM JSONExtractRaw(person_properties, 'name'))
ALTER TABLE events ADD COLUMN pmat_creator_user_login String Materialized trim(BOTH '\"' FROM JSONExtractRaw(person_properties, 'creator_user_login'))
ALTER TABLE events MATERIALIZE COLUMN mat_file_name
ALTER TABLE events MATERIALIZE COLUMN pmat_name
ALTER TABLE events MATERIALIZE COLUMN pmat_creator_user_login
ALTER TABLE person ADD COLUMN pmat_name String Materialized trim(BOTH '\"' FROM JSONExtractRaw(properties, 'name'))
ALTER TABLE person MATERIALIZE COLUMN pmat_name
ALTER TABLE events_old ADD COLUMN mat_file_name String Materialized trim(BOTH '\"' FROM JSONExtractRaw(properties, 'file_name'))
ALTER TABLE events_old ADD COLUMN pmat_name String Materialized trim(BOTH '\"' FROM JSONExtractRaw(person_properties, 'name'))
ALTER TABLE events_old ADD COLUMN pmat_creator_user_login String Materialized trim(BOTH '\"' FROM JSONExtractRaw(person_properties, 'creator_user_login'))
ALTER TABLE events_old MATERIALIZE COLUMN mat_file_name
ALTER TABLE events_old MATERIALIZE COLUMN pmat_name
ALTER TABLE events_old MATERIALIZE COLUMN pmat_creator_user_login
#i3en.3xlarge
#1.36 usd per hour =
ssh ec2-user@35.175.197.89 -i ~/Downloads/karl.pem
# Mount data dir https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/add-instance-store-volumes.html
sudo mkfs -t xfs /dev/nvme1n1
sudo mkdir /data
sudo mount /dev/nvme1n1 /data
sudo chown ec2-user:ec2-user /data
mkdir -p /data/default
# Install clickhouse
sudo yum install yum-utils
sudo rpm --import https://repo.clickhouse.com/CLICKHOUSE-KEY.GPG
sudo yum-config-manager --add-repo https://repo.clickhouse.com/rpm/stable/x86_64
sudo yum install clickhouse-server clickhouse-client asciidoc libarchive-devel xz-devel git autoconf automake gcc
# install pixz
git clone https://github.com/vasi/pixz
cd pixz
sudo ./configure --prefix=/usr/local
sudo make
sudo make install
# Edit data dir path to /data/default
sudo vim /etc/clickhouse-server/config.xml
sudo service clickhouse-server start
# Follow https://ghe.clickhouse.tech/#download-the-dataset-directly-into-clickhouse
cd /data
wget https://datasets.clickhouse.tech/github_events_v2.native.xz
pixz -d < github_events_v2.native.xz | clickhouse-client --query "INSERT INTO github_events FORMAT Native" --progress
clickhouse-client --query "SELECT * FROM events_view FORMAT JSONEachRow" --progress | clickhouse-client --query "INSERT INTO events FORMAT JSONEachRow SETTINGS max_partitions_per_insert_block=100000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment