The following provides the details required to reproduce the demo given on the Real-time SQL analytics at scale: A story of open-source GitHub activity using ClickHouse + Grafana webinar.
This provides a Grafana dashboard showing an analysis of the history of the Grafana repository, including common commiters, most popular issues, largest commits and files with the longest history.
- All steps require a ClickHouse and Grafana instance. Examples are most easily reproduced with ClickHouse Cloud and Grafana Cloud.
- Some steps require a local instance of ClickHouse e.g. if you wish to regenerate more recent versions of data files.
CREATE TABLE github_events
(
`file_time` DateTime,
`event_type` Enum8('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, 'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15, 'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19, 'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22),
`actor_login` LowCardinality(String),
`repo_name` LowCardinality(String),
`created_at` DateTime,
`updated_at` DateTime,
`action` Enum8('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9, 'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19, 'merged' = 20),
`comment_id` UInt64,
`body` String,
`path` String,
`position` Int32,
`line` Int32,
`ref` LowCardinality(String),
`ref_type` Enum8('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3, 'unknown' = 4),
`creator_user_login` LowCardinality(String),
`number` UInt32,
`title` String,
`labels` Array(LowCardinality(String)),
`state` Enum8('none' = 0, 'open' = 1, 'closed' = 2),
`locked` UInt8,
`assignee` LowCardinality(String),
`assignees` Array(LowCardinality(String)),
`comments` UInt32,
`author_association` Enum8('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5),
`closed_at` DateTime,
`merged_at` DateTime,
`merge_commit_sha` String,
`requested_reviewers` Array(LowCardinality(String)),
`requested_teams` Array(LowCardinality(String)),
`head_ref` LowCardinality(String),
`head_sha` String,
`base_ref` LowCardinality(String),
`base_sha` String,
`merged` UInt8,
`mergeable` UInt8,
`rebaseable` UInt8,
`mergeable_state` Enum8('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4),
`merged_by` LowCardinality(String),
`review_comments` UInt32,
`maintainer_can_modify` UInt8,
`commits` UInt32,
`additions` UInt32,
`deletions` UInt32,
`changed_files` UInt32,
`diff_hunk` String,
`original_position` UInt32,
`commit_id` String,
`original_commit_id` String,
`push_size` UInt32,
`push_distinct_size` UInt32,
`member_login` LowCardinality(String),
`release_tag_name` String,
`release_name` String,
`review_state` Enum8('none' = 0, 'approved' = 1, 'changes_requested' = 2, 'commented' = 3, 'dismissed' = 4, 'pending' = 5)
)
ENGINE = MergeTree
ORDER BY (event_type, repo_name, created_at)
The following will populate the table with just the grafana and ClickHouse repository histories. This will allow you to load most visualizations but will mean you cannot perform a cohort analysis i.e. who stars Grafana, stars what other repositories (Affinity by issues and PRs)? This dataset is also only up-to-date as of 8/2/2023.
INSERT INTO github_events SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/grafana/grafana_github_events.csv.gz', 'CSVWithNames')
Load the entire dataset. Full instructions here. Remember this is 5.5billion events as of Feb 2023 and increasing.
CREATE TABLE commits
(
`hash` String,
`author` LowCardinality(String),
`time` DateTime,
`message` String,
`files_added` UInt32,
`files_deleted` UInt32,
`files_renamed` UInt32,
`files_modified` UInt32,
`lines_added` UInt32,
`lines_deleted` UInt32,
`hunks_added` UInt32,
`hunks_removed` UInt32,
`hunks_changed` UInt32
)
ENGINE = MergeTree
ORDER BY time
CREATE TABLE file_changes
(
`change_type` Enum8('Add' = 1, 'Delete' = 2, 'Modify' = 3, 'Rename' = 4, 'Copy' = 5, 'Type' = 6),
`path` LowCardinality(String),
`old_path` LowCardinality(String),
`file_extension` LowCardinality(String),
`lines_added` UInt32,
`lines_deleted` UInt32,
`hunks_added` UInt32,
`hunks_removed` UInt32,
`hunks_changed` UInt32,
`commit_hash` String,
`author` LowCardinality(String),
`time` DateTime,
`commit_message` String,
`commit_files_added` UInt32,
`commit_files_deleted` UInt32,
`commit_files_renamed` UInt32,
`commit_files_modified` UInt32,
`commit_lines_added` UInt32,
`commit_lines_deleted` UInt32,
`commit_hunks_added` UInt32,
`commit_hunks_removed` UInt32,
`commit_hunks_changed` UInt32
)
ENGINE = MergeTree
ORDER BY time
CREATE TABLE line_changes
(
sign Int8,
line_number_old UInt32,
line_number_new UInt32,
hunk_num UInt32,
hunk_start_line_number_old UInt32,
hunk_start_line_number_new UInt32,
hunk_lines_added UInt32,
hunk_lines_deleted UInt32,
hunk_context LowCardinality(String),
line LowCardinality(String),
indent UInt8,
line_type Enum('Empty' = 0, 'Comment' = 1, 'Punct' = 2, 'Code' = 3),
prev_commit_hash String,
prev_author LowCardinality(String),
prev_time DateTime,
file_change_type Enum('Add' = 1, 'Delete' = 2, 'Modify' = 3, 'Rename' = 4, 'Copy' = 5, 'Type' = 6),
path LowCardinality(String),
old_path LowCardinality(String),
file_extension LowCardinality(String),
file_lines_added UInt32,
file_lines_deleted UInt32,
file_hunks_added UInt32,
file_hunks_removed UInt32,
file_hunks_changed UInt32,
commit_hash String,
author LowCardinality(String),
time DateTime,
commit_message String,
commit_files_added UInt32,
commit_files_deleted UInt32,
commit_files_renamed UInt32,
commit_files_modified UInt32,
commit_lines_added UInt32,
commit_lines_deleted UInt32,
commit_hunks_added UInt32,
commit_hunks_removed UInt32,
commit_hunks_changed UInt32
) ENGINE = MergeTree ORDER BY time
Utilise the prepared files accurate as of 9/02/2022.
INSERT INTO commits SELECT *
FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/grafana/commits.tsv.gz', 'TSV')
INSERT INTO file_changes SELECT *
FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/grafana/file_changes.tsv.gz', 'TSV')
INSERT INTO line_changes SELECT *
FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/grafana/line_changes.tsv.gz', 'TSV')
Generate and load the files yourself. Requires a local installation of ClickHouse.
git clone [email protected]:grafana/grafana.git
cd grafana
clickhouse git-import
~/clickhouse client --host <host> --secure --port <port> --password <password --query "INSERT INTO line_changes FORMAT TSV" < line_changes.tsv
~/clickhouse client --host <host> --secure --port <port> --password <password --query "INSERT INTO file_changes FORMAT TSV" < file_changes.tsv
~/clickhouse client --host <host> --secure --port <port> --password <password --query "INSERT INTO commits FORMAT TSV" < commits.tsv
CREATE TABLE stopwords
(
`token` LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY token
INSERT INTO stopwords SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/grafana/stop_words.txt', 'CSV')
As shown in the webinar recording configure Grafana to use your ClickHouse instance by adding a datasource.
Dashboard is published here through id 18065
.
Loading instructions can be found here