Skip to content

Instantly share code, notes, and snippets.

@felix021
Created November 27, 2024 12:27
Show Gist options
  • Save felix021/b04cc2f7990fa85943819f3570d93bdd to your computer and use it in GitHub Desktop.
Save felix021/b04cc2f7990fa85943819f3570d93bdd to your computer and use it in GitHub Desktop.
ClickHouse query_log pattern analyzer
#!/usr/bin/python3
import sys
import re
try:
import pandas as pd
except:
print("Please install pandas using 'pip3 install --user pandas'")
exit(1)
try:
import sqlparse
from sqlparse.sql import Token, TokenList
except:
print("Please install sqlparse using 'pip3 install --user sqlparse'")
exit(1)
total = 0
current = 0
def replace_constants_with_placeholder(sql):
parsed = sqlparse.parse(sql)
statement = parsed[0]
def replace_tokens(token_list):
for token in token_list.tokens:
if token.ttype in (sqlparse.tokens.Literal.Number.Integer,
sqlparse.tokens.Literal.Number.Float,
sqlparse.tokens.Literal.String.Single):
token.value = '?'
elif isinstance(token, TokenList):
replace_tokens(token)
replace_tokens(statement)
return str(statement)
def parse_query(query):
global total, current
current += 1
if current % 10 == 0:
print("\r%10d/%10d" % (current, total), end='', file=sys.stderr)
# unescape backslash encoded string
query = query.encode('latin-1', 'backslashreplace').decode('unicode-escape')
# replace constant tokens with ?
query = replace_constants_with_placeholder(query)
# Replace `IN (x, y, z)` with `IN (?)`
query = re.sub(r'\bin\s*\((\s|,|\?)*\)', 'IN (?)', query, flags=re.IGNORECASE)
return query.replace('\n', '\\n').replace('\t', '\\t')
def aggregate_queries(file_path):
global total
# Read the tab-separated file
df = pd.read_csv(file_path, sep='\t')
total = len(df.index)
print("total queries: %d" % (total), file=sys.stderr)
# Parse queries to get query patterns
df['query_pattern'] = df['query'].apply(parse_query)
print("\nall parsed", file=sys.stderr)
# Aggregate duration_second, memory_usage, user_cpu and execution count by query pattern
aggregation = df.groupby('query_pattern').agg({
'duration_second': 'sum',
'memory_usage': 'sum',
'user_cpu': 'sum',
'query_pattern': 'count'
}).rename(columns={'query_pattern': 'execution_count'}).reset_index()
# Sort by the most resource-consuming queries
aggregation = aggregation.sort_values(by=['duration_second', 'memory_usage', 'user_cpu'], ascending=False)
return aggregation
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: %s <query_log>", sys.argv[0])
sys.exit(1)
file_path = sys.argv[1]
output_path = file_path + ".tsv"
result = aggregate_queries(file_path)
result.to_csv(output_path, index=False, sep="\t")
print(result)
print("\n[DONE] details in: " + output_path + "\n")
#!/bin/bash
cd `dirname $0`
file=query_log
HOST=127.0.0.1
USER=username
PASS=123456
begin=$1
end=$2
if [ -z "$begin" -o -z "$end" ]; then
echo "Usage: $0 <BEGIN_TIME> <END_TIME>"
exit 1
fi
echo "导出 $begin ~ $end 的查询..."
set -x
clickhouse client --host $HOST --user $USER --password $PASS --query "
select
query_start_time,
query_duration_ms / 1000 as duration_second,
memory_usage,
ProfileEvents.Values[indexOf(ProfileEvents.Names, 'UserTimeMicroseconds')] AS user_cpu,
read_rows,
read_bytes,
written_rows,
written_bytes,
query
from system.query_log
where query_start_time between '$begin' and '$end'
and type = 'QueryFinish'
-- and read_rows > 1000
order by query_start_time
FORMAT TabSeparatedWithNames;
" > $file
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment