Last active
June 30, 2021 02:44
-
-
Save mikeda/6326042 to your computer and use it in GitHub Desktop.
アクセスログ集計スクリプトのとりあえず版
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/ruby | |
# アクセスログを1時間ごとに集計。アクセスタイプごとのアクセス数をTSV形式で出力する | |
# usage : analyze_access_log.rb <YYYYMMDD> <access_log>... | |
require 'json' | |
require 'time' | |
require 'pp' | |
ANALYZE_DATE = ARGV.shift # YYYYMMDD | |
LOG_REGEXP = Regexp.new('^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+) +(?<path>[^ ]+)( +(?<protocol>\S*))?" (?<status>[^ ]*) (?<size>[^ ]*) "(?<referer>[^\"]*)" "(?<agent>.*)"( (?<response_time>[^ ]*))?$') | |
COUNT_PATTERNS = { | |
bcon: Regexp.new('^/bcon/'), | |
reco: Regexp.new('^/recommend/'), | |
review: Regexp.new('^/review/'), | |
admin: Regexp.new('^/admin/'), | |
static: Regexp.new('\.(js|css|swf|xml|jpe?g|gif|ico|txt)$'), | |
} | |
def parse(line) | |
line.chomp! | |
unless LOG_REGEXP.match(line) | |
STDERR.puts "not match: #{line}" | |
return | |
end | |
m = Regexp.last_match | |
field = {} | |
LOG_REGEXP.names.each{|name| field[name] = m[name]} | |
field['time'] = Time.strptime(field['time'], '%d/%b/%Y:%H:%M:%S %z') | |
# ちょっとだけ混ざっちゃう別日のログを除去 | |
return unless field['time'].strftime('%Y%m%d') == ANALYZE_DATE | |
field['path'], field['query'] = field['path'].split('?', 2) | |
field | |
end | |
counts = {} | |
ARGF.each_line do |line| | |
field = parse(line) or next | |
match_type = nil | |
# 200 OKと304 NotModifiedのみを集計対象にする | |
if field['status'] != '200' && field['status'] != '304' | |
match_type = :error | |
else | |
COUNT_PATTERNS.each do |type, regexp| | |
if regexp.match(field['path']) | |
match_type = type | |
break | |
end | |
end | |
end | |
match_type ||= :unknown | |
hour = field['time'].strftime('%Y%m%d_%H') | |
counts[hour] ||= Hash.new(0) | |
counts[hour][match_type] += 1 | |
end | |
counts.each do |time, count| | |
puts [ time, *[ :bcon, :reco, :cm, :unknown , :error ].map{|k| count[k]} ].join("\t") | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/ruby | |
# 指定したサービスのアクセスログを集計する | |
# usage : analyze_service.rb <service> | |
# input: | |
# 以下のようにアクセスログが配置されてることが前提 | |
# access_log/<service>/<host>/*.access_log.<YYYYMMDD>.gz | |
# output: | |
# 以下のファイルに出力 | |
# 集計結果:output/<service>/<service>_<YYYYMMDD>.tsv | |
# エラーログ:output/<service>/<service>_<YYYYMMDD>.log | |
require 'json' | |
require 'time' | |
require 'parallel' | |
require 'find' | |
require 'pp' | |
service = ARGV.shift or exit(1) | |
base_dir = '/home/mikeda/analyze_access_log' | |
access_log_dir = "#{base_dir}/access_log/#{service}" | |
output_dir = "#{base_dir}/output/#{service}" | |
# 日付ごとに集計対象のログファイルをまとめる | |
file_map = {} | |
Find.find("#{base_dir}/access_log/#{service}") do |file| | |
next unless /log\.(\d+)\.gz$/ =~ file | |
file_map[$1] ||= [] | |
file_map[$1] << filebase_dir | |
end | |
Dir.mkdir(output_dir) | |
# 1日ずつアクセスログを集計 | |
# CPUのコア数分、並列に実行される | |
results = Parallel.map(file_map) do |time, files| | |
output_file = "#{output_dir}/#{service}_#{time}.tsv" | |
log_file = "#{output_dir}/#{service}_#{time}.log" | |
puts "start #{service}:#{time}" | |
system("zcat #{files.join(' ')} | #{base_dir}/analyze_access_log.rb #{time} > #{output_file} 2> #{log_file}") | |
puts "end #{service}:#{time}" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment