Skip to content

Instantly share code, notes, and snippets.

@mikeda
Last active June 30, 2021 02:44
Show Gist options
  • Save mikeda/6326042 to your computer and use it in GitHub Desktop.
Save mikeda/6326042 to your computer and use it in GitHub Desktop.
アクセスログ集計スクリプトのとりあえず版
#!/usr/local/bin/ruby
# アクセスログを1時間ごとに集計。アクセスタイプごとのアクセス数をTSV形式で出力する
# usage : analyze_access_log.rb <YYYYMMDD> <access_log>...
require 'json'
require 'time'
require 'pp'
ANALYZE_DATE = ARGV.shift # YYYYMMDD
LOG_REGEXP = Regexp.new('^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+) +(?<path>[^ ]+)( +(?<protocol>\S*))?" (?<status>[^ ]*) (?<size>[^ ]*) "(?<referer>[^\"]*)" "(?<agent>.*)"( (?<response_time>[^ ]*))?$')
COUNT_PATTERNS = {
bcon: Regexp.new('^/bcon/'),
reco: Regexp.new('^/recommend/'),
review: Regexp.new('^/review/'),
admin: Regexp.new('^/admin/'),
static: Regexp.new('\.(js|css|swf|xml|jpe?g|gif|ico|txt)$'),
}
def parse(line)
line.chomp!
unless LOG_REGEXP.match(line)
STDERR.puts "not match: #{line}"
return
end
m = Regexp.last_match
field = {}
LOG_REGEXP.names.each{|name| field[name] = m[name]}
field['time'] = Time.strptime(field['time'], '%d/%b/%Y:%H:%M:%S %z')
# ちょっとだけ混ざっちゃう別日のログを除去
return unless field['time'].strftime('%Y%m%d') == ANALYZE_DATE
field['path'], field['query'] = field['path'].split('?', 2)
field
end
counts = {}
ARGF.each_line do |line|
field = parse(line) or next
match_type = nil
# 200 OKと304 NotModifiedのみを集計対象にする
if field['status'] != '200' && field['status'] != '304'
match_type = :error
else
COUNT_PATTERNS.each do |type, regexp|
if regexp.match(field['path'])
match_type = type
break
end
end
end
match_type ||= :unknown
hour = field['time'].strftime('%Y%m%d_%H')
counts[hour] ||= Hash.new(0)
counts[hour][match_type] += 1
end
counts.each do |time, count|
puts [ time, *[ :bcon, :reco, :cm, :unknown , :error ].map{|k| count[k]} ].join("\t")
end
#!/usr/local/bin/ruby
# 指定したサービスのアクセスログを集計する
# usage : analyze_service.rb <service>
# input:
# 以下のようにアクセスログが配置されてることが前提
# access_log/<service>/<host>/*.access_log.<YYYYMMDD>.gz
# output:
# 以下のファイルに出力
# 集計結果:output/<service>/<service>_<YYYYMMDD>.tsv
# エラーログ:output/<service>/<service>_<YYYYMMDD>.log
require 'json'
require 'time'
require 'parallel'
require 'find'
require 'pp'
service = ARGV.shift or exit(1)
base_dir = '/home/mikeda/analyze_access_log'
access_log_dir = "#{base_dir}/access_log/#{service}"
output_dir = "#{base_dir}/output/#{service}"
# 日付ごとに集計対象のログファイルをまとめる
file_map = {}
Find.find("#{base_dir}/access_log/#{service}") do |file|
next unless /log\.(\d+)\.gz$/ =~ file
file_map[$1] ||= []
file_map[$1] << filebase_dir
end
Dir.mkdir(output_dir)
# 1日ずつアクセスログを集計
# CPUのコア数分、並列に実行される
results = Parallel.map(file_map) do |time, files|
output_file = "#{output_dir}/#{service}_#{time}.tsv"
log_file = "#{output_dir}/#{service}_#{time}.log"
puts "start #{service}:#{time}"
system("zcat #{files.join(' ')} | #{base_dir}/analyze_access_log.rb #{time} > #{output_file} 2> #{log_file}")
puts "end #{service}:#{time}"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment