Skip to content

Instantly share code, notes, and snippets.

@dbushong
Created June 30, 2020 01:56
Show Gist options
  • Select an option

  • Save dbushong/f7dbec959c7158efc915562066dddac1 to your computer and use it in GitHub Desktop.

Select an option

Save dbushong/f7dbec959c7158efc915562066dddac1 to your computer and use it in GitHub Desktop.
require "option_parser"
require "http/params"
def_fields = "ip,ident,user,year,mon,day,hour,min,sec,method,path,query," +
"http,code,bytes,referrer,ua"
quiet = false
fields_str = def_fields
output = STDOUT
print_header = false
DASH_FIELDS = %w(ident user referrer ua bytes)
INT_FIELDS = %w(day year hour min sec code tzhrs tzmins)
STR_FIELDS = %w(ip method path http)
APACHE_RE = %r{^
(?<ip>\d+\.\d+\.\d+\.\d+|::1) [ ]
(?<ident>\S+) [ ]
(?<user>\S+) [ ]
\[
(?<day>\d\d) / (?<mon>[JFMASOND][aepuco][nbrylgptvc]) / (?<year>\d{4})
: (?<hour>\d\d) : (?<min>\d\d) : (?<sec>\d\d)
[ ] (?<tzsign>[+-]) (?<tzhrs>\d\d) (?<tzmins>\d\d)
\] [ ]
" (?:
(?<method>[A-Z]+) [ ]
(?<path>[^?"]+) (?: \? (?<query>[^"]*) )? [ ]
HTTP/(?<http>[\d.]+)
|
-
)
" [ ]
(?<code>\d+) [ ]
(?<bytes>\d+ | -) [ ]
" (?<referrer>[^"]*) " [ ]
" (?<ua>[^"]*) "
}x
# build Mon -> num mapping
MONS = %w(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)
MON = {} of String => Int32
MONS.each_with_index { |m, i| MON[m] = i.to_i + 1 }
def usage(p)
puts p
exit 1
end
option_parser = OptionParser.parse do |parser|
parser.banner = "Usage: apache2tsv [-q] [-d] [-f field[,field[,...]]] [-o outfile] [file [...]]"
parser.on("-q", "--quiet", "no warnings about unparseable lines") \
{ quiet = true }
parser.on("-d", "--header", "output header as first line") \
{ print_header = true }
parser.on("-o FILE", "--output=FILE", "write to FILE (instead of stdout)") \
{ |f| output = File.new(f, "w") }
parser.on("-f FIELDS", "--fields=FIELDS", "specify fields (and order) to emit; default is all real fields, or:\n" +
" #{def_fields}\n" +
" additionally available are dynamic fields, including:\n" +
" ts: unix seconds-since-epoch of timestamp\n" +
" query:<field name>: a single field of the query string\n" +
" query:*: all query string args as key1\\tval1\\tkey2...\n" +
" note: this can be specified only once, and must be last") { |f| fields_str = f }
parser.on("-h", "--help", "Show this help") { usage parser }
end
query_star = false
split_query = [] of String
fields = fields_str.split(',')
fields.each_with_index do |f, i|
if f == "query:*"
usage(option_parser) if query_star || i < fields.size - 1
query_star = true
fields.pop()
elsif f =~ /^query:(.+)/
split_query << $1
end
end
# output header if requested (need to re-push virtual query:* arg)
if print_header
output.puts (fields + (query_star ? ["query:*"] : [] of String)).join('\t')
end
ARGF.each_line do |line|
match = line.match(APACHE_RE)
unless match
STDERR.puts "bad line: #{line}" unless quiet
next
end
row = {} of String => (String | Int32 | Int64)
# basic string fields that might have '-'
DASH_FIELDS.each do |f|
val = match[f]
row[f] = val == "-" ? "" : val
end
# most int fields
INT_FIELDS.each { |f| row[f] = match[f].to_i }
# remaining fields
STR_FIELDS.each { |f| row[f] = match[f]? || "" }
# bytes
bytes = match["bytes"]
row["bytes"] = bytes == "-" ? 0_i64 : bytes.to_i64
# ts
row["mon"] = MON[match["mon"]]
off_secs = row["tzhrs"].to_i * 3600 + row["tzmins"].to_i * 60
off_secs = -off_secs if match["tzsign"] == "-"
loc = Time::Location.fixed(off_secs)
row["ts"] = Time.local(
row["year"].as(Int32), row["mon"].as(Int32), row["day"].as(Int32),
row["hour"].as(Int32), row["min"].as(Int32), row["sec"].as(Int32),
location: loc
).to_unix
# copy for additions from query:*
row_fields = fields.dup
# split out query args if requested
if (!split_query.empty? || query_star) && match["query"]?
q = HTTP::Params.parse(match["query"])
unless q.empty?
row_split_query = split_query.dup
if query_star
q.to_h.keys.sort.each do |k|
row_fields << "query-name:#{k}" << "query:#{k}"
row["query-name:#{k}"] = k
row_split_query << k
end
end
row_split_query.each {|k| row["query:#{k}"] = q[k] }
end
end
cleaned = false
last_i = row_fields.size - 1
row_fields.each_with_index do |f, i|
val = row[f]?.to_s || ""
if val.includes?('\t')
STDERR.puts "removing \\t from output" unless cleaned
cleaned = true
val = val.gsub('\t', ' ')
end
output << val << (i < last_i ? '\t' : '\n')
end
end
output.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment