Created
June 30, 2020 01:56
-
-
Save dbushong/f7dbec959c7158efc915562066dddac1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require "option_parser" | |
| require "http/params" | |
| def_fields = "ip,ident,user,year,mon,day,hour,min,sec,method,path,query," + | |
| "http,code,bytes,referrer,ua" | |
| quiet = false | |
| fields_str = def_fields | |
| output = STDOUT | |
| print_header = false | |
| DASH_FIELDS = %w(ident user referrer ua bytes) | |
| INT_FIELDS = %w(day year hour min sec code tzhrs tzmins) | |
| STR_FIELDS = %w(ip method path http) | |
| APACHE_RE = %r{^ | |
| (?<ip>\d+\.\d+\.\d+\.\d+|::1) [ ] | |
| (?<ident>\S+) [ ] | |
| (?<user>\S+) [ ] | |
| \[ | |
| (?<day>\d\d) / (?<mon>[JFMASOND][aepuco][nbrylgptvc]) / (?<year>\d{4}) | |
| : (?<hour>\d\d) : (?<min>\d\d) : (?<sec>\d\d) | |
| [ ] (?<tzsign>[+-]) (?<tzhrs>\d\d) (?<tzmins>\d\d) | |
| \] [ ] | |
| " (?: | |
| (?<method>[A-Z]+) [ ] | |
| (?<path>[^?"]+) (?: \? (?<query>[^"]*) )? [ ] | |
| HTTP/(?<http>[\d.]+) | |
| | | |
| - | |
| ) | |
| " [ ] | |
| (?<code>\d+) [ ] | |
| (?<bytes>\d+ | -) [ ] | |
| " (?<referrer>[^"]*) " [ ] | |
| " (?<ua>[^"]*) " | |
| }x | |
| # build Mon -> num mapping | |
| MONS = %w(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec) | |
| MON = {} of String => Int32 | |
| MONS.each_with_index { |m, i| MON[m] = i.to_i + 1 } | |
| def usage(p) | |
| puts p | |
| exit 1 | |
| end | |
| option_parser = OptionParser.parse do |parser| | |
| parser.banner = "Usage: apache2tsv [-q] [-d] [-f field[,field[,...]]] [-o outfile] [file [...]]" | |
| parser.on("-q", "--quiet", "no warnings about unparseable lines") \ | |
| { quiet = true } | |
| parser.on("-d", "--header", "output header as first line") \ | |
| { print_header = true } | |
| parser.on("-o FILE", "--output=FILE", "write to FILE (instead of stdout)") \ | |
| { |f| output = File.new(f, "w") } | |
| parser.on("-f FIELDS", "--fields=FIELDS", "specify fields (and order) to emit; default is all real fields, or:\n" + | |
| " #{def_fields}\n" + | |
| " additionally available are dynamic fields, including:\n" + | |
| " ts: unix seconds-since-epoch of timestamp\n" + | |
| " query:<field name>: a single field of the query string\n" + | |
| " query:*: all query string args as key1\\tval1\\tkey2...\n" + | |
| " note: this can be specified only once, and must be last") { |f| fields_str = f } | |
| parser.on("-h", "--help", "Show this help") { usage parser } | |
| end | |
| query_star = false | |
| split_query = [] of String | |
| fields = fields_str.split(',') | |
| fields.each_with_index do |f, i| | |
| if f == "query:*" | |
| usage(option_parser) if query_star || i < fields.size - 1 | |
| query_star = true | |
| fields.pop() | |
| elsif f =~ /^query:(.+)/ | |
| split_query << $1 | |
| end | |
| end | |
| # output header if requested (need to re-push virtual query:* arg) | |
| if print_header | |
| output.puts (fields + (query_star ? ["query:*"] : [] of String)).join('\t') | |
| end | |
| ARGF.each_line do |line| | |
| match = line.match(APACHE_RE) | |
| unless match | |
| STDERR.puts "bad line: #{line}" unless quiet | |
| next | |
| end | |
| row = {} of String => (String | Int32 | Int64) | |
| # basic string fields that might have '-' | |
| DASH_FIELDS.each do |f| | |
| val = match[f] | |
| row[f] = val == "-" ? "" : val | |
| end | |
| # most int fields | |
| INT_FIELDS.each { |f| row[f] = match[f].to_i } | |
| # remaining fields | |
| STR_FIELDS.each { |f| row[f] = match[f]? || "" } | |
| # bytes | |
| bytes = match["bytes"] | |
| row["bytes"] = bytes == "-" ? 0_i64 : bytes.to_i64 | |
| # ts | |
| row["mon"] = MON[match["mon"]] | |
| off_secs = row["tzhrs"].to_i * 3600 + row["tzmins"].to_i * 60 | |
| off_secs = -off_secs if match["tzsign"] == "-" | |
| loc = Time::Location.fixed(off_secs) | |
| row["ts"] = Time.local( | |
| row["year"].as(Int32), row["mon"].as(Int32), row["day"].as(Int32), | |
| row["hour"].as(Int32), row["min"].as(Int32), row["sec"].as(Int32), | |
| location: loc | |
| ).to_unix | |
| # copy for additions from query:* | |
| row_fields = fields.dup | |
| # split out query args if requested | |
| if (!split_query.empty? || query_star) && match["query"]? | |
| q = HTTP::Params.parse(match["query"]) | |
| unless q.empty? | |
| row_split_query = split_query.dup | |
| if query_star | |
| q.to_h.keys.sort.each do |k| | |
| row_fields << "query-name:#{k}" << "query:#{k}" | |
| row["query-name:#{k}"] = k | |
| row_split_query << k | |
| end | |
| end | |
| row_split_query.each {|k| row["query:#{k}"] = q[k] } | |
| end | |
| end | |
| cleaned = false | |
| last_i = row_fields.size - 1 | |
| row_fields.each_with_index do |f, i| | |
| val = row[f]?.to_s || "" | |
| if val.includes?('\t') | |
| STDERR.puts "removing \\t from output" unless cleaned | |
| cleaned = true | |
| val = val.gsub('\t', ' ') | |
| end | |
| output << val << (i < last_i ? '\t' : '\n') | |
| end | |
| end | |
| output.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment