Skip to content

Instantly share code, notes, and snippets.

@aerostitch
Last active January 22, 2018 21:53
Show Gist options
  • Select an option

  • Save aerostitch/bc9d879fa38a840c0003220d1a8c37fd to your computer and use it in GitHub Desktop.

Select an option

Save aerostitch/bc9d879fa38a840c0003220d1a8c37fd to your computer and use it in GitHub Desktop.
package main
// Parses classic ELB access logs and puts them inside a MySQL/MariaDB table
// Easy way to get a DB:
// docker run --name some-mariadb -e MYSQL_ROOT_PASSWORD=my-secret-pw -e MYSQL_DATABASE=accesslogs -p 3306:3306 -d mariadb:latest
//
// The bulk load your files:
// for f in ~/Downloads/*.txt ; do go run aws_elb_log_analyzer.go -db-create-table -db-host "tcp(172.17.0.2)" -db-name accesslogs -db-user root -db-pwd my-secret-pw -db-table bla -file-path $f; done
// Or:
// TBL=bla ; find /tmp/${TBL} -type f -name '*.log' -o -name '*.txt' | while read f; do echo "Processing $f"; go run aws_elb_log_analyzer.go -db-create-table -db-host "tcp(172.17.0.2)" -db-name accesslogs -db-user root -db-pwd my-secret-pw -db-table ${TBL} -file-path $f; done
//
// And do reports:
// - By day and IP
// select year, month, day, sourceIP, count(*) as nbrcalls from bla group by year, month, day, sourceIP order by nbrcalls;
// - By uri
// select SUBSTRING_INDEX(uri, '?', 1), count(*) as nbrcalls from bla group by SUBSTRING_INDEX(uri, '?', 1) order by nbrcalls;
// - By userAgent
// select SUBSTRING_INDEX(userAgent, ' (', 1), count(*) as nbrcalls from bla group by SUBSTRING_INDEX(userAgent, ' (', 1) order by nbrcalls;
// - A bit of filtering
// select year, month, day, hour, SUBSTRING_INDEX(userAgent, ' (', 1) as agent, SUBSTRING_INDEX(uri, '?', 1) as uri, count(*) as nbrcalls from bla where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by year, month, day, hour, SUBSTRING_INDEX(userAgent, ' (', 1), SUBSTRING_INDEX(uri, '?', 1) order by year, month, day, hour, nbrcalls;
// Usage example in e a script:
/*
#!/bin/bash
if [ $# -ne 1 ]; then
echo "argument required"
exit 1
fi
TBL=$1
BUCKET=my-elb-logs
aws s3 cp --recursive --exclude "*" --include "*2018/*" s3://${BUCKET}/${TBL}/AWSLogs/ /tmp/${TBL}
find /tmp/${TBL} -type f -name '*.log' -o -name '*.txt' | while read f; do
echo "Processing $f"
go run aws_elb_log_analyzer.go -db-create-table -db-host "tcp(172.17.0.2)" -db-name accesslogs -db-user root -db-pwd my-secret-pw -db-table ${TBL} -file-path $f
done
mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select CONCAT(year, '-', month, '-', day) as date, SUBSTRING_INDEX(userAgent, ' ', 1) as agent, SUBSTRING_INDEX(SUBSTRING_INDEX(REPLACE(uri,'//','/'), '?', 1), '/', 3) as shorturi, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by year, month, day, SUBSTRING_INDEX(userAgent, ' ', 1), SUBSTRING_INDEX(SUBSTRING_INDEX(REPLACE(uri,'//','/'), '?', 1), '/', 3) order by year, month, day, nbrcalls" -B > /tmp/${TBL}_short.tsv
echo "Requests per day" > /tmp/${TBL}_summary.tsv
mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select CONCAT(year, '-', month, '-', day) as date, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by year, month, day order by year, month, day, nbrcalls" -B >> /tmp/${TBL}_summary.tsv
echo "" >> /tmp/${TBL}_summary.tsv
echo "Requests per method and scheme" >> /tmp/${TBL}_summary.tsv
mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select method, scheme, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by method, scheme order by nbrcalls" -B >> /tmp/${TBL}_summary.tsv
echo "" >> /tmp/${TBL}_summary.tsv
echo "Top 10 source IP" >> /tmp/${TBL}_summary.tsv
mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select * from (select sourceIP, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by sourceIP order by nbrcalls desc) t limit 10;" -B >> /tmp/${TBL}_summary.tsv
echo "" >> /tmp/${TBL}_summary.tsv
echo "Top 10 full user agent" >> /tmp/${TBL}_summary.tsv
mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select * from (select userAgent, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by userAgent order by nbrcalls desc) t limit 10;" -B >> /tmp/${TBL}_summary.tsv
echo "" >> /tmp/${TBL}_summary.tsv
echo "Top 10 short user agent" >> /tmp/${TBL}_summary.tsv
mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select * from (select SUBSTRING_INDEX(SUBSTRING_INDEX(userAgent, ' ', 1),'(',1) as userAgent, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by SUBSTRING_INDEX(SUBSTRING_INDEX(userAgent, ' ', 1),'(',1) order by nbrcalls desc) t limit 10;" -B >> /tmp/${TBL}_summary.tsv
echo "" >> /tmp/${TBL}_summary.tsv
echo "Top 10 root uri path" >> /tmp/${TBL}_summary.tsv
mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select * from (select SUBSTRING_INDEX(SUBSTRING_INDEX(REPLACE(uri,'//','/'), '?', 1), '/', 2) as root_uri, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by SUBSTRING_INDEX(SUBSTRING_INDEX(REPLACE(uri,'//','/'), '?', 1), '/', 2) order by nbrcalls desc) t limit 10;" -B >> /tmp/${TBL}_summary.tsv
echo "" >> /tmp/${TBL}_summary.tsv
echo "Top 10 short uri path" >> /tmp/${TBL}_summary.tsv
mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select * from (select SUBSTRING_INDEX(SUBSTRING_INDEX(REPLACE(uri,'//','/'), '?', 1), '/', 3) as short_uri, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by SUBSTRING_INDEX(SUBSTRING_INDEX(REPLACE(uri,'//','/'), '?', 1), '/', 3) order by nbrcalls desc) t limit 10;" -B >> /tmp/${TBL}_summary.tsv
echo "" >> /tmp/${TBL}_summary.tsv
echo "Top 10 raw uri path" >> /tmp/${TBL}_summary.tsv
mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select * from (select SUBSTRING_INDEX(uri,'?', 1) as uri, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by SUBSTRING_INDEX(uri, '?', 1) order by nbrcalls desc) t limit 10;" -B >> /tmp/${TBL}_summary.tsv
rm -rf /tmp/${TBL}/
*/
import (
"bufio"
"flag"
"fmt"
"net/url"
"os"
"regexp"
"sync"
"time"
"database/sql"
_ "github.com/go-sql-driver/mysql"
"github.com/gobike/envflag"
)
var wg sync.WaitGroup
type accessLogEntry struct {
year, month, day, hour int
sourceIP, method, domain, scheme, uri, userAgent string
}
// processLine takes a line and the compiled regex and returns a accessLogEntry
func processLine(re *regexp.Regexp, line string) *accessLogEntry {
entry := accessLogEntry{}
result := re.FindStringSubmatch(line)
// fmt.Printf("%s --->>> %v\n", line, result)
// do not process incorrect lines
if len(result) < 18 {
return nil
}
layout := "2006-01-02T15:04:05.000000Z"
mDate, err := time.Parse(layout, result[1])
if err != nil {
fmt.Println(err)
}
entry.year = mDate.Year()
entry.month = int(mDate.Month())
entry.day = mDate.Day()
entry.hour = mDate.Hour()
entry.sourceIP = result[3]
entry.method = result[14]
u, err := url.Parse(result[15])
if err != nil {
fmt.Println(err)
} else {
entry.domain = u.Hostname()
entry.scheme = u.Scheme
entry.uri = u.RequestURI()
}
entry.userAgent = result[17]
// fmt.Println(entry)
return &entry
}
// processFile reads a file and process each of the lines and sends them to the
// given open channel
func processFile(path string, dataPipe chan *accessLogEntry) {
inFile, _ := os.Open(path)
defer inFile.Close()
scanner := bufio.NewScanner(inFile)
scanner.Split(bufio.ScanLines)
pattern := regexp.MustCompile(`^([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*)[:\-]([0-9]*) ([-.0-9]*) ([-.0-9]*) ([-.0-9]*) (|[-0-9]*) (-|[-0-9]*) ([-0-9]*) ([-0-9]*) "([^ ]*) ([^ ]*) (- |[^ ]*)" "([^"]*)" ([A-Z0-9-]+) ([A-Za-z0-9.-]*)$`)
for scanner.Scan() {
dataPipe <- processLine(pattern, scanner.Text())
}
close(dataPipe)
}
// Takes the data out of the given channel and pushes it to the given mysql
// table
func channelToDB(user, pwd, host, database, tableName string, createTbl bool, dataPipe chan *accessLogEntry) {
db, err := sql.Open("mysql", fmt.Sprintf("%s:%s@%s/%s?charset=utf8", user, pwd, host, database))
if err != nil {
panic(err)
}
defer db.Close()
if createTbl {
crStmt, err := db.Prepare(fmt.Sprintf("CREATE TABLE IF NOT EXISTS `%s` (`year` INT(4), `month` INT(2), `day` INT(2), `hour` INT(2), `sourceIP` VARCHAR(128), `method` VARCHAR(8), `domain` VARCHAR(256), `scheme` VARCHAR(8), `uri` VARCHAR(512), `userAgent` VARCHAR(512))", tableName))
if err != nil {
fmt.Println(err)
}
_, err = crStmt.Exec()
if err != nil {
fmt.Println(err)
}
crStmt.Close()
}
var (
tx *sql.Tx
stmt *sql.Stmt
)
flagIdx := 0
for elem := range dataPipe {
if elem == nil {
continue
}
if flagIdx == 0 {
tx, err = db.Begin()
if err != nil {
fmt.Println(err)
}
stmt, err = tx.Prepare(fmt.Sprintf("insert into `%s` (`year`, `month`, `day`, `hour`, `sourceIP`, `method`, `domain`, `scheme`, `uri`, `userAgent`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", tableName))
if err != nil {
fmt.Println(err)
}
defer stmt.Close()
}
uriLen := len(elem.uri)
if uriLen > 511 {
uriLen = 511
}
agentLen := len(elem.userAgent)
if agentLen > 511 {
agentLen = 511
}
_, err = stmt.Exec(elem.year, elem.month, elem.day, elem.hour, elem.sourceIP, elem.method, elem.domain, elem.scheme, elem.uri[:uriLen], elem.userAgent[:agentLen])
if err != nil {
fmt.Println(err)
}
flagIdx++
if flagIdx > 10000 {
err = tx.Commit()
if err != nil {
fmt.Println(err)
}
flagIdx = 0
}
}
if flagIdx != 0 {
err = tx.Commit()
}
if err != nil {
fmt.Println(err)
}
wg.Done()
}
func main() {
var (
fPath, dbName, dbHost, dbUser, dbPassword, dbTable string
dbCreateTable bool
)
flag.StringVar(&fPath, "file-path", "text", "Path to the log file. Environment variable: FILE_PATH")
flag.StringVar(&dbName, "db-name", "accesslogs", "Name of the DB to connect to. Environment variable: DB_NAME")
flag.StringVar(&dbHost, "db-host", "", "Name of the DB server to connect to. Environment variable: DB_HOST")
flag.StringVar(&dbUser, "db-user", "", "User name to use to connect to the DB. Environment variable: DB_USER")
flag.StringVar(&dbPassword, "db-pwd", "", "Password to use to connect to the DB. Environment variable: DB_PWD")
flag.StringVar(&dbTable, "db-table", "", "Name of the table to import the data in. Environment variable: DB_TABLE")
flag.BoolVar(&dbCreateTable, "db-create-table", false, "Whether to create the table if it does not exists. Environment variable: DB_CREATE_TABLE")
envflag.Parse()
dp := make(chan *accessLogEntry)
wg.Add(1)
go channelToDB(dbUser, dbPassword, dbHost, dbName, dbTable, dbCreateTable, dp)
processFile(fPath, dp)
wg.Wait()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment