Last active
January 22, 2018 21:53
-
-
Save aerostitch/bc9d879fa38a840c0003220d1a8c37fd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package main | |
| // Parses classic ELB access logs and puts them inside a MySQL/MariaDB table | |
| // Easy way to get a DB: | |
| // docker run --name some-mariadb -e MYSQL_ROOT_PASSWORD=my-secret-pw -e MYSQL_DATABASE=accesslogs -p 3306:3306 -d mariadb:latest | |
| // | |
| // The bulk load your files: | |
| // for f in ~/Downloads/*.txt ; do go run aws_elb_log_analyzer.go -db-create-table -db-host "tcp(172.17.0.2)" -db-name accesslogs -db-user root -db-pwd my-secret-pw -db-table bla -file-path $f; done | |
| // Or: | |
| // TBL=bla ; find /tmp/${TBL} -type f -name '*.log' -o -name '*.txt' | while read f; do echo "Processing $f"; go run aws_elb_log_analyzer.go -db-create-table -db-host "tcp(172.17.0.2)" -db-name accesslogs -db-user root -db-pwd my-secret-pw -db-table ${TBL} -file-path $f; done | |
| // | |
| // And do reports: | |
| // - By day and IP | |
| // select year, month, day, sourceIP, count(*) as nbrcalls from bla group by year, month, day, sourceIP order by nbrcalls; | |
| // - By uri | |
| // select SUBSTRING_INDEX(uri, '?', 1), count(*) as nbrcalls from bla group by SUBSTRING_INDEX(uri, '?', 1) order by nbrcalls; | |
| // - By userAgent | |
| // select SUBSTRING_INDEX(userAgent, ' (', 1), count(*) as nbrcalls from bla group by SUBSTRING_INDEX(userAgent, ' (', 1) order by nbrcalls; | |
| // - A bit of filtering | |
| // select year, month, day, hour, SUBSTRING_INDEX(userAgent, ' (', 1) as agent, SUBSTRING_INDEX(uri, '?', 1) as uri, count(*) as nbrcalls from bla where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by year, month, day, hour, SUBSTRING_INDEX(userAgent, ' (', 1), SUBSTRING_INDEX(uri, '?', 1) order by year, month, day, hour, nbrcalls; | |
| // Usage example in e a script: | |
| /* | |
| #!/bin/bash | |
| if [ $# -ne 1 ]; then | |
| echo "argument required" | |
| exit 1 | |
| fi | |
| TBL=$1 | |
| BUCKET=my-elb-logs | |
| aws s3 cp --recursive --exclude "*" --include "*2018/*" s3://${BUCKET}/${TBL}/AWSLogs/ /tmp/${TBL} | |
| find /tmp/${TBL} -type f -name '*.log' -o -name '*.txt' | while read f; do | |
| echo "Processing $f" | |
| go run aws_elb_log_analyzer.go -db-create-table -db-host "tcp(172.17.0.2)" -db-name accesslogs -db-user root -db-pwd my-secret-pw -db-table ${TBL} -file-path $f | |
| done | |
| mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select CONCAT(year, '-', month, '-', day) as date, SUBSTRING_INDEX(userAgent, ' ', 1) as agent, SUBSTRING_INDEX(SUBSTRING_INDEX(REPLACE(uri,'//','/'), '?', 1), '/', 3) as shorturi, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by year, month, day, SUBSTRING_INDEX(userAgent, ' ', 1), SUBSTRING_INDEX(SUBSTRING_INDEX(REPLACE(uri,'//','/'), '?', 1), '/', 3) order by year, month, day, nbrcalls" -B > /tmp/${TBL}_short.tsv | |
| echo "Requests per day" > /tmp/${TBL}_summary.tsv | |
| mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select CONCAT(year, '-', month, '-', day) as date, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by year, month, day order by year, month, day, nbrcalls" -B >> /tmp/${TBL}_summary.tsv | |
| echo "" >> /tmp/${TBL}_summary.tsv | |
| echo "Requests per method and scheme" >> /tmp/${TBL}_summary.tsv | |
| mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select method, scheme, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by method, scheme order by nbrcalls" -B >> /tmp/${TBL}_summary.tsv | |
| echo "" >> /tmp/${TBL}_summary.tsv | |
| echo "Top 10 source IP" >> /tmp/${TBL}_summary.tsv | |
| mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select * from (select sourceIP, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by sourceIP order by nbrcalls desc) t limit 10;" -B >> /tmp/${TBL}_summary.tsv | |
| echo "" >> /tmp/${TBL}_summary.tsv | |
| echo "Top 10 full user agent" >> /tmp/${TBL}_summary.tsv | |
| mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select * from (select userAgent, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by userAgent order by nbrcalls desc) t limit 10;" -B >> /tmp/${TBL}_summary.tsv | |
| echo "" >> /tmp/${TBL}_summary.tsv | |
| echo "Top 10 short user agent" >> /tmp/${TBL}_summary.tsv | |
| mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select * from (select SUBSTRING_INDEX(SUBSTRING_INDEX(userAgent, ' ', 1),'(',1) as userAgent, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by SUBSTRING_INDEX(SUBSTRING_INDEX(userAgent, ' ', 1),'(',1) order by nbrcalls desc) t limit 10;" -B >> /tmp/${TBL}_summary.tsv | |
| echo "" >> /tmp/${TBL}_summary.tsv | |
| echo "Top 10 root uri path" >> /tmp/${TBL}_summary.tsv | |
| mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select * from (select SUBSTRING_INDEX(SUBSTRING_INDEX(REPLACE(uri,'//','/'), '?', 1), '/', 2) as root_uri, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by SUBSTRING_INDEX(SUBSTRING_INDEX(REPLACE(uri,'//','/'), '?', 1), '/', 2) order by nbrcalls desc) t limit 10;" -B >> /tmp/${TBL}_summary.tsv | |
| echo "" >> /tmp/${TBL}_summary.tsv | |
| echo "Top 10 short uri path" >> /tmp/${TBL}_summary.tsv | |
| mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select * from (select SUBSTRING_INDEX(SUBSTRING_INDEX(REPLACE(uri,'//','/'), '?', 1), '/', 3) as short_uri, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by SUBSTRING_INDEX(SUBSTRING_INDEX(REPLACE(uri,'//','/'), '?', 1), '/', 3) order by nbrcalls desc) t limit 10;" -B >> /tmp/${TBL}_summary.tsv | |
| echo "" >> /tmp/${TBL}_summary.tsv | |
| echo "Top 10 raw uri path" >> /tmp/${TBL}_summary.tsv | |
| mysql -h 172.17.0.2 -u root --password=my-secret-pw --database accesslogs -e "select * from (select SUBSTRING_INDEX(uri,'?', 1) as uri, count(*) as nbrcalls from \`${TBL}\` where userAgent not like 'Pingdom%' and userAgent != 'ZmEu' group by SUBSTRING_INDEX(uri, '?', 1) order by nbrcalls desc) t limit 10;" -B >> /tmp/${TBL}_summary.tsv | |
| rm -rf /tmp/${TBL}/ | |
| */ | |
| import ( | |
| "bufio" | |
| "flag" | |
| "fmt" | |
| "net/url" | |
| "os" | |
| "regexp" | |
| "sync" | |
| "time" | |
| "database/sql" | |
| _ "github.com/go-sql-driver/mysql" | |
| "github.com/gobike/envflag" | |
| ) | |
| var wg sync.WaitGroup | |
| type accessLogEntry struct { | |
| year, month, day, hour int | |
| sourceIP, method, domain, scheme, uri, userAgent string | |
| } | |
| // processLine takes a line and the compiled regex and returns a accessLogEntry | |
| func processLine(re *regexp.Regexp, line string) *accessLogEntry { | |
| entry := accessLogEntry{} | |
| result := re.FindStringSubmatch(line) | |
| // fmt.Printf("%s --->>> %v\n", line, result) | |
| // do not process incorrect lines | |
| if len(result) < 18 { | |
| return nil | |
| } | |
| layout := "2006-01-02T15:04:05.000000Z" | |
| mDate, err := time.Parse(layout, result[1]) | |
| if err != nil { | |
| fmt.Println(err) | |
| } | |
| entry.year = mDate.Year() | |
| entry.month = int(mDate.Month()) | |
| entry.day = mDate.Day() | |
| entry.hour = mDate.Hour() | |
| entry.sourceIP = result[3] | |
| entry.method = result[14] | |
| u, err := url.Parse(result[15]) | |
| if err != nil { | |
| fmt.Println(err) | |
| } else { | |
| entry.domain = u.Hostname() | |
| entry.scheme = u.Scheme | |
| entry.uri = u.RequestURI() | |
| } | |
| entry.userAgent = result[17] | |
| // fmt.Println(entry) | |
| return &entry | |
| } | |
| // processFile reads a file and process each of the lines and sends them to the | |
| // given open channel | |
| func processFile(path string, dataPipe chan *accessLogEntry) { | |
| inFile, _ := os.Open(path) | |
| defer inFile.Close() | |
| scanner := bufio.NewScanner(inFile) | |
| scanner.Split(bufio.ScanLines) | |
| pattern := regexp.MustCompile(`^([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*)[:\-]([0-9]*) ([-.0-9]*) ([-.0-9]*) ([-.0-9]*) (|[-0-9]*) (-|[-0-9]*) ([-0-9]*) ([-0-9]*) "([^ ]*) ([^ ]*) (- |[^ ]*)" "([^"]*)" ([A-Z0-9-]+) ([A-Za-z0-9.-]*)$`) | |
| for scanner.Scan() { | |
| dataPipe <- processLine(pattern, scanner.Text()) | |
| } | |
| close(dataPipe) | |
| } | |
| // Takes the data out of the given channel and pushes it to the given mysql | |
| // table | |
| func channelToDB(user, pwd, host, database, tableName string, createTbl bool, dataPipe chan *accessLogEntry) { | |
| db, err := sql.Open("mysql", fmt.Sprintf("%s:%s@%s/%s?charset=utf8", user, pwd, host, database)) | |
| if err != nil { | |
| panic(err) | |
| } | |
| defer db.Close() | |
| if createTbl { | |
| crStmt, err := db.Prepare(fmt.Sprintf("CREATE TABLE IF NOT EXISTS `%s` (`year` INT(4), `month` INT(2), `day` INT(2), `hour` INT(2), `sourceIP` VARCHAR(128), `method` VARCHAR(8), `domain` VARCHAR(256), `scheme` VARCHAR(8), `uri` VARCHAR(512), `userAgent` VARCHAR(512))", tableName)) | |
| if err != nil { | |
| fmt.Println(err) | |
| } | |
| _, err = crStmt.Exec() | |
| if err != nil { | |
| fmt.Println(err) | |
| } | |
| crStmt.Close() | |
| } | |
| var ( | |
| tx *sql.Tx | |
| stmt *sql.Stmt | |
| ) | |
| flagIdx := 0 | |
| for elem := range dataPipe { | |
| if elem == nil { | |
| continue | |
| } | |
| if flagIdx == 0 { | |
| tx, err = db.Begin() | |
| if err != nil { | |
| fmt.Println(err) | |
| } | |
| stmt, err = tx.Prepare(fmt.Sprintf("insert into `%s` (`year`, `month`, `day`, `hour`, `sourceIP`, `method`, `domain`, `scheme`, `uri`, `userAgent`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", tableName)) | |
| if err != nil { | |
| fmt.Println(err) | |
| } | |
| defer stmt.Close() | |
| } | |
| uriLen := len(elem.uri) | |
| if uriLen > 511 { | |
| uriLen = 511 | |
| } | |
| agentLen := len(elem.userAgent) | |
| if agentLen > 511 { | |
| agentLen = 511 | |
| } | |
| _, err = stmt.Exec(elem.year, elem.month, elem.day, elem.hour, elem.sourceIP, elem.method, elem.domain, elem.scheme, elem.uri[:uriLen], elem.userAgent[:agentLen]) | |
| if err != nil { | |
| fmt.Println(err) | |
| } | |
| flagIdx++ | |
| if flagIdx > 10000 { | |
| err = tx.Commit() | |
| if err != nil { | |
| fmt.Println(err) | |
| } | |
| flagIdx = 0 | |
| } | |
| } | |
| if flagIdx != 0 { | |
| err = tx.Commit() | |
| } | |
| if err != nil { | |
| fmt.Println(err) | |
| } | |
| wg.Done() | |
| } | |
| func main() { | |
| var ( | |
| fPath, dbName, dbHost, dbUser, dbPassword, dbTable string | |
| dbCreateTable bool | |
| ) | |
| flag.StringVar(&fPath, "file-path", "text", "Path to the log file. Environment variable: FILE_PATH") | |
| flag.StringVar(&dbName, "db-name", "accesslogs", "Name of the DB to connect to. Environment variable: DB_NAME") | |
| flag.StringVar(&dbHost, "db-host", "", "Name of the DB server to connect to. Environment variable: DB_HOST") | |
| flag.StringVar(&dbUser, "db-user", "", "User name to use to connect to the DB. Environment variable: DB_USER") | |
| flag.StringVar(&dbPassword, "db-pwd", "", "Password to use to connect to the DB. Environment variable: DB_PWD") | |
| flag.StringVar(&dbTable, "db-table", "", "Name of the table to import the data in. Environment variable: DB_TABLE") | |
| flag.BoolVar(&dbCreateTable, "db-create-table", false, "Whether to create the table if it does not exists. Environment variable: DB_CREATE_TABLE") | |
| envflag.Parse() | |
| dp := make(chan *accessLogEntry) | |
| wg.Add(1) | |
| go channelToDB(dbUser, dbPassword, dbHost, dbName, dbTable, dbCreateTable, dp) | |
| processFile(fPath, dp) | |
| wg.Wait() | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment