Skip to content

Instantly share code, notes, and snippets.

@jackywyz
Created October 9, 2011 09:58
Show Gist options
  • Save jackywyz/1273514 to your computer and use it in GitHub Desktop.
Save jackywyz/1273514 to your computer and use it in GitHub Desktop.
analytics nginx logs with python
# -*- coding: utf-8 -*-
__version__ = '0.2.0'
#Nginx日志分析,分析歌曲热度排行
#crontab -e mting_rank.py统计周期:
#1)每月一号6点->
# 0 6 1 * * mting_rank.py
#1)每周一4点->
# 0 4 * * 1 mting_rank.py
#
import os
import fileinput
import re
from tornado import web,ioloop,httpclient as http
import xlwt
import xlrd
from xlutils.copy import copy
import datetime
ezxf = xlwt.easyxf
#日志的位置
dir_log = r"/logs"
songurl = "**********************************"
#203.208.60.230
ipP = r"?P<ip>[\d.]*";
#[21/Jan/2011:15:04:41 +0800]
timeP = r"""?P<time>\[ #以[开始
[^\[\]]* #除[]以外的任意字符 防止匹配上下个[]项目(也可以使用非贪婪匹配*?) 不在中括号里的.可以匹配换行外的任意字符 *这样地重复是"贪婪的“ 表达式引擎会试着重复尽可能多的次数。
\] #以]结束
"""
requestP = r"""?P<request>\" #以"开始
[^\"]* #除双引号以外的任意字符 防止匹配上下个""项目(也可以使用非贪婪匹配*?)
\" #以"结束
"""
rests = r""".+"""
#原理:主要通过空格和-来区分各不同项目,各项目内部写各自的匹配表达式
nginxLogPattern = re.compile(r"(%s)\ -\ -\ (%s)\ (%s)\ (%s)" %(ipP, timeP, requestP, rests), re.VERBOSE)
def getValue(value):
return value is not None and value.text or '0'
def getBody(url):
client = http.HTTPClient()
try:
res = client.fetch(url)
return res.body
except http.HTTPError,e:
print "Error:",e
def write_xls(file_name, sheet_name, headings, data, heading_xf, data_xfs):
book = None
try:
book = xlrd.open_workbook(file_name+".swf",formatting_info=True)
book = copy(book)
except IOError:
book = xlwt.Workbook(encoding="utf-8")
sheet = book.add_sheet(sheet_name)
rowx = 0
for colx, value in enumerate(headings):
sheet.write(rowx, colx, unicode(value, 'utf-8'), heading_xf)
sheet.set_panes_frozen(True) # frozen headings instead of split panes
sheet.set_horz_split_pos(rowx+1) # in general, freeze after last heading row
sheet.set_remove_splits(True) # if user does unfreeze, don't leave a split there
for row in data:
rowx += 1
for colx, value in enumerate(row):
sheet.write(rowx, colx, value, data_xfs[colx])
book.save(file_name)
def processDir(dir_proc):
os.popen('gzip -d ' + dir_proc + '/*.gz')
date = datetime.datetime.now()
includes = []
if date.day == 1:
includes = includes = filter(lambda x: (date.month ==1 and str(date.year-1) +'12' or datetime.datetime(date.year,date.month-1,date.day).strftime('%Y%m')) in x, os.listdir(dir_proc))
if date.weekday() == 0:
b = date - datetime.timedelta(date.weekday() + 1)
for i in range(6, -1, -1):
c = b-datetime.timedelta(i)
includes.append('media.zx.access.'+c.strftime('%Y%m%d'))
for file in includes:
if os.path.isdir(os.path.join(dir_proc, file)):
print "WARN:%s is a directory" %(file)
processDir(os.path.join(dir_proc, file))
continue
if file.endswith(".gz"):
print "WARN:%s is not a log file" %(file)
continue
print "INFO:process file %s" %(file)
for line in fileinput.input(os.path.join(dir_proc, file)):
matchs = nginxLogPattern.match(line)
if matchs!=None:
allGroups = matchs.groups()
ip = allGroups[0]
time = allGroups[1]
request = allGroups[2]
songInfo = request.split("/")[-2]
#统计HTTP状态码的数量
GetResponseStatusCount(songInfo.split("_")[0],ip)
#在这里补充其他任何需要的分析代码
else:
raise Exception
fileinput.close()
allSongDict = {}
reportInfos = {}
def GetResponseStatusCount(songId,ip):
if allSongDict.has_key(songId):
allSongDict[songId].add(ip);
else:
allSongDict[songId] = set();
def formatLog(info):
body = getBody(songurl+info[0])
import xml.etree.ElementTree as ET
root = ET.fromstring(body)
title = getValue(root.find("song/title"))
artists = root.findall("song/artists/artist")
singerNames=''
for artist in artists:
singerNames += getValue(artist.find("name"))+', '
return [title,singerNames,len(info[1])]
def reportLog(count):
reports = []
for i in sorted(allSongDict.items(), key=lambda d:len(d[1]), reverse=True)[0:count]:
#reportInfos[i[0]] = formatLog(i)
reports += [[i[0]] + formatLog(i)]
hdngs = ['歌曲ID','歌曲名','歌手', '用户数']
kinds = 'text text text int'.split()
heading_xf = ezxf('font: bold on; align: wrap on, vert centre, horiz center')
kind_to_xf_map = {
'text': ezxf(),
'text': ezxf(),
'text': ezxf(),
'int': ezxf(num_format_str='#,##0'),
}
data_xfs = [kind_to_xf_map[k] for k in kinds]
date = datetime.datetime.now()
time = date.strftime("%Y%m%d")
filename = '/report/rank_'
if date.day == 1:
filename = filename + str(date.month==1 and date.year-1 or date.year)
write_xls(filename, time, hdngs, reports, heading_xf, data_xfs)
if date.weekday() == 0:
filename = filename + date.strftime("%Y%m")
write_xls(filename , time, hdngs, reports, heading_xf, data_xfs)
os.popen('mv '+filename + ' '+filename+'.swf')
if __name__ == "__main__":
processDir(dir_log)
reportLog(100)
#print reportInfos
print "done, python is great!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment