Created
October 9, 2011 09:58
-
-
Save jackywyz/1273514 to your computer and use it in GitHub Desktop.
analytics nginx logs with python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
__version__ = '0.2.0' | |
#Nginx日志分析,分析歌曲热度排行 | |
#crontab -e mting_rank.py统计周期: | |
#1)每月一号6点-> | |
# 0 6 1 * * mting_rank.py | |
#1)每周一4点-> | |
# 0 4 * * 1 mting_rank.py | |
# | |
import os | |
import fileinput | |
import re | |
from tornado import web,ioloop,httpclient as http | |
import xlwt | |
import xlrd | |
from xlutils.copy import copy | |
import datetime | |
ezxf = xlwt.easyxf | |
#日志的位置 | |
dir_log = r"/logs" | |
songurl = "**********************************" | |
#203.208.60.230 | |
ipP = r"?P<ip>[\d.]*"; | |
#[21/Jan/2011:15:04:41 +0800] | |
timeP = r"""?P<time>\[ #以[开始 | |
[^\[\]]* #除[]以外的任意字符 防止匹配上下个[]项目(也可以使用非贪婪匹配*?) 不在中括号里的.可以匹配换行外的任意字符 *这样地重复是"贪婪的“ 表达式引擎会试着重复尽可能多的次数。 | |
\] #以]结束 | |
""" | |
requestP = r"""?P<request>\" #以"开始 | |
[^\"]* #除双引号以外的任意字符 防止匹配上下个""项目(也可以使用非贪婪匹配*?) | |
\" #以"结束 | |
""" | |
rests = r""".+""" | |
#原理:主要通过空格和-来区分各不同项目,各项目内部写各自的匹配表达式 | |
nginxLogPattern = re.compile(r"(%s)\ -\ -\ (%s)\ (%s)\ (%s)" %(ipP, timeP, requestP, rests), re.VERBOSE) | |
def getValue(value): | |
return value is not None and value.text or '0' | |
def getBody(url): | |
client = http.HTTPClient() | |
try: | |
res = client.fetch(url) | |
return res.body | |
except http.HTTPError,e: | |
print "Error:",e | |
def write_xls(file_name, sheet_name, headings, data, heading_xf, data_xfs): | |
book = None | |
try: | |
book = xlrd.open_workbook(file_name+".swf",formatting_info=True) | |
book = copy(book) | |
except IOError: | |
book = xlwt.Workbook(encoding="utf-8") | |
sheet = book.add_sheet(sheet_name) | |
rowx = 0 | |
for colx, value in enumerate(headings): | |
sheet.write(rowx, colx, unicode(value, 'utf-8'), heading_xf) | |
sheet.set_panes_frozen(True) # frozen headings instead of split panes | |
sheet.set_horz_split_pos(rowx+1) # in general, freeze after last heading row | |
sheet.set_remove_splits(True) # if user does unfreeze, don't leave a split there | |
for row in data: | |
rowx += 1 | |
for colx, value in enumerate(row): | |
sheet.write(rowx, colx, value, data_xfs[colx]) | |
book.save(file_name) | |
def processDir(dir_proc): | |
os.popen('gzip -d ' + dir_proc + '/*.gz') | |
date = datetime.datetime.now() | |
includes = [] | |
if date.day == 1: | |
includes = includes = filter(lambda x: (date.month ==1 and str(date.year-1) +'12' or datetime.datetime(date.year,date.month-1,date.day).strftime('%Y%m')) in x, os.listdir(dir_proc)) | |
if date.weekday() == 0: | |
b = date - datetime.timedelta(date.weekday() + 1) | |
for i in range(6, -1, -1): | |
c = b-datetime.timedelta(i) | |
includes.append('media.zx.access.'+c.strftime('%Y%m%d')) | |
for file in includes: | |
if os.path.isdir(os.path.join(dir_proc, file)): | |
print "WARN:%s is a directory" %(file) | |
processDir(os.path.join(dir_proc, file)) | |
continue | |
if file.endswith(".gz"): | |
print "WARN:%s is not a log file" %(file) | |
continue | |
print "INFO:process file %s" %(file) | |
for line in fileinput.input(os.path.join(dir_proc, file)): | |
matchs = nginxLogPattern.match(line) | |
if matchs!=None: | |
allGroups = matchs.groups() | |
ip = allGroups[0] | |
time = allGroups[1] | |
request = allGroups[2] | |
songInfo = request.split("/")[-2] | |
#统计HTTP状态码的数量 | |
GetResponseStatusCount(songInfo.split("_")[0],ip) | |
#在这里补充其他任何需要的分析代码 | |
else: | |
raise Exception | |
fileinput.close() | |
allSongDict = {} | |
reportInfos = {} | |
def GetResponseStatusCount(songId,ip): | |
if allSongDict.has_key(songId): | |
allSongDict[songId].add(ip); | |
else: | |
allSongDict[songId] = set(); | |
def formatLog(info): | |
body = getBody(songurl+info[0]) | |
import xml.etree.ElementTree as ET | |
root = ET.fromstring(body) | |
title = getValue(root.find("song/title")) | |
artists = root.findall("song/artists/artist") | |
singerNames='' | |
for artist in artists: | |
singerNames += getValue(artist.find("name"))+', ' | |
return [title,singerNames,len(info[1])] | |
def reportLog(count): | |
reports = [] | |
for i in sorted(allSongDict.items(), key=lambda d:len(d[1]), reverse=True)[0:count]: | |
#reportInfos[i[0]] = formatLog(i) | |
reports += [[i[0]] + formatLog(i)] | |
hdngs = ['歌曲ID','歌曲名','歌手', '用户数'] | |
kinds = 'text text text int'.split() | |
heading_xf = ezxf('font: bold on; align: wrap on, vert centre, horiz center') | |
kind_to_xf_map = { | |
'text': ezxf(), | |
'text': ezxf(), | |
'text': ezxf(), | |
'int': ezxf(num_format_str='#,##0'), | |
} | |
data_xfs = [kind_to_xf_map[k] for k in kinds] | |
date = datetime.datetime.now() | |
time = date.strftime("%Y%m%d") | |
filename = '/report/rank_' | |
if date.day == 1: | |
filename = filename + str(date.month==1 and date.year-1 or date.year) | |
write_xls(filename, time, hdngs, reports, heading_xf, data_xfs) | |
if date.weekday() == 0: | |
filename = filename + date.strftime("%Y%m") | |
write_xls(filename , time, hdngs, reports, heading_xf, data_xfs) | |
os.popen('mv '+filename + ' '+filename+'.swf') | |
if __name__ == "__main__": | |
processDir(dir_log) | |
reportLog(100) | |
#print reportInfos | |
print "done, python is great!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment