jackywyz · October 9, 2011 09:58
diff --git a/xl.py b/xl.py
 # -*- coding: utf-8 -*-
 __version__ = '0.2.0'

 #Nginx日志分析，分析歌曲热度排行
 #crontab -e mting_rank.py统计周期:
 #1)每月一号6点->
 #        0 6 1 * *  mting_rank.py
 #1)每周一4点->
 #        0 4 * * 1  mting_rank.py
 #

 import os
 import fileinput
 import re
 from tornado import web,ioloop,httpclient as http
 import xlwt
 import xlrd
 from xlutils.copy import copy
 import datetime
 ezxf = xlwt.easyxf

 #日志的位置
 dir_log  = r"/logs"
 songurl = "**********************************"

 #203.208.60.230 
 ipP = r"?P<ip>[\d.]*";

 #[21/Jan/2011:15:04:41 +0800]
 timeP = r"""?P<time>\[           #以[开始
            [^\[\]]* #除[]以外的任意字符  防止匹配上下个[]项目(也可以使用非贪婪匹配*?)  不在中括号里的.可以匹配换行外的任意字符  *这样地重复是"贪婪的“ 表达式引擎会试着重复尽可能多的次数。
            \]           #以]结束
        """

 requestP = r"""?P<request>\"          #以"开始
            [^\"]* #除双引号以外的任意字符 防止匹配上下个""项目(也可以使用非贪婪匹配*?)
            \"          #以"结束
            """

 rests = r""".+"""

 #原理：主要通过空格和-来区分各不同项目，各项目内部写各自的匹配表达式
 nginxLogPattern = re.compile(r"(%s)\ -\ -\ (%s)\ (%s)\ (%s)" %(ipP, timeP, requestP, rests), re.VERBOSE)

 def getValue(value):
        return value is not None and  value.text or '0'

 def getBody(url):
       	client = http.HTTPClient()
 	try:
 		res = client.fetch(url)
 		return res.body
 	except http.HTTPError,e:
 		print "Error:",e

 def write_xls(file_name, sheet_name, headings, data, heading_xf, data_xfs):
    book = None
    try:
        book = xlrd.open_workbook(file_name+".swf",formatting_info=True)
        book = copy(book)
    except IOError:
        book = xlwt.Workbook(encoding="utf-8")
        
    sheet = book.add_sheet(sheet_name)
    rowx = 0
    for colx, value in enumerate(headings):
        sheet.write(rowx, colx, unicode(value, 'utf-8'), heading_xf)
    sheet.set_panes_frozen(True) # frozen headings instead of split panes
    sheet.set_horz_split_pos(rowx+1) # in general, freeze after last heading row
    sheet.set_remove_splits(True) # if user does unfreeze, don't leave a split there
    for row in data:
        rowx += 1
        for colx, value in enumerate(row):
            sheet.write(rowx, colx, value, data_xfs[colx])
    book.save(file_name)

 def processDir(dir_proc):
    os.popen('gzip -d ' + dir_proc + '/*.gz')
    date = datetime.datetime.now()
    includes = []
    if date.day == 1:
            includes = includes = filter(lambda x: (date.month ==1 and str(date.year-1) +'12' or datetime.datetime(date.year,date.month-1,date.day).strftime('%Y%m')) in x,  os.listdir(dir_proc))
    if date.weekday() == 0:
            b = date - datetime.timedelta(date.weekday() + 1)
            for i in range(6, -1, -1):
                    c = b-datetime.timedelta(i)
                    includes.append('media.zx.access.'+c.strftime('%Y%m%d'))
                            
    for file in includes:
        if os.path.isdir(os.path.join(dir_proc, file)):
            print "WARN:%s is a directory" %(file)
            processDir(os.path.join(dir_proc, file))
            continue

        if  file.endswith(".gz"):
            print "WARN:%s is not a log file" %(file)
            continue

        print "INFO:process file %s" %(file)
        for line in fileinput.input(os.path.join(dir_proc, file)):
            
            matchs = nginxLogPattern.match(line)
            if matchs!=None:
                
                allGroups = matchs.groups()
                ip = allGroups[0]
                time = allGroups[1]
                request = allGroups[2]
                songInfo = request.split("/")[-2]
               
                #统计HTTP状态码的数量
                GetResponseStatusCount(songInfo.split("_")[0],ip)
                #在这里补充其他任何需要的分析代码
            else:
                raise Exception
                
        fileinput.close()

 allSongDict = {}
 reportInfos = {}


 def GetResponseStatusCount(songId,ip):
    if allSongDict.has_key(songId):
        allSongDict[songId].add(ip);
    else:
        allSongDict[songId] = set();

 def formatLog(info):
    body = getBody(songurl+info[0])
    
    import xml.etree.ElementTree as ET
    root = ET.fromstring(body)
    title = getValue(root.find("song/title"))
    artists = root.findall("song/artists/artist")
    singerNames=''
    for artist in artists:
            singerNames += getValue(artist.find("name"))+', '
    return [title,singerNames,len(info[1])]
    
 def reportLog(count):
    reports = []
    for i in  sorted(allSongDict.items(), key=lambda d:len(d[1]), reverse=True)[0:count]:
        #reportInfos[i[0]] = formatLog(i)
        reports +=  [[i[0]] + formatLog(i)]
    
    hdngs = ['歌曲ID','歌曲名','歌手', '用户数']
    kinds =  'text    text  text  int'.split()
    heading_xf = ezxf('font: bold on; align: wrap on, vert centre, horiz center')
    kind_to_xf_map = {
        'text': ezxf(),
        'text': ezxf(),
        'text': ezxf(),
        'int': ezxf(num_format_str='#,##0'),
        }
    data_xfs = [kind_to_xf_map[k] for k in kinds]
    date = datetime.datetime.now()
    time = date.strftime("%Y%m%d")
    filename = '/report/rank_'
    if date.day == 1:
            filename = filename + str(date.month==1 and date.year-1 or date.year) 
            write_xls(filename, time, hdngs, reports, heading_xf, data_xfs)
    if date.weekday() == 0:
            filename = filename + date.strftime("%Y%m")  
            write_xls(filename , time, hdngs, reports, heading_xf, data_xfs)
    os.popen('mv '+filename + ' '+filename+'.swf')
    
 if __name__ == "__main__":
    processDir(dir_log)
    reportLog(100)
    #print reportInfos     
    print "done, python is great!"
	# -- coding: utf-8 --
	__version__ = '0.2.0'

	#Nginx日志分析，分析歌曲热度排行
	#crontab -e mting_rank.py统计周期:
	#1)每月一号6点->
	# 0 6 1 * * mting_rank.py
	#1)每周一4点->
	# 0 4 * * 1 mting_rank.py
	#

	import os
	import fileinput
	import re
	from tornado import web,ioloop,httpclient as http
	import xlwt
	import xlrd
	from xlutils.copy import copy
	import datetime
	ezxf = xlwt.easyxf

	#日志的位置
	dir_log = r"/logs"
	songurl = "**********************************"

	#203.208.60.230
	ipP = r"?P<ip>[\d.]*";

	#[21/Jan/2011:15:04:41 +0800]
	timeP = r"""?P<time>\[ #以[开始
	[^\[\]]* #除[]以外的任意字符防止匹配上下个[]项目(也可以使用非贪婪匹配?) 不在中括号里的.可以匹配换行外的任意字符这样地重复是"贪婪的“ 表达式引擎会试着重复尽可能多的次数。
	\] #以]结束
	"""

	requestP = r"""?P<request>\" #以"开始
	[^\"]* #除双引号以外的任意字符防止匹配上下个""项目(也可以使用非贪婪匹配*?)
	\" #以"结束
	"""

	rests = r""".+"""

	#原理：主要通过空格和-来区分各不同项目，各项目内部写各自的匹配表达式
	nginxLogPattern = re.compile(r"(%s)\ -\ -\ (%s)\ (%s)\ (%s)" %(ipP, timeP, requestP, rests), re.VERBOSE)

	def getValue(value):
	return value is not None and value.text or '0'

	def getBody(url):
	client = http.HTTPClient()
	try:
	res = client.fetch(url)
	return res.body
	except http.HTTPError,e:
	print "Error:",e

	def write_xls(file_name, sheet_name, headings, data, heading_xf, data_xfs):
	book = None
	try:
	book = xlrd.open_workbook(file_name+".swf",formatting_info=True)
	book = copy(book)
	except IOError:
	book = xlwt.Workbook(encoding="utf-8")

	sheet = book.add_sheet(sheet_name)
	rowx = 0
	for colx, value in enumerate(headings):
	sheet.write(rowx, colx, unicode(value, 'utf-8'), heading_xf)
	sheet.set_panes_frozen(True) # frozen headings instead of split panes
	sheet.set_horz_split_pos(rowx+1) # in general, freeze after last heading row
	sheet.set_remove_splits(True) # if user does unfreeze, don't leave a split there
	for row in data:
	rowx += 1
	for colx, value in enumerate(row):
	sheet.write(rowx, colx, value, data_xfs[colx])
	book.save(file_name)

	def processDir(dir_proc):
	os.popen('gzip -d ' + dir_proc + '/*.gz')
	date = datetime.datetime.now()
	includes = []
	if date.day == 1:
	includes = includes = filter(lambda x: (date.month ==1 and str(date.year-1) +'12' or datetime.datetime(date.year,date.month-1,date.day).strftime('%Y%m')) in x, os.listdir(dir_proc))
	if date.weekday() == 0:
	b = date - datetime.timedelta(date.weekday() + 1)
	for i in range(6, -1, -1):
	c = b-datetime.timedelta(i)
	includes.append('media.zx.access.'+c.strftime('%Y%m%d'))

	for file in includes:
	if os.path.isdir(os.path.join(dir_proc, file)):
	print "WARN:%s is a directory" %(file)
	processDir(os.path.join(dir_proc, file))
	continue

	if file.endswith(".gz"):
	print "WARN:%s is not a log file" %(file)
	continue

	print "INFO:process file %s" %(file)
	for line in fileinput.input(os.path.join(dir_proc, file)):

	matchs = nginxLogPattern.match(line)
	if matchs!=None:

	allGroups = matchs.groups()
	ip = allGroups[0]
	time = allGroups[1]
	request = allGroups[2]
	songInfo = request.split("/")[-2]

	#统计HTTP状态码的数量
	GetResponseStatusCount(songInfo.split("_")[0],ip)
	#在这里补充其他任何需要的分析代码
	else:
	raise Exception

	fileinput.close()

	allSongDict = {}
	reportInfos = {}


	def GetResponseStatusCount(songId,ip):
	if allSongDict.has_key(songId):
	allSongDict[songId].add(ip);
	else:
	allSongDict[songId] = set();

	def formatLog(info):
	body = getBody(songurl+info[0])

	import xml.etree.ElementTree as ET
	root = ET.fromstring(body)
	title = getValue(root.find("song/title"))
	artists = root.findall("song/artists/artist")
	singerNames=''
	for artist in artists:
	singerNames += getValue(artist.find("name"))+', '
	return [title,singerNames,len(info[1])]

	def reportLog(count):
	reports = []
	for i in sorted(allSongDict.items(), key=lambda d:len(d[1]), reverse=True)[0:count]:
	#reportInfos[i[0]] = formatLog(i)
	reports += [[i[0]] + formatLog(i)]

	hdngs = ['歌曲ID','歌曲名','歌手', '用户数']
	kinds = 'text text text int'.split()
	heading_xf = ezxf('font: bold on; align: wrap on, vert centre, horiz center')
	kind_to_xf_map = {
	'text': ezxf(),
	'text': ezxf(),
	'text': ezxf(),
	'int': ezxf(num_format_str='#,##0'),
	}
	data_xfs = [kind_to_xf_map[k] for k in kinds]
	date = datetime.datetime.now()
	time = date.strftime("%Y%m%d")
	filename = '/report/rank_'
	if date.day == 1:
	filename = filename + str(date.month==1 and date.year-1 or date.year)
	write_xls(filename, time, hdngs, reports, heading_xf, data_xfs)
	if date.weekday() == 0:
	filename = filename + date.strftime("%Y%m")
	write_xls(filename , time, hdngs, reports, heading_xf, data_xfs)
	os.popen('mv '+filename + ' '+filename+'.swf')

	if __name__ == "__main__":
	processDir(dir_log)
	reportLog(100)
	#print reportInfos
	print "done, python is great!"