Skip to content

Instantly share code, notes, and snippets.

@chyikwei
Created August 27, 2013 19:09
Show Gist options
  • Select an option

  • Save chyikwei/6357723 to your computer and use it in GitHub Desktop.

Select an option

Save chyikwei/6357723 to your computer and use it in GitHub Desktop.
pull objects feeds
import bz2
import os
import re
import random
import csv
from urlparse import urlparse
#old bz2 files
#path = '/logs/prod/nginx/2013/06'
path = '/var/log/remote/prod/nginx/2013/08'
dates = range(1, 25)
def get_files():
logs = []
for root_dir, dirnames, filenames in os.walk(path):
for file in filenames:
if not [root_dir for x in dates if root_dir.startswith('%s/%02d/' % (path, x))]:
continue
elif not file.startswith('guide.getglue.com-access_log'):
continue
else:
if 1:
logs.append('%s/%s' % (root_dir, file))
#print file
return logs
def main():
logs = get_files()
#log = path + 'ip-10-79-94-122' + '/' + 'guide.getglue.com-access_log.bz2'
#logs = [log]
#delete_log = open('/mnt/delete_2013-07-01_2.log', 'w')
last_log = ''
for log in sorted(logs, reverse=True):
out_log = '/mnt/log_objects_' + '-'.join(log.split('/')[-5:-2]) + '.out'
#out_log = '/mnt/log_objects_0725.out'
if last_log != out_log:
out_fh = open(out_log, 'w')
else:
out_fh = open(out_log, 'a')
print 'append file %s' % out_log
print log
file = bz2.BZ2File(log, "r")
prev_line = ''
for line in file:
line = line.strip()
if prev_line == line:
continue
prev_line = line
#if not re.search(r' HTTP/1\.[01]" \d{3} \d+ "http(?:s)*://(?:www\.)*facebook\.com/.+', line):
#if not re.search(r'"GET /v4/stream/main', line):
if not re.search(r'"GET /v4/stream/objects', line):
continue
if re.search(r'/count', line):
continue
if re.search(r'app=Widget', line):
continue
#if random.random() < 0.9: # sample 10% data
# continue
out_fh.write(line + '\n')
file.close()
out_fh.close()
last_log = out_log
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment