Skip to content

Instantly share code, notes, and snippets.

@LenKIM
Created September 17, 2018 02:48
Show Gist options
  • Save LenKIM/add4966cf0f96bf71f92a49e4b092f80 to your computer and use it in GitHub Desktop.
Save LenKIM/add4966cf0f96bf71f92a49e4b092f80 to your computer and use it in GitHub Desktop.
시간별로 로그 파일 분할하기
# -*- coding: utf-8 -*-
# !/usr/bin/env python3
class PreprocessorHelper:
def __init__(self) -> None:
super().__init__()
def file_classification_by_datetime(self):
a_total = []
# a_24 = glob.glob('/Users/len/log-analyer-assignment/logdata/20180824/*.txt')
# a_27 = glob.glob('/Users/len/log-analyer-assignment/logdata/20180827/*.txt')
a_28 = glob.glob('/Users/len/log-analyer-assignment/logdata/20180828/*.txt')
# a_total.append(a_24)
# a_total.append(a_27)
a_total.append(a_28)
for single_day in a_total:
for file_list in single_day:
print(file_list)
file_list = str(file_list)
multithreading(self.abc, [self, file_list], 4)
def abc(self, file_list):
file_list = str(file_list)
with open(file_list, 'r', encoding='utf8') as infile:
lines = infile.readlines()
for row in tqdm(lines):
row_list = self.custom_log_parser(row)
if len(row_list) > 14:
print(row_list)
continue
user_datetime = datetime.datetime.strptime(row_list[INDEX_OF_DATETIME_IN_LOG()],
'%d/%b/%Y:%H:%M:%S %z')
self.make_files_valid_datetime(user_datetime, row_list)
def make_files_valid_datetime(self, _datetime: datetime, row: List):
if _datetime.day is 24:
self.make_file_by_hour(_datetime, row)
elif _datetime.day is 27:
self.make_file_by_hour(_datetime, row)
elif _datetime.day is 28:
self.make_file_by_hour(_datetime, row)
def make_file_by_hour(self, _datetime: datetime, row: List):
if self.today_at(dy=_datetime.day, hr=0) <= _datetime < self.today_at(dy=_datetime.day, hr=2):
self.make_file_by_time(_datetime.day, 0, 2, row)
elif self.today_at(dy=_datetime.day, hr=2) <= _datetime < self.today_at(dy=_datetime.day, hr=4):
self.make_file_by_time(_datetime.day, 2, 4, row)
elif self.today_at(dy=_datetime.day, hr=4) <= _datetime < self.today_at(dy=_datetime.day, hr=6):
self.make_file_by_time(_datetime.day, 4, 6, row)
elif self.today_at(dy=_datetime.day, hr=4) <= _datetime < self.today_at(dy=_datetime.day, hr=6):
self.make_file_by_time(_datetime.day, 6, 8, row)
elif self.today_at(dy=_datetime.day, hr=6) <= _datetime < self.today_at(dy=_datetime.day, hr=8):
self.make_file_by_time(_datetime.day, 4, 6, row)
elif self.today_at(dy=_datetime.day, hr=8) <= _datetime < self.today_at(dy=_datetime.day, hr=10):
self.make_file_by_time(_datetime.day, 8, 10, row)
elif self.today_at(dy=_datetime.day, hr=10) <= _datetime < self.today_at(dy=_datetime.day, hr=12):
self.make_file_by_time(_datetime.day, 10, 12, row)
elif self.today_at(dy=_datetime.day, hr=12) <= _datetime < self.today_at(dy=_datetime.day, hr=14):
self.make_file_by_time(_datetime.day, 12, 14, row)
elif self.today_at(dy=_datetime.day, hr=14) <= _datetime < self.today_at(dy=_datetime.day, hr=16):
self.make_file_by_time(_datetime.day, 14, 16, row)
elif self.today_at(dy=_datetime.day, hr=16) <= _datetime < self.today_at(dy=_datetime.day, hr=18):
self.make_file_by_time(_datetime.day, 16, 18, row)
elif self.today_at(dy=_datetime.day, hr=18) <= _datetime < self.today_at(dy=_datetime.day, hr=20):
self.make_file_by_time(_datetime.day, 18, 20, row)
elif self.today_at(dy=_datetime.day, hr=20) <= _datetime < self.today_at(dy=_datetime.day, hr=22):
self.make_file_by_time(_datetime.day, 20, 22, row)
elif self.today_at(dy=_datetime.day, hr=22) <= _datetime < self.today_at(dy=_datetime.day, hr=23):
self.make_file_by_time(_datetime.day, 20, 24, row)
def make_file_by_time(self, day, s_time, e_time, row):
with open('/Users/len/log-analyer-assignment/out/' + str(day) + '/day_' + str(s_time) + '_to_' + str(
e_time) + 'hour.csv', 'a+', encoding='utf8') as outfile:
data = '|'.join(row) + '\n'
if len(row) == 14:
outfile.write(data)
def today_at(self, dy, hr, min=0, sec=0, micros=0):
localtz = pytz.timezone('Asia/Seoul')
now = localtz.localize(datetime.datetime.now())
if hr is 23:
return now.replace(month=8, day=dy, hour=hr, minute=59, second=59, microsecond=micros)
else:
return now.replace(month=8, day=dy, hour=hr, minute=min, second=sec, microsecond=micros)
def custom_log_parser(self, string) -> List:
qe = qp = None
row = []
quote_part = []
quote_end = ''
# for string in string.replace('\r', '').replace('\n', '').split(' '):
for string in re.sub('[\r\n]', '', string).split(' '):
if quote_part:
quote_part.append(string)
elif '' == string:
row.append('')
elif '"' == string[0]:
quote_part = [string]
quote_end = '"'
elif '[' == string[0]:
quote_part = [string]
quote_end = ']'
else:
row.append(string)
length = len(string)
if length and quote_end == string[-1]: # end quote
if length and quote_end == string[-1] != '\\':
row.append(' '.join(quote_part)[1:-1].replace('\\' + quote_end, quote_end))
quote_end = quote_part = None
return row
a = PreprocessorHelper()
a.file_classification_by_datetime()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment