Created
September 17, 2018 02:48
-
-
Save LenKIM/add4966cf0f96bf71f92a49e4b092f80 to your computer and use it in GitHub Desktop.
시간별로 로그 파일 분할하기
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# !/usr/bin/env python3 | |
class PreprocessorHelper: | |
def __init__(self) -> None: | |
super().__init__() | |
def file_classification_by_datetime(self): | |
a_total = [] | |
# a_24 = glob.glob('/Users/len/log-analyer-assignment/logdata/20180824/*.txt') | |
# a_27 = glob.glob('/Users/len/log-analyer-assignment/logdata/20180827/*.txt') | |
a_28 = glob.glob('/Users/len/log-analyer-assignment/logdata/20180828/*.txt') | |
# a_total.append(a_24) | |
# a_total.append(a_27) | |
a_total.append(a_28) | |
for single_day in a_total: | |
for file_list in single_day: | |
print(file_list) | |
file_list = str(file_list) | |
multithreading(self.abc, [self, file_list], 4) | |
def abc(self, file_list): | |
file_list = str(file_list) | |
with open(file_list, 'r', encoding='utf8') as infile: | |
lines = infile.readlines() | |
for row in tqdm(lines): | |
row_list = self.custom_log_parser(row) | |
if len(row_list) > 14: | |
print(row_list) | |
continue | |
user_datetime = datetime.datetime.strptime(row_list[INDEX_OF_DATETIME_IN_LOG()], | |
'%d/%b/%Y:%H:%M:%S %z') | |
self.make_files_valid_datetime(user_datetime, row_list) | |
def make_files_valid_datetime(self, _datetime: datetime, row: List): | |
if _datetime.day is 24: | |
self.make_file_by_hour(_datetime, row) | |
elif _datetime.day is 27: | |
self.make_file_by_hour(_datetime, row) | |
elif _datetime.day is 28: | |
self.make_file_by_hour(_datetime, row) | |
def make_file_by_hour(self, _datetime: datetime, row: List): | |
if self.today_at(dy=_datetime.day, hr=0) <= _datetime < self.today_at(dy=_datetime.day, hr=2): | |
self.make_file_by_time(_datetime.day, 0, 2, row) | |
elif self.today_at(dy=_datetime.day, hr=2) <= _datetime < self.today_at(dy=_datetime.day, hr=4): | |
self.make_file_by_time(_datetime.day, 2, 4, row) | |
elif self.today_at(dy=_datetime.day, hr=4) <= _datetime < self.today_at(dy=_datetime.day, hr=6): | |
self.make_file_by_time(_datetime.day, 4, 6, row) | |
elif self.today_at(dy=_datetime.day, hr=4) <= _datetime < self.today_at(dy=_datetime.day, hr=6): | |
self.make_file_by_time(_datetime.day, 6, 8, row) | |
elif self.today_at(dy=_datetime.day, hr=6) <= _datetime < self.today_at(dy=_datetime.day, hr=8): | |
self.make_file_by_time(_datetime.day, 4, 6, row) | |
elif self.today_at(dy=_datetime.day, hr=8) <= _datetime < self.today_at(dy=_datetime.day, hr=10): | |
self.make_file_by_time(_datetime.day, 8, 10, row) | |
elif self.today_at(dy=_datetime.day, hr=10) <= _datetime < self.today_at(dy=_datetime.day, hr=12): | |
self.make_file_by_time(_datetime.day, 10, 12, row) | |
elif self.today_at(dy=_datetime.day, hr=12) <= _datetime < self.today_at(dy=_datetime.day, hr=14): | |
self.make_file_by_time(_datetime.day, 12, 14, row) | |
elif self.today_at(dy=_datetime.day, hr=14) <= _datetime < self.today_at(dy=_datetime.day, hr=16): | |
self.make_file_by_time(_datetime.day, 14, 16, row) | |
elif self.today_at(dy=_datetime.day, hr=16) <= _datetime < self.today_at(dy=_datetime.day, hr=18): | |
self.make_file_by_time(_datetime.day, 16, 18, row) | |
elif self.today_at(dy=_datetime.day, hr=18) <= _datetime < self.today_at(dy=_datetime.day, hr=20): | |
self.make_file_by_time(_datetime.day, 18, 20, row) | |
elif self.today_at(dy=_datetime.day, hr=20) <= _datetime < self.today_at(dy=_datetime.day, hr=22): | |
self.make_file_by_time(_datetime.day, 20, 22, row) | |
elif self.today_at(dy=_datetime.day, hr=22) <= _datetime < self.today_at(dy=_datetime.day, hr=23): | |
self.make_file_by_time(_datetime.day, 20, 24, row) | |
def make_file_by_time(self, day, s_time, e_time, row): | |
with open('/Users/len/log-analyer-assignment/out/' + str(day) + '/day_' + str(s_time) + '_to_' + str( | |
e_time) + 'hour.csv', 'a+', encoding='utf8') as outfile: | |
data = '|'.join(row) + '\n' | |
if len(row) == 14: | |
outfile.write(data) | |
def today_at(self, dy, hr, min=0, sec=0, micros=0): | |
localtz = pytz.timezone('Asia/Seoul') | |
now = localtz.localize(datetime.datetime.now()) | |
if hr is 23: | |
return now.replace(month=8, day=dy, hour=hr, minute=59, second=59, microsecond=micros) | |
else: | |
return now.replace(month=8, day=dy, hour=hr, minute=min, second=sec, microsecond=micros) | |
def custom_log_parser(self, string) -> List: | |
qe = qp = None | |
row = [] | |
quote_part = [] | |
quote_end = '' | |
# for string in string.replace('\r', '').replace('\n', '').split(' '): | |
for string in re.sub('[\r\n]', '', string).split(' '): | |
if quote_part: | |
quote_part.append(string) | |
elif '' == string: | |
row.append('') | |
elif '"' == string[0]: | |
quote_part = [string] | |
quote_end = '"' | |
elif '[' == string[0]: | |
quote_part = [string] | |
quote_end = ']' | |
else: | |
row.append(string) | |
length = len(string) | |
if length and quote_end == string[-1]: # end quote | |
if length and quote_end == string[-1] != '\\': | |
row.append(' '.join(quote_part)[1:-1].replace('\\' + quote_end, quote_end)) | |
quote_end = quote_part = None | |
return row | |
a = PreprocessorHelper() | |
a.file_classification_by_datetime() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment