Last active
December 17, 2015 03:38
-
-
Save lu911/5544443 to your computer and use it in GitHub Desktop.
identify_timezone
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*-coding:utf8-*- | |
from study.models import * | |
from BeautifulSoup import BeautifulSoup | |
import re,datetime | |
exclude_words = [u'1:1', u'명', u'문장', u'시간', u'PT', u'세', u'일', u'월'] | |
time_regex = [ur'((?:저녁|아침)(?:시간)?|AM|PM|오(?:전|후))?\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*((?:저녁|아침)(?:시간)?|명|AM|PM|오(?:전|후))?\s*((?:-|–|~|부터|또는))\s*((?:저녁|아침)(?:시간?)?|AM|PM|오(?:전|후))?\s*([0-2]?\d)\s*([시:])\s*((?:[0-5]?\d분?|반))?\s*((?:저녁|아침)(?:시간)?|AM|PM|오(?:전|후))?', | |
ur'((?:저녁|아침)(?:시간)?|오(?:전|후))\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*((?:-|–|~|부터|또는))\s*((?:저녁|아침)(?:시간?)?|오(?:전|후))?\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?', | |
ur'((?:저녁|아침)(?:시간)?|오(?:전|후))?\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*((?:-|–|~|부터|또는))\s*((?:저녁|아침)(?:시간?)?|오(?:전|후))\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?', | |
ur'([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*(AM|PM)\s*((?:-|–|~|부터|또는))\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*(AM|PM)?', | |
ur'([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*(AM|PM)?\s*((?:-|–|~|부터|또는))\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*(AM|PM)', | |
ur'((?:저녁|아침)(?:시간)?|AM|PM|오(?:전|후))?\s*([0-2]?\d)\s*([시:]\s*(?:[0-5]?\d분|반))\s*((?:저녁|아침)(?:시간)?|AM|PM|오(?:전|후))?', | |
ur'((?:저녁|아침)(?:시간)?|AM|PM|오(?:전|후))?\s*([0-2]?\d시간?)\s*((?:저녁|아침)(?:시간)?|AM|PM|오(?:전|후))?'] | |
def get_clean_content(study): | |
content = BeautifulSoup(re.sub(r'<br\s*(>)*(/>)*','\n',study.content)).text.replace(' ','') | |
content = re.sub(r'–','-', content) | |
return re.sub(ur'^(?:01[016789]{1}|02|0[3-9]{1}[0-9]{1})[-.]?[0-9]{3,4}[-.\s]?[0-9]{4}$', '', content, re.M) | |
def get_study_list(): | |
return Study.objects.filter(write_time = datetime.datetime(year=2013, month=5, day=03)) | |
def identify_time_zone(time): | |
time = re.sub(ur'\s*','',time) | |
hour = re.findall(ur'[0-2]?\d',time) | |
if hour: | |
hour = hour[0] | |
try: | |
hour = int(hour) | |
except ValueError: | |
return 'etc' | |
if u'오전' in time or u'AM' in time or u'am' in time or u'아침' in time or (hour >= 8 and hour < 12): | |
return 'ante meridiem' | |
elif ((u'오후' in time or u'PM' in time or u'pm' in time or u'저녁' in time) and (hour >= 0 and hour < 6) or (hour >= 12 and hour < 18)) or (hour >= 0 and hour < 6) or (hour >= 12 and hour < 18): | |
return 'post meridiem' | |
elif ((u'오후' in time or u'PM' in time or u'pm' in time or u'저녁' in time) and (hour >= 6 and hour < 8) or (hour >= 18 and hour < 24)) or (hour >= 6 and hour < 8) or (hour >= 18 and hour < 24): | |
return 'evening' | |
else: | |
return 'etc' | |
def get_time(study): | |
global study_count | |
content = get_clean_content(study) | |
for regex in time_regex: | |
is_real_time = None | |
found_time = re.findall(regex, content, re.I) | |
for time in found_time: | |
is_real_time = True | |
for word in exclude_words: | |
time = ''.join(time) | |
if word in time: | |
is_real_time = False | |
break | |
if is_real_time: | |
study_count +=1 | |
time_zone = identify_time_zone(time) | |
print 'Study ID : %s'%study.id | |
print 'Time : %s'%time | |
print 'Meridiem : %s'%time_zone | |
break | |
if found_time and is_real_time: | |
break | |
def test1(): | |
global study_count | |
study_count = 0 | |
study_list = get_study_list() | |
for study in study_list: | |
get_time(study) | |
percent = (float(study_count) / float(len(study_list))) * 100.0 | |
print "study_count : %d Percent : %.2f"%(study_count, percent) | |
def test2(start, end): | |
global study_count | |
study_count = 0 | |
study_list = get_study_list() | |
for study in study_list[start:end]: | |
get_time(study) | |
percent = (float(study_count) / float((end-1)-start)) * 100.0 | |
print "study_count : %d Percent : %.2f"%(study_count, percent) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment