Created
March 31, 2020 02:56
-
-
Save juice500ml/24e27cd8d395b6635fdf6acec9bb67d2 to your computer and use it in GitHub Desktop.
mma_crawler.service
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
requests | |
beautifulsoup4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pathlib | |
import pickle | |
import socket | |
import time | |
import traceback | |
import requests | |
import bs4 | |
class PickleDatabase(): | |
def __init__(self, filename='db.pkl'): | |
if not pathlib.Path(filename).is_file(): | |
with open(filename, 'wb') as f: | |
pickle.dump(set(), f) | |
with open(filename, 'rb') as f: | |
self._db = pickle.load(f) | |
self._filename = filename | |
def commit(self): | |
with open(self._filename, 'wb') as f: | |
return pickle.dump(self._db, f) | |
def update(self, value): | |
self._db.add(value) | |
def find(self, value): | |
return value in self._db | |
def parse_board(): | |
req = requests.post( | |
url='https://work.mma.go.kr/caisBYIS/board/boardList.do', | |
data={ | |
'gesipan_gbcd': '13', | |
'tmpl_id': '1', | |
'menu_id': 'm_m8_6', | |
'pageUnit': '100', | |
} | |
) | |
soup = bs4.BeautifulSoup(req.text, 'html.parser') | |
rows = soup.find('table', class_='brd_list_n').tbody.find_all('tr') | |
ret = [] | |
for row in rows: | |
tds = row.find_all('td') | |
name = tds[0].a.text.strip() | |
url = tds[0].a['onclick'] | |
ret.append((name, url)) | |
return ret | |
def parse_url(url): | |
# HACK: 들어오는 url을 param list로 변경해줌. | |
# 예) url = "javascript:fnBoardView('m_m8_6','13','2000127135','','','1','10');" | |
# -> params = ['m_m8_6', '13', '2000127135', '', '', '1', '10'] | |
# HACK: | |
params = [x[1:-1] for x in url.replace('(', ',').replace(')', ',').split(',')[1:-1]] | |
return { | |
'menu_id': params[0], | |
'gesipan_gbcd': params[1], | |
'ilryeon_no': params[2], | |
'searchCondition': params[3], | |
'searchKeyword': params[4], | |
'pageIndex': params[5], | |
'pageUnit': params[6], | |
} | |
def parse_page(data): | |
req = requests.post( | |
url='https://work.mma.go.kr/caisBYIS/board/boardView.do', | |
data=data | |
) | |
soup = bs4.BeautifulSoup(req.text, 'html.parser') | |
rows = soup.find('table', class_='brd_view').find_all('tr') | |
files = [] | |
for a in rows[3].td.find_all('a'): | |
files.append((a.text.strip(), 'https://work.mma.go.kr' + a['href'])) | |
return { | |
'title': rows[0].td.text.strip(), | |
'writer': rows[1].td.text.strip(), | |
'date': rows[2].td.text.strip(), | |
'content': '\n'.join(rows[4].td.strings), | |
'files': files | |
} | |
def send_to_slack(title, writer, date, content, files): | |
requests.post( | |
'SLACK_URL_HOOK', | |
json={ | |
'attachments': [ | |
{ | |
'title': title, | |
'author_name': writer + ' ' + date, | |
'author_link': 'https://work.mma.go.kr/caisBYIS/main.do', | |
'text': content, | |
'fields': [ # START of list comprehension | |
{'value': '<{}|{}>'.format(url, title), 'short': False} | |
for title, url in files | |
] # END of list comprehension. | |
} | |
], | |
} | |
) | |
def send_error_to_slack(content): | |
requests.post( | |
'SLACK_URL_HOOK', | |
json={ | |
'attachments': [ | |
{ | |
'title': 'Exception raised', | |
'author_name': socket.gethostname(), | |
'text': content, | |
} | |
], | |
} | |
) | |
def main(): | |
db = PickleDatabase() | |
for name, url in parse_board(): | |
if not db.find((name, url)): | |
send_to_slack(**parse_page(parse_url(url))) | |
db.update((name, url)) | |
db.commit() | |
if __name__ == '__main__': | |
while True: | |
try: | |
main() | |
except KeyboardInterrupt: | |
break | |
except Exception: | |
send_error_to_slack(traceback.format_exc()) | |
time.sleep(600) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment