Skip to content

Instantly share code, notes, and snippets.

@juice500ml
Created March 31, 2020 02:56
Show Gist options
  • Save juice500ml/24e27cd8d395b6635fdf6acec9bb67d2 to your computer and use it in GitHub Desktop.
Save juice500ml/24e27cd8d395b6635fdf6acec9bb67d2 to your computer and use it in GitHub Desktop.
mma_crawler.service
requests
beautifulsoup4
import pathlib
import pickle
import socket
import time
import traceback
import requests
import bs4
class PickleDatabase():
def __init__(self, filename='db.pkl'):
if not pathlib.Path(filename).is_file():
with open(filename, 'wb') as f:
pickle.dump(set(), f)
with open(filename, 'rb') as f:
self._db = pickle.load(f)
self._filename = filename
def commit(self):
with open(self._filename, 'wb') as f:
return pickle.dump(self._db, f)
def update(self, value):
self._db.add(value)
def find(self, value):
return value in self._db
def parse_board():
req = requests.post(
url='https://work.mma.go.kr/caisBYIS/board/boardList.do',
data={
'gesipan_gbcd': '13',
'tmpl_id': '1',
'menu_id': 'm_m8_6',
'pageUnit': '100',
}
)
soup = bs4.BeautifulSoup(req.text, 'html.parser')
rows = soup.find('table', class_='brd_list_n').tbody.find_all('tr')
ret = []
for row in rows:
tds = row.find_all('td')
name = tds[0].a.text.strip()
url = tds[0].a['onclick']
ret.append((name, url))
return ret
def parse_url(url):
# HACK: 들어오는 url을 param list로 변경해줌.
# 예) url = "javascript:fnBoardView('m_m8_6','13','2000127135','','','1','10');"
# -> params = ['m_m8_6', '13', '2000127135', '', '', '1', '10']
# HACK:
params = [x[1:-1] for x in url.replace('(', ',').replace(')', ',').split(',')[1:-1]]
return {
'menu_id': params[0],
'gesipan_gbcd': params[1],
'ilryeon_no': params[2],
'searchCondition': params[3],
'searchKeyword': params[4],
'pageIndex': params[5],
'pageUnit': params[6],
}
def parse_page(data):
req = requests.post(
url='https://work.mma.go.kr/caisBYIS/board/boardView.do',
data=data
)
soup = bs4.BeautifulSoup(req.text, 'html.parser')
rows = soup.find('table', class_='brd_view').find_all('tr')
files = []
for a in rows[3].td.find_all('a'):
files.append((a.text.strip(), 'https://work.mma.go.kr' + a['href']))
return {
'title': rows[0].td.text.strip(),
'writer': rows[1].td.text.strip(),
'date': rows[2].td.text.strip(),
'content': '\n'.join(rows[4].td.strings),
'files': files
}
def send_to_slack(title, writer, date, content, files):
requests.post(
'SLACK_URL_HOOK',
json={
'attachments': [
{
'title': title,
'author_name': writer + ' ' + date,
'author_link': 'https://work.mma.go.kr/caisBYIS/main.do',
'text': content,
'fields': [ # START of list comprehension
{'value': '<{}|{}>'.format(url, title), 'short': False}
for title, url in files
] # END of list comprehension.
}
],
}
)
def send_error_to_slack(content):
requests.post(
'SLACK_URL_HOOK',
json={
'attachments': [
{
'title': 'Exception raised',
'author_name': socket.gethostname(),
'text': content,
}
],
}
)
def main():
db = PickleDatabase()
for name, url in parse_board():
if not db.find((name, url)):
send_to_slack(**parse_page(parse_url(url)))
db.update((name, url))
db.commit()
if __name__ == '__main__':
while True:
try:
main()
except KeyboardInterrupt:
break
except Exception:
send_error_to_slack(traceback.format_exc())
time.sleep(600)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment