Last active
October 30, 2016 00:01
-
-
Save FinanceData/ebc7216b42f92e1d8285fd5670cbaf80 to your computer and use it in GitHub Desktop.
DART 06 - 보고서 PDF 다운로드
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# DART 06 보고서 PDF 문서\n", | |
"\n", | |
"<img src=\"https://dart.fss.or.kr/images/common/logo.gif\" >\n", | |
"\n", | |
"#### 2016 이승준 fb.com/plusjune " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import sqlite3\n", | |
"import pandas as pd\n", | |
"from datetime import date, timedelta" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"con = sqlite3.connect('dart.db')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"count= 27\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>corp_name</th>\n", | |
" <th>doc_id</th>\n", | |
" <th>date</th>\n", | |
" <th>link</th>\n", | |
" <th>title</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>삼성전자</td>\n", | |
" <td>20160715000235</td>\n", | |
" <td>2016-07-15 16:10</td>\n", | |
" <td>http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20...</td>\n", | |
" <td>자기주식취득결과보고서</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>삼성전자</td>\n", | |
" <td>20160516003174</td>\n", | |
" <td>2016-05-16 17:06</td>\n", | |
" <td>http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20...</td>\n", | |
" <td>분기보고서 (2016.03)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>삼성전자</td>\n", | |
" <td>20160428000005</td>\n", | |
" <td>2016-04-28 08:35</td>\n", | |
" <td>http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20...</td>\n", | |
" <td>주요사항보고서(자기주식취득결정)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>삼성전자</td>\n", | |
" <td>20160418000349</td>\n", | |
" <td>2016-04-18 16:46</td>\n", | |
" <td>http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20...</td>\n", | |
" <td>자기주식취득결과보고서</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>삼성전자</td>\n", | |
" <td>20160330003536</td>\n", | |
" <td>2016-03-30 16:55</td>\n", | |
" <td>http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20...</td>\n", | |
" <td>사업보고서 (2015.12)</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" corp_name doc_id date \\\n", | |
"0 삼성전자 20160715000235 2016-07-15 16:10 \n", | |
"1 삼성전자 20160516003174 2016-05-16 17:06 \n", | |
"2 삼성전자 20160428000005 2016-04-28 08:35 \n", | |
"3 삼성전자 20160418000349 2016-04-18 16:46 \n", | |
"4 삼성전자 20160330003536 2016-03-30 16:55 \n", | |
"\n", | |
" link title \n", | |
"0 http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20... 자기주식취득결과보고서 \n", | |
"1 http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20... 분기보고서 (2016.03) \n", | |
"2 http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20... 주요사항보고서(자기주식취득결정) \n", | |
"3 http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20... 자기주식취득결과보고서 \n", | |
"4 http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20... 사업보고서 (2015.12) " | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 삼성전자, 2014년 이후 제목에 '보고서'를 포함한 모든 공시\n", | |
"\n", | |
"sql = \"\"\"\n", | |
" SELECT corp_name, doc_id, date, link, title\n", | |
" FROM report \n", | |
" WHERE corp_name='%s' and title like '%%%s%%' and date >= '2014'\n", | |
" ORDER BY date DESC\n", | |
" \"\"\" % ('삼성전자', '보고서')\n", | |
"\n", | |
"df = pd.read_sql(sql, con) \n", | |
"print ('count=', len(df))\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160715000235\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160516003174\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160428000005\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160418000349\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160330003536\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160314000622\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160314000623\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160226800754\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160128000009\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160115000355\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20151116000976\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20151029000003\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150817000859\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150515001379\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150331002915\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150326000677\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150326000676\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150305800561\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150129000015\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20141126000232\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20141114000755\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20140814000743\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20140515001057\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20140331002427\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20140327000954\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20140327000956\n", | |
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20140306801032\n" | |
] | |
} | |
], | |
"source": [ | |
"for ix, r in df.iterrows():\n", | |
" print (r['link'])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 보고서 페이지\n", | |
"* http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160516003174\n", | |
"\n", | |
"```javascript\n", | |
"\tvar treeRoot = new Tree.TreeNode({\n", | |
"\t\ttext: \"전체\",\n", | |
"\t\tid: \"root\",\n", | |
"\t\thref: \"javascript: viewDoc('20160516003174', '5146351', null, null, null, 'dart3.xsd')\"\n", | |
"\t});\n", | |
"\n", | |
"\twest = new Tree.TreePanel({\n", | |
"\t\tid:\"west-panel\",\n", | |
"\t\ttitle:\"문서목차\",\n", | |
"\t\tregion:\"west\",\n", | |
"```\n", | |
"\n", | |
"## 다운로드 페이지\n", | |
"* 다운로드 버튼 → 팝업의 \"분기보고서(5146351.pdf)\" 링크\n", | |
"* http://dart.fss.or.kr/pdf/download/pdf.do?rcp_no=20160516003174&dcm_no=5146351\n", | |
"* dcm_no을 얻으면 PDF 문서의 URL을 얻을 수 있다" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# 사이 문자열 잘라내기\n", | |
"def find_between( s, first, last ):\n", | |
" start = s.rfind( first ) + len( first )\n", | |
" end = s.index( last, start )\n", | |
" return s[start:end]\n", | |
"\n", | |
"# wget: URL을 to로 저장\n", | |
"def wget(url, to=None):\n", | |
" local_filename = url.split('/')[-1]\n", | |
" if to:\n", | |
" local_filename = to\n", | |
" r = requests.get(url, stream=True)\n", | |
" f = open(local_filename, 'wb')\n", | |
" for chunk in r.iter_content(chunk_size=1024): \n", | |
" if chunk:\n", | |
" f.write(chunk)\n", | |
" f.flush()\n", | |
" return local_filename" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"5222460 → 삼성전자_20160715000235.pdf\n", | |
"5146351 → 삼성전자_20160516003174.pdf\n", | |
"5101443 → 삼성전자_20160428000005.pdf\n", | |
"5090035 → 삼성전자_20160418000349.pdf\n", | |
"5026126 → 삼성전자_20160330003536.pdf\n", | |
"4981053 → 삼성전자_20160314000622.pdf\n", | |
"4981054 → 삼성전자_20160314000623.pdf\n", | |
"4958618 → 삼성전자_20160226800754.pdf\n", | |
"4958618 (zip) → 삼성전자_20160226800754.zip\n", | |
"4927972 → 삼성전자_20160128000009.pdf\n", | |
"4918477 → 삼성전자_20160115000355.pdf\n", | |
"4854164 → 삼성전자_20151116000976.pdf\n", | |
"4836008 → 삼성전자_20151029000003.pdf\n", | |
"4770098 → 삼성전자_20150817000859.pdf\n", | |
"4669876 → 삼성전자_20150515001379.pdf\n", | |
"4556583 → 삼성전자_20150331002915.pdf\n", | |
"4533078 → 삼성전자_20150326000677.pdf\n", | |
"4533077 → 삼성전자_20150326000676.pdf\n", | |
"4496515 → 삼성전자_20150305800561.pdf\n", | |
"4496515 (zip) → 삼성전자_20150305800561.zip\n", | |
"4459567 → 삼성전자_20150129000015.pdf\n", | |
"4395396 → 삼성전자_20141126000232.pdf\n", | |
"4384460 → 삼성전자_20141114000755.pdf\n", | |
"4299541 → 삼성전자_20140814000743.pdf\n", | |
"4215971 → 삼성전자_20140515001057.pdf\n", | |
"4114631 → 삼성전자_20140331002427.pdf\n", | |
"4096190 → 삼성전자_20140327000954.pdf\n", | |
"4096193 → 삼성전자_20140327000956.pdf\n", | |
"4060798 → 삼성전자_20140306801032.pdf\n", | |
"4060798 (zip) → 삼성전자_20140306801032.zip\n" | |
] | |
} | |
], | |
"source": [ | |
"pdf_link_tmpl = \"http://dart.fss.or.kr/pdf/download/pdf.do?rcp_no=%s&dcm_no=%s\"\n", | |
"zip_link_tmpl = \"http://dart.fss.or.kr/pdf/download/zip.do?rcp_no=%s&dcm_no=%s\"\n", | |
"\n", | |
"for ix, row in df.iterrows():\n", | |
" #print (row['doc_id'], row['link'])\n", | |
" rcp_no = row['doc_id']\n", | |
" r = requests.get(row['link'])\n", | |
" start_str = \"javascript: viewDoc('\" + row['doc_id'] + \"', '\"\n", | |
" end_str = \"', null, null, null,\"\n", | |
" dcm_no = find_between (r.text , start_str, end_str)\n", | |
" pdf_link = pdf_link_tmpl % (rcp_no, dcm_no)\n", | |
" to = row['corp_name'] + '_' + rcp_no + '.pdf'\n", | |
" print (dcm_no, ' → ', to)\n", | |
" wget(pdf_link, to)\n", | |
" if os.path.getsize(to) <= 0:\n", | |
" os.remove(to)\n", | |
" zip_link = zip_link_tmpl % (rcp_no, dcm_no)\n", | |
" to = row['corp_name'] + '_' + rcp_no + '.zip'\n", | |
" print (dcm_no, '(zip) → ', to)\n", | |
" wget(pdf_link, to)\n", | |
" if os.path.getsize(to) <= 0:\n", | |
" os.remove(to)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment