Skip to content

Instantly share code, notes, and snippets.

@FinanceData
Last active October 30, 2016 00:01
Show Gist options
  • Save FinanceData/ebc7216b42f92e1d8285fd5670cbaf80 to your computer and use it in GitHub Desktop.
Save FinanceData/ebc7216b42f92e1d8285fd5670cbaf80 to your computer and use it in GitHub Desktop.
DART 06 - 보고서 PDF 다운로드
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# DART 06 보고서 PDF 문서\n",
"\n",
"<img src=\"https://dart.fss.or.kr/images/common/logo.gif\" >\n",
"\n",
"#### 2016 이승준 fb.com/plusjune "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import sqlite3\n",
"import pandas as pd\n",
"from datetime import date, timedelta"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"con = sqlite3.connect('dart.db')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count= 27\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>corp_name</th>\n",
" <th>doc_id</th>\n",
" <th>date</th>\n",
" <th>link</th>\n",
" <th>title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>삼성전자</td>\n",
" <td>20160715000235</td>\n",
" <td>2016-07-15 16:10</td>\n",
" <td>http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20...</td>\n",
" <td>자기주식취득결과보고서</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>삼성전자</td>\n",
" <td>20160516003174</td>\n",
" <td>2016-05-16 17:06</td>\n",
" <td>http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20...</td>\n",
" <td>분기보고서 (2016.03)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>삼성전자</td>\n",
" <td>20160428000005</td>\n",
" <td>2016-04-28 08:35</td>\n",
" <td>http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20...</td>\n",
" <td>주요사항보고서(자기주식취득결정)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>삼성전자</td>\n",
" <td>20160418000349</td>\n",
" <td>2016-04-18 16:46</td>\n",
" <td>http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20...</td>\n",
" <td>자기주식취득결과보고서</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>삼성전자</td>\n",
" <td>20160330003536</td>\n",
" <td>2016-03-30 16:55</td>\n",
" <td>http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20...</td>\n",
" <td>사업보고서 (2015.12)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" corp_name doc_id date \\\n",
"0 삼성전자 20160715000235 2016-07-15 16:10 \n",
"1 삼성전자 20160516003174 2016-05-16 17:06 \n",
"2 삼성전자 20160428000005 2016-04-28 08:35 \n",
"3 삼성전자 20160418000349 2016-04-18 16:46 \n",
"4 삼성전자 20160330003536 2016-03-30 16:55 \n",
"\n",
" link title \n",
"0 http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20... 자기주식취득결과보고서 \n",
"1 http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20... 분기보고서 (2016.03) \n",
"2 http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20... 주요사항보고서(자기주식취득결정) \n",
"3 http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20... 자기주식취득결과보고서 \n",
"4 http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20... 사업보고서 (2015.12) "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 삼성전자, 2014년 이후 제목에 '보고서'를 포함한 모든 공시\n",
"\n",
"sql = \"\"\"\n",
" SELECT corp_name, doc_id, date, link, title\n",
" FROM report \n",
" WHERE corp_name='%s' and title like '%%%s%%' and date >= '2014'\n",
" ORDER BY date DESC\n",
" \"\"\" % ('삼성전자', '보고서')\n",
"\n",
"df = pd.read_sql(sql, con) \n",
"print ('count=', len(df))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160715000235\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160516003174\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160428000005\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160418000349\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160330003536\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160314000622\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160314000623\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160226800754\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160128000009\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160115000355\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20151116000976\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20151029000003\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150817000859\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150515001379\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150331002915\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150326000677\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150326000676\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150305800561\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20150129000015\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20141126000232\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20141114000755\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20140814000743\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20140515001057\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20140331002427\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20140327000954\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20140327000956\n",
"http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20140306801032\n"
]
}
],
"source": [
"for ix, r in df.iterrows():\n",
" print (r['link'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 보고서 페이지\n",
"* http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20160516003174\n",
"\n",
"```javascript\n",
"\tvar treeRoot = new Tree.TreeNode({\n",
"\t\ttext: \"전체\",\n",
"\t\tid: \"root\",\n",
"\t\thref: \"javascript: viewDoc('20160516003174', '5146351', null, null, null, 'dart3.xsd')\"\n",
"\t});\n",
"\n",
"\twest = new Tree.TreePanel({\n",
"\t\tid:\"west-panel\",\n",
"\t\ttitle:\"문서목차\",\n",
"\t\tregion:\"west\",\n",
"```\n",
"\n",
"## 다운로드 페이지\n",
"* 다운로드 버튼 &rarr; 팝업의 \"분기보고서(5146351.pdf)\" 링크\n",
"* http://dart.fss.or.kr/pdf/download/pdf.do?rcp_no=20160516003174&dcm_no=5146351\n",
"* dcm_no을 얻으면 PDF 문서의 URL을 얻을 수 있다"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import requests\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# 사이 문자열 잘라내기\n",
"def find_between( s, first, last ):\n",
" start = s.rfind( first ) + len( first )\n",
" end = s.index( last, start )\n",
" return s[start:end]\n",
"\n",
"# wget: URL을 to로 저장\n",
"def wget(url, to=None):\n",
" local_filename = url.split('/')[-1]\n",
" if to:\n",
" local_filename = to\n",
" r = requests.get(url, stream=True)\n",
" f = open(local_filename, 'wb')\n",
" for chunk in r.iter_content(chunk_size=1024): \n",
" if chunk:\n",
" f.write(chunk)\n",
" f.flush()\n",
" return local_filename"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5222460 → 삼성전자_20160715000235.pdf\n",
"5146351 → 삼성전자_20160516003174.pdf\n",
"5101443 → 삼성전자_20160428000005.pdf\n",
"5090035 → 삼성전자_20160418000349.pdf\n",
"5026126 → 삼성전자_20160330003536.pdf\n",
"4981053 → 삼성전자_20160314000622.pdf\n",
"4981054 → 삼성전자_20160314000623.pdf\n",
"4958618 → 삼성전자_20160226800754.pdf\n",
"4958618 (zip) → 삼성전자_20160226800754.zip\n",
"4927972 → 삼성전자_20160128000009.pdf\n",
"4918477 → 삼성전자_20160115000355.pdf\n",
"4854164 → 삼성전자_20151116000976.pdf\n",
"4836008 → 삼성전자_20151029000003.pdf\n",
"4770098 → 삼성전자_20150817000859.pdf\n",
"4669876 → 삼성전자_20150515001379.pdf\n",
"4556583 → 삼성전자_20150331002915.pdf\n",
"4533078 → 삼성전자_20150326000677.pdf\n",
"4533077 → 삼성전자_20150326000676.pdf\n",
"4496515 → 삼성전자_20150305800561.pdf\n",
"4496515 (zip) → 삼성전자_20150305800561.zip\n",
"4459567 → 삼성전자_20150129000015.pdf\n",
"4395396 → 삼성전자_20141126000232.pdf\n",
"4384460 → 삼성전자_20141114000755.pdf\n",
"4299541 → 삼성전자_20140814000743.pdf\n",
"4215971 → 삼성전자_20140515001057.pdf\n",
"4114631 → 삼성전자_20140331002427.pdf\n",
"4096190 → 삼성전자_20140327000954.pdf\n",
"4096193 → 삼성전자_20140327000956.pdf\n",
"4060798 → 삼성전자_20140306801032.pdf\n",
"4060798 (zip) → 삼성전자_20140306801032.zip\n"
]
}
],
"source": [
"pdf_link_tmpl = \"http://dart.fss.or.kr/pdf/download/pdf.do?rcp_no=%s&dcm_no=%s\"\n",
"zip_link_tmpl = \"http://dart.fss.or.kr/pdf/download/zip.do?rcp_no=%s&dcm_no=%s\"\n",
"\n",
"for ix, row in df.iterrows():\n",
" #print (row['doc_id'], row['link'])\n",
" rcp_no = row['doc_id']\n",
" r = requests.get(row['link'])\n",
" start_str = \"javascript: viewDoc('\" + row['doc_id'] + \"', '\"\n",
" end_str = \"', null, null, null,\"\n",
" dcm_no = find_between (r.text , start_str, end_str)\n",
" pdf_link = pdf_link_tmpl % (rcp_no, dcm_no)\n",
" to = row['corp_name'] + '_' + rcp_no + '.pdf'\n",
" print (dcm_no, ' → ', to)\n",
" wget(pdf_link, to)\n",
" if os.path.getsize(to) <= 0:\n",
" os.remove(to)\n",
" zip_link = zip_link_tmpl % (rcp_no, dcm_no)\n",
" to = row['corp_name'] + '_' + rcp_no + '.zip'\n",
" print (dcm_no, '(zip) → ', to)\n",
" wget(pdf_link, to)\n",
" if os.path.getsize(to) <= 0:\n",
" os.remove(to)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment