Skip to content

Instantly share code, notes, and snippets.

@QuantTraderEd
Last active August 29, 2015 14:18
Show Gist options
  • Select an option

  • Save QuantTraderEd/f29ab9336a8e5e4db456 to your computer and use it in GitHub Desktop.

Select an option

Save QuantTraderEd/f29ab9336a8e5e4db456 to your computer and use it in GitHub Desktop.
web scraping
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 30 12:45:04 2015
@author: assa
"""
def read_shortcd(filename='kospi.text'):
shortcd_lst = []
# 삼성에스디에스, 제일모직, 맥쿼리인프라
shortcd_except_lst = ['A018260', 'A028260', 'A088980']
with open(filename) as f:
line = f.readline()
while True:
line = f.readline()
if not line: break
line = line[:-1]
line_lst = line[:-1].split('\t')
shortcd = line_lst[0]
shortcd_lst.append(shortcd)
return shortcd_lst
__author__ = 'assa'
import pandas as pd
import pandas.io.data as web
import sqlite3 as lite
import datetime
import pdb
import matplotlib.pyplot as plt
import matplotlib.style as style
import statsmodels.formula.api as sm
filename = 'finance_naver.db'
conn = lite.connect(filename)
start = datetime.datetime(2014, 4, 5)
end = datetime.datetime(2015, 4, 3)
shortcd = '015760' #KEPCO
#shortcd = '000660' #SKHynics
#shortcd = '005380' #hyundaimotors
#shortcd = '005930' #samsungelec
df_stock = web.DataReader('%s.KS'%shortcd, 'yahoo', start, end)
df_stock = df_stock[df_stock['Volume'] > 0]
date_lst = df_stock.index
social_factor_list = [0]
for i in xrange(len(date_lst)-1):
nowdate = date_lst[i]
nextdate = date_lst[i+1]
str_nowdate = nowdate.strftime('%Y.%m.%d 15:00')
str_nextdate = nextdate.strftime('%Y.%m.%d 15:00')
sqltext = """SELECT * FROM FinNaverBoard
WHERE ShortCD = '%s' and Time > '%s' and Time < '%s' """%(shortcd, str_nowdate, str_nextdate)
df = pd.read_sql(sqltext, conn)
print str_nextdate ,df_stock['Close'][i],len(df)
social_factor_list.append(len(df))
conn.close()
pdb.set_trace()
df_stock = df_stock[1:]
social_factor = pd.Series(social_factor_list[:-1],index=date_lst[:-1])
df_stock['social_factor'] = social_factor
df_stock['Ret'] = df_stock['Close'].pct_change()
df_stock['SF_Ret'] = df_stock['social_factor'].pct_change()
df_stock = df_stock[1:]
df_stock = df_stock[df_stock['SF_Ret'] < 3]
model = sm.ols(formula='Ret ~ SF_Ret', data=df_stock)
res = model.fit()
print res.summary()
style.use('ggplot')
#df_stock.plot(kind='scatter', x='SF_Ret', y='Ret')
plt.plot(df_stock['SF_Ret'], df_stock['Ret'],'bo')
plt.plot(df_stock['SF_Ret'], res.fittedvalues,'r')
plt.legend(['Data', 'Fitted model'])
plt.xlabel('SF_Ret')
plt.ylabel('Stock_Ret')
plt.show()
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 17 19:16:40 2013
@author: assa
"""
import urllib2
import bs4
url = 'http://www.czce.com.cn/portal/exchange/2013/datadaily/20131114.htm'
html = urllib2.urlopen(url)
htmltext = html.read()
soup = bs4.BeautifulSoup(htmltext)
table = soup.find('table', attrs={"id": "senfe"})
for row in table.find_all("tr")[1:15]:
for td in row.find_all("td"):
td_text = td.get_text()
if td_text == u'\xa0':
print ''
else:
print td_text
print
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 17 19:16:40 2013
@author: assa
"""
import urllib2
import bs4
url = 'http://finance.naver.com/item/board.nhn?code=005930'
response = urllib2.urlopen(url)
htmltext = response.read()
soup = bs4.BeautifulSoup(htmltext)
table = soup.find('table', attrs={"class": "type2"})
for line in table.find_all('tr', attrs={'align': 'center'}):
title = line.find('td', attrs={"class": "title"})
text = title.get_text()
print text
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 03 09:46:25 2015
@author: assa
"""
import urllib2
import bs4
import sqlite3 as lite
import pdb
def get_table(shortcd,page_num):
url = 'http://finance.naver.com/item/board.nhn?code=%s&page=%d'%(shortcd,page_num)
response = urllib2.urlopen(url)
htmltext = response.read()
soup = bs4.BeautifulSoup(htmltext)
table = soup.find('table', attrs={'class': 'type2'})
return table
pass
def chk_sqltable(conn):
sqltext = """SELECT name FROM sqlite_master
WHERE type='table' and name='FinNaverBoard'"""
cur = conn.execute(sqltext)
row = cur.fetchone()
if not row:
return False
else:
return True
def initDB(conn):
if chk_sqltable(conn): return
conn.execute("""CREATE TABLE FinNaverBoard (Id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
ShortCD TEXT,
Time TEXT,
Title TEXT,
Reply TEXT,
Rating TEXT,
USR_ID TEXT,
Query TEXT,
Like TEXT)""")
pass
if __name__ == "__main__":
filedbname = 'finance_naver.db'
conn = lite.connect(filedbname)
initDB(conn)
shortcd = '015760'
for i in xrange(1,364):
table = get_table(shortcd,i)
for line in table.find_all('tr', attrs={'align': 'center'}):
item = line.find('td', attrs={'class': 'title'})
timestamp = line.find('td').get_text()
reply = line.find('span', attrs={'class': 'tah p9'})
if not reply:
reply_num = '0'
else:
reply_num = reply.get_text()
reply_num = reply_num[1:-1]
rating = line.find('td', attrs={'class': 'tc'})
rating = rating.get_text()
title = item.find('a')
text = title.get('title')
#text = text.encode('utf-8')
usr_id = line.find('td', attrs={'class': 'p11'})
usr_id = usr_id.get_text()
usr_id = usr_id.strip('\r\n\t\t\t\t')
query = line.find_all('span', attrs={'class':'tah p10 gray03'})[1]
query_num = query.get_text()
like = line.find('strong')
like_num = like.get_text()
#print timestamp, text, reply_num, rating, usr_id, query_num, like_num
print shortcd, timestamp, len(text)
item = (shortcd, timestamp, text, reply_num, rating, usr_id, query_num, like_num)
sqltext = """INSERT INTO FinNaverBoard(ShortCD,Time,Title,Reply,Rating,USR_ID,Query,Like)
VALUES(?, ?, ?, ? ,?, ?, ?, ?)"""
conn.execute(sqltext,item)
conn.commit()
pass
conn.close()
# -*- coding: utf-8 -*-
"""
Created on Mon May 11 15:15:57 2015
@author: assa
"""
import urllib2
import bs4
import re
import time
def getdailydata(shortcd, conn):
# shortcd = 'A130960'
# url = 'http://finance.naver.com/item/sise.nhn?code=130960'
url = 'http://finance.naver.com/item/sise_day.nhn?code=%s&page=1' % shortcd[1:]
response = urllib2.urlopen(url)
htmltext = response.read()
soup = bs4.BeautifulSoup(htmltext)
navi_table = soup.find('table', attrs={'class': 'Nnavi'})
pgRR = navi_table.find('td', attrs={'class': 'pgRR'})
result = pgRR.find('a', href=True)
link_text = result['href']
p = re.compile('page=')
m = p.search(link_text)
page_count = int(link_text[m.end():])
print shortcd, page_count
begin = time.time()
for i in xrange(1, page_count+1):
url = 'http://finance.naver.com/item/sise_day.nhn?code=%s&page=%d' % (shortcd[1:],i)
try:
response = urllib2.urlopen(url)
except :
print shortcd, i
continue
htmltext = response.read()
soup = bs4.BeautifulSoup(htmltext)
table = soup.find('table', attrs={"class": "type2"})
for line in table.find_all('tr', attrs={'onmouseout': 'mouseOut(this)'}):
date_line = line.find('td', attrs={'align': 'center'})
date_text = date_line.get_text()
date_text = date_text.replace('.','-')
OHLC_lines = line.find_all('td', attrs={'class': 'num'})
close_text = OHLC_lines[0].get_text()
open_text = OHLC_lines[2].get_text()
high_text = OHLC_lines[3].get_text()
low_text = OHLC_lines[4].get_text()
volume_text = OHLC_lines[5].get_text()
close_text = close_text.replace(',','')
open_text = open_text.replace(',','')
high_text = high_text.replace(',','')
low_text = low_text.replace(',','')
volume_text = volume_text.replace(',','')
# print shortcd, date_text, open_text, high_text, low_text, close_text, volume_text
priceitem = (shortcd, date_text, open_text, high_text, low_text, close_text, volume_text)
insertdb(priceitem, conn)
pass
print 'elapsed time: ', time.time() - begin
pass
def insertdb(priceitem, conn):
sqltext = """INSERT INTO StockPriceData (ShortCd, Date, Open, High, Low, Close, Volume)
VALUES(?, ?, ?, ?, ?, ?, ?)"""
conn.execute(sqltext, priceitem)
conn.commit()
pass
if __name__ == '__main__':
import sqlite3 as lite
import os
from read_shortcd import read_shortcd
if not os.path.isfile('KOSDAQ100.db'):
conn = lite.connect('KOSDAQ100.db')
conn.execute("""CREATE TABLE StockPriceData (Id INTEGER NOT NULL PRIMARY KEY,
ShortCD TEXT,
Date TEXT,
Open TEXT,
High TEXT,
Low TEXT,
Close TEXT,
Volume TEXT)""")
else:
conn = lite.connect('KOSDAQ100.db')
shortcd_lst = read_shortcd('kosdaq100.text')
for shortcd in shortcd_lst:
# shortcd = 'A130960'
getdailydata(shortcd, conn)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment