Last active
August 29, 2015 14:18
-
-
Save QuantTraderEd/f29ab9336a8e5e4db456 to your computer and use it in GitHub Desktop.
web scraping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Thu Apr 30 12:45:04 2015 | |
| @author: assa | |
| """ | |
| def read_shortcd(filename='kospi.text'): | |
| shortcd_lst = [] | |
| # 삼성에스디에스, 제일모직, 맥쿼리인프라 | |
| shortcd_except_lst = ['A018260', 'A028260', 'A088980'] | |
| with open(filename) as f: | |
| line = f.readline() | |
| while True: | |
| line = f.readline() | |
| if not line: break | |
| line = line[:-1] | |
| line_lst = line[:-1].split('\t') | |
| shortcd = line_lst[0] | |
| shortcd_lst.append(shortcd) | |
| return shortcd_lst |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| __author__ = 'assa' | |
| import pandas as pd | |
| import pandas.io.data as web | |
| import sqlite3 as lite | |
| import datetime | |
| import pdb | |
| import matplotlib.pyplot as plt | |
| import matplotlib.style as style | |
| import statsmodels.formula.api as sm | |
| filename = 'finance_naver.db' | |
| conn = lite.connect(filename) | |
| start = datetime.datetime(2014, 4, 5) | |
| end = datetime.datetime(2015, 4, 3) | |
| shortcd = '015760' #KEPCO | |
| #shortcd = '000660' #SKHynics | |
| #shortcd = '005380' #hyundaimotors | |
| #shortcd = '005930' #samsungelec | |
| df_stock = web.DataReader('%s.KS'%shortcd, 'yahoo', start, end) | |
| df_stock = df_stock[df_stock['Volume'] > 0] | |
| date_lst = df_stock.index | |
| social_factor_list = [0] | |
| for i in xrange(len(date_lst)-1): | |
| nowdate = date_lst[i] | |
| nextdate = date_lst[i+1] | |
| str_nowdate = nowdate.strftime('%Y.%m.%d 15:00') | |
| str_nextdate = nextdate.strftime('%Y.%m.%d 15:00') | |
| sqltext = """SELECT * FROM FinNaverBoard | |
| WHERE ShortCD = '%s' and Time > '%s' and Time < '%s' """%(shortcd, str_nowdate, str_nextdate) | |
| df = pd.read_sql(sqltext, conn) | |
| print str_nextdate ,df_stock['Close'][i],len(df) | |
| social_factor_list.append(len(df)) | |
| conn.close() | |
| pdb.set_trace() | |
| df_stock = df_stock[1:] | |
| social_factor = pd.Series(social_factor_list[:-1],index=date_lst[:-1]) | |
| df_stock['social_factor'] = social_factor | |
| df_stock['Ret'] = df_stock['Close'].pct_change() | |
| df_stock['SF_Ret'] = df_stock['social_factor'].pct_change() | |
| df_stock = df_stock[1:] | |
| df_stock = df_stock[df_stock['SF_Ret'] < 3] | |
| model = sm.ols(formula='Ret ~ SF_Ret', data=df_stock) | |
| res = model.fit() | |
| print res.summary() | |
| style.use('ggplot') | |
| #df_stock.plot(kind='scatter', x='SF_Ret', y='Ret') | |
| plt.plot(df_stock['SF_Ret'], df_stock['Ret'],'bo') | |
| plt.plot(df_stock['SF_Ret'], res.fittedvalues,'r') | |
| plt.legend(['Data', 'Fitted model']) | |
| plt.xlabel('SF_Ret') | |
| plt.ylabel('Stock_Ret') | |
| plt.show() | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Sun Nov 17 19:16:40 2013 | |
| @author: assa | |
| """ | |
| import urllib2 | |
| import bs4 | |
| url = 'http://www.czce.com.cn/portal/exchange/2013/datadaily/20131114.htm' | |
| html = urllib2.urlopen(url) | |
| htmltext = html.read() | |
| soup = bs4.BeautifulSoup(htmltext) | |
| table = soup.find('table', attrs={"id": "senfe"}) | |
| for row in table.find_all("tr")[1:15]: | |
| for td in row.find_all("td"): | |
| td_text = td.get_text() | |
| if td_text == u'\xa0': | |
| print '' | |
| else: | |
| print td_text | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Sun Nov 17 19:16:40 2013 | |
| @author: assa | |
| """ | |
| import urllib2 | |
| import bs4 | |
| url = 'http://finance.naver.com/item/board.nhn?code=005930' | |
| response = urllib2.urlopen(url) | |
| htmltext = response.read() | |
| soup = bs4.BeautifulSoup(htmltext) | |
| table = soup.find('table', attrs={"class": "type2"}) | |
| for line in table.find_all('tr', attrs={'align': 'center'}): | |
| title = line.find('td', attrs={"class": "title"}) | |
| text = title.get_text() | |
| print text |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Fri Apr 03 09:46:25 2015 | |
| @author: assa | |
| """ | |
| import urllib2 | |
| import bs4 | |
| import sqlite3 as lite | |
| import pdb | |
| def get_table(shortcd,page_num): | |
| url = 'http://finance.naver.com/item/board.nhn?code=%s&page=%d'%(shortcd,page_num) | |
| response = urllib2.urlopen(url) | |
| htmltext = response.read() | |
| soup = bs4.BeautifulSoup(htmltext) | |
| table = soup.find('table', attrs={'class': 'type2'}) | |
| return table | |
| pass | |
| def chk_sqltable(conn): | |
| sqltext = """SELECT name FROM sqlite_master | |
| WHERE type='table' and name='FinNaverBoard'""" | |
| cur = conn.execute(sqltext) | |
| row = cur.fetchone() | |
| if not row: | |
| return False | |
| else: | |
| return True | |
| def initDB(conn): | |
| if chk_sqltable(conn): return | |
| conn.execute("""CREATE TABLE FinNaverBoard (Id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, | |
| ShortCD TEXT, | |
| Time TEXT, | |
| Title TEXT, | |
| Reply TEXT, | |
| Rating TEXT, | |
| USR_ID TEXT, | |
| Query TEXT, | |
| Like TEXT)""") | |
| pass | |
| if __name__ == "__main__": | |
| filedbname = 'finance_naver.db' | |
| conn = lite.connect(filedbname) | |
| initDB(conn) | |
| shortcd = '015760' | |
| for i in xrange(1,364): | |
| table = get_table(shortcd,i) | |
| for line in table.find_all('tr', attrs={'align': 'center'}): | |
| item = line.find('td', attrs={'class': 'title'}) | |
| timestamp = line.find('td').get_text() | |
| reply = line.find('span', attrs={'class': 'tah p9'}) | |
| if not reply: | |
| reply_num = '0' | |
| else: | |
| reply_num = reply.get_text() | |
| reply_num = reply_num[1:-1] | |
| rating = line.find('td', attrs={'class': 'tc'}) | |
| rating = rating.get_text() | |
| title = item.find('a') | |
| text = title.get('title') | |
| #text = text.encode('utf-8') | |
| usr_id = line.find('td', attrs={'class': 'p11'}) | |
| usr_id = usr_id.get_text() | |
| usr_id = usr_id.strip('\r\n\t\t\t\t') | |
| query = line.find_all('span', attrs={'class':'tah p10 gray03'})[1] | |
| query_num = query.get_text() | |
| like = line.find('strong') | |
| like_num = like.get_text() | |
| #print timestamp, text, reply_num, rating, usr_id, query_num, like_num | |
| print shortcd, timestamp, len(text) | |
| item = (shortcd, timestamp, text, reply_num, rating, usr_id, query_num, like_num) | |
| sqltext = """INSERT INTO FinNaverBoard(ShortCD,Time,Title,Reply,Rating,USR_ID,Query,Like) | |
| VALUES(?, ?, ?, ? ,?, ?, ?, ?)""" | |
| conn.execute(sqltext,item) | |
| conn.commit() | |
| pass | |
| conn.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Mon May 11 15:15:57 2015 | |
| @author: assa | |
| """ | |
| import urllib2 | |
| import bs4 | |
| import re | |
| import time | |
| def getdailydata(shortcd, conn): | |
| # shortcd = 'A130960' | |
| # url = 'http://finance.naver.com/item/sise.nhn?code=130960' | |
| url = 'http://finance.naver.com/item/sise_day.nhn?code=%s&page=1' % shortcd[1:] | |
| response = urllib2.urlopen(url) | |
| htmltext = response.read() | |
| soup = bs4.BeautifulSoup(htmltext) | |
| navi_table = soup.find('table', attrs={'class': 'Nnavi'}) | |
| pgRR = navi_table.find('td', attrs={'class': 'pgRR'}) | |
| result = pgRR.find('a', href=True) | |
| link_text = result['href'] | |
| p = re.compile('page=') | |
| m = p.search(link_text) | |
| page_count = int(link_text[m.end():]) | |
| print shortcd, page_count | |
| begin = time.time() | |
| for i in xrange(1, page_count+1): | |
| url = 'http://finance.naver.com/item/sise_day.nhn?code=%s&page=%d' % (shortcd[1:],i) | |
| try: | |
| response = urllib2.urlopen(url) | |
| except : | |
| print shortcd, i | |
| continue | |
| htmltext = response.read() | |
| soup = bs4.BeautifulSoup(htmltext) | |
| table = soup.find('table', attrs={"class": "type2"}) | |
| for line in table.find_all('tr', attrs={'onmouseout': 'mouseOut(this)'}): | |
| date_line = line.find('td', attrs={'align': 'center'}) | |
| date_text = date_line.get_text() | |
| date_text = date_text.replace('.','-') | |
| OHLC_lines = line.find_all('td', attrs={'class': 'num'}) | |
| close_text = OHLC_lines[0].get_text() | |
| open_text = OHLC_lines[2].get_text() | |
| high_text = OHLC_lines[3].get_text() | |
| low_text = OHLC_lines[4].get_text() | |
| volume_text = OHLC_lines[5].get_text() | |
| close_text = close_text.replace(',','') | |
| open_text = open_text.replace(',','') | |
| high_text = high_text.replace(',','') | |
| low_text = low_text.replace(',','') | |
| volume_text = volume_text.replace(',','') | |
| # print shortcd, date_text, open_text, high_text, low_text, close_text, volume_text | |
| priceitem = (shortcd, date_text, open_text, high_text, low_text, close_text, volume_text) | |
| insertdb(priceitem, conn) | |
| pass | |
| print 'elapsed time: ', time.time() - begin | |
| pass | |
| def insertdb(priceitem, conn): | |
| sqltext = """INSERT INTO StockPriceData (ShortCd, Date, Open, High, Low, Close, Volume) | |
| VALUES(?, ?, ?, ?, ?, ?, ?)""" | |
| conn.execute(sqltext, priceitem) | |
| conn.commit() | |
| pass | |
| if __name__ == '__main__': | |
| import sqlite3 as lite | |
| import os | |
| from read_shortcd import read_shortcd | |
| if not os.path.isfile('KOSDAQ100.db'): | |
| conn = lite.connect('KOSDAQ100.db') | |
| conn.execute("""CREATE TABLE StockPriceData (Id INTEGER NOT NULL PRIMARY KEY, | |
| ShortCD TEXT, | |
| Date TEXT, | |
| Open TEXT, | |
| High TEXT, | |
| Low TEXT, | |
| Close TEXT, | |
| Volume TEXT)""") | |
| else: | |
| conn = lite.connect('KOSDAQ100.db') | |
| shortcd_lst = read_shortcd('kosdaq100.text') | |
| for shortcd in shortcd_lst: | |
| # shortcd = 'A130960' | |
| getdailydata(shortcd, conn) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment