QuantTraderEd · August 29, 2015 14:18
diff --git a/read_shortcd.py b/read_shortcd.py
 # -*- coding: utf-8 -*-
 """
 Created on Thu Apr 30 12:45:04 2015

 @author: assa
 """


 def read_shortcd(filename='kospi.text'):
    shortcd_lst = []
    # 삼성에스디에스, 제일모직, 맥쿼리인프라    
    shortcd_except_lst = ['A018260', 'A028260', 'A088980']    
    
    with open(filename) as f:
        line = f.readline()
        while True:        
            line = f.readline()
            if not line: break
            line = line[:-1]
            line_lst = line[:-1].split('\t')
            shortcd = line_lst[0]                    
            shortcd_lst.append(shortcd)
    
    return shortcd_lst
diff --git a/web_scraping_analysis.py b/web_scraping_analysis.py
 __author__ = 'assa'

 import pandas as pd
 import pandas.io.data as web
 import sqlite3 as lite
 import datetime
 import pdb
 import matplotlib.pyplot as plt
 import matplotlib.style as style
 import statsmodels.formula.api as sm


 filename = 'finance_naver.db'
 conn = lite.connect(filename)

 start = datetime.datetime(2014, 4, 5)
 end = datetime.datetime(2015, 4, 3)

 shortcd = '015760'  #KEPCO
 #shortcd = '000660'  #SKHynics
 #shortcd = '005380'  #hyundaimotors
 #shortcd = '005930'  #samsungelec

 df_stock = web.DataReader('%s.KS'%shortcd, 'yahoo', start, end)
 df_stock = df_stock[df_stock['Volume'] > 0]
 date_lst = df_stock.index

 social_factor_list = [0]
 for i in xrange(len(date_lst)-1):
    nowdate = date_lst[i]
    nextdate = date_lst[i+1]

    str_nowdate = nowdate.strftime('%Y.%m.%d 15:00')
    str_nextdate = nextdate.strftime('%Y.%m.%d 15:00')

    sqltext = """SELECT * FROM FinNaverBoard
                WHERE ShortCD = '%s' and Time > '%s' and Time < '%s' """%(shortcd, str_nowdate, str_nextdate)
    df = pd.read_sql(sqltext, conn)

    print str_nextdate ,df_stock['Close'][i],len(df)
    social_factor_list.append(len(df))

 conn.close()
 pdb.set_trace()

 df_stock = df_stock[1:]
 social_factor = pd.Series(social_factor_list[:-1],index=date_lst[:-1])
 df_stock['social_factor'] = social_factor
 df_stock['Ret'] = df_stock['Close'].pct_change()
 df_stock['SF_Ret'] = df_stock['social_factor'].pct_change()

 df_stock = df_stock[1:]
 df_stock = df_stock[df_stock['SF_Ret'] < 3]
 model = sm.ols(formula='Ret ~ SF_Ret', data=df_stock)
 res = model.fit()

 print res.summary()

 style.use('ggplot')

 #df_stock.plot(kind='scatter', x='SF_Ret', y='Ret')
 plt.plot(df_stock['SF_Ret'], df_stock['Ret'],'bo')
 plt.plot(df_stock['SF_Ret'], res.fittedvalues,'r')
 plt.legend(['Data', 'Fitted model'])
 plt.xlabel('SF_Ret')
 plt.ylabel('Stock_Ret')
 plt.show()



diff --git a/web_scraping_test1.py b/web_scraping_test1.py
 # -*- coding: utf-8 -*-
 """
 Created on Sun Nov 17 19:16:40 2013

 @author: assa
 """

 import urllib2
 import bs4

 url = 'http://www.czce.com.cn/portal/exchange/2013/datadaily/20131114.htm'
 html = urllib2.urlopen(url)
 htmltext = html.read()

 soup = bs4.BeautifulSoup(htmltext)
 table = soup.find('table', attrs={"id": "senfe"})

 for row in table.find_all("tr")[1:15]:
    for td in row.find_all("td"):
        td_text = td.get_text()
        if td_text == u'\xa0':
            print ''
        else:
            print td_text
    print
diff --git a/web_scraping_test2.py b/web_scraping_test2.py
 # -*- coding: utf-8 -*-
 """
 Created on Sun Nov 17 19:16:40 2013

 @author: assa
 """

 import urllib2
 import bs4

 url = 'http://finance.naver.com/item/board.nhn?code=005930'
 response = urllib2.urlopen(url)
 htmltext = response.read()

 soup = bs4.BeautifulSoup(htmltext)
 table = soup.find('table', attrs={"class": "type2"})

 for line in table.find_all('tr', attrs={'align': 'center'}):
    title = line.find('td', attrs={"class": "title"})
    text = title.get_text()
    print text
diff --git a/web_scraping_test3.py b/web_scraping_test3.py
 # -*- coding: utf-8 -*-
 """
 Created on Fri Apr 03 09:46:25 2015

 @author: assa
 """

 import urllib2
 import bs4
 import sqlite3 as lite
 import pdb


 def get_table(shortcd,page_num):
    url = 'http://finance.naver.com/item/board.nhn?code=%s&page=%d'%(shortcd,page_num)
    response = urllib2.urlopen(url)
    htmltext = response.read() 
    soup = bs4.BeautifulSoup(htmltext)
    table = soup.find('table', attrs={'class': 'type2'})
    return table
    pass

 def chk_sqltable(conn):
    sqltext = """SELECT name FROM sqlite_master 
                    WHERE type='table' and name='FinNaverBoard'"""
    cur = conn.execute(sqltext)
    row = cur.fetchone()
    if not row:
        return False
    else:
        return True
        
 def initDB(conn):
    if chk_sqltable(conn): return
    conn.execute("""CREATE TABLE FinNaverBoard (Id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, 
                                       ShortCD TEXT, 
                                       Time TEXT, 
                                       Title TEXT,
                                       Reply TEXT,
                                       Rating TEXT,
                                       USR_ID TEXT,
                                       Query TEXT,
                                       Like TEXT)""")
    pass


 if __name__ == "__main__":    
    
    filedbname = 'finance_naver.db'    
    conn = lite.connect(filedbname)
    initDB(conn)
    shortcd = '015760'
    
    for i in xrange(1,364):
        table = get_table(shortcd,i)
        for line in table.find_all('tr', attrs={'align': 'center'}):
            item = line.find('td', attrs={'class': 'title'})
            timestamp = line.find('td').get_text()
            reply = line.find('span', attrs={'class': 'tah p9'})    
            if not reply:
                reply_num = '0'
            else:
                reply_num = reply.get_text()
                reply_num = reply_num[1:-1]
            
            rating = line.find('td', attrs={'class': 'tc'})
            rating = rating.get_text()
            
            title = item.find('a')
            text = title.get('title')
            #text = text.encode('utf-8')
            
            usr_id = line.find('td', attrs={'class': 'p11'})
            usr_id = usr_id.get_text()
            usr_id = usr_id.strip('\r\n\t\t\t\t')
            
            query = line.find_all('span', attrs={'class':'tah p10 gray03'})[1]
            query_num = query.get_text()
            
            like = line.find('strong')
            like_num = like.get_text()
            #print timestamp, text, reply_num, rating, usr_id, query_num, like_num
            print shortcd, timestamp, len(text)
            item = (shortcd, timestamp, text, reply_num, rating, usr_id, query_num, like_num)            
            sqltext = """INSERT INTO FinNaverBoard(ShortCD,Time,Title,Reply,Rating,USR_ID,Query,Like) 
                                               VALUES(?, ?, ?, ? ,?, ?, ?, ?)""" 
            conn.execute(sqltext,item)
            conn.commit()
        pass
    
    conn.close()
diff --git a/web_scraping_test5.py b/web_scraping_test5.py
 # -*- coding: utf-8 -*-
 """
 Created on Mon May 11 15:15:57 2015

 @author: assa
 """

 import urllib2
 import bs4
 import re
 import time


 def getdailydata(shortcd, conn):
    # shortcd = 'A130960'
    # url = 'http://finance.naver.com/item/sise.nhn?code=130960'
    url = 'http://finance.naver.com/item/sise_day.nhn?code=%s&page=1' % shortcd[1:]
    response = urllib2.urlopen(url)
    htmltext = response.read()
    soup = bs4.BeautifulSoup(htmltext)
    
    navi_table = soup.find('table', attrs={'class': 'Nnavi'})
    pgRR = navi_table.find('td', attrs={'class': 'pgRR'})
    result = pgRR.find('a', href=True)
    link_text = result['href']
    p = re.compile('page=')
    m = p.search(link_text)
    page_count = int(link_text[m.end():])
    
    print shortcd, page_count
    
    begin = time.time()

    for i in xrange(1, page_count+1):
        url = 'http://finance.naver.com/item/sise_day.nhn?code=%s&page=%d' % (shortcd[1:],i)
        try:
            response = urllib2.urlopen(url)
        except :
            print shortcd, i
            continue
        htmltext = response.read()
        soup = bs4.BeautifulSoup(htmltext)
        table = soup.find('table', attrs={"class": "type2"})
        
        for line in table.find_all('tr', attrs={'onmouseout': 'mouseOut(this)'}):
            date_line = line.find('td', attrs={'align': 'center'})
            date_text = date_line.get_text()
            date_text = date_text.replace('.','-')
            OHLC_lines = line.find_all('td', attrs={'class': 'num'})
            close_text = OHLC_lines[0].get_text()
            open_text = OHLC_lines[2].get_text()
            high_text = OHLC_lines[3].get_text()
            low_text = OHLC_lines[4].get_text()
            volume_text = OHLC_lines[5].get_text()
            close_text = close_text.replace(',','')            
            open_text = open_text.replace(',','')            
            high_text = high_text.replace(',','')            
            low_text = low_text.replace(',','')            
            volume_text = volume_text.replace(',','')            
            # print shortcd, date_text, open_text, high_text, low_text, close_text, volume_text
            priceitem = (shortcd, date_text, open_text, high_text, low_text, close_text, volume_text)
            insertdb(priceitem, conn)
            pass

    print 'elapsed time: ', time.time() - begin
    pass


 def insertdb(priceitem, conn):
    sqltext = """INSERT INTO StockPriceData (ShortCd, Date, Open, High, Low, Close, Volume)
                                        VALUES(?, ?, ?, ?, ?, ?, ?)"""
    conn.execute(sqltext, priceitem)
    conn.commit()
    pass


 if __name__ == '__main__':
    import sqlite3 as lite
    import os
    from read_shortcd import read_shortcd
    
    if not os.path.isfile('KOSDAQ100.db'):
        conn = lite.connect('KOSDAQ100.db')
        conn.execute("""CREATE TABLE StockPriceData (Id INTEGER NOT NULL PRIMARY KEY,
                                                    ShortCD TEXT,
                                                    Date TEXT,
                                                    Open TEXT,
                                                    High TEXT,
                                                    Low TEXT,
                                                    Close TEXT,
                                                    Volume TEXT)""")
    else:
        conn = lite.connect('KOSDAQ100.db')
        
    shortcd_lst = read_shortcd('kosdaq100.text')
    for shortcd in shortcd_lst:
        # shortcd = 'A130960'
        getdailydata(shortcd, conn)
	# -- coding: utf-8 --
	"""
	Created on Thu Apr 30 12:45:04 2015

	@author: assa
	"""


	def read_shortcd(filename='kospi.text'):
	shortcd_lst = []
	# 삼성에스디에스, 제일모직, 맥쿼리인프라
	shortcd_except_lst = ['A018260', 'A028260', 'A088980']

	with open(filename) as f:
	line = f.readline()
	while True:
	line = f.readline()
	if not line: break
	line = line[:-1]
	line_lst = line[:-1].split('\t')
	shortcd = line_lst[0]
	shortcd_lst.append(shortcd)

	return shortcd_lst
	__author__ = 'assa'

	import pandas as pd
	import pandas.io.data as web
	import sqlite3 as lite
	import datetime
	import pdb
	import matplotlib.pyplot as plt
	import matplotlib.style as style
	import statsmodels.formula.api as sm


	filename = 'finance_naver.db'
	conn = lite.connect(filename)

	start = datetime.datetime(2014, 4, 5)
	end = datetime.datetime(2015, 4, 3)

	shortcd = '015760' #KEPCO
	#shortcd = '000660' #SKHynics
	#shortcd = '005380' #hyundaimotors
	#shortcd = '005930' #samsungelec

	df_stock = web.DataReader('%s.KS'%shortcd, 'yahoo', start, end)
	df_stock = df_stock[df_stock['Volume'] > 0]
	date_lst = df_stock.index

	social_factor_list = [0]
	for i in xrange(len(date_lst)-1):
	nowdate = date_lst[i]
	nextdate = date_lst[i+1]

	str_nowdate = nowdate.strftime('%Y.%m.%d 15:00')
	str_nextdate = nextdate.strftime('%Y.%m.%d 15:00')

	sqltext = """SELECT * FROM FinNaverBoard
	WHERE ShortCD = '%s' and Time > '%s' and Time < '%s' """%(shortcd, str_nowdate, str_nextdate)
	df = pd.read_sql(sqltext, conn)

	print str_nextdate ,df_stock['Close'][i],len(df)
	social_factor_list.append(len(df))

	conn.close()
	pdb.set_trace()

	df_stock = df_stock[1:]
	social_factor = pd.Series(social_factor_list[:-1],index=date_lst[:-1])
	df_stock['social_factor'] = social_factor
	df_stock['Ret'] = df_stock['Close'].pct_change()
	df_stock['SF_Ret'] = df_stock['social_factor'].pct_change()

	df_stock = df_stock[1:]
	df_stock = df_stock[df_stock['SF_Ret'] < 3]
	model = sm.ols(formula='Ret ~ SF_Ret', data=df_stock)
	res = model.fit()

	print res.summary()

	style.use('ggplot')

	#df_stock.plot(kind='scatter', x='SF_Ret', y='Ret')
	plt.plot(df_stock['SF_Ret'], df_stock['Ret'],'bo')
	plt.plot(df_stock['SF_Ret'], res.fittedvalues,'r')
	plt.legend(['Data', 'Fitted model'])
	plt.xlabel('SF_Ret')
	plt.ylabel('Stock_Ret')
	plt.show()
	# -- coding: utf-8 --
	"""
	Created on Sun Nov 17 19:16:40 2013

	@author: assa
	"""

	import urllib2
	import bs4

	url = 'http://www.czce.com.cn/portal/exchange/2013/datadaily/20131114.htm'
	html = urllib2.urlopen(url)
	htmltext = html.read()

	soup = bs4.BeautifulSoup(htmltext)
	table = soup.find('table', attrs={"id": "senfe"})

	for row in table.find_all("tr")[1:15]:
	for td in row.find_all("td"):
	td_text = td.get_text()
	if td_text == u'\xa0':
	print ''
	else:
	print td_text
	print
	# -- coding: utf-8 --
	"""
	Created on Fri Apr 03 09:46:25 2015

	@author: assa
	"""

	import urllib2
	import bs4
	import sqlite3 as lite
	import pdb


	def get_table(shortcd,page_num):
	url = 'http://finance.naver.com/item/board.nhn?code=%s&page=%d'%(shortcd,page_num)
	response = urllib2.urlopen(url)
	htmltext = response.read()
	soup = bs4.BeautifulSoup(htmltext)
	table = soup.find('table', attrs={'class': 'type2'})
	return table
	pass

	def chk_sqltable(conn):
	sqltext = """SELECT name FROM sqlite_master
	WHERE type='table' and name='FinNaverBoard'"""
	cur = conn.execute(sqltext)
	row = cur.fetchone()
	if not row:
	return False
	else:
	return True

	def initDB(conn):
	if chk_sqltable(conn): return
	conn.execute("""CREATE TABLE FinNaverBoard (Id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
	ShortCD TEXT,
	Time TEXT,
	Title TEXT,
	Reply TEXT,
	Rating TEXT,
	USR_ID TEXT,
	Query TEXT,
	Like TEXT)""")
	pass


	if __name__ == "__main__":

	filedbname = 'finance_naver.db'
	conn = lite.connect(filedbname)
	initDB(conn)
	shortcd = '015760'

	for i in xrange(1,364):
	table = get_table(shortcd,i)
	for line in table.find_all('tr', attrs={'align': 'center'}):
	item = line.find('td', attrs={'class': 'title'})
	timestamp = line.find('td').get_text()
	reply = line.find('span', attrs={'class': 'tah p9'})
	if not reply:
	reply_num = '0'
	else:
	reply_num = reply.get_text()
	reply_num = reply_num[1:-1]

	rating = line.find('td', attrs={'class': 'tc'})
	rating = rating.get_text()

	title = item.find('a')
	text = title.get('title')
	#text = text.encode('utf-8')

	usr_id = line.find('td', attrs={'class': 'p11'})
	usr_id = usr_id.get_text()
	usr_id = usr_id.strip('\r\n\t\t\t\t')

	query = line.find_all('span', attrs={'class':'tah p10 gray03'})[1]
	query_num = query.get_text()

	like = line.find('strong')
	like_num = like.get_text()
	#print timestamp, text, reply_num, rating, usr_id, query_num, like_num
	print shortcd, timestamp, len(text)
	item = (shortcd, timestamp, text, reply_num, rating, usr_id, query_num, like_num)
	sqltext = """INSERT INTO FinNaverBoard(ShortCD,Time,Title,Reply,Rating,USR_ID,Query,Like)
	VALUES(?, ?, ?, ? ,?, ?, ?, ?)"""
	conn.execute(sqltext,item)
	conn.commit()
	pass

	conn.close()
	# -- coding: utf-8 --
	"""
	Created on Mon May 11 15:15:57 2015

	@author: assa
	"""

	import urllib2
	import bs4
	import re
	import time


	def getdailydata(shortcd, conn):
	# shortcd = 'A130960'
	# url = 'http://finance.naver.com/item/sise.nhn?code=130960'
	url = 'http://finance.naver.com/item/sise_day.nhn?code=%s&page=1' % shortcd[1:]
	response = urllib2.urlopen(url)
	htmltext = response.read()
	soup = bs4.BeautifulSoup(htmltext)

	navi_table = soup.find('table', attrs={'class': 'Nnavi'})
	pgRR = navi_table.find('td', attrs={'class': 'pgRR'})
	result = pgRR.find('a', href=True)
	link_text = result['href']
	p = re.compile('page=')
	m = p.search(link_text)
	page_count = int(link_text[m.end():])

	print shortcd, page_count

	begin = time.time()

	for i in xrange(1, page_count+1):
	url = 'http://finance.naver.com/item/sise_day.nhn?code=%s&page=%d' % (shortcd[1:],i)
	try:
	response = urllib2.urlopen(url)
	except :
	print shortcd, i
	continue
	htmltext = response.read()
	soup = bs4.BeautifulSoup(htmltext)
	table = soup.find('table', attrs={"class": "type2"})

	for line in table.find_all('tr', attrs={'onmouseout': 'mouseOut(this)'}):
	date_line = line.find('td', attrs={'align': 'center'})
	date_text = date_line.get_text()
	date_text = date_text.replace('.','-')
	OHLC_lines = line.find_all('td', attrs={'class': 'num'})
	close_text = OHLC_lines[0].get_text()
	open_text = OHLC_lines[2].get_text()
	high_text = OHLC_lines[3].get_text()
	low_text = OHLC_lines[4].get_text()
	volume_text = OHLC_lines[5].get_text()
	close_text = close_text.replace(',','')
	open_text = open_text.replace(',','')
	high_text = high_text.replace(',','')
	low_text = low_text.replace(',','')
	volume_text = volume_text.replace(',','')
	# print shortcd, date_text, open_text, high_text, low_text, close_text, volume_text
	priceitem = (shortcd, date_text, open_text, high_text, low_text, close_text, volume_text)
	insertdb(priceitem, conn)
	pass

	print 'elapsed time: ', time.time() - begin
	pass


	def insertdb(priceitem, conn):
	sqltext = """INSERT INTO StockPriceData (ShortCd, Date, Open, High, Low, Close, Volume)
	VALUES(?, ?, ?, ?, ?, ?, ?)"""
	conn.execute(sqltext, priceitem)
	conn.commit()
	pass


	if __name__ == '__main__':
	import sqlite3 as lite
	import os
	from read_shortcd import read_shortcd

	if not os.path.isfile('KOSDAQ100.db'):
	conn = lite.connect('KOSDAQ100.db')
	conn.execute("""CREATE TABLE StockPriceData (Id INTEGER NOT NULL PRIMARY KEY,
	ShortCD TEXT,
	Date TEXT,
	Open TEXT,
	High TEXT,
	Low TEXT,
	Close TEXT,
	Volume TEXT)""")
	else:
	conn = lite.connect('KOSDAQ100.db')

	shortcd_lst = read_shortcd('kosdaq100.text')
	for shortcd in shortcd_lst:
	# shortcd = 'A130960'
	getdailydata(shortcd, conn)