Skip to content

Instantly share code, notes, and snippets.

@Curt-Park
Last active January 22, 2024 21:43
Show Gist options
  • Save Curt-Park/7f0cfe9712d0e405499b406356c2173a to your computer and use it in GitHub Desktop.
Save Curt-Park/7f0cfe9712d0e405499b406356c2173a to your computer and use it in GitHub Desktop.
Scrap finance data from Naver Finance
# -*- coding: utf-8 -*-
"""Scrap data from Naver Finance.
Author:
- Name: Jinwoo Park
- Email: [email protected]
Prerequisites:
pip install beautifulsoup4
pip install html5lib
pip install pandas
pip install matplotlib
"""
from http.client import HTTPResponse
from multiprocessing import Pool, cpu_count
from typing import List, Tuple
from urllib.request import build_opener
import matplotlib.pyplot as plt
import pandas as pd
from bs4 import BeautifulSoup
class PriceParser:
"""Price parser from Naver Finance."""
BASE_URL: str = "https://finance.naver.com/item/sise_day.nhn"
HEADERS: Tuple[str, str] = ("User-Agent", "Mozilla/5.0")
def __init__(self, code: str, n_processes: int = 4) -> None:
"""Initialize."""
print(f"code: {code}, n_processes: {n_processes}")
self.code = code
self.n_processes = n_processes
# get the total page number
url = PriceParser.BASE_URL + f"?code={code}"
response = PriceParser.open_url(url)
html = BeautifulSoup(response, "lxml")
self.last_page = int(html.find("td", class_="pgRR").a["href"].split("=")[-1])
def __call__(self) -> pd.DataFrame:
"""Parse all prices from Naver Finance."""
with Pool(self.n_processes) as pool:
prices: List[pd.DataFrame] = pool.starmap(
PriceParser.parse_page_info,
[(self.code, i) for i in range(1, self.last_page + 1)],
)
prices_df = pd.DataFrame().append(prices)
# post processing
columns = {
"날짜": "Date",
"시가": "Open",
"고가": "High",
"저가": "Low",
"종가": "Close",
"거래량": "Volume",
}
prices_df = prices_df.dropna()
prices_df = prices_df.rename(columns=columns).sort_values(by="Date")
prices_df.index = pd.to_datetime(prices_df.Date)
prices_df = prices_df[columns.values()]
return prices_df
@staticmethod
def parse_page_info(code: str, page: int) -> pd.DataFrame:
"""Parse price info from a single page."""
print(f"start parsing page {page}")
url = PriceParser.BASE_URL + f"?code={code}&page={page}"
response = PriceParser.open_url(url)
return pd.read_html(response, header=0)[0]
@staticmethod
def open_url(url: str) -> HTTPResponse:
"""Open the input url."""
opener = build_opener()
opener.addheaders = [PriceParser.HEADERS]
response = opener.open(url)
return response
if __name__ == "__main__":
# KODEX200
parser = PriceParser(code="069500", n_processes=cpu_count())
prices = parser()
print(prices)
# plot
plt.figure(figsize=(9, 7))
plt.xticks(rotation=90)
plt.plot(prices.index, prices["Close"])
plt.grid(True)
plt.show()
plt.close()
@Curt-Park
Copy link
Author

Result: KODEX200 from 2002

code: 069500, n_processes: 8
start parsing page 1
start parsing page 16
start parsing page 31
start parsing page 46
start parsing page 61
start parsing page 76
start parsing page 91
start parsing page 106
...
start parsing page 433
start parsing page 404
start parsing page 463
start parsing page 448
start parsing page 390
start parsing page 375
start parsing page 419
start parsing page 405
start parsing page 464
start parsing page 434
start parsing page 449
start parsing page 420
start parsing page 465
start parsing page 450
start parsing page 435
               Open     High      Low    Close     Volume
Date
2002-10-14   7800.0   8000.0   7590.0   7750.0  2577230.0
2002-10-15   7850.0   7980.0   7700.0   7940.0  2288769.0
2002-10-16   8040.0   8040.0   7910.0   7970.0  2039150.0
2002-10-17   7980.0   8180.0   7870.0   8080.0  2091777.0
2002-10-18   8300.0   8500.0   8250.0   8430.0  1321300.0
...             ...      ...      ...      ...        ...
2021-08-25  41385.0  41610.0  41125.0  41405.0  3821022.0
2021-08-26  41470.0  41595.0  41025.0  41095.0  3769213.0
2021-08-27  40945.0  41235.0  40765.0  41070.0  5072400.0
2021-08-30  41480.0  41560.0  41070.0  41170.0  4219723.0
2021-08-31  41205.0  41985.0  41005.0  41985.0  3682317.0

[4673 rows x 5 columns]

스크린샷 2021-09-01 오전 12 28 30

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment