Last active
January 22, 2024 21:43
-
-
Save Curt-Park/7f0cfe9712d0e405499b406356c2173a to your computer and use it in GitHub Desktop.
Scrap finance data from Naver Finance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""Scrap data from Naver Finance. | |
Author: | |
- Name: Jinwoo Park | |
- Email: [email protected] | |
Prerequisites: | |
pip install beautifulsoup4 | |
pip install html5lib | |
pip install pandas | |
pip install matplotlib | |
""" | |
from http.client import HTTPResponse | |
from multiprocessing import Pool, cpu_count | |
from typing import List, Tuple | |
from urllib.request import build_opener | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
class PriceParser: | |
"""Price parser from Naver Finance.""" | |
BASE_URL: str = "https://finance.naver.com/item/sise_day.nhn" | |
HEADERS: Tuple[str, str] = ("User-Agent", "Mozilla/5.0") | |
def __init__(self, code: str, n_processes: int = 4) -> None: | |
"""Initialize.""" | |
print(f"code: {code}, n_processes: {n_processes}") | |
self.code = code | |
self.n_processes = n_processes | |
# get the total page number | |
url = PriceParser.BASE_URL + f"?code={code}" | |
response = PriceParser.open_url(url) | |
html = BeautifulSoup(response, "lxml") | |
self.last_page = int(html.find("td", class_="pgRR").a["href"].split("=")[-1]) | |
def __call__(self) -> pd.DataFrame: | |
"""Parse all prices from Naver Finance.""" | |
with Pool(self.n_processes) as pool: | |
prices: List[pd.DataFrame] = pool.starmap( | |
PriceParser.parse_page_info, | |
[(self.code, i) for i in range(1, self.last_page + 1)], | |
) | |
prices_df = pd.DataFrame().append(prices) | |
# post processing | |
columns = { | |
"날짜": "Date", | |
"시가": "Open", | |
"고가": "High", | |
"저가": "Low", | |
"종가": "Close", | |
"거래량": "Volume", | |
} | |
prices_df = prices_df.dropna() | |
prices_df = prices_df.rename(columns=columns).sort_values(by="Date") | |
prices_df.index = pd.to_datetime(prices_df.Date) | |
prices_df = prices_df[columns.values()] | |
return prices_df | |
@staticmethod | |
def parse_page_info(code: str, page: int) -> pd.DataFrame: | |
"""Parse price info from a single page.""" | |
print(f"start parsing page {page}") | |
url = PriceParser.BASE_URL + f"?code={code}&page={page}" | |
response = PriceParser.open_url(url) | |
return pd.read_html(response, header=0)[0] | |
@staticmethod | |
def open_url(url: str) -> HTTPResponse: | |
"""Open the input url.""" | |
opener = build_opener() | |
opener.addheaders = [PriceParser.HEADERS] | |
response = opener.open(url) | |
return response | |
if __name__ == "__main__": | |
# KODEX200 | |
parser = PriceParser(code="069500", n_processes=cpu_count()) | |
prices = parser() | |
print(prices) | |
# plot | |
plt.figure(figsize=(9, 7)) | |
plt.xticks(rotation=90) | |
plt.plot(prices.index, prices["Close"]) | |
plt.grid(True) | |
plt.show() | |
plt.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Result: KODEX200 from 2002