Curt-Park · January 22, 2024 21:43 · Curt-Park · Aug 31, 2021
diff --git a/scrap_finance_data.py b/scrap_finance_data.py
 # -*- coding: utf-8 -*-
 """Scrap data from Naver Finance.

 Author:
    - Name: Jinwoo Park
    - Email: [email protected]

 Prerequisites:
  pip install beautifulsoup4
  pip install html5lib
  pip install pandas
  pip install matplotlib
 """
 from http.client import HTTPResponse
 from multiprocessing import Pool, cpu_count
 from typing import List, Tuple
 from urllib.request import build_opener

 import matplotlib.pyplot as plt
 import pandas as pd
 from bs4 import BeautifulSoup


 class PriceParser:
    """Price parser from Naver Finance."""

    BASE_URL: str = "https://finance.naver.com/item/sise_day.nhn"
    HEADERS: Tuple[str, str] = ("User-Agent", "Mozilla/5.0")

    def __init__(self, code: str, n_processes: int = 4) -> None:
        """Initialize."""
        print(f"code: {code}, n_processes: {n_processes}")
        self.code = code
        self.n_processes = n_processes

        # get the total page number
        url = PriceParser.BASE_URL + f"?code={code}"
        response = PriceParser.open_url(url)
        html = BeautifulSoup(response, "lxml")
        self.last_page = int(html.find("td", class_="pgRR").a["href"].split("=")[-1])

    def __call__(self) -> pd.DataFrame:
        """Parse all prices from Naver Finance."""
        with Pool(self.n_processes) as pool:
            prices: List[pd.DataFrame] = pool.starmap(
                PriceParser.parse_page_info,
                [(self.code, i) for i in range(1, self.last_page + 1)],
            )
        prices_df = pd.DataFrame().append(prices)

        # post processing
        columns = {
            "날짜": "Date",
            "시가": "Open",
            "고가": "High",
            "저가": "Low",
            "종가": "Close",
            "거래량": "Volume",
        }
        prices_df = prices_df.dropna()
        prices_df = prices_df.rename(columns=columns).sort_values(by="Date")
        prices_df.index = pd.to_datetime(prices_df.Date)
        prices_df = prices_df[columns.values()]

        return prices_df

    @staticmethod
    def parse_page_info(code: str, page: int) -> pd.DataFrame:
        """Parse price info from a single page."""
        print(f"start parsing page {page}")
        url = PriceParser.BASE_URL + f"?code={code}&page={page}"
        response = PriceParser.open_url(url)
        return pd.read_html(response, header=0)[0]

    @staticmethod
    def open_url(url: str) -> HTTPResponse:
        """Open the input url."""
        opener = build_opener()
        opener.addheaders = [PriceParser.HEADERS]
        response = opener.open(url)
        return response


 if __name__ == "__main__":
    # KODEX200
    parser = PriceParser(code="069500", n_processes=cpu_count())
    prices = parser()
    print(prices)

    # plot
    plt.figure(figsize=(9, 7))
    plt.xticks(rotation=90)
    plt.plot(prices.index, prices["Close"])
    plt.grid(True)
    plt.show()
    plt.close()
	# -- coding: utf-8 --
	"""Scrap data from Naver Finance.

	Author:
	- Name: Jinwoo Park
	- Email: [email protected]

	Prerequisites:
	pip install beautifulsoup4
	pip install html5lib
	pip install pandas
	pip install matplotlib
	"""
	from http.client import HTTPResponse
	from multiprocessing import Pool, cpu_count
	from typing import List, Tuple
	from urllib.request import build_opener

	import matplotlib.pyplot as plt
	import pandas as pd
	from bs4 import BeautifulSoup


	class PriceParser:
	"""Price parser from Naver Finance."""

	BASE_URL: str = "https://finance.naver.com/item/sise_day.nhn"
	HEADERS: Tuple[str, str] = ("User-Agent", "Mozilla/5.0")

	def __init__(self, code: str, n_processes: int = 4) -> None:
	"""Initialize."""
	print(f"code: {code}, n_processes: {n_processes}")
	self.code = code
	self.n_processes = n_processes

	# get the total page number
	url = PriceParser.BASE_URL + f"?code={code}"
	response = PriceParser.open_url(url)
	html = BeautifulSoup(response, "lxml")
	self.last_page = int(html.find("td", class_="pgRR").a["href"].split("=")[-1])

	def __call__(self) -> pd.DataFrame:
	"""Parse all prices from Naver Finance."""
	with Pool(self.n_processes) as pool:
	prices: List[pd.DataFrame] = pool.starmap(
	PriceParser.parse_page_info,
	[(self.code, i) for i in range(1, self.last_page + 1)],
	)
	prices_df = pd.DataFrame().append(prices)

	# post processing
	columns = {
	"날짜": "Date",
	"시가": "Open",
	"고가": "High",
	"저가": "Low",
	"종가": "Close",
	"거래량": "Volume",
	}
	prices_df = prices_df.dropna()
	prices_df = prices_df.rename(columns=columns).sort_values(by="Date")
	prices_df.index = pd.to_datetime(prices_df.Date)
	prices_df = prices_df[columns.values()]

	return prices_df

	@staticmethod
	def parse_page_info(code: str, page: int) -> pd.DataFrame:
	"""Parse price info from a single page."""
	print(f"start parsing page {page}")
	url = PriceParser.BASE_URL + f"?code={code}&page={page}"
	response = PriceParser.open_url(url)
	return pd.read_html(response, header=0)[0]

	@staticmethod
	def open_url(url: str) -> HTTPResponse:
	"""Open the input url."""
	opener = build_opener()
	opener.addheaders = [PriceParser.HEADERS]
	response = opener.open(url)
	return response


	if __name__ == "__main__":
	# KODEX200
	parser = PriceParser(code="069500", n_processes=cpu_count())
	prices = parser()
	print(prices)

	# plot
	plt.figure(figsize=(9, 7))
	plt.xticks(rotation=90)
	plt.plot(prices.index, prices["Close"])
	plt.grid(True)
	plt.show()
	plt.close()