Last active
October 17, 2024 17:38
-
-
Save scubamut/71c8b0648f079e2f9a9b4519bee4d912 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ######################################################################### | |
| # US_stock_bundle | |
| ############### | |
| # ~.zipline/extensions.py | |
| #from zipline.data.bundles import register, US_stock_data | |
| # | |
| #register('US_stock_bundle', US_stock_data.US_stock_data, calendar_name='NYSE') | |
| ######################################################################### | |
| import pandas as pd | |
| from os import listdir | |
| # Change the path to where you have your data | |
| try: | |
| import google.colab | |
| # for COLAB | |
| path = '/content/data/daily/US_stock_data/' | |
| except: | |
| path = '/home/scubamut1/MEGAsync/10_DATA/daily/US_stock_data/' | |
| """ | |
| The ingest function needs to have this exact signature, | |
| meaning these arguments passed, as shown below. | |
| """ | |
| def US_stock_data(environ, | |
| asset_db_writer, | |
| minute_bar_writer, | |
| daily_bar_writer, | |
| adjustment_writer, | |
| calendar, | |
| start_session, | |
| end_session, | |
| cache, | |
| show_progress, | |
| output_dir): | |
| # Get list of files from path | |
| # Slicing off the last part | |
| # 'example.csv'[:-4] = 'example' | |
| symbols = [f[:-4] for f in listdir(path)] | |
| if not symbols: | |
| raise ValueError("No symbols found in folder.") | |
| # Prepare an empty DataFrame for dividends | |
| divs = pd.DataFrame(columns=['sid', | |
| 'amount', | |
| 'ex_date', | |
| 'record_date', | |
| 'declared_date', | |
| 'pay_date'] | |
| ) | |
| # Prepare an empty DataFrame for splits | |
| splits = pd.DataFrame(columns=['sid', | |
| 'ratio', | |
| 'effective_date'] | |
| ) | |
| # Prepare an empty DataFrame for metadata | |
| metadata = pd.DataFrame(columns=('start_date', | |
| 'end_date', | |
| 'auto_close_date', | |
| 'symbol', | |
| 'exchange' | |
| ) | |
| ) | |
| # Check valid trading dates, according to the selected exchange calendar | |
| # NOTE: THIS WORKS FOR zipline.calendar_utils NOT FOR pandas_market_calendars | |
| sessions = calendar.sessions_in_range(start_session, end_session) | |
| # Get data for all stocks and write to Zipline | |
| daily_bar_writer.write( | |
| process_stocks(symbols, sessions, metadata, divs) | |
| ) | |
| # Write the metadata | |
| # asset_db_writer.write(equities=metadata) | |
| ##################################################################################### | |
| # from https://github.com/quantopian/zipline/issues/2517 | |
| exchange = {'exchange': 'NYSE', 'canonical_name': 'NYSE', 'country_code': 'US'} | |
| exchange_df = pd.DataFrame( | |
| data=[["NYSE", "NYSE", "US"]], | |
| columns=["exchange", "canonical_name", "country_code"], | |
| ) | |
| asset_db_writer.write(equities=metadata,exchanges=exchange_df) | |
| ####################################################################################### | |
| # Write splits and dividends | |
| adjustment_writer.write(splits=splits, | |
| dividends=divs) | |
| """ | |
| Generator function to iterate stocks, | |
| build historical data, metadata | |
| and dividend data | |
| """ | |
| def process_stocks(symbols, sessions, metadata, divs): | |
| # Loop the stocks, setting a unique Security ID (SID) | |
| for sid, symbol in enumerate(symbols): | |
| print('Loading {}...'.format(symbol)) | |
| # Read the stock data from csv file. | |
| df = pd.read_csv('{}/{}.csv'.format(path, symbol), index_col=[0], parse_dates=[0]) | |
| # Check first and last date. | |
| start_date = df.index[0] | |
| end_date = df.index[-1] | |
| # Synch to the official exchange calendar | |
| df = df.reindex(sessions.tz_localize(None))[start_date:end_date] | |
| # Forward fill missing data | |
| df.fillna(method='ffill', inplace=True) | |
| # Drop remaining NaN | |
| df.dropna(inplace=True) | |
| # The auto_close date is the day after the last trade. | |
| ac_date = end_date + pd.Timedelta(days=1) | |
| # Add a row to the metadata DataFrame. Don't forget to add an exchange field. | |
| metadata.loc[sid] = start_date, end_date, ac_date, symbol, "NYSE" | |
| # If there's dividend data, add that to the dividend DataFrame | |
| if 'dividend' in df.columns: | |
| # Slice off the days with dividends | |
| tmp = df[df['dividend'] != 0.0]['dividend'] | |
| div = pd.DataFrame(data=tmp.index.tolist(), columns=['ex_date']) | |
| # Provide empty columns as we don't have this data for now | |
| div['record_date'] = pd.NaT | |
| div['declared_date'] = pd.NaT | |
| div['pay_date'] = pd.NaT | |
| # Store the dividends and set the Security ID | |
| div['amount'] = tmp.tolist() | |
| div['sid'] = sid | |
| # Start numbering at where we left off last time | |
| ind = pd.Index(range(divs.shape[0], divs.shape[0] + div.shape[0])) | |
| div.set_index(ind, inplace=True) | |
| # Append this stock's dividends to the list of all dividends | |
| divs = divs.append(div) | |
| yield sid, df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment