Last active
October 25, 2019 15:29
-
-
Save DeastinY/02f17d539ff7008d2b8dde1f9a6a9480 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import timeit | |
import numpy as np | |
import pandas as pd | |
from pathlib import Path | |
def create_test_data(large=False, small=False): | |
# Note that a large_number by 1 dataframe is not really representative. | |
root = Path("tmp") | |
root.mkdir(exist_ok=True) | |
if large: | |
large_df = pd.DataFrame(np.random.randint(0,100,size=(800000000,1))) | |
large_df.columns = large_df.columns.astype(str) | |
large_df.to_feather(root/"large_df.feather") | |
if small: | |
for i in range(20000): | |
small_df = pd.DataFrame(np.random.randint(0,100,size=(40000,1))) | |
small_df.columns = small_df.columns.astype(str) | |
small_df.to_feather(root/f"small_df_{i}.feather") | |
def test_speed(large=False, small=False): | |
if large: | |
print(timeit.timeit("""pd.read_feather("tmp/large_df.feather")""", setup="import pandas as pd", number=10)) | |
if small: | |
print(timeit.timeit("""for f in Path("tmp").glob("small_*.feather"): pd.read_feather(f)""", setup="import pandas as pd; from pathlib import Path", number=10)) | |
if __name__ == '__main__': | |
create_test_data(large=False, small=True) | |
test_speed(large=False, small=True) | |
# Loading a single 6 GB file takes roughly 15.2 seconds | |
# In contrast to loading 20,000 300kb files, which takes roughly forever (296.8 seconds). | |
# (Ten times average) |
Thanks for running it! Did you change the timeit number? I messed that up in the original version ^^
Nope, I ran it as is i.e number=10. only changed the boolean values
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Just FYI I get 8 seconds for large and 195.2 seconds for small