mookerji · May 7, 2018 17:08
diff --git a/bug_categories.py b/bug_categories.py
 import pandas as pd

 # Create a DataFrame df_orig and write to Parquet

 size = 1000
 df_orig = pd.DataFrame(
    data={
        'a': size*['foo', 'bar', 'baz'],
        'b': size*['foo', 'bar', 'baz'],
        'c': size*['foo', 'bar', 'baz'],
    })
 print(df_orig.head(10),'\n')
 df_orig.to_parquet(fname='df_orig.prq', engine='fastparquet')

 # Coerce df_orig to a categorical variables: df_cat

 df_cat0 = df_orig.copy()
 for col in df_cat0.columns:
    df_cat0[col] = df_cat0[col].astype('category')

 # Roundtripping through Parquet

 df_cat0.to_parquet(fname='df_cat0.prq', engine='fastparquet')
 df_cat1 = pd.read_parquet(path='df_cat0.prq')

 # Examine read Parquet DataFrame

 print('df_orig - original all dtype object')
 df_orig.info(memory_usage='deep')
 print()

 print('df_cat0 - all dtype categories')
 df_cat0.info(memory_usage='deep')
 print()

 print('df_cat1 - df_cat0 read from disk - all dtype object')
 df_cat1.info(memory_usage='deep')
 print()
diff --git a/Script output b/Script output
 # python3 bug_categories.py

     a    b    c
 0  foo  foo  foo
 1  bar  bar  bar
 2  baz  baz  baz
 3  foo  foo  foo
 4  bar  bar  bar
 5  baz  baz  baz
 6  foo  foo  foo
 7  bar  bar  bar
 8  baz  baz  baz
 9  foo  foo  foo

 df_orig - original all dtype object
 <class 'pandas.core.frame.DataFrame'>
 RangeIndex: 3000 entries, 0 to 2999
 Data columns (total 3 columns):
 a    3000 non-null object
 b    3000 non-null object
 c    3000 non-null object
 dtypes: object(3)
 memory usage: 527.4 KB

 df_cat0 - all dtype categories
 <class 'pandas.core.frame.DataFrame'>
 RangeIndex: 3000 entries, 0 to 2999
 Data columns (total 3 columns):
 a    3000 non-null category
 b    3000 non-null category
 c    3000 non-null category
 dtypes: category(3)
 memory usage: 9.6 KB

 df_cat1 - df_cat0 read from disk - all dtype object
 <class 'pandas.core.frame.DataFrame'>
 RangeIndex: 3000 entries, 0 to 2999
 Data columns (total 3 columns):
 a    3000 non-null object
 b    3000 non-null object
 c    3000 non-null object
 dtypes: object(3)
 memory usage: 527.4 KB
diff --git a/Shell file sizes b/Shell file sizes
 -rw-r--r--   1 mookerji  staff   1.8K May  7 09:56 df_cat0.prq
 -rw-r--r--   1 mookerji  staff   3.8K May  7 09:56 df_orig.prq
diff --git a/versions b/versions
 >>> import pandas as pd
 >>> pd.show_versions()

 INSTALLED VERSIONS
 ------------------
 commit: None
 python: 3.6.5.final.0
 python-bits: 64
 OS: Darwin
 OS-release: 17.5.0
 machine: x86_64
 processor: i386
 byteorder: little
 LC_ALL: None
 LANG: en_US.UTF-8
 LOCALE: en_US.UTF-8

 pandas: 0.22.0
 pytest: 3.5.0
 pip: 9.0.3
 setuptools: 39.0.1
 Cython: None
 numpy: 1.14.2
 scipy: 1.1.0rc1
 pyarrow: 0.7.1
 xarray: None
 IPython: 6.3.1
 sphinx: None
 patsy: None
 dateutil: 2.6.1
 pytz: 2018.4
 blosc: None
 bottleneck: None
 tables: 3.4.2
 numexpr: 2.6.4
 feather: None
 matplotlib: 2.1.2
 openpyxl: None
 xlrd: None
 xlwt: None
 xlsxwriter: None
 lxml: None
 bs4: None
 html5lib: 1.0.1
 sqlalchemy: 1.2.7
 pymysql: None
 psycopg2: 2.7.4 (dt dec pq3 ext lo64)
 jinja2: 2.8.1
 s3fs: 0.1.0
 fastparquet: 0.1.5
 pandas_gbq: None
 pandas_datareader: None
	import pandas as pd

	# Create a DataFrame df_orig and write to Parquet

	size = 1000
	df_orig = pd.DataFrame(
	data={
	'a': size*['foo', 'bar', 'baz'],
	'b': size*['foo', 'bar', 'baz'],
	'c': size*['foo', 'bar', 'baz'],
	})
	print(df_orig.head(10),'\n')
	df_orig.to_parquet(fname='df_orig.prq', engine='fastparquet')

	# Coerce df_orig to a categorical variables: df_cat

	df_cat0 = df_orig.copy()
	for col in df_cat0.columns:
	df_cat0[col] = df_cat0[col].astype('category')

	# Roundtripping through Parquet

	df_cat0.to_parquet(fname='df_cat0.prq', engine='fastparquet')
	df_cat1 = pd.read_parquet(path='df_cat0.prq')

	# Examine read Parquet DataFrame

	print('df_orig - original all dtype object')
	df_orig.info(memory_usage='deep')
	print()

	print('df_cat0 - all dtype categories')
	df_cat0.info(memory_usage='deep')
	print()

	print('df_cat1 - df_cat0 read from disk - all dtype object')
	df_cat1.info(memory_usage='deep')
	print()
	# python3 bug_categories.py

	a b c
	0 foo foo foo
	1 bar bar bar
	2 baz baz baz
	3 foo foo foo
	4 bar bar bar
	5 baz baz baz
	6 foo foo foo
	7 bar bar bar
	8 baz baz baz
	9 foo foo foo

	df_orig - original all dtype object
	<class 'pandas.core.frame.DataFrame'>
	RangeIndex: 3000 entries, 0 to 2999
	Data columns (total 3 columns):
	a 3000 non-null object
	b 3000 non-null object
	c 3000 non-null object
	dtypes: object(3)
	memory usage: 527.4 KB

	df_cat0 - all dtype categories
	<class 'pandas.core.frame.DataFrame'>
	RangeIndex: 3000 entries, 0 to 2999
	Data columns (total 3 columns):
	a 3000 non-null category
	b 3000 non-null category
	c 3000 non-null category
	dtypes: category(3)
	memory usage: 9.6 KB

	df_cat1 - df_cat0 read from disk - all dtype object
	<class 'pandas.core.frame.DataFrame'>
	RangeIndex: 3000 entries, 0 to 2999
	Data columns (total 3 columns):
	a 3000 non-null object
	b 3000 non-null object
	c 3000 non-null object
	dtypes: object(3)
	memory usage: 527.4 KB
	-rw-r--r-- 1 mookerji staff 1.8K May 7 09:56 df_cat0.prq
	-rw-r--r-- 1 mookerji staff 3.8K May 7 09:56 df_orig.prq
	>>> import pandas as pd
	>>> pd.show_versions()

	INSTALLED VERSIONS
	------------------
	commit: None
	python: 3.6.5.final.0
	python-bits: 64
	OS: Darwin
	OS-release: 17.5.0
	machine: x86_64
	processor: i386
	byteorder: little
	LC_ALL: None
	LANG: en_US.UTF-8
	LOCALE: en_US.UTF-8

	pandas: 0.22.0
	pytest: 3.5.0
	pip: 9.0.3
	setuptools: 39.0.1
	Cython: None
	numpy: 1.14.2
	scipy: 1.1.0rc1
	pyarrow: 0.7.1
	xarray: None
	IPython: 6.3.1
	sphinx: None
	patsy: None
	dateutil: 2.6.1
	pytz: 2018.4
	blosc: None
	bottleneck: None
	tables: 3.4.2
	numexpr: 2.6.4
	feather: None
	matplotlib: 2.1.2
	openpyxl: None
	xlrd: None
	xlwt: None
	xlsxwriter: None
	lxml: None
	bs4: None
	html5lib: 1.0.1
	sqlalchemy: 1.2.7
	pymysql: None
	psycopg2: 2.7.4 (dt dec pq3 ext lo64)
	jinja2: 2.8.1
	s3fs: 0.1.0
	fastparquet: 0.1.5
	pandas_gbq: None
	pandas_datareader: None