Created
October 9, 2019 10:37
-
-
Save jeremy-rutman/4d154451498e040811aebb662dc570d3 to your computer and use it in GitHub Desktop.
pandas read csv of multiple formats
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Uses chardet if encoding is not specified, reading the first kB to determine encoding. | |
| # This obviates the unfortunate situation wherein reading a utf-16 as utf-8 does not generally throw an error, | |
| # but will munge up the data. | |
| import chardet | |
| import pandas as pd | |
| def read_csv_multiformat(f,**kwargs): | |
| if not 'encoding' in kwargs: | |
| with open(f, 'rb') as fp: | |
| bin_dat = fp.read(1000) | |
| char_info = chardet.detect(bin_dat) | |
| print(char_info) | |
| if 'encoding' in char_info and 'confidence' in char_info and char_info['confidence'] > 0.6: | |
| kwargs['encoding'] = char_info['encoding'] | |
| df = pd.read_csv(f,**kwargs) | |
| return df | |
| f = 'utf16.csv' | |
| df=read_csv_multiformat(f,sep='|') | |
| print(df.head()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment