Created
May 13, 2017 00:33
-
-
Save tony/0218d6dfe51e37fc02f697a9405382a8 to your computer and use it in GitHub Desktop.
create datapackage for unihan-tabular
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf8 -*- | |
""" | |
Based off http://frictionlessdata.io/guides/creating-tabular-data-packages-in-python/ | |
For use on https://github.com/cihai/unihan-tabular | |
License: MIT | |
""" | |
from __future__ import (absolute_import, division, print_function, | |
unicode_literals, with_statement) | |
import io | |
import csv | |
import os | |
from jsontableschema import infer | |
import datapackage | |
# this and name/title descriptors can be replaced with your package name/title | |
about = {} | |
about_file = os.path.join( | |
os.path.dirname(__file__), 'unihan_tabular', '__about__.py') | |
with open(about_file) as fp: | |
exec(fp.read(), about) | |
dp = datapackage.DataPackage() | |
dp.descriptor['name'] = about['__title__'] | |
dp.descriptor['title'] = about['__title__'] | |
filepath = './data/unihan.csv' | |
# On Python 2, this would crash due to poor Unicode support (was using this | |
# on a unicode-rich CSV. Python 3 is slow, so had to chop off the top | |
with io.open(filepath) as stream: | |
headers = stream.readline().rstrip('\n').split(',') | |
values = csv.reader(stream) | |
schema = infer(headers, values) | |
dp.descriptor['resources'] = [ | |
{ | |
'name': 'data', | |
'path': filepath, | |
'schema': schema | |
} | |
] | |
# datapackage Version: 0.8.8 would wrongly attribute a date-like type | |
# to many fields, so had to find-replace them with "string" | |
with open('datapackage.json', 'w') as f: | |
f.write(dp.to_json()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment