Created
April 28, 2018 18:10
-
-
Save maartenbreddels/e41afb646dbaf557a61131202e1a546c to your computer and use it in GitHub Desktop.
Makefile for converting GaiaDR2 cvs files to a single hdf5 file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Makefile for converting the CSV files from http://cdn.gea.esac.esa.int/Gaia/gdr2/gaia_source/csv/ | |
# to a single (vaex) hdf5 file | |
# * https://docs.vaex.io | |
# * https://github.com/maartenbreddels/vaex/ | |
# It is multistage to work around opening 60 000 files at once. | |
# Strategy is | |
# * stage1: convert all cvs.gz to csv to hdf5 | |
# * do this via xargs and calling make again, since gmake has trouble matching 60 000 rules | |
# * stage2: Create part-<NUMBER>.txt files containing max FILES_PER_PART per file | |
# * stage3: convert the list of hdf5 files to single hdf5 files (part-<NUMBER>.hdf5) | |
# * stage4: convert the partial files, to a single file (SINGLE_FILE) | |
# Possible use | |
# $ make stage1 | |
# $ make stage2 | |
# $ make stage3 -j8 | |
# $ make stage4 | |
FILES_PER_PART = 10 | |
ZIPPED = $(shell find . -maxdepth 1 -type f -name '*.csv.gz') | |
CSVS = $(patsubst %.cvs.gz, %.cvs, $(ZIPPED)) | |
HDF5S = $(patsubst %.csv, %.cvs.hdf5, $(CSVS)) | |
PARTS_TXT = $(shell find . -maxdepth 1 -type f -name 'part*.txt') | |
PARTS = $(patsubst %.txt, %.hdf5, $(PARTS_TXT)) | |
SINGLE_FILE = gaia-dr2-sort-by-source_id.hdf5 | |
all: | |
echo "Read the comments in this makefile" | |
stage1: | |
# change -P8 to ~number of cores on your system | |
find . -iname '*csv.gz' | sed 's/gz/hdf5/' | xargs -n30 -P8 make -j8 | |
stage2: | |
find $(INPUT_DIR) -iname '*.hdf5' | awk -vc=0 'NR%$(FILES_PER_PART)==0{c++}{print $0 > "part-"c".txt"}' | |
stage3: $(PARTS) | |
stage4: $(SINGLE_FILE) | |
%.csv.hdf5 : %.csv | |
vaex convert file $< $@ | |
%.csv : %.csv.gz | |
gunzip -c $< > $@ | |
part-%.hdf5: part-%.txt | |
vaex convert --progress --sort=source_id file @$< $@ | |
group_%.hdf5: group_%.txt | |
vaex convert --progress --sort=source_id file @$< $@ | |
$(SINGLE_FILE): $(PARTS) | |
find . -iname 'part*.hdf5' > single.txt | |
vaex convert --progress --sort=source_id file @single.txt $@ | |
#.PRECIOUS: %.csv | |
#.PRECIOUS: %.csv.hdf5 | |
# don't delete intermediate files | |
.PRECIOUS: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment