Skip to content

Instantly share code, notes, and snippets.

@maartenbreddels
Created April 28, 2018 18:10
Show Gist options
  • Save maartenbreddels/e41afb646dbaf557a61131202e1a546c to your computer and use it in GitHub Desktop.
Save maartenbreddels/e41afb646dbaf557a61131202e1a546c to your computer and use it in GitHub Desktop.
Makefile for converting GaiaDR2 cvs files to a single hdf5 file
# Makefile for converting the CSV files from http://cdn.gea.esac.esa.int/Gaia/gdr2/gaia_source/csv/
# to a single (vaex) hdf5 file
# * https://docs.vaex.io
# * https://github.com/maartenbreddels/vaex/
# It is multistage to work around opening 60 000 files at once.
# Strategy is
# * stage1: convert all cvs.gz to csv to hdf5
# * do this via xargs and calling make again, since gmake has trouble matching 60 000 rules
# * stage2: Create part-<NUMBER>.txt files containing max FILES_PER_PART per file
# * stage3: convert the list of hdf5 files to single hdf5 files (part-<NUMBER>.hdf5)
# * stage4: convert the partial files, to a single file (SINGLE_FILE)
# Possible use
# $ make stage1
# $ make stage2
# $ make stage3 -j8
# $ make stage4
FILES_PER_PART = 10
ZIPPED = $(shell find . -maxdepth 1 -type f -name '*.csv.gz')
CSVS = $(patsubst %.cvs.gz, %.cvs, $(ZIPPED))
HDF5S = $(patsubst %.csv, %.cvs.hdf5, $(CSVS))
PARTS_TXT = $(shell find . -maxdepth 1 -type f -name 'part*.txt')
PARTS = $(patsubst %.txt, %.hdf5, $(PARTS_TXT))
SINGLE_FILE = gaia-dr2-sort-by-source_id.hdf5
all:
echo "Read the comments in this makefile"
stage1:
# change -P8 to ~number of cores on your system
find . -iname '*csv.gz' | sed 's/gz/hdf5/' | xargs -n30 -P8 make -j8
stage2:
find $(INPUT_DIR) -iname '*.hdf5' | awk -vc=0 'NR%$(FILES_PER_PART)==0{c++}{print $0 > "part-"c".txt"}'
stage3: $(PARTS)
stage4: $(SINGLE_FILE)
%.csv.hdf5 : %.csv
vaex convert file $< $@
%.csv : %.csv.gz
gunzip -c $< > $@
part-%.hdf5: part-%.txt
vaex convert --progress --sort=source_id file @$< $@
group_%.hdf5: group_%.txt
vaex convert --progress --sort=source_id file @$< $@
$(SINGLE_FILE): $(PARTS)
find . -iname 'part*.hdf5' > single.txt
vaex convert --progress --sort=source_id file @single.txt $@
#.PRECIOUS: %.csv
#.PRECIOUS: %.csv.hdf5
# don't delete intermediate files
.PRECIOUS:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment