maartenbreddels · April 28, 2018 18:10
diff --git a/Makefile b/Makefile
 # Makefile for converting the CSV files from http://cdn.gea.esac.esa.int/Gaia/gdr2/gaia_source/csv/
 # to a single (vaex) hdf5 file
 #  * https://docs.vaex.io
 #  * https://github.com/maartenbreddels/vaex/
 # It is multistage to work around opening 60 000 files at once.
 # Strategy is
 #  * stage1: convert all cvs.gz to csv to hdf5
 #   * do this via xargs and calling make again, since gmake has trouble matching 60 000 rules
 #  * stage2: Create part-<NUMBER>.txt files containing max FILES_PER_PART per file  
 #  * stage3: convert the list of hdf5 files to single hdf5 files (part-<NUMBER>.hdf5)
 #  * stage4: convert the partial files, to a single file (SINGLE_FILE)
 # Possible use
 #   $ make stage1
 #   $ make stage2
 #   $ make stage3 -j8
 #   $ make stage4
 FILES_PER_PART = 10

 ZIPPED = $(shell find . -maxdepth 1 -type f -name '*.csv.gz')
 CSVS   = $(patsubst %.cvs.gz, %.cvs, $(ZIPPED))
 HDF5S  = $(patsubst %.csv, %.cvs.hdf5, $(CSVS))

 PARTS_TXT = $(shell find . -maxdepth 1 -type f -name 'part*.txt')
 PARTS = $(patsubst %.txt, %.hdf5, $(PARTS_TXT))
 SINGLE_FILE = gaia-dr2-sort-by-source_id.hdf5

 all:
 	echo "Read the comments in this makefile"

 stage1:
 	# change -P8 to ~number of cores on your system
 	find . -iname '*csv.gz' | sed 's/gz/hdf5/' | xargs -n30 -P8 make -j8
 stage2:
 	find $(INPUT_DIR) -iname '*.hdf5' | awk -vc=0 'NR%$(FILES_PER_PART)==0{c++}{print $0 > "part-"c".txt"}'

 stage3: $(PARTS)
 stage4: $(SINGLE_FILE)


 %.csv.hdf5 : %.csv
 	vaex convert file $< $@

 %.csv : %.csv.gz
 	gunzip -c $< > $@

 part-%.hdf5: part-%.txt
 	vaex convert --progress --sort=source_id file @$< $@

 group_%.hdf5: group_%.txt
 	vaex convert --progress --sort=source_id file @$< $@

 $(SINGLE_FILE): $(PARTS)
 	find . -iname 'part*.hdf5' > single.txt
 	vaex convert --progress --sort=source_id file @single.txt $@


 #.PRECIOUS: %.csv
 #.PRECIOUS:  %.csv.hdf5
 # don't delete intermediate files
 .PRECIOUS:
	# Makefile for converting the CSV files from http://cdn.gea.esac.esa.int/Gaia/gdr2/gaia_source/csv/
	# to a single (vaex) hdf5 file
	# * https://docs.vaex.io
	# * https://github.com/maartenbreddels/vaex/
	# It is multistage to work around opening 60 000 files at once.
	# Strategy is
	# * stage1: convert all cvs.gz to csv to hdf5
	# * do this via xargs and calling make again, since gmake has trouble matching 60 000 rules
	# * stage2: Create part-<NUMBER>.txt files containing max FILES_PER_PART per file
	# * stage3: convert the list of hdf5 files to single hdf5 files (part-<NUMBER>.hdf5)
	# * stage4: convert the partial files, to a single file (SINGLE_FILE)
	# Possible use
	# $ make stage1
	# $ make stage2
	# $ make stage3 -j8
	# $ make stage4
	FILES_PER_PART = 10

	ZIPPED = $(shell find . -maxdepth 1 -type f -name '*.csv.gz')
	CSVS = $(patsubst %.cvs.gz, %.cvs, $(ZIPPED))
	HDF5S = $(patsubst %.csv, %.cvs.hdf5, $(CSVS))

	PARTS_TXT = $(shell find . -maxdepth 1 -type f -name 'part*.txt')
	PARTS = $(patsubst %.txt, %.hdf5, $(PARTS_TXT))
	SINGLE_FILE = gaia-dr2-sort-by-source_id.hdf5

	all:
	echo "Read the comments in this makefile"

	stage1:
	# change -P8 to ~number of cores on your system
	find . -iname '*csv.gz' \| sed 's/gz/hdf5/' \| xargs -n30 -P8 make -j8
	stage2:
	find $(INPUT_DIR) -iname '*.hdf5' \| awk -vc=0 'NR%$(FILES_PER_PART)==0{c++}{print $0 > "part-"c".txt"}'

	stage3: $(PARTS)
	stage4: $(SINGLE_FILE)


	%.csv.hdf5 : %.csv
	vaex convert file $< $@

	%.csv : %.csv.gz
	gunzip -c $< > $@

	part-%.hdf5: part-%.txt
	vaex convert --progress --sort=source_id file @$< $@

	group_%.hdf5: group_%.txt
	vaex convert --progress --sort=source_id file @$< $@

	$(SINGLE_FILE): $(PARTS)
	find . -iname 'part*.hdf5' > single.txt
	vaex convert --progress --sort=source_id file @single.txt $@


	#.PRECIOUS: %.csv
	#.PRECIOUS: %.csv.hdf5
	# don't delete intermediate files
	.PRECIOUS: