robinkraft · December 14, 2015 22:18
diff --git a/gistfile1.clj b/gistfile1.clj
 (use 'gulo.views-test)
 (in-ns 'gulo.views-test)

 ;; from the views-test ns, make some test data
 ;; to-pail is used in mk-test-data, and calls .absorb on the temp
 ;; pail that is created, moving it into PAIL-PATH

 (mk-test-data)

 ;; consolidate the pail - default of .consolidate is 128mb max file size
 (.consolidate (Pail. PAIL-PATH))

 ;; Whoops! That consolidated all the RecordProperty records into one 
 ;; file in vn-test-tmp/prop/RecordProperty, and all the OrganizationProperty 
 ;; records into vn-test-tmp/OrganizationProperty. Yep, this destroys 
 ;; the beautiful vertical partitioning schema we're using

 ;; Ok, let's start over, adding a bit more data this time (delete the pail first)

 (mk-test-data)
 (mk-test-data)
 (mk-test-data)

 ;; Since we want to retain the vertical partitioning, we have to call .consolidate
 ;; on each directory.

 (.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/Event")))
 (.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/GeologicalContext")))
 (.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/Identification")))
 (.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/Location")))
 (.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/MeasurementOrFact")))
 (.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/Occurrence")))
 (.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/RecordLevel")))
 (.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/ResourceRelationship")))
 (.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/Taxon")))

 ;; This produces a new randomly named directory with large files of records 
 ;; inside the directory given. If you've already consolidated once then
 ;; add records, you can consolidate again. This produces another randomly  
 ;; named directory with big files filled with records. The old randomly named
 ;; directory is now empty.

 ;; As a result, pulling records out of the /prop/RecordProperty/Taxon directory, for example 
 ;; still works as expected, since Hadoop will look for data inside each subdirectory 
 ;; that has been created. The empty ones don't get in the way, and the subdirectory that
 ;; does contain data will have the appropriate records (e.g. Taxon).

 ;; HOWEVER: using (take 1 (Pail. PAIL-PATH)) on a consolidated pail doesn't seem to work, 
 ;; at least not using a naive approoach. Seems Pail. doesn't like having data in those subdirectories
 ;; instead of the expected /prop/RecordProperty/Taxon. Fortunately, Cascalog queries don't
 ;; seem to have a problem with that, so it's not such a big deal.

 ;; It's still annoying not being able to use our normal partitioning schema,
 ;; and having to consolidate each subdirectory separately.

 (get-test-record ["prop" "RecordProperty" "Taxon"])
 ;=> nil

 ;; references:
 ;; http://www.manning-sandbox.com/thread.jspa?threadID=48985
 ;; https://groups.google.com/forum/#!topic/cascalog-user/y7SWXANaM4k
	(use 'gulo.views-test)
	(in-ns 'gulo.views-test)

	;; from the views-test ns, make some test data
	;; to-pail is used in mk-test-data, and calls .absorb on the temp
	;; pail that is created, moving it into PAIL-PATH

	(mk-test-data)

	;; consolidate the pail - default of .consolidate is 128mb max file size
	(.consolidate (Pail. PAIL-PATH))

	;; Whoops! That consolidated all the RecordProperty records into one
	;; file in vn-test-tmp/prop/RecordProperty, and all the OrganizationProperty
	;; records into vn-test-tmp/OrganizationProperty. Yep, this destroys
	;; the beautiful vertical partitioning schema we're using

	;; Ok, let's start over, adding a bit more data this time (delete the pail first)

	(mk-test-data)
	(mk-test-data)
	(mk-test-data)

	;; Since we want to retain the vertical partitioning, we have to call .consolidate
	;; on each directory.

	(.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/Event")))
	(.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/GeologicalContext")))
	(.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/Identification")))
	(.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/Location")))
	(.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/MeasurementOrFact")))
	(.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/Occurrence")))
	(.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/RecordLevel")))
	(.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/ResourceRelationship")))
	(.consolidate (Pail. (str PAIL-PATH "/prop/RecordProperty/Taxon")))

	;; This produces a new randomly named directory with large files of records
	;; inside the directory given. If you've already consolidated once then
	;; add records, you can consolidate again. This produces another randomly
	;; named directory with big files filled with records. The old randomly named
	;; directory is now empty.

	;; As a result, pulling records out of the /prop/RecordProperty/Taxon directory, for example
	;; still works as expected, since Hadoop will look for data inside each subdirectory
	;; that has been created. The empty ones don't get in the way, and the subdirectory that
	;; does contain data will have the appropriate records (e.g. Taxon).

	;; HOWEVER: using (take 1 (Pail. PAIL-PATH)) on a consolidated pail doesn't seem to work,
	;; at least not using a naive approoach. Seems Pail. doesn't like having data in those subdirectories
	;; instead of the expected /prop/RecordProperty/Taxon. Fortunately, Cascalog queries don't
	;; seem to have a problem with that, so it's not such a big deal.

	;; It's still annoying not being able to use our normal partitioning schema,
	;; and having to consolidate each subdirectory separately.

	(get-test-record ["prop" "RecordProperty" "Taxon"])
	;=> nil

	;; references:
	;; http://www.manning-sandbox.com/thread.jspa?threadID=48985
	;; https://groups.google.com/forum/#!topic/cascalog-user/y7SWXANaM4k
No results found