Last active
November 9, 2015 22:44
-
-
Save c-garcia/85d32ea8a21d6dc25cb6 to your computer and use it in GitHub Desktop.
Some experiments with HDFS MapFiles and hdfs-clj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns hdp-clj-wb.core | |
"Some tests with hadoop mapfiles. Example usage. | |
* Create a file with 1000 keys (from 000 to 999) | |
(hdp-clj-wb.core/make-map-file \"hdfs://localhost:9000/user/me/map3\" 1000) | |
* Get the MD5 of the key 235 of 1000 | |
(hdp-clj-wb.core/get-md5 \"hdfs://localhost:9000/user/me/map3\" 234 1000) | |
* Get 4 items from the key 990 of 1000 | |
(hdp-clj-wb.core/get-some-md5 \"hdfs://localhost:9000/user/me/map3\" 990 1000 4) | |
" | |
(:require [hdfs.core :as hdfs]) | |
(:import (org.apache.hadoop.io Text MapFile MapFile$Writer SequenceFile$Writer$Option) | |
(org.apache.hadoop.io MapFile$Reader SequenceFile$Reader$Option) | |
(java.security MessageDigest))) | |
(defn format-key | |
[idx total] | |
(let [num-digits (int (Math/ceil (/ (Math/log total) (Math/log 10)))) | |
format-str (format "%%0%dd" num-digits)] | |
(format format-str idx))) | |
(defn make-map-file | |
"Creates a Hadoop MapFile with num-records consecutive records | |
The key is the 0 padded string representing the index: 0001, 0002.. | |
with as much zeros as needed to represent all the keys sorted lexicographically | |
The value is a string representing the MD5 of the key a0:12:32:. ... :33" | |
[path-str num-records] | |
(with-open [mf (MapFile$Writer. | |
(hdfs/configuration) | |
(hdfs/make-path path-str) | |
(into-array SequenceFile$Writer$Option [(MapFile$Writer/keyClass Text) (MapFile$Writer/valueClass Text)]))] | |
(let [md5-engine (MessageDigest/getInstance "MD5")] | |
(doseq [item (range num-records)] | |
(let [k (Text. (format-key item num-records)) | |
_ (doto md5-engine .reset (.update (.getBytes k))) | |
txt (clojure.string/join ":" (map (partial format "%02x") (vec (.digest md5-engine)))) | |
v (Text. txt)] | |
(.append mf k v)))))) | |
(defn get-md5 | |
"Gets the value of the key idx of a total of `total` | |
from a file created by make-map-file" | |
[path-str idx total] | |
(with-open [mf (MapFile$Reader. | |
(hdfs/make-path path-str) | |
(hdfs/configuration) | |
(into-array SequenceFile$Reader$Option []))] | |
(let [k (Text. (format-key idx total)) | |
v-res (Text. )] | |
(when (.get mf k v-res) | |
[(str k) (str v-res)])))) | |
(defn get-some-md5 | |
"The same as the function above but gets more records. | |
It shows how to perform a scan from an specific key" | |
[path-str idx total num-records] | |
(with-open [mf (MapFile$Reader. | |
(hdfs/make-path path-str) | |
(hdfs/configuration) | |
(into-array SequenceFile$Reader$Option []))] | |
(let [k (Text. (format-key idx total)) | |
v (Text. ) | |
found? (.get mf k v)] | |
(loop [f-loop found? k-loop k v-loop v c-loop (dec num-records) res []] | |
(if (or (not f-loop) (< c-loop 0)) | |
res | |
(let [pair [(str k-loop) (str v-loop)] | |
f-loop (.next mf k-loop v-loop)] | |
(recur f-loop k-loop v-loop (dec c-loop) (conj res pair)))))))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment