jcpsantiago · November 18, 2021 18:04 · jcpsantiago · Oct 5, 2021
diff --git a/bulgogi.clj b/bulgogi.clj
 (ns bulgogi.main
  "
  A bulgogi prototype. Not seasoned yet. ⚠️
  
  An exploration on how a simple feature engineering system could look like in Clojure.
  
  Features are defined as pure functions and take an input-data map as input:
  {:input-data {:email \"[email protected]\"
                :current-amount 3455
                :previous-amount 2344}}
  "
  (:require
    [clojure.edn :as edn]
    [clojure.string :as s]))


 ;; ---- utilities -----
 (defn boolean->int
  "Cast a boolean to 1/0 integer indicator"
  [b]
  (when boolean? b
        (if (true? b) 1 0)))


 ;; ---- features -----
 ;; Features are variables used in Machine Learning models. They're like fn args, if a model was just
 ;; a function: `(model feature1 feature2 ...)`.
 ;; Becacuse of referential transparency, here they are just pure functions. 
 ;; In this prototype, they all follow the same contract taking
 ;; an input-map as input, extracting what they need and returning some value.
 ;; The following features are representative of the types I've used in production.

 (defn email-name
  "Lower-cased name of an email address (the bit before @)"
  [{email :email}]
  (-> email
      s/lower-case
      (s/replace-first #"@.*" "")))


 (defn n-digits-in-email-name
  "Number of digits in the email name"
  [input-data]
  ;; we get dependency management for free because everything is just functions
  (->> (email-name input-data)
       (re-seq #"\d")
       count))


 (defn n-chars-in-email-mail
  "Number of characters in the email name i.e. length of the email name"
  [input-data]
  (-> (email-name input-data)
      count))


 (defn diff-eur-previous-order
  "Difference in euros between the current order and the previous one."
  [{current-amount :current-amount previous-amount :previous-amount}]
  (- current-amount previous-amount))


 (defn risky-item?
  "Boolean depending on whether an item is risky or not"
  [{brand :brand}]
  (->> brand
       s/lower-case
       (re-seq #"baz corp")
       some?
       boolean->int))


 (defn contains-risky-item
  "Indicator 1/0 depending on whether a risky item is present in the cart"
  [{items :items}]
  (->> items
       (map #(risky-item? %))
       (some #(= 1 %))
       boolean->int))


 ;; ---- main functions part of the actual infrastructure, not features ----
 (defn preprocessed
  "
  Takes a request map with keys :input-data and :features.
  The first key contains an input-data map with the actual data needed to calculate features;
  The second key contains a vector with the names of the features requested.
  Looks for the features (aka functions) in the namespace and applies them to the input-data
  in parallel. 
  
  Returns a map of feature-keys and feature-values.
  "
  [req]
  (let [{:keys [input-data features]} req
        fns (->> features
                 (map #(-> % symbol resolve)))
        fn-ks (map keyword features)]
    (->> (pmap #(% input-data) fns)
         (zipmap fn-ks))))


 (defn response
  "Bundles the calculated features into a consumable response"
  [input-map preprocessed-map]
  (->> preprocessed-map
       (assoc {:request input-map} :preprocessed)))


 (comment
  ;; try it in the REPL 
  (def req
    "Example request. :input-data should be as flat as possible"
    {:input-data {:current-amount 700
                  :previous-amount 400
                  :email "[email protected]"
                  :items [{:brand "Foo Industries" :value 1234}
                          {:brand "Baz Corp" :value 35345}]}
     :features ["n-digits-in-email-name" 
                "diff-eur-previous-order"]}))


 (let [req (edn/read *in*)
      res (->> req
               preprocessed
               (response req))]
  (future
    (println "Saving response to file...")
    (spit "bulgogi_response.edn" res :append true))
  res)
	(ns bulgogi.main
	"
	A bulgogi prototype. Not seasoned yet. ⚠️

	An exploration on how a simple feature engineering system could look like in Clojure.

	Features are defined as pure functions and take an input-data map as input:
	{:input-data {:email \"[email protected]\"
	:current-amount 3455
	:previous-amount 2344}}
	"
	(:require
	[clojure.edn :as edn]
	[clojure.string :as s]))


	;; ---- utilities -----
	(defn boolean->int
	"Cast a boolean to 1/0 integer indicator"
	[b]
	(when boolean? b
	(if (true? b) 1 0)))


	;; ---- features -----
	;; Features are variables used in Machine Learning models. They're like fn args, if a model was just
	;; a function: `(model feature1 feature2 ...)`.
	;; Becacuse of referential transparency, here they are just pure functions.
	;; In this prototype, they all follow the same contract taking
	;; an input-map as input, extracting what they need and returning some value.
	;; The following features are representative of the types I've used in production.

	(defn email-name
	"Lower-cased name of an email address (the bit before @)"
	[{email :email}]
	(-> email
	s/lower-case
	(s/replace-first #"@.*" "")))


	(defn n-digits-in-email-name
	"Number of digits in the email name"
	[input-data]
	;; we get dependency management for free because everything is just functions
	(->> (email-name input-data)
	(re-seq #"\d")
	count))


	(defn n-chars-in-email-mail
	"Number of characters in the email name i.e. length of the email name"
	[input-data]
	(-> (email-name input-data)
	count))


	(defn diff-eur-previous-order
	"Difference in euros between the current order and the previous one."
	[{current-amount :current-amount previous-amount :previous-amount}]
	(- current-amount previous-amount))


	(defn risky-item?
	"Boolean depending on whether an item is risky or not"
	[{brand :brand}]
	(->> brand
	s/lower-case
	(re-seq #"baz corp")
	some?
	boolean->int))


	(defn contains-risky-item
	"Indicator 1/0 depending on whether a risky item is present in the cart"
	[{items :items}]
	(->> items
	(map #(risky-item? %))
	(some #(= 1 %))
	boolean->int))


	;; ---- main functions part of the actual infrastructure, not features ----
	(defn preprocessed
	"
	Takes a request map with keys :input-data and :features.
	The first key contains an input-data map with the actual data needed to calculate features;
	The second key contains a vector with the names of the features requested.
	Looks for the features (aka functions) in the namespace and applies them to the input-data
	in parallel.

	Returns a map of feature-keys and feature-values.
	"
	[req]
	(let [{:keys [input-data features]} req
	fns (->> features
	(map #(-> % symbol resolve)))
	fn-ks (map keyword features)]
	(->> (pmap #(% input-data) fns)
	(zipmap fn-ks))))


	(defn response
	"Bundles the calculated features into a consumable response"
	[input-map preprocessed-map]
	(->> preprocessed-map
	(assoc {:request input-map} :preprocessed)))


	(comment
	;; try it in the REPL
	(def req
	"Example request. :input-data should be as flat as possible"
	{:input-data {:current-amount 700
	:previous-amount 400
	:email "[email protected]"
	:items [{:brand "Foo Industries" :value 1234}
	{:brand "Baz Corp" :value 35345}]}
	:features ["n-digits-in-email-name"
	"diff-eur-previous-order"]}))


	(let [req (edn/read in)
	res (->> req
	preprocessed
	(response req))]
	(future
	(println "Saving response to file...")
	(spit "bulgogi_response.edn" res :append true))
	res)
No results found