Last active
November 18, 2021 18:04
-
-
Save jcpsantiago/320e3665a9bd749fc25ede0341c6323c to your computer and use it in GitHub Desktop.
Proof of concept for bulgogi/feature processing system
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns bulgogi.main | |
" | |
A bulgogi prototype. Not seasoned yet. ⚠️ | |
An exploration on how a simple feature engineering system could look like in Clojure. | |
Features are defined as pure functions and take an input-data map as input: | |
{:input-data {:email \"[email protected]\" | |
:current-amount 3455 | |
:previous-amount 2344}} | |
" | |
(:require | |
[clojure.edn :as edn] | |
[clojure.string :as s])) | |
;; ---- utilities ----- | |
(defn boolean->int | |
"Cast a boolean to 1/0 integer indicator" | |
[b] | |
(when boolean? b | |
(if (true? b) 1 0))) | |
;; ---- features ----- | |
;; Features are variables used in Machine Learning models. They're like fn args, if a model was just | |
;; a function: `(model feature1 feature2 ...)`. | |
;; Becacuse of referential transparency, here they are just pure functions. | |
;; In this prototype, they all follow the same contract taking | |
;; an input-map as input, extracting what they need and returning some value. | |
;; The following features are representative of the types I've used in production. | |
(defn email-name | |
"Lower-cased name of an email address (the bit before @)" | |
[{email :email}] | |
s/lower-case | |
(s/replace-first #"@.*" ""))) | |
(defn n-digits-in-email-name | |
"Number of digits in the email name" | |
[input-data] | |
;; we get dependency management for free because everything is just functions | |
(->> (email-name input-data) | |
(re-seq #"\d") | |
count)) | |
(defn n-chars-in-email-mail | |
"Number of characters in the email name i.e. length of the email name" | |
[input-data] | |
(-> (email-name input-data) | |
count)) | |
(defn diff-eur-previous-order | |
"Difference in euros between the current order and the previous one." | |
[{current-amount :current-amount previous-amount :previous-amount}] | |
(- current-amount previous-amount)) | |
(defn risky-item? | |
"Boolean depending on whether an item is risky or not" | |
[{brand :brand}] | |
(->> brand | |
s/lower-case | |
(re-seq #"baz corp") | |
some? | |
boolean->int)) | |
(defn contains-risky-item | |
"Indicator 1/0 depending on whether a risky item is present in the cart" | |
[{items :items}] | |
(->> items | |
(map #(risky-item? %)) | |
(some #(= 1 %)) | |
boolean->int)) | |
;; ---- main functions part of the actual infrastructure, not features ---- | |
(defn preprocessed | |
" | |
Takes a request map with keys :input-data and :features. | |
The first key contains an input-data map with the actual data needed to calculate features; | |
The second key contains a vector with the names of the features requested. | |
Looks for the features (aka functions) in the namespace and applies them to the input-data | |
in parallel. | |
Returns a map of feature-keys and feature-values. | |
" | |
[req] | |
(let [{:keys [input-data features]} req | |
fns (->> features | |
(map #(-> % symbol resolve))) | |
fn-ks (map keyword features)] | |
(->> (pmap #(% input-data) fns) | |
(zipmap fn-ks)))) | |
(defn response | |
"Bundles the calculated features into a consumable response" | |
[input-map preprocessed-map] | |
(->> preprocessed-map | |
(assoc {:request input-map} :preprocessed))) | |
(comment | |
;; try it in the REPL | |
(def req | |
"Example request. :input-data should be as flat as possible" | |
{:input-data {:current-amount 700 | |
:previous-amount 400 | |
:email "[email protected]" | |
:items [{:brand "Foo Industries" :value 1234} | |
{:brand "Baz Corp" :value 35345}]} | |
:features ["n-digits-in-email-name" | |
"diff-eur-previous-order"]})) | |
(let [req (edn/read *in*) | |
res (->> req | |
preprocessed | |
(response req))] | |
(future | |
(println "Saving response to file...") | |
(spit "bulgogi_response.edn" res :append true)) | |
res) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@daslu asked when would side-effects happen (i.e. saving all results to a database). I think there are two options, of which I prefer the latter:
defn-feature
which wrapsdefn
and stores its return value in some global atom, which is then flushed when responding back to the client. Clear issue is the fact we would need to manage state, which always adds complexity.In this Gist example, the affected feature would be
email-name
in case we only care aboutn-digits-in-email
, because it's only a dependency and not a "final feature". Either option above would result inemail-name
also being stored to persistent storage.