Goals: Add links that are reasonable and good explanations of how stuff works. No hype and no vendor content if possible. Practical first-hand accounts of models in prod eagerly sought.

// Zed settings | |
// | |
// For information on how to configure Zed, see the Zed | |
// documentation: https://zed.dev/docs/configuring-zed | |
// | |
// To see all of Zed's default settings without changing your | |
// custom settings, run the `open default settings` command | |
// from the command palette or from `Zed` application menu. | |
{ | |
// The settings for slash commands. |
import com.twitter.scalding._ | |
import com.twitter.algebird._ | |
/** | |
* More sensible aggregation with Monoids. | |
* Use SketchMap to get only the top words that we are interested about. | |
* SketchMap is a generalization of the CountMinSketch in Algebird. Holds list of top items. | |
* The size of the CMS will not grow so this will not run out of mem. | |
*/ | |
class WordCount5(args: Args) extends Job(args) { |
#!/public/spark-0.9.1/bin/pyspark | |
import os | |
import sys | |
# Set the path for spark installation | |
# this is the path where you have built spark using sbt/sbt assembly | |
os.environ['SPARK_HOME'] = "/public/spark-0.9.1" | |
# os.environ['SPARK_HOME'] = "/home/jie/d2/spark-0.9.1" | |
# Append to PYTHONPATH so that pyspark could be found |
import com.twitter.scalding._ | |
import com.twitter.algebird.{ MinHasher, MinHasher32, MinHashSignature } | |
/** | |
* Computes similar items (with a string itemId), based on approximate | |
* Jaccard similarity, using LSH. | |
* | |
* Assumes an input data TSV file of the following format: | |
* | |
* itemId userId |
(defn cascalog-map | |
[op-var output-fields & {:keys [stateful?]}] | |
(let [ser (KryoService/serialize (ops/fn-spec op-var))] | |
(proxy [BaseOperation Function] [^Fields output-fields] | |
(prepare [^FlowProcess flow-process ^OperationCall op-call] | |
(let [op (Util/bootFn (KryoService/deserialize ser))] | |
(-> op-call | |
(.setContext [op (if stateful? (op))])))) | |
(operate [^FlowProcess flow-process ^FunctionCall fn-call] | |
(let [[op] (.getContext fn-call) |