Skip to content

Instantly share code, notes, and snippets.

View remeniuk's full-sized avatar

Vasil Remeniuk remeniuk

View GitHub Profile
/**
* K-means clustering algorithm requires the input to be represented as vectors.
* In out case, the vector, itself, represents the player, where other users, the player has played with, are
* vector axises/features (the weigh of the feature is a number of games, played together)
* User: remeniuk
*/
class VectorBuilder(args: Args) extends Job(args) {
import Dictionary._
class Indexer(args: Args) extends Job(args) {
val output = WritableSequenceFile(args("output"), classOf[Text], classOf[IntWritable],
'userId -> 'idx)
TextLine(args("input")).read
.map(('offset -> 'line) -> ('userId -> 'idx)) {
// dictionary lines are read with indices from TextLine source
// out of the box. For some reason, in my case, indices were multiplied by 5, so I have had to divide them
tuple: (Int, String) => (new Text(tuple._2.toString) -> new IntWritable((tuple._1 / 5)))
// extract user ID from hand history record
val userId = (playerHistory: PlayerHandHistory) =>
new Text(playerHistory.getUserId.toString)
// Builds basic dixtionary (enumeration, in fact) of all the players, participated in the selected subset of hand
// history records
class Builder(args: Args) extends Job(args) {
// input tap is an HTable with hand history entries: hand history id -> hand history record, serialized with ProtoBuf
val input = new HBaseSource("hand", args("hbasehost"), 'handId, Array("d"), Array('blob))
VL-0{n=5 c=[1003:3.400, 1006:3.400, 1008:3.200, 1009:3.200, 1012:3.200] r=[1003:1.744, 1006:1.744, 1008:1.600, 1009:1.600, 1012:1.600]}
Top Terms:
1006 => 3.4
1003 => 3.4
1012 => 3.2
1009 => 3.2
1008 => 3.2
VL-15{n=1 c=[1016:4.000, 1019:3.000, 1020:3.000, 1021:3.000, 1022:3.000, 1023:3.000, 1024:3.000, 1025:3.000] r=[]}
@remeniuk
remeniuk / 1.scala
Created August 10, 2012 11:00
Poker Collusion Detector
val conf = new Configuration
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+ "org.apache.hadoop.io.serializer.WritableSerialization")
// the path, where the vectors will be stored to
val vectorsPath = new Path("job/vectors")
// enumeration of all users involved in a selected subset of hand history records
val dictionaryPath = new Path("job/dictionary")
// text file with the dictionary size
val dictionarySizePath = new Path("job/dictionary-size")
@remeniuk
remeniuk / run.sh
Created May 6, 2012 17:17
CLI in SBT
sbt console
object DslRepl extends Build {
val host = SettingKey[String]("host", "The host that the REPL will connect to.")
val port = SettingKey[Int]("port", "The port that the REPL will connect to.")
lazy val root = Project("root", file(".")) settings (
libraryDependencies ++= Seq(
"com.vasilrem" % "somedsl" % "0.1"
),
host := "127.0.0.1",
port := 7777,
Lens<Person, Address> personAddressLens = new Lens<Person, Address>() {
@Override
public Address get(@Nullable Person person) {
return person.getAddress();
}
@Override
public Person set(Person person, Address address) {
return new Person(person.getFirstName(), person.getLastName(), address);
}
Lens<Address, Integer> addressZipCodeLens = lens(
new Function<Address, Integer>() {
@Override
public Integer apply(@Nullable Address address) {
return address.getZipCode();
}
},
new Function2<Address, Integer, Address>() {
@Override
public Address apply(Address address, Integer zipCode) {
public class TestAssemblyFlow extends Flow {
...
public String fooOperation() {
StringBuffer buffer = new StringBuffer("");
buffer.append(((TestComponentA) getAssembly().
getComponentByType("TestComponentA")).fooComponentOperation());
buffer.append(((TestComponentB) getAssembly().
getComponentByType("TestComponentB")).fooComponentOperation());