####Scaling Up All Pairs Similarity Search ####论文理解: maxweighti(V)列i最大值 maxweight(x) x[:]的最大值
####代码阅读:
Google的开源实现是All-Pair-Binary的
- 几个重要的数据结构
//generate tagId-tagId-cos similarity result | |
import spark.util.Vector | |
import scala.math.sqrt | |
import java.io._ | |
val word_vec_size=150 | |
def parseVector(line: String): Vector = { | |
return new Vector(line.split(' ').slice(1,word_vec_size+1).map(_.toDouble)) | |
} |
/* | |
This script parse a text file like: | |
3 {"list":[[8702,9630,3],[192470,8502,3],[25234,4160,3]]} | |
into: | |
3\t8702\t192470\t25234 | |
and sort by tagId count acendingly in order to fit to the format described in paper: | |
Scaling Up All Pairs Similarity Search | |
to calculate item to item similarity | |
*/ |
####Scaling Up All Pairs Similarity Search ####论文理解: maxweighti(V)列i最大值 maxweight(x) x[:]的最大值
####代码阅读:
Google的开源实现是All-Pair-Binary的
#####lshkit的数据格式,load代码
void Matrix<T>::load (std::istream &is)
{
unsigned header[3]; /* entry size, row, col */
assert(sizeof header == 3*4);
is.read((char *)header, sizeof header);
BOOST_VERIFY(is);
BOOST_VERIFY(header[0] == sizeof(T));
reset(header[2], header[1]);
// main.cpp : Defines the entry point for the console application. | |
/* | |
g++ step1-ocr.cpp -I /home/ataosky/software/cvblobs8.3_linux -L /home/ataosky/software/cvblobs8.3_linux -lopencv_core -lopencv_ml -lopencv_imgproc -lopencv_highgui -lblob -o step2-ocr | |
*/ | |
// | |
//#include "stdafx.h" //AO | |
#include "opencv/cv.h" | |
#include "opencv/highgui.h" | |
#include "opencv/ml.h" |
#convert mahout data set format to scikit-learn | |
#mahout: -d N 3 C 2 N C 4 N C 8 N 2 C 19 N L | |
#scikit-learn | |
import sys | |
import argparse | |
import numpy | |
from sklearn.cross_validation import cross_val_score | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.ensemble import ExtraTreesClassifier | |
from sklearn.tree import DecisionTreeClassifier |
import spark.util.Vector | |
import scala.math.sqrt | |
def cosineDist(a:Vector,b:Vector):Double = { | |
if(a.length==b.length){ | |
(a dot b)/(sqrt(a.squaredDist(Vector.zeros(a.length))*b.squaredDist(Vector.zeros(b.length)))) | |
} |
import spark.util.Vector | |
val word_vec_size=150 | |
def parseVector(line: String): Vector = { | |
return new Vector(line.split(' ').slice(1,word_vec_size+1).map(_.toDouble)) | |
} | |
def closestPoint(p: Vector, centers: Array[Vector]): Int = { | |
var index = 0 | |
var bestIndex = 0 |
/** | |
DBSCAN(D, eps, MinPts) | |
C = 0 | |
for each unvisited point P in dataset D | |
mark P as visited | |
NeighborPts = regionQuery(P, eps) | |
if sizeof(NeighborPts) < MinPts | |
mark P as NOISE | |
else | |
C = next cluster |
val tags=file.filter(line => label_name_map.contains(line.split("\t")(0).toLong)).map(line => (line.split("\t")(1).toLong ->label_name_map(line.split("\t")(0).toLong))).sortByKey(false) | |
tags.saveAsTextFile("hdfs://finger-test2:54310/home/TagHierarchy/tag_count_sorted") |