Maybe treper

####Scaling Up All Pairs Similarity Search ####论文理解: maxweighti(V)列i最大值 maxweight(x) x[:]的最大值

####代码阅读：

几个重要的数据结构

#####lshkit的数据格式，load代码

void Matrix<T>::load (std::istream &is)
{
    unsigned header[3]; /* entry size, row, col */
    assert(sizeof header == 3*4);
    is.read((char *)header, sizeof header);
    BOOST_VERIFY(is);
    BOOST_VERIFY(header[0] == sizeof(T));
    reset(header[2], header[1]);

	//generate tagId-tagId-cos similarity result
	import spark.util.Vector
	import scala.math.sqrt
	import java.io._

	val word_vec_size=150

	def parseVector(line: String): Vector = {
	return new Vector(line.split(' ').slice(1,word_vec_size+1).map(_.toDouble))
	}

	/*
	This script parse a text file like:
	3 {"list":[[8702,9630,3],[192470,8502,3],[25234,4160,3]]}
	into:
	3\t8702\t192470\t25234

	and sort by tagId count acendingly in order to fit to the format described in paper:
	Scaling Up All Pairs Similarity Search
	to calculate item to item similarity
	*/

	// main.cpp : Defines the entry point for the console application.
	/*
	g++ step1-ocr.cpp -I /home/ataosky/software/cvblobs8.3_linux -L /home/ataosky/software/cvblobs8.3_linux -lopencv_core -lopencv_ml -lopencv_imgproc -lopencv_highgui -lblob -o step2-ocr
	*/
	//

	//#include "stdafx.h" //AO
	#include "opencv/cv.h"
	#include "opencv/highgui.h"
	#include "opencv/ml.h"

	#convert mahout data set format to scikit-learn
	#mahout: -d N 3 C 2 N C 4 N C 8 N 2 C 19 N L
	#scikit-learn
	import sys
	import argparse
	import numpy
	from sklearn.cross_validation import cross_val_score
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.ensemble import ExtraTreesClassifier
	from sklearn.tree import DecisionTreeClassifier

	import spark.util.Vector
	import scala.math.sqrt
	def cosineDist(a:Vector,b:Vector):Double = {
	if(a.length==b.length){
	(a dot b)/(sqrt(a.squaredDist(Vector.zeros(a.length))*b.squaredDist(Vector.zeros(b.length))))
	}

	/**
	DBSCAN(D, eps, MinPts)
	C = 0
	for each unvisited point P in dataset D
	mark P as visited
	NeighborPts = regionQuery(P, eps)
	if sizeof(NeighborPts) < MinPts
	mark P as NOISE
	else
	C = next cluster

	val tags=file.filter(line => label_name_map.contains(line.split("\t")(0).toLong)).map(line => (line.split("\t")(1).toLong ->label_name_map(line.split("\t")(0).toLong))).sortByKey(false)
	tags.saveAsTextFile("hdfs://finger-test2:54310/home/TagHierarchy/tag_count_sorted")