Bryan Yang bryanyang0528

Probabilistic Data Structures for Web Analytics and Data Mining : A great overview of the space of probabilistic data structures and how they are used in approximation algorithm implementation.
Models and Issues in Data Stream Systems
Philippe Flajolet’s contribution to streaming algorithms : A presentation by Jérémie Lumbroso that visits some of the hostorical perspectives and how it all began with Flajolet
Approximate Frequency Counts over Data Streams by Gurmeet Singh Manku & Rajeev Motwani : One of the early papers on the subject.
[Methods for Finding Frequent Items in Data Streams](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.187.9800&rep

	__reference__ = 'http://stackoverflow.com/questions/16852655/convert-a-tsv-file-to-xls-xlsx-using-python'
	__author__ = 'jmcnamara'
	__contributor__ = 'bryan yang'

	import sys
	import csv
	from xlsxwriter.workbook import Workbook

	class Tsvtoxlsx(object):

	import numpy as np
	import operator
	point = [0,0]
	k = 3
	listOfPoint=[(1,1),(1,0),(2,1),(0,-1),(2,2)]


	def knn(point, k, lists):
	#create a dic to store distance for each point
	dic={}

	input = [-3,-1, 0, 1, 2 ,3]
	res = []
	def sumOfZero(input, target, part=[]):
	s = 0
	if part:
	for i in part:
	s+=i

	if s == target:
	res.append(part)

	import numpy as np
	def roc(actual, pred):
	fpr=np.array([1.0])
	tpr=np.array([1.0])
	n=float(len(actual)-sum(actual))
	p=float(sum(actual))
	for i in np.arange(min(pred), max(pred), 1.0/len(pred)):
	TP=0.0
	FP=0.0
	for j in range(len(pred)):

	http://d.stavrovski.net/blog/post/how-to-install-and-setup-oracle-java-jdk-in-centos-6

	# rpm
	wget --no-cookies \
	--no-check-certificate \
	--header "Cookie: oraclelicense=accept-securebackup-cookie" \
	"http://download.oracle.com/otn-pub/java/jdk/7u55-b13/jdk-7u55-linux-x64.rpm" \
	-O jdk-7-linux-x64.rpm

	# ubuntu

	core-site.xml
	=================================
	<property>
	<name>fs.defaultFS</name>
	<value>hdfs://ec2-54-148-213-237.us-west-2.compute.amazonaws.com</value>
	</property>
	<property>
	<name>hadoop.tmp.dir</name>
	<value>/home/hadoop/local/var/hadoop/tmp/hadoop-${user.name}</value>
	</property>

	# coding=utf-8
	# Goal: parse house information for each district from websites
	# for each district, get 「土地區段位置或建物區門牌」,「建物型態」,「建物現況格局」,「坪數」,「屋齡」,「總價元」,「資料來源」into csv file

	# Procedure:
	# 1. get the number of page for each district by parsing first html content
	# 2. for each district put all html page together, use htmlparser to parse content and save data into file

	import sys
	import math