|
% Usage: |
|
% * download the full book from: http://infolab.stanford.edu/~ullman/mmds/book0n.pdf |
|
% * store this file as 'out.tex', and compile as 'pdflatex out.tex' |
|
% * rename output file to e.g. |
|
% 'Mining of Massive Datasets - Jure Leskovec, Anand Rajaraman, Jeffrey Ullman.pdf' |
|
|
|
\documentclass{article} |
|
\usepackage[utf8]{inputenc} |
|
|
|
\usepackage{geometry} |
|
|
|
\usepackage{pdfpages} |
|
\usepackage[ |
|
pdfpagelabels=true, |
|
pdftitle={Mining of Massive Datasets}, |
|
pdfauthor={Jure Leskovec, Anand Rajaraman, Jeffrey D. Ullman}, |
|
pdfsubject={Machine Learning}, |
|
pdfkeywords={machine learning}, |
|
unicode=true, |
|
]{hyperref} |
|
\usepackage{bookmark} |
|
|
|
\begin{document} |
|
|
|
|
|
\pagenumbering{arabic} |
|
\setcounter{page}{1} |
|
\includepdf[pages=1-]{book0n.pdf} |
|
|
|
\bookmark[page=1,level=0]{Cover} |
|
\bookmark[page=3,level=0]{Preface} |
|
\bookmark[page=7,level=0]{Table of Contents} |
|
|
|
\bookmark[page=21,level=0]{1 Data Mining} |
|
\bookmark[page=21,level=1]{1.1 What is Data Mining?} |
|
\bookmark[page=21,level=2]{1.1.1 Modeling} |
|
\bookmark[page=22,level=2]{1.1.2 Statistical Modeling} |
|
\bookmark[page=22,level=2]{1.1.3 Machine Learning} |
|
\bookmark[page=23,level=2]{1.1.4 Computational Approaches to Modeling} |
|
\bookmark[page=24,level=2]{1.1.5 Summarization} |
|
\bookmark[page=25,level=2]{1.1.6 Feature Extraction} |
|
\bookmark[page=25,level=1]{1.2 Statistical Limits on Data Mining} |
|
\bookmark[page=26,level=2]{1.2.1 Total Information Awareness} |
|
\bookmark[page=26,level=2]{1.2.2 Bonferroni’s Principle} |
|
\bookmark[page=27,level=2]{1.2.3 An Example of Bonferroni’s Principle} |
|
\bookmark[page=28,level=3]{1.2.4 Exercises for Section 1.2} |
|
\bookmark[page=28,level=1]{1.3 Things Useful to Know} |
|
\bookmark[page=29,level=2]{1.3.1 Importance of Words in Documents} |
|
\bookmark[page=30,level=2]{1.3.2 Hash Functions} |
|
\bookmark[page=31,level=2]{1.3.3 Indexes} |
|
\bookmark[page=33,level=2]{1.3.4 Secondary Storage} |
|
\bookmark[page=33,level=2]{1.3.5 The Base of Natural Logarithms} |
|
\bookmark[page=34,level=2]{1.3.6 Power Laws} |
|
\bookmark[page=36,level=3]{1.3.7 Exercises for Section 1.3} |
|
\bookmark[page=37,level=1]{1.4 Outline of the Book} |
|
\bookmark[page=39,level=1]{1.5 Summary of Chapter 1} |
|
\bookmark[page=39,level=1]{1.6 References for Chapter 1} |
|
\bookmark[page=41,level=0]{2 MapReduce and the New Software Stack} |
|
\bookmark[page=42,level=1]{2.1 Distributed File Systems} |
|
\bookmark[page=42,level=2]{2.1.1 Physical Organization of Compute Nodes} |
|
\bookmark[page=44,level=2]{2.1.2 Large-Scale File-System Organization} |
|
\bookmark[page=45,level=1]{2.2 MapReduce} |
|
\bookmark[page=45,level=2]{2.2.1 The Map Tasks} |
|
\bookmark[page=46,level=2]{2.2.2 Grouping by Key} |
|
\bookmark[page=47,level=2]{2.2.3 The Reduce Tasks} |
|
\bookmark[page=47,level=2]{2.2.4 Combiners} |
|
\bookmark[page=48,level=2]{2.2.5 Details of MapReduce Execution} |
|
\bookmark[page=50,level=2]{2.2.6 Coping With Node Failures} |
|
\bookmark[page=50,level=3]{2.2.7 Exercises for Section 2.2} |
|
\bookmark[page=50,level=1]{2.3 Algorithms Using MapReduce} |
|
\bookmark[page=51,level=2]{2.3.1 Matrix-Vector Multiplication by MapReduce} |
|
\bookmark[page=52,level=2]{2.3.2 If the Vector v Cannot Fit in Main Memory} |
|
\bookmark[page=53,level=2]{2.3.3 Relational-Algebra Operations} |
|
\bookmark[page=55,level=2]{2.3.4 Computing Selections by MapReduce} |
|
\bookmark[page=56,level=2]{2.3.5 Computing Projections by MapReduce} |
|
\bookmark[page=56,level=2]{2.3.6 Union, Intersection, and Difference by MapReduce} |
|
\bookmark[page=57,level=2]{2.3.7 Computing Natural Join by MapReduce} |
|
\bookmark[page=58,level=2]{2.3.8 Grouping and Aggregation by MapReduce} |
|
\bookmark[page=58,level=2]{2.3.9 Matrix Multiplication} |
|
\bookmark[page=59,level=2]{2.3.10 Matrix Multiplication with One MapReduce Step} |
|
\bookmark[page=60,level=3]{2.3.11 Exercises for Section 2.3} |
|
\bookmark[page=61,level=1]{2.4 Extensions to MapReduce} |
|
\bookmark[page=62,level=2]{2.4.1 Workflow Systems} |
|
\bookmark[page=63,level=2]{2.4.2 Spark} |
|
\bookmark[page=66,level=2]{2.4.3 Spark Implementation} |
|
\bookmark[page=68,level=2]{2.4.4 TensorFlow} |
|
\bookmark[page=69,level=2]{2.4.5 Recursive Extensions to MapReduce} |
|
\bookmark[page=71,level=2]{2.4.6 Bulk-Synchronous Systems} |
|
\bookmark[page=73,level=3]{2.4.7 Exercises for Section 2.4} |
|
\bookmark[page=73,level=1]{2.5 The Communication-Cost Model} |
|
\bookmark[page=73,level=2]{2.5.1 Communication Cost for Task Networks} |
|
\bookmark[page=75,level=2]{2.5.2 Wall-Clock Time} |
|
\bookmark[page=76,level=2]{2.5.3 Multiway Joins} |
|
\bookmark[page=79,level=3]{2.5.4 Exercises for Section 2.5} |
|
\bookmark[page=81,level=1]{2.6 Complexity Theory for MapReduce} |
|
\bookmark[page=81,level=2]{2.6.1 Reducer Size and Replication Rate} |
|
\bookmark[page=82,level=2]{2.6.2 An Example: Similarity Joins} |
|
\bookmark[page=84,level=2]{2.6.3 A Graph Model for MapReduce Problems} |
|
\bookmark[page=85,level=2]{2.6.4 Mapping Schemas} |
|
\bookmark[page=87,level=2]{2.6.5 When Not All Inputs Are Present} |
|
\bookmark[page=88,level=2]{2.6.6 Lower Bounds on Replication Rate} |
|
\bookmark[page=89,level=2]{2.6.7 Case Study: Matrix Multiplication} |
|
\bookmark[page=93,level=3]{2.6.8 Exercises for Section 2.6} |
|
\bookmark[page=94,level=1]{2.7 Summary of Chapter 2} |
|
\bookmark[page=97,level=1]{2.8 References for Chapter 2} |
|
\bookmark[page=101,level=0]{3 Finding Similar Items} |
|
\bookmark[page=102,level=1]{3.1 Applications of Set Similarity} |
|
\bookmark[page=102,level=2]{3.1.1 Jaccard Similarity of Sets} |
|
\bookmark[page=102,level=2]{3.1.2 Similarity of Documents} |
|
\bookmark[page=104,level=2]{3.1.3 Collaborative Filtering as a Similar-Sets Problem} |
|
\bookmark[page=106,level=3]{3.1.4 Exercises for Section 3.1} |
|
\bookmark[page=106,level=1]{3.2 Shingling of Documents} |
|
\bookmark[page=106,level=2]{3.2.1 k-Shingles} |
|
\bookmark[page=107,level=2]{3.2.2 Choosing the Shingle Size} |
|
\bookmark[page=107,level=2]{3.2.3 Hashing Shingles} |
|
\bookmark[page=108,level=2]{3.2.4 Shingles Built from Words} |
|
\bookmark[page=108,level=3]{3.2.5 Exercises for Section 3.2} |
|
\bookmark[page=109,level=1]{3.3 Similarity-Preserving Summaries of Sets} |
|
\bookmark[page=109,level=2]{3.3.1 Matrix Representation of Sets} |
|
\bookmark[page=110,level=2]{3.3.2 Minhashing} |
|
\bookmark[page=111,level=2]{3.3.3 Minhashing and Jaccard Similarity} |
|
\bookmark[page=111,level=2]{3.3.4 Minhash Signatures} |
|
\bookmark[page=112,level=2]{3.3.5 Computing Minhash Signatures in Practice} |
|
\bookmark[page=114,level=2]{3.3.6 Speeding Up Minhashing} |
|
\bookmark[page=115,level=2]{3.3.7 Speedup Using Hash Functions} |
|
\bookmark[page=118,level=3]{3.3.8 Exercises for Section 3.3} |
|
\bookmark[page=119,level=1]{3.4 Locality-Sensitive Hashing for Documents} |
|
\bookmark[page=120,level=2]{3.4.1 LSH for Minhash Signatures} |
|
\bookmark[page=121,level=2]{3.4.2 Analysis of the Banding Technique} |
|
\bookmark[page=123,level=2]{3.4.3 Combining the Techniques} |
|
\bookmark[page=124,level=3]{3.4.4 Exercises for Section 3.4} |
|
\bookmark[page=124,level=1]{3.5 Distance Measures} |
|
\bookmark[page=125,level=2]{3.5.1 Definition of a Distance Measure} |
|
\bookmark[page=125,level=2]{3.5.2 Euclidean Distances} |
|
\bookmark[page=126,level=2]{3.5.3 Jaccard Distance} |
|
\bookmark[page=127,level=2]{3.5.4 Cosine Distance} |
|
\bookmark[page=128,level=2]{3.5.5 Edit Distance} |
|
\bookmark[page=129,level=2]{3.5.6 Hamming Distance} |
|
\bookmark[page=129,level=3]{3.5.7 Exercises for Section 3.5} |
|
\bookmark[page=131,level=1]{3.6 The Theory of Locality-Sensitive Functions} |
|
\bookmark[page=132,level=2]{3.6.1 Locality-Sensitive Functions} |
|
\bookmark[page=132,level=2]{3.6.2 Locality-Sensitive Families for Jaccard Distance} |
|
\bookmark[page=133,level=2]{3.6.3 Amplifying a Locality-Sensitive Family} |
|
\bookmark[page=136,level=3]{3.6.4 Exercises for Section 3.6} |
|
\bookmark[page=136,level=1]{3.7 LSH Families for Other Distance Measures} |
|
\bookmark[page=137,level=2]{3.7.1 LSH Families for Hamming Distance} |
|
\bookmark[page=137,level=2]{3.7.2 Random Hyperplanes and the Cosine Distance} |
|
\bookmark[page=139,level=2]{3.7.3 Sketches} |
|
\bookmark[page=139,level=2]{3.7.4 LSH Families for Euclidean Distance} |
|
\bookmark[page=141,level=2]{3.7.5 More LSH Families for Euclidean Spaces} |
|
\bookmark[page=141,level=3]{3.7.6 Exercises for Section 3.7} |
|
\bookmark[page=142,level=1]{3.8 Applications of Locality-Sensitive Hashing} |
|
\bookmark[page=143,level=2]{3.8.1 Entity Resolution} |
|
\bookmark[page=143,level=2]{3.8.2 An Entity-Resolution Example} |
|
\bookmark[page=144,level=2]{3.8.3 Validating Record Matches} |
|
\bookmark[page=145,level=2]{3.8.4 Matching Fingerprints} |
|
\bookmark[page=146,level=2]{3.8.5 A LSH Family for Fingerprint Matching} |
|
\bookmark[page=148,level=2]{3.8.6 Similar News Articles} |
|
\bookmark[page=149,level=3]{3.8.7 Exercises for Section 3.8} |
|
\bookmark[page=150,level=1]{3.9 Methods for High Degrees of Similarity} |
|
\bookmark[page=150,level=2]{3.9.1 Finding Identical Items} |
|
\bookmark[page=151,level=2]{3.9.2 Representing Sets as Strings} |
|
\bookmark[page=151,level=2]{3.9.3 Length-Based Filtering} |
|
\bookmark[page=152,level=2]{3.9.4 Prefix Indexing} |
|
\bookmark[page=153,level=2]{3.9.5 Using Position Information} |
|
\bookmark[page=155,level=2]{3.9.6 Using Position and Length in Indexes} |
|
\bookmark[page=157,level=3]{3.9.7 Exercises for Section 3.9} |
|
\bookmark[page=158,level=1]{3.10 Summary of Chapter 3} |
|
\bookmark[page=161,level=1]{3.11 References for Chapter 3} |
|
\bookmark[page=163,level=0]{4 Mining Data Streams} |
|
\bookmark[page=163,level=1]{4.1 The Stream Data Model} |
|
\bookmark[page=164,level=2]{4.1.1 A Data-Stream-Management System} |
|
\bookmark[page=165,level=2]{4.1.2 Examples of Stream Sources} |
|
\bookmark[page=166,level=2]{4.1.3 Stream Queries} |
|
\bookmark[page=167,level=2]{4.1.4 Issues in Stream Processing} |
|
\bookmark[page=168,level=1]{4.2 Sampling Data in a Stream} |
|
\bookmark[page=168,level=2]{4.2.1 A Motivating Example} |
|
\bookmark[page=169,level=2]{4.2.2 Obtaining a Representative Sample} |
|
\bookmark[page=169,level=2]{4.2.3 The General Sampling Problem} |
|
\bookmark[page=170,level=2]{4.2.4 Varying the Sample Size} |
|
\bookmark[page=170,level=3]{4.2.5 Exercises for Section 4.2} |
|
\bookmark[page=171,level=1]{4.3 Filtering Streams} |
|
\bookmark[page=171,level=2]{4.3.1 A Motivating Example} |
|
\bookmark[page=172,level=2]{4.3.2 The Bloom Filter} |
|
\bookmark[page=172,level=2]{4.3.3 Analysis of Bloom Filtering} |
|
\bookmark[page=173,level=3]{4.3.4 Exercises for Section 4.3} |
|
\bookmark[page=174,level=1]{4.4 Counting Distinct Elements in a Stream} |
|
\bookmark[page=174,level=2]{4.4.1 The Count-Distinct Problem} |
|
\bookmark[page=175,level=2]{4.4.2 The Flajolet-Martin Algorithm} |
|
\bookmark[page=176,level=2]{4.4.3 Combining Estimates} |
|
\bookmark[page=176,level=2]{4.4.4 Space Requirements} |
|
\bookmark[page=177,level=3]{4.4.5 Exercises for Section 4.4} |
|
\bookmark[page=177,level=1]{4.5 Estimating Moments} |
|
\bookmark[page=177,level=2]{4.5.1 Definition of Moments} |
|
\bookmark[page=178,level=2]{4.5.2 The Alon-Matias-Szegedy Algorithm for Second Moments} |
|
\bookmark[page=179,level=2]{4.5.3 Why the Alon-Matias-Szegedy Algorithm Works} |
|
\bookmark[page=180,level=2]{4.5.4 Higher-Order Moments} |
|
\bookmark[page=180,level=2]{4.5.5 Dealing With Infinite Streams} |
|
\bookmark[page=181,level=3]{4.5.6 Exercises for Section 4.5} |
|
\bookmark[page=182,level=1]{4.6 Counting Ones in a Window} |
|
\bookmark[page=183,level=2]{4.6.1 The Cost of Exact Counts} |
|
\bookmark[page=183,level=2]{4.6.2 The Datar-Gionis-Indyk-Motwani Algorithm} |
|
\bookmark[page=185,level=2]{4.6.3 Storage Requirements for the DGIM Algorithm} |
|
\bookmark[page=185,level=2]{4.6.4 Query Answering in the DGIM Algorithm} |
|
\bookmark[page=186,level=2]{4.6.5 Maintaining the DGIM Conditions} |
|
\bookmark[page=187,level=2]{4.6.6 Reducing the Error} |
|
\bookmark[page=188,level=2]{4.6.7 Extensions to the Counting of Ones} |
|
\bookmark[page=189,level=3]{4.6.8 Exercises for Section 4.6} |
|
\bookmark[page=189,level=1]{4.7 Decaying Windows} |
|
\bookmark[page=189,level=2]{4.7.1 The Problem of Most-Common Elements} |
|
\bookmark[page=190,level=2]{4.7.2 Definition of the Decaying Window} |
|
\bookmark[page=191,level=2]{4.7.3 Finding the Most Popular Elements} |
|
\bookmark[page=192,level=1]{4.8 Summary of Chapter 4} |
|
\bookmark[page=193,level=1]{4.9 References for Chapter 4} |
|
\bookmark[page=195,level=0]{5 Link Analysis} |
|
\bookmark[page=195,level=1]{5.1 PageRank} |
|
\bookmark[page=196,level=2]{5.1.1 Early Search Engines and Term Spam} |
|
\bookmark[page=197,level=2]{5.1.2 Definition of PageRank} |
|
\bookmark[page=201,level=2]{5.1.3 Structure of the Web} |
|
\bookmark[page=202,level=2]{5.1.4 Avoiding Dead Ends} |
|
\bookmark[page=205,level=2]{5.1.5 Spider Traps and Taxation} |
|
\bookmark[page=207,level=2]{5.1.6 Using PageRank in a Search Engine} |
|
\bookmark[page=207,level=3]{5.1.7 Exercises for Section 5.1} |
|
\bookmark[page=209,level=1]{5.2 Efficient Computation of PageRank} |
|
\bookmark[page=210,level=2]{5.2.1 Representing Transition Matrices} |
|
\bookmark[page=211,level=2]{5.2.2 PageRank Iteration Using MapReduce} |
|
\bookmark[page=211,level=2]{5.2.3 Use of Combiners to Consolidate the Result Vector} |
|
\bookmark[page=212,level=2]{5.2.4 Representing Blocks of the Transition Matrix} |
|
\bookmark[page=213,level=2]{5.2.5 Other Efficient Approaches to PageRank Iteration} |
|
\bookmark[page=215,level=3]{5.2.6 Exercises for Section 5.2} |
|
\bookmark[page=215,level=1]{5.3 Topic-Sensitive PageRank} |
|
\bookmark[page=215,level=2]{5.3.1 Motivation for Topic-Sensitive Page Rank} |
|
\bookmark[page=216,level=2]{5.3.2 Biased Random Walks} |
|
\bookmark[page=217,level=2]{5.3.3 Using Topic-Sensitive PageRank} |
|
\bookmark[page=218,level=2]{5.3.4 Inferring Topics from Words} |
|
\bookmark[page=219,level=3]{5.3.5 Exercises for Section 5.3} |
|
\bookmark[page=219,level=1]{5.4 Link Spam} |
|
\bookmark[page=219,level=2]{5.4.1 Architecture of a Spam Farm} |
|
\bookmark[page=221,level=2]{5.4.2 Analysis of a Spam Farm} |
|
\bookmark[page=222,level=2]{5.4.3 Combating Link Spam} |
|
\bookmark[page=222,level=2]{5.4.4 TrustRank} |
|
\bookmark[page=223,level=2]{5.4.5 Spam Mass} |
|
\bookmark[page=223,level=3]{5.4.6 Exercises for Section 5.4} |
|
\bookmark[page=224,level=1]{5.5 Hubs and Authorities} |
|
\bookmark[page=224,level=2]{5.5.1 The Intuition Behind HITS} |
|
\bookmark[page=225,level=2]{5.5.2 Formalizing Hubbiness and Authority} |
|
\bookmark[page=228,level=3]{5.5.3 Exercises for Section 5.5} |
|
\bookmark[page=228,level=1]{5.6 Summary of Chapter 5} |
|
\bookmark[page=232,level=1]{5.7 References for Chapter 5} |
|
\bookmark[page=233,level=0]{6 Frequent Itemsets} |
|
\bookmark[page=234,level=1]{6.1 The Market-Basket Model} |
|
\bookmark[page=234,level=2]{6.1.1 Definition of Frequent Itemsets} |
|
\bookmark[page=236,level=2]{6.1.2 Applications of Frequent Itemsets} |
|
\bookmark[page=237,level=2]{6.1.3 Association Rules} |
|
\bookmark[page=239,level=2]{6.1.4 Finding Association Rules with High Confidence} |
|
\bookmark[page=239,level=3]{6.1.5 Exercises for Section 6.1} |
|
\bookmark[page=241,level=1]{6.2 Market Baskets and the A-Priori Algorithm} |
|
\bookmark[page=241,level=2]{6.2.1 Representation of Market-Basket Data} |
|
\bookmark[page=242,level=2]{6.2.2 Use of Main Memory for Itemset Counting} |
|
\bookmark[page=244,level=2]{6.2.3 Monotonicity of Itemsets} |
|
\bookmark[page=245,level=2]{6.2.4 Tyranny of Counting Pairs} |
|
\bookmark[page=245,level=2]{6.2.5 The A-Priori Algorithm} |
|
\bookmark[page=246,level=2]{6.2.6 A-Priori for All Frequent Itemsets} |
|
\bookmark[page=249,level=3]{6.2.7 Exercises for Section 6.2} |
|
\bookmark[page=250,level=1]{6.3 Handling Larger Datasets in Main Memory} |
|
\bookmark[page=250,level=2]{6.3.1 The Algorithm of Park, Chen, and Yu} |
|
\bookmark[page=252,level=2]{6.3.2 The Multistage Algorithm} |
|
\bookmark[page=254,level=2]{6.3.3 The Multihash Algorithm} |
|
\bookmark[page=256,level=3]{6.3.4 Exercises for Section 6.3} |
|
\bookmark[page=258,level=1]{6.4 Limited-Pass Algorithms} |
|
\bookmark[page=258,level=2]{6.4.1 The Simple, Randomized Algorithm} |
|
\bookmark[page=259,level=2]{6.4.2 Avoiding Errors in Sampling Algorithms} |
|
\bookmark[page=260,level=2]{6.4.3 The Algorithm of Savasere, Omiecinski, and Navathe} |
|
\bookmark[page=261,level=2]{6.4.4 The SON Algorithm and MapReduce} |
|
\bookmark[page=262,level=2]{6.4.5 Toivonen’s Algorithm} |
|
\bookmark[page=263,level=2]{6.4.6 Why Toivonen’s Algorithm Works} |
|
\bookmark[page=264,level=3]{6.4.7 Exercises for Section 6.4} |
|
\bookmark[page=264,level=1]{6.5 Counting Frequent Items in a Stream} |
|
\bookmark[page=265,level=2]{6.5.1 Sampling Methods for Streams} |
|
\bookmark[page=266,level=2]{6.5.2 Frequent Itemsets in Decaying Windows} |
|
\bookmark[page=267,level=2]{6.5.3 Hybrid Methods} |
|
\bookmark[page=267,level=3]{6.5.4 Exercises for Section 6.5} |
|
\bookmark[page=268,level=1]{6.6 Summary of Chapter 6} |
|
\bookmark[page=270,level=1]{6.7 References for Chapter 6} |
|
\bookmark[page=273,level=0]{7 Clustering} |
|
\bookmark[page=273,level=1]{7.1 Introduction to Clustering Techniques} |
|
\bookmark[page=273,level=2]{7.1.1 Points, Spaces, and Distances} |
|
\bookmark[page=275,level=2]{7.1.2 Clustering Strategies} |
|
\bookmark[page=276,level=2]{7.1.3 The Curse of Dimensionality} |
|
\bookmark[page=277,level=3]{7.1.4 Exercises for Section 7.1} |
|
\bookmark[page=277,level=1]{7.2 Hierarchical Clustering} |
|
\bookmark[page=278,level=2]{7.2.1 Hierarchical Clustering in a Euclidean Space} |
|
\bookmark[page=280,level=2]{7.2.2 Efficiency of Hierarchical Clustering} |
|
\bookmark[page=281,level=2]{7.2.3 Alternative Rules for Controlling Hierarchical Clustering} |
|
\bookmark[page=284,level=2]{7.2.4 Hierarchical Clustering in Non-Euclidean Spaces} |
|
\bookmark[page=285,level=3]{7.2.5 Exercises for Section 7.2} |
|
\bookmark[page=286,level=1]{7.3 K-means Algorithms} |
|
\bookmark[page=287,level=2]{7.3.1 K-Means Basics} |
|
\bookmark[page=287,level=2]{7.3.2 Initializing Clusters for K-Means} |
|
\bookmark[page=288,level=2]{7.3.3 Picking the Right Value of k} |
|
\bookmark[page=289,level=2]{7.3.4 The Algorithm of Bradley, Fayyad, and Reina} |
|
\bookmark[page=291,level=2]{7.3.5 Processing Data in the BFR Algorithm} |
|
\bookmark[page=294,level=3]{7.3.6 Exercises for Section 7.3} |
|
\bookmark[page=294,level=1]{7.4 The CURE Algorithm} |
|
\bookmark[page=295,level=2]{7.4.1 Initialization in CURE} |
|
\bookmark[page=296,level=2]{7.4.2 Completion of the CURE Algorithm} |
|
\bookmark[page=297,level=3]{7.4.3 Exercises for Section 7.4} |
|
\bookmark[page=298,level=1]{7.5 Clustering in Non-Euclidean Spaces} |
|
\bookmark[page=298,level=2]{7.5.1 Representing Clusters in the GRGPF Algorithm} |
|
\bookmark[page=299,level=2]{7.5.2 Initializing the Cluster Tree} |
|
\bookmark[page=300,level=2]{7.5.3 Adding Points in the GRGPF Algorithm} |
|
\bookmark[page=301,level=2]{7.5.4 Splitting and Merging Clusters} |
|
\bookmark[page=302,level=3]{7.5.5 Exercises for Section 7.5} |
|
\bookmark[page=302,level=1]{7.6 Clustering for Streams and Parallelism} |
|
\bookmark[page=303,level=2]{7.6.1 The Stream-Computing Model} |
|
\bookmark[page=303,level=2]{7.6.2 A Stream-Clustering Algorithm} |
|
\bookmark[page=304,level=2]{7.6.3 Initializing Buckets} |
|
\bookmark[page=304,level=2]{7.6.4 Merging Buckets} |
|
\bookmark[page=307,level=2]{7.6.5 Answering Queries} |
|
\bookmark[page=307,level=2]{7.6.6 Clustering in a Parallel Environment} |
|
\bookmark[page=308,level=3]{7.6.7 Exercises for Section 7.6} |
|
\bookmark[page=308,level=1]{7.7 Summary of Chapter 7} |
|
\bookmark[page=312,level=1]{7.8 References for Chapter 7} |
|
\bookmark[page=313,level=0]{8 Advertising on the Web} |
|
\bookmark[page=313,level=1]{8.1 Issues in On-Line Advertising} |
|
\bookmark[page=313,level=2]{8.1.1 Advertising Opportunities} |
|
\bookmark[page=314,level=2]{8.1.2 Direct Placement of Ads} |
|
\bookmark[page=315,level=2]{8.1.3 Issues for Display Ads} |
|
\bookmark[page=316,level=1]{8.2 On-Line Algorithms} |
|
\bookmark[page=316,level=2]{8.2.1 On-Line and Off-Line Algorithms} |
|
\bookmark[page=317,level=2]{8.2.2 Greedy Algorithms} |
|
\bookmark[page=318,level=2]{8.2.3 The Competitive Ratio} |
|
\bookmark[page=318,level=3]{8.2.4 Exercises for Section 8.2} |
|
\bookmark[page=319,level=1]{8.3 The Matching Problem} |
|
\bookmark[page=319,level=2]{8.3.1 Matches and Perfect Matches} |
|
\bookmark[page=320,level=2]{8.3.2 The Greedy Algorithm for Maximal Matching} |
|
\bookmark[page=321,level=2]{8.3.3 Competitive Ratio for Greedy Matching} |
|
\bookmark[page=322,level=3]{8.3.4 Exercises for Section 8.3} |
|
\bookmark[page=322,level=1]{8.4 The Adwords Problem} |
|
\bookmark[page=323,level=2]{8.4.1 History of Search Advertising} |
|
\bookmark[page=323,level=2]{8.4.2 Definition of the Adwords Problem} |
|
\bookmark[page=324,level=2]{8.4.3 The Greedy Approach to the Adwords Problem} |
|
\bookmark[page=325,level=2]{8.4.4 The Balance Algorithm} |
|
\bookmark[page=326,level=2]{8.4.5 A Lower Bound on Competitive Ratio for Balance} |
|
\bookmark[page=328,level=2]{8.4.6 The Balance Algorithm with Many Bidders} |
|
\bookmark[page=329,level=2]{8.4.7 The Generalized Balance Algorithm} |
|
\bookmark[page=330,level=2]{8.4.8 Final Observations About the Adwords Problem} |
|
\bookmark[page=331,level=3]{8.4.9 Exercises for Section 8.4} |
|
\bookmark[page=331,level=1]{8.5 Adwords Implementation} |
|
\bookmark[page=332,level=2]{8.5.1 Matching Bids and Search Queries} |
|
\bookmark[page=332,level=2]{8.5.2 More Complex Matching Problems} |
|
\bookmark[page=333,level=2]{8.5.3 A Matching Algorithm for Documents and Bids} |
|
\bookmark[page=335,level=1]{8.6 Summary of Chapter 8} |
|
\bookmark[page=337,level=1]{8.7 References for Chapter 8} |
|
\bookmark[page=339,level=0]{9 Recommendation Systems} |
|
\bookmark[page=339,level=1]{9.1 A Model for Recommendation Systems} |
|
\bookmark[page=340,level=2]{9.1.1 The Utility Matrix} |
|
\bookmark[page=341,level=2]{9.1.2 The Long Tail} |
|
\bookmark[page=341,level=2]{9.1.3 Applications of Recommendation Systems} |
|
\bookmark[page=343,level=2]{9.1.4 Populating the Utility Matrix} |
|
\bookmark[page=344,level=1]{9.2 Content-Based Recommendations} |
|
\bookmark[page=344,level=2]{9.2.1 Item Profiles} |
|
\bookmark[page=345,level=2]{9.2.2 Discovering Features of Documents} |
|
\bookmark[page=346,level=2]{9.2.3 Obtaining Item Features From Tags} |
|
\bookmark[page=347,level=2]{9.2.4 Representing Item Profiles} |
|
\bookmark[page=348,level=2]{9.2.5 User Profiles} |
|
\bookmark[page=349,level=2]{9.2.6 Recommending Items to Users Based on Content} |
|
\bookmark[page=350,level=2]{9.2.7 Classification Algorithms} |
|
\bookmark[page=352,level=3]{9.2.8 Exercises for Section 9.2} |
|
\bookmark[page=353,level=1]{9.3 Collaborative Filtering} |
|
\bookmark[page=354,level=2]{9.3.1 Measuring Similarity} |
|
\bookmark[page=356,level=2]{9.3.2 The Duality of Similarity} |
|
\bookmark[page=357,level=2]{9.3.3 Clustering Users and Items} |
|
\bookmark[page=359,level=3]{9.3.4 Exercises for Section 9.3} |
|
\bookmark[page=360,level=1]{9.4 Dimensionality Reduction} |
|
\bookmark[page=360,level=2]{9.4.1 UV-Decomposition} |
|
\bookmark[page=361,level=2]{9.4.2 Root-Mean-Square Error} |
|
\bookmark[page=362,level=2]{9.4.3 Incremental Computation of a UV-Decomposition} |
|
\bookmark[page=364,level=2]{9.4.4 Optimizing an Arbitrary Element} |
|
\bookmark[page=366,level=2]{9.4.5 Building a Complete UV-Decomposition Algorithm} |
|
\bookmark[page=368,level=3]{9.4.6 Exercises for Section 9.4} |
|
\bookmark[page=369,level=1]{9.5 The Netflix Challenge} |
|
\bookmark[page=370,level=1]{9.6 Summary of Chapter 9} |
|
\bookmark[page=372,level=1]{9.7 References for Chapter 9} |
|
\bookmark[page=375,level=0]{10 Mining Social-Network Graphs} |
|
\bookmark[page=376,level=1]{10.1 Social Networks as Graphs} |
|
\bookmark[page=376,level=2]{10.1.1 What is a Social Network?} |
|
\bookmark[page=376,level=2]{10.1.2 Social Networks as Graphs} |
|
\bookmark[page=378,level=2]{10.1.3 Varieties of Social Networks} |
|
\bookmark[page=379,level=2]{10.1.4 Graphs With Several Node Types} |
|
\bookmark[page=380,level=3]{10.1.5 Exercises for Section 10.1} |
|
\bookmark[page=381,level=1]{10.2 Clustering of Social-Network Graphs} |
|
\bookmark[page=381,level=2]{10.2.1 Distance Measures for Social-Network Graphs} |
|
\bookmark[page=381,level=2]{10.2.2 Applying Standard Clustering Methods} |
|
\bookmark[page=383,level=2]{10.2.3 Betweenness} |
|
\bookmark[page=383,level=2]{10.2.4 The Girvan-Newman Algorithm} |
|
\bookmark[page=386,level=2]{10.2.5 Using Betweenness to Find Communities} |
|
\bookmark[page=387,level=3]{10.2.6 Exercises for Section 10.2} |
|
\bookmark[page=388,level=1]{10.3 Direct Discovery of Communities} |
|
\bookmark[page=389,level=2]{10.3.1 Finding Cliques} |
|
\bookmark[page=389,level=2]{10.3.2 Complete Bipartite Graphs} |
|
\bookmark[page=390,level=2]{10.3.3 Finding Complete Bipartite Subgraphs} |
|
\bookmark[page=391,level=2]{10.3.4 Why Complete Bipartite Graphs Must Exist} |
|
\bookmark[page=393,level=3]{10.3.5 Exercises for Section 10.3} |
|
\bookmark[page=394,level=1]{10.4 Partitioning of Graphs} |
|
\bookmark[page=394,level=2]{10.4.1 What Makes a Good Partition?} |
|
\bookmark[page=395,level=2]{10.4.2 Normalized Cuts} |
|
\bookmark[page=395,level=2]{10.4.3 Some Matrices That Describe Graphs} |
|
\bookmark[page=397,level=2]{10.4.4 Eigenvalues of the Laplacian Matrix} |
|
\bookmark[page=399,level=2]{10.4.5 Alternative Partitioning Methods} |
|
\bookmark[page=400,level=3]{10.4.6 Exercises for Section 10.4} |
|
\bookmark[page=401,level=1]{10.5 Finding Overlapping Communities} |
|
\bookmark[page=401,level=2]{10.5.1 The Nature of Communities} |
|
\bookmark[page=401,level=2]{10.5.2 Maximum-Likelihood Estimation} |
|
\bookmark[page=403,level=2]{10.5.3 The Affiliation-Graph Model} |
|
\bookmark[page=405,level=2]{10.5.4 Discrete Optimization of Community Assignments} |
|
\bookmark[page=406,level=2]{10.5.5 Avoiding the Use of Discrete Membership Changes} |
|
\bookmark[page=408,level=3]{10.5.6 Exercises for Section 10.5} |
|
\bookmark[page=409,level=1]{10.6 Simrank} |
|
\bookmark[page=410,level=2]{10.6.1 Random Walkers on a Social Graph} |
|
\bookmark[page=411,level=2]{10.6.2 Random Walks with Restart} |
|
\bookmark[page=413,level=2]{10.6.3 Approximate Simrank} |
|
\bookmark[page=415,level=2]{10.6.4 Why Approximate Simrank Works} |
|
\bookmark[page=416,level=2]{10.6.5 Application of Simrank to Finding Communities} |
|
\bookmark[page=417,level=3]{10.6.6 Exercises for Section 10.6} |
|
\bookmark[page=419,level=1]{10.7 Counting Triangles} |
|
\bookmark[page=419,level=2]{10.7.1 Why Count Triangles?} |
|
\bookmark[page=420,level=2]{10.7.2 An Algorithm for Finding Triangles} |
|
\bookmark[page=421,level=2]{10.7.3 Optimality of the Triangle-Finding Algorithm} |
|
\bookmark[page=421,level=2]{10.7.4 Finding Triangles Using MapReduce} |
|
\bookmark[page=423,level=2]{10.7.5 Using Fewer Reduce Tasks} |
|
\bookmark[page=424,level=3]{10.7.6 Exercises for Section 10.7} |
|
\bookmark[page=425,level=1]{10.8 Neighborhood Properties of Graphs} |
|
\bookmark[page=425,level=2]{10.8.1 Directed Graphs and Neighborhoods} |
|
\bookmark[page=426,level=2]{10.8.2 The Diameter of a Graph} |
|
\bookmark[page=427,level=2]{10.8.3 Transitive Closure and Reachability} |
|
\bookmark[page=428,level=2]{10.8.4 Reachability Via MapReduce} |
|
\bookmark[page=429,level=2]{10.8.5 Seminaive Evaluation} |
|
\bookmark[page=430,level=2]{10.8.6 Linear Transitive Closure} |
|
\bookmark[page=431,level=2]{10.8.7 Transitive Closure by Recursive Doubling} |
|
\bookmark[page=433,level=2]{10.8.8 Smart Transitive Closure} |
|
\bookmark[page=435,level=2]{10.8.9 Comparison of Methods} |
|
\bookmark[page=437,level=2]{10.8.10Transitive Closure by Graph Reduction} |
|
\bookmark[page=438,level=2]{10.8.11Approximating the Sizes of Neighborhoods} |
|
\bookmark[page=440,level=3]{10.8.12Exercises for Section 10.8} |
|
\bookmark[page=442,level=1]{10.9 Summary of Chapter 10} |
|
\bookmark[page=445,level=1]{10.10References for Chapter 10} |
|
\bookmark[page=449,level=0]{11 Dimensionality Reduction} |
|
\bookmark[page=450,level=1]{11.1 Eigenvalues and Eigenvectors of Symmetric Matrices} |
|
\bookmark[page=450,level=2]{11.1.1 Definitions} |
|
\bookmark[page=451,level=2]{11.1.2 Computing Eigenvalues and Eigenvectors} |
|
\bookmark[page=452,level=2]{11.1.3 Finding Eigenpairs by Power Iteration} |
|
\bookmark[page=455,level=2]{11.1.4 The Matrix of Eigenvectors} |
|
\bookmark[page=455,level=3]{11.1.5 Exercises for Section 11.1} |
|
\bookmark[page=456,level=1]{11.2 Principal-Component Analysis} |
|
\bookmark[page=457,level=2]{11.2.1 An Illustrative Example} |
|
\bookmark[page=460,level=2]{11.2.2 Using Eigenvectors for Dimensionality Reduction} |
|
\bookmark[page=461,level=2]{11.2.3 The Matrix of Distances} |
|
\bookmark[page=462,level=3]{11.2.4 Exercises for Section 11.2} |
|
\bookmark[page=462,level=1]{11.3 Singular-Value Decomposition} |
|
\bookmark[page=462,level=2]{11.3.1 Definition of SVD} |
|
\bookmark[page=464,level=2]{11.3.2 Interpretation of SVD} |
|
\bookmark[page=466,level=2]{11.3.3 Dimensionality Reduction Using SVD} |
|
\bookmark[page=467,level=2]{11.3.4 Why Zeroing Low Singular Values Works} |
|
\bookmark[page=469,level=2]{11.3.5 Querying Using Concepts} |
|
\bookmark[page=470,level=2]{11.3.6 Computing the SVD of a Matrix} |
|
\bookmark[page=471,level=3]{11.3.7 Exercises for Section 11.3} |
|
\bookmark[page=472,level=1]{11.4 CUR Decomposition} |
|
\bookmark[page=473,level=2]{11.4.1 Definition of CUR} |
|
\bookmark[page=474,level=2]{11.4.2 Choosing Rows and Columns Properly} |
|
\bookmark[page=475,level=2]{11.4.3 Constructing the Middle Matrix} |
|
\bookmark[page=476,level=2]{11.4.4 The Complete CUR Decomposition} |
|
\bookmark[page=477,level=2]{11.4.5 Eliminating Duplicate Rows and Columns} |
|
\bookmark[page=478,level=3]{11.4.6 Exercises for Section 11.4} |
|
\bookmark[page=478,level=1]{11.5 Summary of Chapter 11} |
|
\bookmark[page=480,level=1]{11.6 References for Chapter 11} |
|
\bookmark[page=483,level=0]{12 Large-Scale Machine Learning} |
|
\bookmark[page=484,level=1]{12.1 The Machine-Learning Model} |
|
\bookmark[page=484,level=2]{12.1.1 Training Sets} |
|
\bookmark[page=484,level=2]{12.1.2 Some Illustrative Examples} |
|
\bookmark[page=487,level=2]{12.1.3 Approaches to Machine Learning} |
|
\bookmark[page=488,level=2]{12.1.4 Machine-Learning Architecture} |
|
\bookmark[page=491,level=3]{12.1.5 Exercises for Section 12.1} |
|
\bookmark[page=491,level=1]{12.2 Perceptrons} |
|
\bookmark[page=491,level=2]{12.2.1 Training a Perceptron with Zero Threshold} |
|
\bookmark[page=495,level=2]{12.2.2 Convergence of Perceptrons} |
|
\bookmark[page=495,level=2]{12.2.3 The Winnow Algorithm} |
|
\bookmark[page=497,level=2]{12.2.4 Allowing the Threshold to Vary} |
|
\bookmark[page=499,level=2]{12.2.5 Multiclass Perceptrons} |
|
\bookmark[page=500,level=2]{12.2.6 Transforming the Training Set} |
|
\bookmark[page=501,level=2]{12.2.7 Problems With Perceptrons} |
|
\bookmark[page=502,level=2]{12.2.8 Parallel Implementation of Perceptrons} |
|
\bookmark[page=504,level=3]{12.2.9 Exercises for Section 12.2} |
|
\bookmark[page=505,level=1]{12.3 Support-Vector Machines} |
|
\bookmark[page=506,level=2]{12.3.1 The Mechanics of an SVM} |
|
\bookmark[page=507,level=2]{12.3.2 Normalizing the Hyperplane} |
|
\bookmark[page=509,level=2]{12.3.3 Finding Optimal Approximate Separators} |
|
\bookmark[page=512,level=2]{12.3.4 SVM Solutions by Gradient Descent} |
|
\bookmark[page=515,level=2]{12.3.5 Stochastic Gradient Descent} |
|
\bookmark[page=516,level=2]{12.3.6 Parallel Implementation of SVM} |
|
\bookmark[page=517,level=3]{12.3.7 Exercises for Section 12.3} |
|
\bookmark[page=517,level=1]{12.4 Learning from Nearest Neighbors} |
|
\bookmark[page=518,level=2]{12.4.1 The Framework for Nearest-Neighbor Calculations} |
|
\bookmark[page=518,level=2]{12.4.2 Learning with One Nearest Neighbor} |
|
\bookmark[page=519,level=2]{12.4.3 Learning One-Dimensional Functions} |
|
\bookmark[page=522,level=2]{12.4.4 Kernel Regression} |
|
\bookmark[page=522,level=2]{12.4.5 Dealing with High-Dimensional Euclidean Data} |
|
\bookmark[page=524,level=2]{12.4.6 Dealing with Non-Euclidean Distances} |
|
\bookmark[page=524,level=3]{12.4.7 Exercises for Section 12.4} |
|
\bookmark[page=525,level=1]{12.5 Decision Trees} |
|
\bookmark[page=526,level=2]{12.5.1 Using a Decision Tree} |
|
\bookmark[page=527,level=2]{12.5.2 Impurity Measures} |
|
\bookmark[page=528,level=2]{12.5.3 Designing a Decision-Tree Node} |
|
\bookmark[page=529,level=2]{12.5.4 Selecting a Test Using a Numerical Feature} |
|
\bookmark[page=531,level=2]{12.5.5 Selecting a Test Using a Categorical Feature} |
|
\bookmark[page=533,level=2]{12.5.6 Parallel Design of Decision Trees} |
|
\bookmark[page=534,level=2]{12.5.7 Node Pruning} |
|
\bookmark[page=535,level=2]{12.5.8 Decision Forests} |
|
\bookmark[page=536,level=3]{12.5.9 Exercises for Section 12.5} |
|
\bookmark[page=537,level=1]{12.6 Comparison of Learning Methods} |
|
\bookmark[page=538,level=1]{12.7 Summary of Chapter 12} |
|
\bookmark[page=540,level=1]{12.8 References for Chapter 12} |
|
\bookmark[page=543,level=0]{13 Neural Nets and Deep Learning} |
|
\bookmark[page=544,level=1]{13.1 Introduction to Neural Nets} |
|
\bookmark[page=545,level=2]{13.1.1 Neural Nets, in General} |
|
\bookmark[page=547,level=2]{13.1.2 Interconnections Among Nodes} |
|
\bookmark[page=547,level=2]{13.1.3 Convolutional Neural Networks} |
|
\bookmark[page=548,level=2]{13.1.4 Design Issues for Neural Nets} |
|
\bookmark[page=548,level=3]{13.1.5 Exercises for Section 13.1} |
|
\bookmark[page=549,level=1]{13.2 Dense Feedforward Networks} |
|
\bookmark[page=549,level=2]{13.2.1 Linear Algebra Notation} |
|
\bookmark[page=551,level=2]{13.2.2 Activation Functions} |
|
\bookmark[page=552,level=2]{13.2.3 The Sigmoid} |
|
\bookmark[page=552,level=2]{13.2.4 The Hyperbolic Tangent} |
|
\bookmark[page=553,level=2]{13.2.5 Softmax} |
|
\bookmark[page=554,level=2]{13.2.6 Recified Linear Unit} |
|
\bookmark[page=555,level=2]{13.2.7 Loss Functions} |
|
\bookmark[page=556,level=2]{13.2.8 Regression Loss} |
|
\bookmark[page=557,level=2]{13.2.9 Classification Loss} |
|
\bookmark[page=558,level=3]{13.2.10Exercises for Section 13.2} |
|
\bookmark[page=559,level=1]{13.3 Backpropagation and Gradient Descent} |
|
\bookmark[page=560,level=2]{13.3.1 Compute Graphs} |
|
\bookmark[page=561,level=2]{13.3.2 Gradients, Jacobians, and the Chain Rule} |
|
\bookmark[page=562,level=2]{13.3.3 The Backpropagation Algorithm} |
|
\bookmark[page=565,level=2]{13.3.4 Iterating Gradient Descent} |
|
\bookmark[page=566,level=2]{13.3.5 Tensors} |
|
\bookmark[page=568,level=3]{13.3.6 Exercises for Section 13.3} |
|
\bookmark[page=568,level=1]{13.4 Convolutional Neural Networks} |
|
\bookmark[page=569,level=2]{13.4.1 Convolutional Layers} |
|
\bookmark[page=572,level=2]{13.4.2 Convolution and Cross-Correlation} |
|
\bookmark[page=573,level=2]{13.4.3 Pooling Layers} |
|
\bookmark[page=573,level=2]{13.4.4 CNN Architecture} |
|
\bookmark[page=575,level=2]{13.4.5 Implementation and Training} |
|
\bookmark[page=576,level=3]{13.4.6 Exercises for Section 13.4} |
|
\bookmark[page=577,level=1]{13.5 Recurrent Neural Networks} |
|
\bookmark[page=579,level=2]{13.5.1 Training RNN’s} |
|
\bookmark[page=581,level=2]{13.5.2 Vanishing and Exploding Gradients} |
|
\bookmark[page=582,level=2]{13.5.3 Long Short-Term Memory (LSTM)} |
|
\bookmark[page=584,level=3]{13.5.4 Exercises for Section 13.5} |
|
\bookmark[page=585,level=1]{13.6 Regularization} |
|
\bookmark[page=585,level=2]{13.6.1 Norm Penalties} |
|
\bookmark[page=586,level=2]{13.6.2 Dropout} |
|
\bookmark[page=586,level=2]{13.6.3 Early Stopping} |
|
\bookmark[page=587,level=2]{13.6.4 Dataset Augmentation} |
|
\bookmark[page=587,level=1]{13.7 Summary of Chapter 13} |
|
\bookmark[page=589,level=1]{13.8 References for Chapter 13} |
|
|
|
|
|
\end{document} |