Last active
February 13, 2018 16:21
-
-
Save pwrliang/33941501766a56140e590b22d805adbb to your computer and use it in GitHub Desktop.
Implementing PageRank Algorithm in SQL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- The graph data and algorithm source from the book "Mining of Massive Datasets", P175, http://infolab.stanford.edu/~ullman/mmds/book.pdf | |
-- This script has been verified the correctness in SQL Server 2017 Linux Version. | |
DROP TABLE Node; | |
DROP TABLE Edge; | |
DROP TABLE OutDegree; | |
DROP TABLE PageRank; | |
CREATE TABLE Node(id int PRIMARY KEY); | |
CREATE TABLE Edge(src int,dst int, PRIMARY KEY (src, dst)); | |
CREATE TABLE OutDegree(id int PRIMARY KEY, degree int); | |
CREATE TABLE PageRank(id int PRIMARY KEY, rank float); | |
CREATE TABLE TmpRank(id int PRIMARY KEY, rank float); | |
--delete all records | |
DELETE FROM Node; | |
DELETE FROM Edge; | |
DELETE FROM OutDegree; | |
DELETE FROM PageRank; | |
DELETE FROM TmpRank; | |
--init basic tables | |
INSERT INTO Node VALUES (0); | |
INSERT INTO Node VALUES (1); | |
INSERT INTO Node VALUES (2); | |
INSERT INTO Node VALUES (3); | |
INSERT INTO Edge VALUES (0, 1); | |
INSERT INTO Edge VALUES (0, 2); | |
INSERT INTO Edge VALUES (0, 3); | |
INSERT INTO Edge VALUES (1, 0); | |
INSERT INTO Edge VALUES (1, 3); | |
INSERT INTO Edge VALUES (2, 2); | |
INSERT INTO Edge VALUES (3, 1); | |
INSERT INTO Edge VALUES (3, 2); | |
--compute out-degree | |
INSERT INTO OutDegree | |
SELECT Node.id, COUNT(Edge.src) --Count(Edge.src) instead of Count(*) for count no out-degree edge | |
FROM Node LEFT OUTER JOIN Edge | |
ON Node.id = Edge.src | |
GROUP BY Node.id; | |
--WARN: There's no special process for node with out-degree, This may cause wrong result | |
-- Please to make sure every node in graph has out-degree | |
DECLARE @ALPHA float = 0.8; | |
DECLARE @Node_Num int; | |
SELECT @Node_Num = COUNT(*) FROM Node; | |
--PageRank Init Value | |
INSERT INTO PageRank | |
SELECT Node.id, rank = (1 - @ALPHA) / @Node_Num | |
FROM Node INNER JOIN OutDegree | |
ON Node.id = OutDegree.id | |
/* | |
--For Debugging | |
SELECT * FROM Node; | |
SELECT * FROM Edge; | |
SELECT * FROM OutDegree; | |
SELECT * FROM PageRank; | |
SELECT * FROM TmpRank; | |
*/ | |
DECLARE @Iteration int = 0; | |
WHILE @Iteration < 50 | |
BEGIN | |
--Iteration Style | |
SET @Iteration = @Iteration + 1 | |
INSERT INTO TmpRank | |
SELECT Edge.dst, rank = SUM(@ALPHA * PageRank.rank / OutDegree.degree) + (1 - @ALPHA) / @Node_Num | |
FROM PageRank | |
INNER JOIN Edge ON PageRank.id = Edge.src | |
INNER JOIN OutDegree ON PageRank.id = OutDegree.id | |
GROUP BY Edge.dst | |
DELETE FROM PageRank; | |
INSERT INTO PageRank | |
SELECT * FROM TmpRank; | |
DELETE FROM TmpRank; | |
END | |
SELECT * FROM PageRank; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- The graph data and algorithm source from the book "Mining of Massive Datasets", P175, http://infolab.stanford.edu/~ullman/mmds/book.pdf | |
-- This script has been verified the correctness on SQL Server 2017 Linux Version. | |
DROP TABLE Node; | |
DROP TABLE Edge; | |
DROP TABLE OutDegree; | |
DROP TABLE PageRankDelta; | |
DROP TABLE TmpRankDelta; | |
CREATE TABLE Node(id int PRIMARY KEY); | |
CREATE TABLE Edge(src int,dst int, PRIMARY KEY (src, dst)); | |
CREATE TABLE OutDegree(id int PRIMARY KEY, degree int); | |
CREATE TABLE PageRankDelta(id int PRIMARY KEY, rank float, delta float); | |
CREATE TABLE TmpRankDelta(id int PRIMARY KEY, delta float); | |
--delete all records | |
DELETE FROM Node; | |
DELETE FROM Edge; | |
DELETE FROM OutDegree; | |
DELETE FROM PageRankDelta; | |
DELETE FROM TmpRankDelta; | |
--init basic tables | |
INSERT INTO Node VALUES (0); | |
INSERT INTO Node VALUES (1); | |
INSERT INTO Node VALUES (2); | |
INSERT INTO Node VALUES (3); | |
INSERT INTO Edge VALUES (0, 1); | |
INSERT INTO Edge VALUES (0, 2); | |
INSERT INTO Edge VALUES (0, 3); | |
INSERT INTO Edge VALUES (1, 0); | |
INSERT INTO Edge VALUES (1, 3); | |
INSERT INTO Edge VALUES (2, 2); | |
INSERT INTO Edge VALUES (3, 1); | |
INSERT INTO Edge VALUES (3, 2); | |
--compute out-degree | |
INSERT INTO OutDegree | |
SELECT Node.id, COUNT(Edge.src) --Count(Edge.src) instead of Count(*) for count no out-degree edge | |
FROM Node LEFT OUTER JOIN Edge | |
ON Node.id = Edge.src | |
GROUP BY Node.id; | |
--WARN: There's no special process for node with out-degree, This may cause wrong result | |
-- Please to make sure every node in graph has out-degree | |
DECLARE @ALPHA float = 0.8; | |
DECLARE @Node_Num int; | |
SELECT @Node_Num = COUNT(*) FROM Node; | |
--PageRank Init Value | |
INSERT INTO PageRankDelta | |
SELECT Node.id, rank = 0, delta = (1 - @ALPHA) / @Node_Num | |
FROM Node INNER JOIN OutDegree | |
ON Node.id = OutDegree.id | |
/* | |
--For Debugging | |
SELECT * FROM Node; | |
SELECT * FROM Edge; | |
SELECT * FROM OutDegree; | |
SELECT * FROM PageRankDelta; | |
SELECT * FROM TmpRankDelta; | |
*/ | |
--Iteration Style | |
DECLARE @Iteration int = 0; | |
WHILE @Iteration < 50 | |
BEGIN | |
SET @Iteration = @Iteration + 1 | |
INSERT INTO TmpRankDelta | |
SELECT Edge.dst, delta = SUM(@ALPHA * PageRankDelta.delta / OutDegree.degree) | |
FROM PageRankDelta | |
INNER JOIN Edge ON PageRankDelta.id = Edge.src | |
INNER JOIN OutDegree ON PageRankDelta.id = OutDegree.id | |
GROUP BY Edge.dst; | |
UPDATE PRDelta | |
SET PRDelta.rank = PRDelta.rank + PRDelta.delta, PRDelta.delta = TmpRankDelta.delta | |
FROM PageRankDelta AS PRDelta | |
INNER JOIN TmpRankDelta ON TmpRankDelta.id = PRDelta.id; | |
DELETE FROM TmpRankDelta; | |
END | |
SELECT * FROM PageRankDelta; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment