Created
October 24, 2022 10:30
-
-
Save LucaCappelletti94/63a5596bf90c6fe33c5d939dd4dfea91 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# We are going to setup the | |
# first-order LINE model for | |
# node embedding as a Python class | |
# We will be using numpy for the algebraic | |
# operations | |
import numpy as np | |
from numpy.random import randint | |
# We will also need numba | |
# to compile just-in-time the core | |
# loops and make everything a bit faster, | |
# other than to parallelize the whole thing. | |
from numba import njit, prange | |
# TQDM for showing a nice loading bar | |
from tqdm.auto import trange | |
# Type hinting for optional values | |
from typing import Optional | |
@njit(fastmath=True) | |
def sigmoid(x: float) -> float: | |
# Depending on whether the `x` is greater or smaller than | |
# zero, we choose a different version of the sigmoid to | |
# avoid numerical overflows. | |
if x > 0: | |
return 1.0 / (1.0 + np.exp(-x)) | |
return np.exp(x) / (1.0 + np.exp(x)) | |
class FirstOrderLINE: | |
"""First-order LINE model for the computation of | |
undirected graph node embedding. | |
The model is rather simple, yet surprisingly effective. | |
It learn a feature vector for each node whose dot product | |
with the connected nodes should be maximal, which we can see | |
as the embedding of the node should be maximally similar to the | |
embedding of the connected nodes, and minimizes the dot product | |
of disconnected nodes, i.e. the embedding of the node should be | |
as dissimilar as possible to the embedding of disconnected nodes. | |
The procedure is optimized using gradient descent. | |
In this implementation we will represent a graph as a CSR matrix, | |
which is composed of an comulative outbound node degree vector, | |
with the length of the number of nodes, and a vector of destination | |
nodes, with the length of the number of edges. | |
This implementation is based on Numpy. | |
""" | |
def __init__( | |
self, | |
embedding_size: int = 100, | |
number_of_epochs: int = 2_000, | |
learning_rate: float = 0.01, | |
learning_rate_decay: float = 0.999, | |
random_state: int = 123675, | |
verbose: bool = True | |
): | |
"""Return new instance of First-Order LINE. | |
Parameters | |
----------------- | |
embedding_size: int = 2_000 | |
Dimensionality of the embedding. | |
number_of_epochs: int = 100 | |
The number of epochs to train the model for. | |
learning_rate: float = 0.005 | |
The learning rate to update the embedding. | |
learning_rate_decay: float = 0.999 | |
Rate to reduce the learning rate. | |
random_state: int = 123675 | |
The random state to reproduce the training. | |
verbose: bool = True | |
Whether to show loading bars. | |
""" | |
self._embedding_size = embedding_size | |
self._number_of_epochs = number_of_epochs | |
self._learning_rate = learning_rate | |
self._learning_rate_decay = learning_rate_decay | |
self._random_state = random_state | |
self._verbose = verbose | |
def fit_transform( | |
self, | |
edges: np.ndarray, | |
number_of_nodes: Optional[int] = None | |
) -> np.ndarray: | |
"""Return first-order LINE node embedding of provided graph. | |
Parameters | |
------------------- | |
edges: csr_matrix | |
The edges of the graph to embed. | |
number_of_nodes: Optional[int] = None | |
Number of nodes in the graph. | |
If None, we use the maximum node ID we | |
find in the provided edges. | |
""" | |
assert edges.shape[1] == 2 | |
if number_of_nodes is None: | |
number_of_nodes = edges.max() + 1 | |
number_of_edges = edges.shape[0] | |
learning_rate = self._learning_rate | |
# The scale factor is helpful to reduce the | |
# magnitude of the dot product relative only | |
# to the dimensionality of the embedding. | |
scale_factor = np.sqrt(self._embedding_size) | |
# We set the random seed to reproduce the embedding | |
# and allocate the embedding matrix, with values between | |
# one and minus one. | |
# Note that we use a float32, so we'll require | |
# number_of_nodes * embedding_size * 4B memory | |
# | |
# For large embeddings, it is possible to MMAP the | |
# numpy array to disk, but this is not convenient | |
# as this genre of embedding requires random access | |
# to the various node embedding vector, and accessing | |
# random vectors from disk is extremely slow. | |
np.random.seed(self._random_state) | |
embedding = np.random.default_rng( | |
self._random_state | |
).random( | |
( | |
number_of_nodes, | |
self._embedding_size | |
), | |
dtype=np.float32 | |
) * 2.0 - 1.0 | |
@njit(parallel=True, fastmath=True) | |
def compute_epoch(embedding, learning_rate): | |
for edge_id in prange(number_of_edges): | |
edge = edges[edge_id] | |
false_src = edges[randint(number_of_edges)][0] | |
false_dst = edges[randint(number_of_edges)][1] | |
# If we samples a selfloops for the negative edges, | |
# there is no way it will predict it as a negative | |
# as trivially a node embedding is maximally identical | |
# to itself. Therefore, in this case we skip onward. | |
if false_src == false_dst: | |
continue | |
for src, dst, label in ( | |
# Negative sample | |
(false_src, false_dst, 0.0), | |
# Positive sample | |
(edge[0], edge[1], 1.0), | |
): | |
# First we compute the scaled dot product | |
dot = (embedding[src] * embedding[dst]).sum() | |
# We scale the dot product by the scale factor | |
scaled_dot = dot / scale_factor | |
# Then, we compute the prediction using the sigmoid | |
variation = learning_rate * (sigmoid(scaled_dot) - label) | |
# We compute and apply gradients racingly, so | |
# there may be (unlikely) data-race collisions | |
# but this allows for HALVING the memory requirements | |
embedding[src] -= variation * embedding[dst] | |
embedding[dst] -= variation * embedding[src] | |
# We start iterating on the model | |
for epoch in trange( | |
self._number_of_epochs, | |
desc="Epochs" | |
): | |
compute_epoch( | |
embedding, | |
learning_rate * self._learning_rate_decay**epoch | |
) | |
# Finally, we can return the computed node embedding. | |
return embedding |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment