j20232 · January 13, 2021 13:37
diff --git a/l2_reg_cp.py b/l2_reg_cp.py
 import numpy as np


 def first_order_cp(X, rank, la=0.01, eta=0.01, eps=1e-5):
    """L2-regularized cp decomposition with first order alternating gradient descent
    Args:
        X (np.ndarray): IxJxK matrix
        rank (int): rank for decomposed matrices ([I, J] -> [I, rank], [rank, J])
        la (float, optional): coefficient for regularization. Defaults to 0.01.
        eta (float, optional): learning rate. Defaults to 0.01.
        eps (float, optional): accuracy. Defaults to 1e-5.
    """
    I, J, K = X.shape

    def f(X, U, V, W):
        recon = np.einsum("ir,jr,kr->ijk", U, V, W)
        first = 0.5 * np.sum((X - recon) ** 2)
        second = 0.5 * la * (np.sum(U ** 2) + np.sum(V ** 2) + np.sum(W ** 2))
        return first + second

    def update_U(X, U, V, W):
        for i in range(I):
            theta_i = np.einsum("jr,jk,kr->r", V, X[i], W)
            U[i] = U[i] - eta * (((V.T @ V) * (W.T @ W) + la * np.identity(rank)) @ U[i] - theta_i)
        return U

    def update_V(X, U, V, W):
        for j in range(J):
            theta_j = np.einsum("ir,ik,kr->r", U, X[:, j], W)
            V[j] = V[j] - eta * (((U.T @ U) * (W.T @ W) + la * np.identity(rank)) @ V[j] - theta_j)
        return V

    def update_W(X, U, V, W):
        for k in range(K):
            theta_k = np.einsum("ir,ij,jr->r", U, X[:, :, k], V)
            W[k] = W[k] - eta * (((U.T @ U) * (V.T @ V) + la * np.identity(rank)) @ W[k] - theta_k)
        return W

    U = np.random.rand(I, rank)
    V = np.random.rand(J, rank)
    W = np.random.rand(K, rank)
    while True:
        g = f(X, U, V, W)
        U = update_U(X, U, V, W)
        V = update_V(X, U, V, W)
        W = update_W(X, U, V, W)
        if abs(g - f(X, U, V, W)) / f(X, U, V, W) < eps:
            break
    return U, V, W


 def main():
    I = 20
    J = 10
    K = 5
    rank = 3
    X = np.random.rand(I, J, K)
    U, V, W = first_order_cp(X, rank)
    recon = np.einsum("ir,jr,kr->ijk", U, V, W)
    loss = np.sum((X - recon)**2)
    print("Loss: ", loss)  # \approx 0
    print("U: ", U.shape)  # (I, rank)
    print("V: ", V.shape)  # (J, rank)
    print("W: ", W.shape)  # (K, rank)


 if __name__ == "__main__":
    main()
diff --git a/l2_reg_cp_second.py b/l2_reg_cp_second.py
 import numpy as np


 def second_order_cp(X, rank, la=0.01, eta=0.01, eps=1e-5):
    """L2-regularized cp decomposition with second order alternating gradient descent
    Args:
        X (np.ndarray): IxJxK matrix
        rank (int): rank for decomposed matrices ([I, J] -> [I, rank], [rank, J])
        la (float, optional): coefficient for regularization. Defaults to 0.01.
        eta (float, optional): learning rate. Defaults to 0.01.
        eps (float, optional): accuracy. Defaults to 1e-5.
    """
    I, J, K = X.shape

    def f(X, U, V, W):
        recon = np.einsum("ir,jr,kr->ijk", U, V, W)
        first = 0.5 * np.sum((X - recon) ** 2)
        second = 0.5 * la * (np.sum(U ** 2) + np.sum(V ** 2) + np.sum(W ** 2))
        return first + second

    def update_U(X, U, V, W):
        for i in range(I):
            theta_i = np.einsum("jr,jk,kr->r", V, X[i], W)
            U[i] = (1 - eta) * U[i] + eta * (np.linalg.inv((V.T @ V) * (W.T @ W) + la * np.identity(rank)) @ theta_i)
        return U

    def update_V(X, U, V, W):
        for j in range(J):
            theta_j = np.einsum("ir,ik,kr->r", U, X[:, j], W)
            V[j] = (1 - eta) * V[j] + eta * (np.linalg.inv((U.T @ U) * (W.T @ W) + la * np.identity(rank)) @ theta_j)
        return V

    def update_W(X, U, V, W):
        for k in range(K):
            theta_k = np.einsum("ir,ij,jr->r", U, X[:, :, k], V)
            W[k] = (1 - eta) * W[k] + eta * (np.linalg.inv((U.T @ U) * (V.T @ V) + la * np.identity(rank)) @ theta_k)
        return W

    U = np.random.rand(I, rank)
    V = np.random.rand(J, rank)
    W = np.random.rand(K, rank)
    while True:
        g = f(X, U, V, W)
        U = update_U(X, U, V, W)
        V = update_V(X, U, V, W)
        W = update_W(X, U, V, W)
        if abs(g - f(X, U, V, W)) / f(X, U, V, W) < eps:
            break
    return U, V, W


 def main():
    I = 20
    J = 10
    K = 5
    rank = 3
    X = np.random.rand(I, J, K)
    U, V, W = second_order_cp(X, rank)
    recon = np.einsum("ir,jr,kr->ijk", U, V, W)
    loss = np.sum((X - recon)**2)
    print("Loss: ", loss)  # \approx 0
    print("U: ", U.shape)  # (I, rank)
    print("V: ", V.shape)  # (J, rank)
    print("W: ", W.shape)  # (K, rank)


 if __name__ == "__main__":
    main()
diff --git a/l2_reg_with_agd.py b/l2_reg_with_agd.py
 import numpy as np


 def first_order_agd(X, rank, la=0.01, eta=0.01, eps=1e-5):
    """L2-regularized matrix decomposition with first order alternating gradient descent
    Args:
        X (np.ndarray): IxJ matrix (I: n_samples, J: n_features)
        rank (int): rank for decomposed matrices ([I, J] -> [I, rank], [rank, J])
        la (float, optional): coefficient for regularization. Defaults to 0.01.
        eta (float, optional): learning rate. Defaults to 0.01.
        eps (float, optional): accuracy. Defaults to 1e-5.
    """

    def f(X, U, V):
        first = 0.5 * np.sum((X - U @ V.T) ** 2)
        second = 0.5 * la * np.sum((np.sum(U ** 2, axis=0) + np.sum(V ** 2, axis=0)))
        return first + second

    def grad_fu(X, U, V):
        return - X @ V + U @ (V.T @ V + la * np.identity(V.shape[1]))

    def grad_fv(X, U, V):
        return - X.T @ U + V @ (U.T @ U + la * np.identity(U.shape[1]))

    n_samples, n_features = X.shape
    U = np.random.rand(n_samples, rank)
    V = np.random.rand(n_features, rank)
    while True:
        g = f(X, U, V)
        U = U - eta * grad_fu(X, U, V)
        V = V - eta * grad_fv(X, U, V)
        if abs(g - f(X, U, V)) / f(X, U, V) < eps:
            break
    return U, V


 def second_order_agd(X, rank, la=0.01, eta=0.01, eps=1e-5):
    """L2-regularized matrix decomposition with second order alternating gradient descent
    Args:
        X (np.ndarray): IxJ matrix (I: n_samples, J: n_features)
        rank (int): rank for decomposed matrices ([I, J] -> [I, rank], [rank, J])
        la (float, optional): coefficient for regularization. Defaults to 0.01.
        eta (float, optional): learning rate. Defaults to 0.01.
        eps (float, optional): accuracy. Defaults to 1e-5.
    """

    def f(X, U, V):
        first = 0.5 * np.sum((X - U @ V.T) ** 2)
        second = 0.5 * la * np.sum((np.sum(U ** 2, axis=0) + np.sum(V ** 2, axis=0)))
        return first + second

    n_samples, n_features = X.shape
    U = np.random.rand(n_samples, rank)
    V = np.random.rand(n_features, rank)
    while True:
        g = f(X, U, V)
        U = (1 - eta) * U + eta * X @ V @ np.linalg.inv(V.T @ V + la * np.identity(V.shape[1]))
        V = (1 - eta) * V + eta * X.T @ U @ np.linalg.inv(U.T @ U + la * np.identity(U.shape[1]))
        if abs(g - f(X, U, V)) / f(X, U, V) < eps:
            break
    return U, V


 def main():
    n_samples = 100
    n_features = 10
    rank = 5
    X = np.random.rand(n_samples, n_features)
    U, V = first_order_agd(X, rank)
    loss = np.sum(X - U @ V.T)
    print("Loss: ", loss)  # \approx 0
    print("U: ", U.shape)  # (n_samples, rank)
    print("V: ", V.shape)  # (n_features, rank)

    U, V = second_order_agd(X, rank)
    loss = np.sum(X - U @ V.T)
    print("Loss: ", loss)  # \approx 0
    print("U: ", U.shape)  # (n_samples, rank)
    print("V: ", V.shape)  # (n_features, rank)


 if __name__ == "__main__":
    main()
diff --git a/l2_reg_with_pgd.py b/l2_reg_with_pgd.py
 import numpy as np


 def non_negative_pgd(X, rank, la=0.01, eta=0.01, eps=1e-5):
    """L2-regularized non-negativematrix decomposition with second order alternating projected gradient descent
    Args:
        X (np.ndarray): IxJ matrix (I: n_samples, J: n_features)
        rank (int): rank for decomposed matrices ([I, J] -> [I, rank], [rank, J])
        la (float, optional): coefficient for regularization. Defaults to 0.01.
        eta (float, optional): learning rate. Defaults to 0.01.
        eps (float, optional): accuracy. Defaults to 1e-5.
    """

    def f(X, U, V):
        first = 0.5 * np.sum((X - U @ V.T) ** 2)
        second = 0.5 * la * np.sum((np.sum(U ** 2, axis=0) + np.sum(V ** 2, axis=0)))
        return first + second

    n_samples, n_features = X.shape
    U = np.random.rand(n_samples, rank)
    V = np.random.rand(n_features, rank)
    while True:
        g = f(X, U, V)
        U = (1 - eta) * U + eta * X @ V @ np.linalg.inv(V.T @ V + la * np.identity(V.shape[1]))
        V = (1 - eta) * V + eta * X.T @ U @ np.linalg.inv(U.T @ U + la * np.identity(U.shape[1]))
        U = np.clip(U, 0, None)
        V = np.clip(V, 0, None)
        if abs(g - f(X, U, V)) / f(X, U, V) < eps:
            break
    return U, V


 def main():
    n_samples = 10
    n_features = 5
    rank = 3
    X = np.random.rand(n_samples, n_features)
    U, V = non_negative_pgd(X, rank)
    loss = np.sum(X - U @ V.T)
    print("Loss: ", loss)  # \approx 0
    print("U: ", np.min(U))  # (n_samples, rank)
    print("V: ", np.min(V))  # (n_features, rank)


 if __name__ == "__main__":
    main()
diff --git a/sgd.py b/sgd.py
 import numpy as np


 def element_sgd(X, rank, T=500, la=0.01, a=1.0, b=1.0):
    """L2-regularized decomposition with stochastic gradient descent
    Args:
        X (np.ndarray): IxJ matrix (I: n_samples, J: n_features)
        rank (int): rank for decomposed matrices ([I, J] -> [I, rank], [rank, J])
        T (int, optional): iteration. Defaults to 500.
        la (float, optional): coefficient for regularization. Defaults to 0.01.
        a (float, optional): factor for learning rate. Defaults to 1.0.
        b (float, optional): factor for learning rate. Defaults to 1.0.
    """

    n_samples, n_features = X.shape
    U = np.random.rand(n_samples, rank)
    V = np.random.rand(n_features, rank)
    for t in range(T):
        i = np.random.randint(0, n_samples)
        j = np.random.randint(0, n_features)
        eta = 1.0 / (a * np.sqrt(t) + b)
        grad_fu = -X[i, j] * V[j] + U[i] * (V[j].T @ V[j] + la / n_features)
        grad_fv = -X[i, j] * U[i] + V[j] * (U[i].T @ U[i] + la / n_samples)
        U[i] = U[i] - eta * grad_fu
        V[j] = V[j] - eta * grad_fv
    return U, V


 def main():
    n_samples = 10
    n_features = 5
    rank = 3
    X = np.random.rand(n_samples, n_features)
    U, V = element_sgd(X, rank)
    loss = np.sum(X - U @ V.T)
    print("Loss: ", loss)  # \approx 0


 if __name__ == "__main__":
    main()
	import numpy as np


	def first_order_cp(X, rank, la=0.01, eta=0.01, eps=1e-5):
	"""L2-regularized cp decomposition with first order alternating gradient descent
	Args:
	X (np.ndarray): IxJxK matrix
	rank (int): rank for decomposed matrices ([I, J] -> [I, rank], [rank, J])
	la (float, optional): coefficient for regularization. Defaults to 0.01.
	eta (float, optional): learning rate. Defaults to 0.01.
	eps (float, optional): accuracy. Defaults to 1e-5.
	"""
	I, J, K = X.shape

	def f(X, U, V, W):
	recon = np.einsum("ir,jr,kr->ijk", U, V, W)
	first = 0.5 * np.sum((X - recon) ** 2)
	second = 0.5 * la * (np.sum(U 2) + np.sum(V 2) + np.sum(W ** 2))
	return first + second

	def update_U(X, U, V, W):
	for i in range(I):
	theta_i = np.einsum("jr,jk,kr->r", V, X[i], W)
	U[i] = U[i] - eta * (((V.T @ V) * (W.T @ W) + la * np.identity(rank)) @ U[i] - theta_i)
	return U

	def update_V(X, U, V, W):
	for j in range(J):
	theta_j = np.einsum("ir,ik,kr->r", U, X[:, j], W)
	V[j] = V[j] - eta * (((U.T @ U) * (W.T @ W) + la * np.identity(rank)) @ V[j] - theta_j)
	return V

	def update_W(X, U, V, W):
	for k in range(K):
	theta_k = np.einsum("ir,ij,jr->r", U, X[:, :, k], V)
	W[k] = W[k] - eta * (((U.T @ U) * (V.T @ V) + la * np.identity(rank)) @ W[k] - theta_k)
	return W

	U = np.random.rand(I, rank)
	V = np.random.rand(J, rank)
	W = np.random.rand(K, rank)
	while True:
	g = f(X, U, V, W)
	U = update_U(X, U, V, W)
	V = update_V(X, U, V, W)
	W = update_W(X, U, V, W)
	if abs(g - f(X, U, V, W)) / f(X, U, V, W) < eps:
	break
	return U, V, W


	def main():
	I = 20
	J = 10
	K = 5
	rank = 3
	X = np.random.rand(I, J, K)
	U, V, W = first_order_cp(X, rank)
	recon = np.einsum("ir,jr,kr->ijk", U, V, W)
	loss = np.sum((X - recon)**2)
	print("Loss: ", loss) # \approx 0
	print("U: ", U.shape) # (I, rank)
	print("V: ", V.shape) # (J, rank)
	print("W: ", W.shape) # (K, rank)


	if __name__ == "__main__":
	main()
	import numpy as np


	def second_order_cp(X, rank, la=0.01, eta=0.01, eps=1e-5):
	"""L2-regularized cp decomposition with second order alternating gradient descent
	Args:
	X (np.ndarray): IxJxK matrix
	rank (int): rank for decomposed matrices ([I, J] -> [I, rank], [rank, J])
	la (float, optional): coefficient for regularization. Defaults to 0.01.
	eta (float, optional): learning rate. Defaults to 0.01.
	eps (float, optional): accuracy. Defaults to 1e-5.
	"""
	I, J, K = X.shape

	def f(X, U, V, W):
	recon = np.einsum("ir,jr,kr->ijk", U, V, W)
	first = 0.5 * np.sum((X - recon) ** 2)
	second = 0.5 * la * (np.sum(U 2) + np.sum(V 2) + np.sum(W ** 2))
	return first + second

	def update_U(X, U, V, W):
	for i in range(I):
	theta_i = np.einsum("jr,jk,kr->r", V, X[i], W)
	U[i] = (1 - eta) * U[i] + eta * (np.linalg.inv((V.T @ V) * (W.T @ W) + la * np.identity(rank)) @ theta_i)
	return U

	def update_V(X, U, V, W):
	for j in range(J):
	theta_j = np.einsum("ir,ik,kr->r", U, X[:, j], W)
	V[j] = (1 - eta) * V[j] + eta * (np.linalg.inv((U.T @ U) * (W.T @ W) + la * np.identity(rank)) @ theta_j)
	return V

	def update_W(X, U, V, W):
	for k in range(K):
	theta_k = np.einsum("ir,ij,jr->r", U, X[:, :, k], V)
	W[k] = (1 - eta) * W[k] + eta * (np.linalg.inv((U.T @ U) * (V.T @ V) + la * np.identity(rank)) @ theta_k)
	return W

	U = np.random.rand(I, rank)
	V = np.random.rand(J, rank)
	W = np.random.rand(K, rank)
	while True:
	g = f(X, U, V, W)
	U = update_U(X, U, V, W)
	V = update_V(X, U, V, W)
	W = update_W(X, U, V, W)
	if abs(g - f(X, U, V, W)) / f(X, U, V, W) < eps:
	break
	return U, V, W


	def main():
	I = 20
	J = 10
	K = 5
	rank = 3
	X = np.random.rand(I, J, K)
	U, V, W = second_order_cp(X, rank)
	recon = np.einsum("ir,jr,kr->ijk", U, V, W)
	loss = np.sum((X - recon)**2)
	print("Loss: ", loss) # \approx 0
	print("U: ", U.shape) # (I, rank)
	print("V: ", V.shape) # (J, rank)
	print("W: ", W.shape) # (K, rank)


	if __name__ == "__main__":
	main()
	import numpy as np


	def first_order_agd(X, rank, la=0.01, eta=0.01, eps=1e-5):
	"""L2-regularized matrix decomposition with first order alternating gradient descent
	Args:
	X (np.ndarray): IxJ matrix (I: n_samples, J: n_features)
	rank (int): rank for decomposed matrices ([I, J] -> [I, rank], [rank, J])
	la (float, optional): coefficient for regularization. Defaults to 0.01.
	eta (float, optional): learning rate. Defaults to 0.01.
	eps (float, optional): accuracy. Defaults to 1e-5.
	"""

	def f(X, U, V):
	first = 0.5 * np.sum((X - U @ V.T) ** 2)
	second = 0.5 * la * np.sum((np.sum(U 2, axis=0) + np.sum(V 2, axis=0)))
	return first + second

	def grad_fu(X, U, V):
	return - X @ V + U @ (V.T @ V + la * np.identity(V.shape[1]))

	def grad_fv(X, U, V):
	return - X.T @ U + V @ (U.T @ U + la * np.identity(U.shape[1]))

	n_samples, n_features = X.shape
	U = np.random.rand(n_samples, rank)
	V = np.random.rand(n_features, rank)
	while True:
	g = f(X, U, V)
	U = U - eta * grad_fu(X, U, V)
	V = V - eta * grad_fv(X, U, V)
	if abs(g - f(X, U, V)) / f(X, U, V) < eps:
	break
	return U, V


	def second_order_agd(X, rank, la=0.01, eta=0.01, eps=1e-5):
	"""L2-regularized matrix decomposition with second order alternating gradient descent
	Args:
	X (np.ndarray): IxJ matrix (I: n_samples, J: n_features)
	rank (int): rank for decomposed matrices ([I, J] -> [I, rank], [rank, J])
	la (float, optional): coefficient for regularization. Defaults to 0.01.
	eta (float, optional): learning rate. Defaults to 0.01.
	eps (float, optional): accuracy. Defaults to 1e-5.
	"""

	def f(X, U, V):
	first = 0.5 * np.sum((X - U @ V.T) ** 2)
	second = 0.5 * la * np.sum((np.sum(U 2, axis=0) + np.sum(V 2, axis=0)))
	return first + second

	n_samples, n_features = X.shape
	U = np.random.rand(n_samples, rank)
	V = np.random.rand(n_features, rank)
	while True:
	g = f(X, U, V)
	U = (1 - eta) * U + eta * X @ V @ np.linalg.inv(V.T @ V + la * np.identity(V.shape[1]))
	V = (1 - eta) * V + eta * X.T @ U @ np.linalg.inv(U.T @ U + la * np.identity(U.shape[1]))
	if abs(g - f(X, U, V)) / f(X, U, V) < eps:
	break
	return U, V


	def main():
	n_samples = 100
	n_features = 10
	rank = 5
	X = np.random.rand(n_samples, n_features)
	U, V = first_order_agd(X, rank)
	loss = np.sum(X - U @ V.T)
	print("Loss: ", loss) # \approx 0
	print("U: ", U.shape) # (n_samples, rank)
	print("V: ", V.shape) # (n_features, rank)

	U, V = second_order_agd(X, rank)
	loss = np.sum(X - U @ V.T)
	print("Loss: ", loss) # \approx 0
	print("U: ", U.shape) # (n_samples, rank)
	print("V: ", V.shape) # (n_features, rank)


	if __name__ == "__main__":
	main()
	import numpy as np


	def non_negative_pgd(X, rank, la=0.01, eta=0.01, eps=1e-5):
	"""L2-regularized non-negativematrix decomposition with second order alternating projected gradient descent
	Args:
	X (np.ndarray): IxJ matrix (I: n_samples, J: n_features)
	rank (int): rank for decomposed matrices ([I, J] -> [I, rank], [rank, J])
	la (float, optional): coefficient for regularization. Defaults to 0.01.
	eta (float, optional): learning rate. Defaults to 0.01.
	eps (float, optional): accuracy. Defaults to 1e-5.
	"""

	def f(X, U, V):
	first = 0.5 * np.sum((X - U @ V.T) ** 2)
	second = 0.5 * la * np.sum((np.sum(U 2, axis=0) + np.sum(V 2, axis=0)))
	return first + second

	n_samples, n_features = X.shape
	U = np.random.rand(n_samples, rank)
	V = np.random.rand(n_features, rank)
	while True:
	g = f(X, U, V)
	U = (1 - eta) * U + eta * X @ V @ np.linalg.inv(V.T @ V + la * np.identity(V.shape[1]))
	V = (1 - eta) * V + eta * X.T @ U @ np.linalg.inv(U.T @ U + la * np.identity(U.shape[1]))
	U = np.clip(U, 0, None)
	V = np.clip(V, 0, None)
	if abs(g - f(X, U, V)) / f(X, U, V) < eps:
	break
	return U, V


	def main():
	n_samples = 10
	n_features = 5
	rank = 3
	X = np.random.rand(n_samples, n_features)
	U, V = non_negative_pgd(X, rank)
	loss = np.sum(X - U @ V.T)
	print("Loss: ", loss) # \approx 0
	print("U: ", np.min(U)) # (n_samples, rank)
	print("V: ", np.min(V)) # (n_features, rank)


	if __name__ == "__main__":
	main()
	import numpy as np


	def element_sgd(X, rank, T=500, la=0.01, a=1.0, b=1.0):
	"""L2-regularized decomposition with stochastic gradient descent
	Args:
	X (np.ndarray): IxJ matrix (I: n_samples, J: n_features)
	rank (int): rank for decomposed matrices ([I, J] -> [I, rank], [rank, J])
	T (int, optional): iteration. Defaults to 500.
	la (float, optional): coefficient for regularization. Defaults to 0.01.
	a (float, optional): factor for learning rate. Defaults to 1.0.
	b (float, optional): factor for learning rate. Defaults to 1.0.
	"""

	n_samples, n_features = X.shape
	U = np.random.rand(n_samples, rank)
	V = np.random.rand(n_features, rank)
	for t in range(T):
	i = np.random.randint(0, n_samples)
	j = np.random.randint(0, n_features)
	eta = 1.0 / (a * np.sqrt(t) + b)
	grad_fu = -X[i, j] * V[j] + U[i] * (V[j].T @ V[j] + la / n_features)
	grad_fv = -X[i, j] * U[i] + V[j] * (U[i].T @ U[i] + la / n_samples)
	U[i] = U[i] - eta * grad_fu
	V[j] = V[j] - eta * grad_fv
	return U, V


	def main():
	n_samples = 10
	n_features = 5
	rank = 3
	X = np.random.rand(n_samples, n_features)
	U, V = element_sgd(X, rank)
	loss = np.sum(X - U @ V.T)
	print("Loss: ", loss) # \approx 0


	if __name__ == "__main__":
	main()