Sayak Paul sayakpaul

TF-Hub text embedding modules for underrepresented languages

Mentors:

Morgan Roff
Sayak Paul
jaeyounkim

This is a summary of my GSoC 2021 project. In this project, I tried to produce text embedding modules trained on underrepresented languages like Arabic and Swahili and publish them on tfhub.dev.

	# Copyright 2021 Google LLC.
	# SPDX-License-Identifier: Apache-2.0
	import kfp
	import json
	import time
	from google.cloud import bigquery
	from google.cloud.exceptions import NotFound
	from kfp.v2.google.client import AIPlatformClient

	client = bigquery.Client()

	# Copyright 2022 Google LLC.
	# SPDX-License-Identifier: Apache-2.0
	# Author: Maithra Raghu <[email protected]>


	def compute_distance_matrix(patch_size, num_patches, length):
	"""Helper function to compute distance matrix."""

	distance_matrix = np.zeros((num_patches, num_patches))

	import torch
	from diffusers import FluxPipeline
	from torch import nn


	class ModelOffloaderV2:
	def __init__(self, model: nn.Module, record_stream: bool = False):
	# move model to pinned memory. keep a model copy in CPU pinned memory.
	for p in model.parameters():
	p.data = p.data.cpu().pin_memory()