This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from spark_session_builder import build_spark_session | |
spark= build_spark_session(master="spark://cpu128-dy-r6i-32xlarge-3:7077",num_cores=128,mem_gb=999) | |
from pyspark.ml.feature import MinHashLSH,MinHashLSHModel | |
from pyspark.ml.linalg import Vectors | |
import time | |
from pyspark.sql.functions import col | |
from pyspark.ml.feature import MinHashLSH, Tokenizer, HashingTF | |
hash_size=100 | |
threshold=0.8 | |
start=time.time() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from transformers import GPTJForCausalLM, GPTJConfig | |
from transformers import CodeGenTokenizer, CodeGenForCausalLM | |
def cg2gptj(code_model): | |
cg_model = CodeGenForCausalLM.from_pretrained(code_model, torch_dtype="auto") | |
cg_config = cg_model.config | |
# Create empty GPTJ model | |
print('Creating empty GPTJ model') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module load openmpi cuda/11.7 | |
#CONDA_HOME=/fsx/quentin/miniconda3 | |
CONDA_HOME=/fsx/gpt-neox/conda/envs/neox | |
#CONDA_HOME=/fsx/gpt-neox/conda/envs/improved-t5 | |
CUDNN_HOME=/fsx/quentin/cudnn-linux-x86_64-8.6.0.163_cuda11-archive | |
export LD_LIBRARY_PATH=$CUDNN_HOME/lib:$LD_LIBRARY_PATH | |
export CPATH=$CUDNN_HOME/include:$CPATH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from urllib.parse import urlparse | |
df=pd.read_csv() | |
def url_matches_dataframe(url: str, df: pd.DataFrame) -> bool: | |
# Parse the given URL to get the netloc and hostname | |
parsed_url = urlparse(url) | |
netloc = parsed_url.netloc | |
hostname = parsed_url.hostname | |
# Remove "www" from the netloc and hostname |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --job-name="elm" | |
#SBATCH --partition=gpu | |
#SBATCH --mem-per-cpu=16GB # Amount of CPU memory | |
#SBATCH --nodes=4 | |
#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node! | |
#SBATCH --cpus-per-task=6 # Number of cores per tasks | |
#SBATCH --hint=nomultithread # We get physical cores not logical | |
#SBATCH --gres=gpu:8 # Number of gpus | |
#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import deepspeed as ds | |
print(ds.__version__) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --job-name="elm" | |
#SBATCH --partition=gpu | |
#SBATCH --mem-per-cpu=16GB # Amount of CPU memory | |
#SBATCH --nodes=1 | |
#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node! | |
#SBATCH --cpus-per-task=6 # Number of cores per tasks | |
#SBATCH --hint=nomultithread # We get physical cores not logical | |
#SBATCH --gres=gpu:8 # Number of gpus | |
#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go |