This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import hts # To install: pip install scikit-hts | |
import collections | |
from scipy.optimize import lsq_linear | |
hts_df = pd.DataFrame([{'total': 14, | |
'CA': 5.4, 'TX': 1.8, 'WI': 5.9, | |
'CA_1': 0.8, 'CA_2': 0.6, 'CA_3': 0.9, 'CA_4': 0.3, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Script for fine-tuning Pegasus | |
Example usage: | |
# use XSum dataset as example, with first 1000 docs as training data | |
from datasets import load_dataset | |
dataset = load_dataset("xsum") | |
train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000] | |
# use Pegasus Large model as base for fine-tuning | |
model_name = 'google/pegasus-large' | |
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###################### | |
### Sample Reviews ### | |
###################### | |
###### Topic 1 ###### | |
"From the start our experience was bad There was only one person on check in so we had to queue Having been allcated our rooms we had to change them as we had specified adjacent or interconnecting rooms which they failed to do We then had to queue up again for the one person still on reception and 45 minutes later were allocated 2 adjacent rooms But one of the rooms had a smell of drains which I reported and which the very discourteous duty manager Thalia refused to deal with In fact she told me several times that I was wrong The rooms were small the beds very soft and the shower and toilet were part of the bedroom The smell of drains was coming from the shower For such an expensive hotel this was unacceptable especially the way the duty manager treated her customers I don t think I have ever encountered a more unpleasant manner in my many years of travelling" | |
"On arrival we only had 30 minutes to get ready We were to |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import vaex | |
import vaex.ml | |
# load iris data | |
df = vaex.ml.datasets.load_iris() | |
# perform train test split | |
df_train, df_test = df.ml.train_test_split(test_size=0.2) | |
# apply standardization transformation |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import vaex | |
import vaex.ml | |
# load titanic data | |
df_vaex = vaex.ml.datasets.load_titanic() | |
# perform train test split | |
df_train, df_test = df_vaex.ml.train_test_split(test_size=0.2) | |
# One-hot encode some features |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import numpy as np | |
from scipy.stats import uniform | |
from sklearn.datasets import load_iris | |
from sklearn.model_selection import train_test_split | |
from sklearn.model_selection import cross_validate | |
from sklearn import metrics | |
from sklearn.model_selection import ParameterSampler | |
from sklearn.ensemble import RandomForestClassifier |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
X_train, X_test, y_train, y_test = data_processing() | |
#################### 1. Setup Experiment ########################### | |
# set experiment name to organize runs | |
mlflow.set_experiment('New Experiment Name') | |
experiment = mlflow.get_experiment_by_name('New Experiment Name') | |
# set path to log data, e.g., mlruns local folder | |
mlflow.set_tracking_uri('./mlruns') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# values.yaml to overwrite default values | |
scheduler: | |
image: | |
tag: 2.21.0 # Container image tag | |
serviceType: "LoadBalancer" | |
resources: | |
limits: | |
cpu: 1 | |
memory: 6G |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import matplotlib | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import missingno | |
import warnings | |
warnings.filterwarnings("ignore") | |
%matplotlib inline |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import spacy | |
import unidecode | |
from word2number import w2n | |
import contractions | |
nlp = spacy.load('en_core_web_md') | |
# exclude words from spacy stopwords list | |
deselect_stop_words = ['no', 'not'] |