This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# vectorize words | |
from sklearn.feature_extraction.text import CountVectorizer | |
# naive bayes | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.metrics import auc, roc_curve | |
# train test split | |
from sklearn.model_selection import train_test_split | |
# MAIN |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tweets_df = pd.read_csv("split-data/X_train.csv") | |
target_df = pd.read_csv("split-data/y_train.csv") | |
# PREPROCESS | |
# drop the info we're not going to use | |
# id, date, flag | |
tweets_df.drop(columns=['ids', 'date', 'flag'], inplace=True) | |
# start the cleaning process | |
# lower text | |
tweets_df.loc[:,'lower_text'] = tweets_df['text'].str.lower() | |
# remove stopwords |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import nltk | |
nltk.download("stopwords") | |
from nltk.corpus import stopwords | |
import string | |
import re | |
# vectorizer | |
from sklearn.feature_extraction.text import CountVectorizer | |
STOPWORDS = stopwords.words("english") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for epoch in 1..N_EPOCHS { | |
// generate random idxs for batch size | |
// run all the images divided in batches -> for loop | |
for i in 1..n_it { | |
let batch_idxs = generate_random_index(TRAIN_SIZE as i64, BATCH_SIZE); | |
let batch_images = train_data.index_select(0, &batch_idxs).to_device(vs.device()).to_kind(Kind::Float); | |
let batch_lbls = train_lbl.index_select(0, &batch_idxs).to_device(vs.device()).to_kind(Kind::Int64); | |
// compute the loss | |
let loss = net.forward_t(&batch_images, true).cross_entropy_for_logits(&batch_lbls); | |
opt.backward_step(&loss); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pub fn generate_random_index(ArraySize: i64, BatchSize: i64)-> Tensor{ | |
let random_idxs = Tensor::randint(ArraySize, &[BatchSize], kind::INT64_CPU); | |
random_idxs | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::result::Result; | |
use std::error::Error; | |
use mnist::*; | |
use tch::{kind, Kind, Tensor, nn, nn::ModuleT, nn::OptimizerConfig, Device}; | |
use ndarray::{Array3, Array2}; | |
const LABELS: i64 = 10; // number of distinct labels | |
const HEIGHT: usize = 28; | |
const WIDTH: usize = 28; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for epoch in 1..N_EPOCHS { | |
let loss = net.forward(&train_data).cross_entropy_for_logits(&train_lbl); | |
// backward step | |
opt.backward_step(&loss); | |
//accuracy on test | |
let val_accuracy = net.forward(&val_data).accuracy_for_logits(&val_lbl); | |
println!( | |
"epoch: {:4} train loss: {:8.5} val acc: {:5.2}%", | |
epoch, | |
f64::from(&loss), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::result::Result; | |
use std::error::Error; | |
use mnist::*; | |
use tch::{kind, Kind, Tensor, nn, nn::Module, nn::OptimizerConfig, Device}; | |
use ndarray::{Array3, Array2}; | |
const LABELS: i64 = 10; // number of distinct labels | |
const HEIGHT: usize = 28; | |
const WIDTH: usize = 28; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::result::Result; | |
use std::error::Error; | |
use mnist::*; | |
use tch::{kind, no_grad, Kind, Tensor}; | |
use ndarray::{Array3, Array2}; | |
const LABELS: i64 = 10; // number of distinct labels | |
const HEIGHT: usize = 28; | |
const WIDTH: usize = 28; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::result::Result; | |
use std::error::Error; | |
use mnist::*; | |
use tch::{kind, no_grad, Kind, Tensor}; | |
use ndarray::{Array3, Array2}; | |
pub fn image_to_tensor(data:Vec<u8>, dim1:usize, dim2:usize, dim3:usize)-> Tensor{ | |
// normalize the image as well | |
let inp_data: Array3<f32> = Array3::from_shape_vec((dim1, dim2, dim3), data) |