Skip to content

Instantly share code, notes, and snippets.

View Houssem96's full-sized avatar
🎯
Focusing

Houssem_Ayed Houssem96

🎯
Focusing
  • Paris
View GitHub Profile
@Houssem96
Houssem96 / libraries.py
Created August 22, 2021 12:54
import libraries for news classification task
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import list_datasets, load_dataset
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, classification_report
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@Houssem96
Houssem96 / explorations.py
Created August 22, 2021 13:11
classes distribution and lengths of texts for each class
AG_news_dataset = load_dataset(dataset_name)
AG_news_dataset.set_format(type="pandas")
df = AG_news_dataset["train"][:]
def label_int2str(row, split):
return AG_news_dataset[split].features["label"].int2str(row)
df["label_name"] = df["label"].apply(label_int2str, split="train")
df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Category Counts")
@Houssem96
Houssem96 / ag_news_train.py
Created August 22, 2021 13:18
loading and finetunning distillbert model on ag_news dataset
num_labels = 4
model_name = "distilbert-base-uncased"
AG_news_dataset_train = load_dataset(dataset_name, split='train[:8000]')
AG_news_dataset_validation = load_dataset(dataset_name, split='train[-2000:]')
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
def tokenize(batch):
return tokenizer(batch["text"], padding=True, truncation=True)
@Houssem96
Houssem96 / ag_news_dataset.py
Created August 23, 2021 08:07
loading AG_News dataset
dataset_name = 'ag_news'
AG_news_dataset = load_dataset(dataset_name)
AG_news_dataset