Houssem_Ayed Houssem96

🎯

Focusing

Data engineer/ scientist

Houssem96 / libraries.py

Created August 22, 2021 12:54

import libraries for news classification task

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from datasets import list_datasets, load_dataset
	import torch
	from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
	from sklearn.metrics import accuracy_score, f1_score, classification_report

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Houssem96 / explorations.py

Created August 22, 2021 13:11

classes distribution and lengths of texts for each class

	AG_news_dataset = load_dataset(dataset_name)
	AG_news_dataset.set_format(type="pandas")
	df = AG_news_dataset["train"][:]
	def label_int2str(row, split):
	return AG_news_dataset[split].features["label"].int2str(row)

	df["label_name"] = df["label"].apply(label_int2str, split="train")
	df["label_name"].value_counts(ascending=True).plot.barh()
	plt.title("Category Counts")

Houssem96 / ag_news_train.py

Created August 22, 2021 13:18

loading and finetunning distillbert model on ag_news dataset

	num_labels = 4
	model_name = "distilbert-base-uncased"

	AG_news_dataset_train = load_dataset(dataset_name, split='train[:8000]')
	AG_news_dataset_validation = load_dataset(dataset_name, split='train[-2000:]')

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name).to(device)
	def tokenize(batch):
	return tokenizer(batch["text"], padding=True, truncation=True)

Houssem96 / ag_news_dataset.py

Created August 23, 2021 08:07

loading AG_News dataset

	dataset_name = 'ag_news'
	AG_news_dataset = load_dataset(dataset_name)
	AG_news_dataset