Skip to content

Instantly share code, notes, and snippets.

@abdelmalek13
abdelmalek13 / bilingual_tokenizer.py
Last active February 3, 2025 12:57
# Bi-Lingual word Tokenization (Arabic and English) with Custom Regex
import re
import string
import spacy
import nltk
from nltk.tokenize import (TweetTokenizer, word_tokenize)
from camel_tools.tokenizers.word import simple_word_tokenize
def bi_tokenizer():