Last active
April 22, 2023 16:40
-
-
Save conceptofmind/6ef98872caa99047fbdfbc33a6c2d41b to your computer and use it in GitHub Desktop.
c4_filters.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import nltk | |
import ftfy | |
import multiprocessing | |
from datasets import load_dataset | |
from langdetect import detect_langs | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
nltk.download("punkt") | |
whitespace={ | |
" ", | |
" ", | |
" ", | |
" ", | |
" ", | |
" ", | |
" ", | |
" ", | |
" ", | |
" ", | |
"", | |
"", | |
} | |
def is_not_empty(example): | |
return len(example['text']) > 0 | |
def is_terminal_punctuation(line): | |
return bool(re.search(r'[\.\?!"]\s*$', line)) | |
def is_valid_sentence(sentence): | |
words = word_tokenize(sentence) | |
return len(words) >= 3 | |
def contains_javascript(sentence): | |
return bool(re.search(r'\b(?:java\s*script|JS)\b', sentence, re.IGNORECASE)) | |
def contains_lorem_ipsum(sentence): | |
return bool(re.search(r'\b(?:lorem\s*ipsum)\b', sentence, re.IGNORECASE)) | |
def contains_curly_bracket(sentence): | |
return bool(re.search(r'[{}]', sentence)) | |
def is_english(sentence): | |
languages = detect_langs(sentence) | |
return any(lang.lang == 'en' and lang.prob >= 0.99 for lang in languages) | |
# def contains_warning(sentence): | |
# return bool(re.search(r'WARNING:\s*THE\s*EDGAR\s*SYSTEM\s*ENCOUNTERED\s*ERROR\(S\)\s*WHILE\s*PROCESSING\s*THIS\s*SCHEDULE\.', sentence, re.IGNORECASE)) | |
def contains_url(sentence): | |
url_pattern = r'(?:(?:http|https|ftp):\/\/|www\.)[\w/\-?=%.]+\.[\w/\-?=%.]+' | |
return bool(re.search(url_pattern, sentence)) | |
def contains_phone_number(sentence): | |
phone_number_pattern = r'\b(?:\+\d{1,3})?[-. (]*(?:\d{1,3})?[-. )]*(?:\d{2,5})[-. (]*(?:\d{2,5})[-. )]*(?:\d{2,5})\b' | |
return bool(re.search(phone_number_pattern, sentence)) | |
def remove_ssn(sentence): | |
return re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '', sentence) | |
def remove_ip_addresses(sentence): | |
return re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', '', sentence) | |
def remove_credit_card_numbers(sentence): | |
return re.sub(r'\b(?:\d{4}[ -]?){3}\d{4}\b', '', sentence) | |
# def has_min_alphanumeric_percentage(sentence, min_percentage=50): | |
# alphanumeric_count = sum(c.isalnum() for c in sentence) | |
# total_count = len(sentence) | |
# percentage = (alphanumeric_count / total_count) * 100 | |
# return percentage >= min_percentage | |
def remove_repeated_chars(sentence): | |
return re.sub(r'(.)\1{3,}', r'\1', sentence) | |
def fix_encoding(text): | |
return ftfy.fix_text(text) | |
def normalize_whitespace(text): | |
text = "".join([char if char not in whitespace else " " for char in text]) | |
return re.sub(r'\s+', ' ', text).strip() | |
def has_min_chars(text, min_chars=3): | |
return len(text) >= min_chars | |
def filter_dataset(example): | |
text = example["text"] | |
text = fix_encoding(text) | |
# Check for lorem ipsum and curly brackets in the initial text | |
if contains_lorem_ipsum(text) or contains_curly_bracket(text) or not is_english(text): | |
return {"text": ""} | |
sentences = sent_tokenize(text) | |
valid_sentences = [] | |
for sentence in sentences: | |
if (is_terminal_punctuation(sentence) and | |
is_valid_sentence(sentence) and | |
not contains_javascript(sentence) and | |
#not contains_warning(sentence) and | |
not contains_phone_number(sentence) and | |
not contains_url(sentence) and | |
#has_min_alphanumeric_percentage(sentence) and | |
has_min_chars(sentence)): | |
sentence = remove_ssn(sentence) | |
sentence = remove_repeated_chars(sentence) | |
sentence = remove_ip_addresses(sentence) | |
sentence = remove_credit_card_numbers(sentence) | |
sentence = normalize_whitespace(sentence) | |
valid_sentences.append(sentence) | |
if len(valid_sentences) < 5: | |
return {"text": ""} | |
else: | |
return {"text": "\n".join(valid_sentences)} | |
if __name__ == "__main__": | |
dataset = load_dataset("conceptofmind/test_c4_filters", split="train") | |
print(dataset) | |
filtered_dataset = dataset.map( | |
filter_dataset, | |
num_proc=multiprocessing.cpu_count() | |
).filter(is_not_empty) | |
print(filtered_dataset) | |
filtered_dataset.push_to_hub('uber_clean') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
from datasets import load_dataset | |
from uber_clean import ( | |
contains_phone_number, | |
contains_javascript, | |
contains_lorem_ipsum, | |
contains_curly_bracket, | |
is_terminal_punctuation, | |
is_valid_sentence, | |
has_min_chars, | |
fix_encoding, | |
normalize_whitespace, | |
remove_repeated_chars, | |
has_min_alphanumeric_percentage, | |
is_english, | |
remove_ssn, | |
remove_credit_card_numbers, | |
contains_url, | |
remove_ip_addresses, | |
filter_dataset | |
) | |
class TestFilters(unittest.TestCase): | |
def test_contains_lorem_ipsum(self): | |
test_cases = [ | |
("This is a sentence with lorem ipsum text.", True), | |
("Lorem ipsum dolor sit amet, consectetur adipiscing elit.", True), | |
("This is a normal sentence without it.", False) | |
] | |
for sentence, expected_result in test_cases: | |
self.assertEqual(contains_lorem_ipsum(sentence), expected_result) | |
def test_contains_curly_bracket(self): | |
test_cases = [ | |
("This is a sentence with a { curly bracket.", True), | |
("The code snippet is: function() { return true; }", True), | |
("This is a normal sentence without any curly brackets.", False) | |
] | |
for sentence, expected_result in test_cases: | |
self.assertEqual(contains_curly_bracket(sentence), expected_result) | |
def test_is_terminal_punctuation(self): | |
test_cases = [ | |
("This sentence ends with a period.", True), | |
("What a great day!", True), | |
("Is this a question?", True), | |
("This sentence does not have terminal punctuation", False) | |
] | |
for sentence, expected_result in test_cases: | |
self.assertEqual(is_terminal_punctuation(sentence), expected_result) | |
def test_is_valid_sentence(self): | |
test_cases = [ | |
("This sentence has at least three words.", True), | |
("Only three words.", True), | |
("One.", False) | |
] | |
for sentence, expected_result in test_cases: | |
self.assertEqual(is_valid_sentence(sentence), expected_result) | |
def test_contains_phone_number(self): | |
test_cases = [ | |
("Call me at (123) 456-7890.", True), | |
("My number is +1 (555) 123-4567.", True), | |
("You can reach me at 9876543210.", True), | |
("The temperature is 100F today.", False), | |
("This is a regular sentence without a phone number.", False), | |
("The number you are trying to reach is no longer in service.", False) | |
] | |
for sentence, expected_result in test_cases: | |
self.assertEqual(contains_phone_number(sentence), expected_result) | |
def test_contains_javascript(self): | |
test_cases = [ | |
("This page requires JavaScript to run properly.", True), | |
("Please enable Javascript in your browser.", True), | |
("This is a sentence with the word javascript.", True), | |
("This page uses CSS for styling.", False), | |
("This is a regular sentence without the word.", False), | |
("Java and Python are popular programming languages.", False) | |
] | |
for sentence, expected_result in test_cases: | |
self.assertEqual(contains_javascript(sentence), expected_result) | |
def test_has_min_chars(self): | |
test_cases = [ | |
("This sentence has more than 20 characters.", True), | |
("This sentence has less.", True) | |
] | |
for sentence, expected_result in test_cases: | |
self.assertEqual(has_min_chars(sentence, 20), expected_result) | |
def test_fix_encoding(self): | |
test_cases = [ | |
("This is a normal sentence without any encoding issues.", "This is a normal sentence without any encoding issues."), | |
("This sentence has ‘smart’ quotes.", "This sentence has 'smart' quotes."), | |
("Möbius strip is a surface with only one side.", "Möbius strip is a surface with only one side.") | |
] | |
for input_text, expected_result in test_cases: | |
self.assertEqual(fix_encoding(input_text), expected_result) | |
def test_normalize_whitespace(self): | |
test_cases = [ | |
("This is a normal sentence without any unusual whitespace.", "This is a normal sentence without any unusual whitespace."), | |
("This sentence has\u2009different\u200aspaces.", "This sentence has different spaces."), | |
("This\u3000sentence\u2002has\u2003wide\u2004spaces.", "This sentence has wide spaces.") | |
] | |
for input_text, expected_result in test_cases: | |
self.assertEqual(normalize_whitespace(input_text), expected_result) | |
def test_remove_repeated_chars(self): | |
test_cases = [ | |
("This is a normal sentence without any repeated characters.", "This is a normal sentence without any repeated characters."), | |
("Thiiiiis sentenceeee has soooome repeaaaated chaaaaracters.", "This sentence has some repeated characters."), | |
("AAAAAhhhhhh, I can't believe thisssss!", "Ah, I can't believe this!") | |
] | |
for input_text, expected_result in test_cases: | |
self.assertEqual(remove_repeated_chars(input_text), expected_result) | |
def test_has_min_alpha_numeric(self): | |
test_cases = [ | |
("This is a normal sentence with enough alpha numeric characters.", True), | |
("$%#@!&*^", False), | |
("A sentence with 20% alpha numeric characters.", True), | |
("A12_+%# $()?", False) | |
] | |
for input_text, expected_result in test_cases: | |
self.assertEqual(has_min_alphanumeric_percentage(input_text, min_percentage=75), expected_result) | |
def test_is_english(self): | |
test_cases = [ | |
("This is a normal English sentence.", True), | |
("Ceci est une phrase en français.", False), | |
("Dies ist ein Satz auf Deutsch.", False), | |
("Esta es una oración en español.", False) | |
] | |
for input_text, expected_result in test_cases: | |
self.assertEqual(is_english(input_text), expected_result) | |
def test_remove_ssn(self): | |
test_cases = [ | |
("This is a normal sentence without any social security numbers.", "This is a normal sentence without any social security numbers."), | |
("My social security number is 123-45-6789.", "My social security number is ."), | |
("Another SSN is 987-65-4321, please handle it.", "Another SSN is , please handle it.") | |
] | |
for input_text, expected_result in test_cases: | |
self.assertEqual(remove_ssn(input_text), expected_result) | |
def test_contains_url(self): | |
test_cases = [ | |
("Visit our website at https://www.example.com.", True), | |
("You can find the article at http://example.org/article.", True), | |
("Check out our blog: www.blog.example.net", True), | |
("My email is [email protected]", False), | |
("The price is $20,000.", False), | |
("This is a regular sentence without a URL.", False) | |
] | |
for sentence, expected_result in test_cases: | |
self.assertEqual(contains_url(sentence), expected_result) | |
def test_remove_credit_card(self): | |
test_cases = [ | |
("This is a normal sentence without any credit card numbers.", "This is a normal sentence without any credit card numbers."), | |
("My credit card number is 1234-5678-9123-4567.", "My credit card number is ."), | |
("Another credit card number is 9876-5432-1098-7654, please remove it.", "Another credit card number is , please remove it.") | |
] | |
for input_text, expected_result in test_cases: | |
self.assertEqual(remove_credit_card_numbers(input_text), expected_result) | |
def test_remove_ip(self): | |
test_cases = [ | |
("This is a normal sentence without any IP addresses.", "This is a normal sentence without any IP addresses."), | |
("The server IP address is 192.168.1.1.", "The server IP address is ."), | |
("Another IP address is 10.0.0.1, please remove it.", "Another IP address is , please remove it.") | |
] | |
for input_text, expected_result in test_cases: | |
self.assertEqual(remove_ip_addresses(input_text), expected_result) | |
def test_filter_dataset(self): | |
# Load a sample from the dataset | |
dataset = load_dataset("conceptofmind/test_l", split="train") | |
example = dataset[0] # Get the first example from the dataset | |
# Apply the filter_dataset function | |
filtered_example = filter_dataset(example) | |
# Perform assertions to check if the filtered_example is as expected | |
# Example: Check if the filtered_example is not empty | |
self.assertNotEqual(filtered_example["text"], "") | |
if __name__ == '__main__': | |
unittest.main(argv=['first-arg-is-ignored'], exit=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment