Last active
February 26, 2025 16:46
-
-
Save ozturkoktay/afad85d3e3b3e6a8844600580e7a1b7d to your computer and use it in GitHub Desktop.
This is helper functions for Python projects.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import json | |
import os | |
import pickle | |
import codecs | |
import numpy as np | |
from functools import reduce | |
def mean_absolute_percentage_error(val1: float, val2: float) -> float: | |
""" | |
Calculate the mean absolute percentage error (MAPE). | |
""" | |
total = val1 + val2 | |
if total == 0: | |
return 0.0 | |
return (abs(val1 - val2) / total) * 2.0 | |
def rescale_range(values: list) -> dict: | |
""" | |
Rescales a list of values between 0 and 1. | |
""" | |
if not values: | |
return {} | |
min_val, max_val = np.min(values), np.max(values) | |
if min_val == max_val: | |
return {v: 0.0 for v in values} | |
return {v: (v - min_val) / (max_val - min_val) for v in values} | |
def check_file_exists(filename: str) -> str: | |
""" | |
Check if a file exists, raise an error otherwise. | |
""" | |
if not os.path.isfile(filename): | |
raise FileNotFoundError(f"Invalid filename: {filename}") | |
return filename | |
def read_text_file(filename: str): | |
""" | |
Generator function to read a text file line by line, stripping newlines and carriage returns. | |
""" | |
filename = check_file_exists(filename) | |
with codecs.open(filename, encoding='utf-8') as file: | |
for line in file: | |
line = line.strip() | |
if line: | |
yield line | |
def read_hash_file(filename: str): | |
""" | |
Generator function to read a hash file (key-value pairs separated by tabs). | |
""" | |
filename = check_file_exists(filename) | |
with codecs.open(filename, encoding='utf-8') as file: | |
for line in file: | |
parts = line.strip().split("\t") | |
if len(parts) == 2: | |
yield parts | |
def load_data(source: str, path: str, config=None): | |
""" | |
Load data from a given source type. | |
""" | |
path = check_file_exists(path) | |
if source == 'list': | |
return list(read_text_file(path)) | |
if source == 'key-json': | |
return {line.split("\t")[0]: json.loads(line.split("\t")[1]) for line in read_text_file(path)} | |
if source == 'commalist': | |
return {w: line.split(",") for line in read_text_file(path) for w in line.split(",")} | |
if source == 'dict': | |
return {key: value for key, value in read_hash_file(path)} | |
if source == 'json': | |
with open(path, 'r', encoding='utf-8') as file: | |
return json.load(file) | |
if source == 'bloomfilter': | |
from pybloom_live import BloomFilter | |
bloom = BloomFilter(capacity=config.get('capacity', 1000), error_rate=config.get('error_rate', 0.01)) | |
for line in read_text_file(path): | |
bloom.add(line) | |
return bloom | |
raise ValueError(f"Unsupported source type: {source}") | |
def write_pickle(data: object, filename: str): | |
""" | |
Write data to a pickle file. | |
""" | |
with open(filename, 'wb') as file: | |
pickle.dump(data, file, pickle.HIGHEST_PROTOCOL) | |
def read_pickle(filename: str) -> object: | |
""" | |
Read data from a pickle file. | |
""" | |
with open(filename, 'rb') as file: | |
return pickle.load(file) | |
def write_json(data: object, filename: str): | |
""" | |
Write data to a JSON file. | |
""" | |
with open(filename, 'w', encoding='utf-8') as file: | |
json.dump(data, file, ensure_ascii=False, indent=4) | |
def read_json(filename: str) -> object: | |
""" | |
Read data from a JSON file. | |
""" | |
with open(filename, 'r', encoding='utf-8') as file: | |
return json.load(file) | |
def write_text_file(filename: str, data: list, mode='w'): | |
""" | |
Write or append a list of strings to a text file. | |
""" | |
with codecs.open(filename, mode, "utf-8") as file: | |
file.writelines(f"{line}\n" for line in data) | |
def append_text_file(filename: str, data: list): | |
""" | |
Append data to a text file. | |
""" | |
write_text_file(filename, data, mode='a') | |
REG_ALL_NOTCHARS = r"[^a-zşıüğçöâîûA-ZŞİÜĞÇÖÂÎÛ0-9 ]" | |
def replace_circumflex(text: str) -> str: | |
""" | |
Replace circumflex characters with their base counterparts. | |
""" | |
mapping = {"Â": "A", "Î": "I", "Û": "U", "â": "a", "î": "ı", "û": "u"} | |
return reduce(lambda x, y: x.replace(y, mapping[y]), mapping, text) | |
def to_lower(text: str) -> str: | |
""" | |
Convert text to lowercase, handling Turkish-specific characters. | |
""" | |
mapping = {"Ş": "ş", "I": "ı", "Ü": "ü", "Ç": "ç", "Ö": "ö", "Ğ": "ğ", "İ": "i", "Â": "â", "Î": "î", "Û": "û"} | |
text = reduce(lambda x, y: x.replace(y, mapping[y]), mapping, text) | |
return text.lower() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment