Last active
December 18, 2023 14:47
-
-
Save itsthejoker/9968cf3dc54086e3ef9198d980c44649 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#################################### | |
# ISO8601 time strings + UTC offsets | |
#################################### | |
from time import strftime | |
from time import gmtime | |
from datetime import datetime | |
@staticmethod | |
def local_datetime(utc_offset=True): | |
""" | |
Returns an ISO8601 formatted string depicting the current local time. | |
Appending the UTC offset to the end is optional, and will have the | |
following formatting: '2016-11-11T14:56:05.004707-0500' | |
NOTE: DOES NOT WORK IF THE TARGET SYSTEM DOES NOT HAVE THE strftime | |
C EXTENSION INSTALLED | |
""" | |
# server timezone UTC offset, e.g. "-0500" | |
local_timezone = strftime("%z", gmtime()) | |
if not utc_offset: | |
return datetime.now().isoformat() | |
return datetime.now().isoformat() + local_timezone | |
#################################### | |
# Graceful Interrupt Handler | |
#################################### | |
https://gist.github.com/itsthejoker/6b497f2098916cefd8a8e2f9a1ff7b5d | |
#################################### | |
# Auto Column Formatter | |
#################################### | |
https://github.com/Samrux/Python-Scripts/blob/master/columnize.py | |
#################################### | |
# Recursively update values in dict | |
#################################### | |
from typing import Any | |
def replace_item( | |
obj: dict | list, key_to_replace: str, replace_value: Any | |
) -> dict | list: | |
if isinstance(obj, list): | |
temp_list = [] | |
for subitem in obj: | |
subitem = replace_item(subitem, key_to_replace, replace_value) | |
temp_list.append(subitem) | |
return temp_list | |
obj = { | |
key: replace_value if key == key_to_replace else value | |
for key, value in obj.items() | |
} | |
for key in obj.keys(): | |
if type(obj[key]) in [dict, list]: | |
obj[key] = replace_item(obj[key], key_to_replace, replace_value) | |
return obj | |
thing2 = replace_item(thing, "IsThin", True) | |
print(thing) | |
print(thing2) | |
#################################### | |
# Pure Python End of Month Datemath | |
#################################### | |
# total days in every month during non leap years | |
M_DAYS = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] | |
def isleap(year): | |
"""Return True for leap years, False for non-leap years.""" | |
return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0) | |
def days_in_month(year, month): | |
"""Returns total number of days in a month accounting for leap years.""" | |
return M_DAYS[month] + (month == 2 and isleap(year)) | |
def is_monthend(ref_date): | |
"""Checks whether a date is also a monthend""" | |
return ref_date.day == days_in_month(ref_date.year, ref_date.month) | |
#################### | |
# Spaceship Operator | |
#################### | |
# in ruby, the spaceship operator `<=>` is used to easily tell if if one number | |
# is greater or lesser than the other. For example, `a <=> b` will return a -1 if | |
# a is smaller, a 1 if a is larger, and a 0 if they are equal. | |
def spaceship(a: int, b: int) -> int: | |
if a < b: | |
return -1 | |
elif a > b: | |
return 1 | |
else: | |
return 0 | |
#################### | |
# Text Normalization | |
#################### | |
def normalize_quotes(text: str) -> str: | |
single_quotes_list = [ | |
"\u0027", # APOSTROPHE | |
"\u0060", # GRAVE ACCENT | |
"\u00B4", # ACUTE ACCENT | |
"\u2018", # LEFT SINGLE QUOTATION MARK | |
"\u2019", # RIGHT SINGLE QUOTATION MARK | |
"\u201A", # SINGLE LOW-9 QUOTATION MARK | |
"\u201B", # SINGLE HIGH-REVERSED-9 QUOTATION MARK | |
"\u2032", # PRIME | |
"\u2035", # REVERSED PRIME | |
"\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK | |
"\u203A", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | |
"\u300C", # LEFT CORNER BRACKET | |
"\u300D", # RIGHT CORNER BRACKET | |
"\u300E", # LEFT WHITE CORNER BRACKET | |
"\u300F", # RIGHT WHITE CORNER BRACKET | |
"\uFF07", # FULLWIDTH APOSTROPHE | |
"\uFF62", # HALFWIDTH LEFT CORNER BRACKET | |
"\uFF63", # HALFWIDTH RIGHT CORNER BRACKET | |
] | |
double_quotes_list = [ | |
"\u0022", # QUOTATION MARK | |
"\u201C", # LEFT DOUBLE QUOTATION MARK | |
"\u201D", # RIGHT DOUBLE QUOTATION MARK | |
"\u201E", # DOUBLE LOW-9 QUOTATION MARK | |
"\u201F", # DOUBLE HIGH-REVERSED-9 QUOTATION MARK | |
"\u301D", # REVERSED DOUBLE PRIME QUOTATION MARK | |
"\u301F", # LOW DOUBLE PRIME QUOTATION MARK | |
"\u2034", # TRIPLE PRIME | |
"\u2036", # REVERSED DOUBLE PRIME | |
"\u2037", # REVERSED TRIPLE PRIME | |
"\u301E", # DOUBLE PRIME QUOTATION MARK | |
"\uFF02", # FULLWIDTH QUOTATION MARK | |
] | |
text = text.translate(str.maketrans({x: "'" for x in single_quotes_list})) | |
return text.translate(str.maketrans({x: '"' for x in double_quotes_list})) | |
def normalize_hyphens(text: str) -> str: | |
hyphen_list = [ | |
"\u002D", # HYPHEN-MINUS | |
"\u007E", # TILDE | |
"\u00AD", # SOFT HYPHEN | |
"\u058A", # ARMENIAN HYPHEN | |
"\u05BE", # HEBREW PUNCTUATION MAQAF | |
"\u1173", # HANGUL JUNGSEONG EU | |
"\u1400", # CANADIAN SYLLABICS HYPHEN | |
"\u1806", # MONGOLIAN TODO SOFT HYPHEN | |
"\u2010", # HYPHEN | |
"\u2011", # NON-BREAKING HYPHEN | |
"\u2012", # FIGURE DASH | |
"\u2013", # EN DASH | |
"\u2014", # EM DASH | |
"\u2015", # HORIZONTAL BAR | |
"\u2043", # HYPHEN BULLET | |
"\u2053", # SWUNG DASH | |
"\u2E17", # DOUBLE OBLIQUE HYPHEN | |
"\u2E1A", # HYPHEN WITH DIAERESIS | |
"\u2E3A", # TWO-EM DASH | |
"\u2E3B", # THREE-EM DASH | |
"\u2E40", # DOUBLE HYPHEN | |
"\u301C", # WAVE DASH | |
"\u30FC", # KATAKANA-HIRAGANA PROLONGED SOUND MARK | |
"\u3030", # WAVY DASH | |
"\u30A0", # KATAKANA-HIRAGANA DOUBLE HYPHEN | |
"\u3161", # HANGUL LETTER YO-YA | |
"\u4E00", # CJK UNIFIED IDEOGRAPH-4E00 | |
"\uA4FE", # LISU PUNCTUATION FULL STOP | |
"\uFE31", # PRESENTATION FORM FOR VERTICAL EM DASH | |
"\uFE32", # PRESENTATION FORM FOR VERTICAL EN DASH | |
"\uFE58", # SMALL EM DASH | |
"\uFE63", # SMALL HYPHEN-MINUS | |
"\uFF0D", # FULLWIDTH HYPHEN-MINUS | |
"\U00010ead", # YEZIDI HYPHENATION MARK | |
"\U00010f55", # SOGDIAN HYPHENATION MARK | |
"\U00010110", # AEGEAN NUMBER TEN | |
"\U00010191", # ROMAN SEMUNCIA SIGN | |
"\U0001104B", # BRAHMI PUNCTUATION LINE | |
"\U00011052", # BRAHMI NUMBER TEN | |
"\U000110BE", # KAITHI SECTION MARK | |
] | |
return text.translate(str.maketrans({x: "-" for x in hyphen_list})) | |
def text_to_ascii( | |
text: str, | |
punctuation_to_keep: Optional[str] = None, | |
keep_first_occurrence_of: str = None, | |
) -> str: | |
""" | |
Reduces text to bare ASCII with optional ability to keep some punctuation. | |
Example: "Eärendil's ship, Vingilótë" -> "Earendils ship Vingilote" | |
To remove international characters while keeping all punctuation, use like | |
this: | |
``` | |
>>> import string, unicodedata | |
>>> my_text = "Eärendil's ship, Vingilótë" | |
>>> text_to_ascii(my_text, punctuation_to_keep=string.punctuation) | |
"Earendil's ship, Vingilote" | |
``` | |
Alternatively, use the helper function `normalize_text` with the second arg | |
of "all". | |
Use the arg `keep_first_occurrence_of` to do what it says; use a string of | |
characters, like "$%" to keep the first occurrence of either of those | |
symbols in the string. | |
""" | |
punctuation = string.punctuation | |
if punctuation_to_keep: | |
punctuation = "".join([x for x in punctuation if x not in punctuation_to_keep]) | |
if keep_first_occurrence_of: | |
punctuation = "".join( | |
[x for x in punctuation if x not in keep_first_occurrence_of] | |
) | |
for char in keep_first_occurrence_of: | |
# change them all to something identifiable, save the first, nuke the rest | |
text = text.replace(char, "|||").replace("|||", char, 1).replace("|||", "") | |
text = text.strip().translate(str.maketrans("", "", punctuation)) | |
return unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode() | |
def normalize_text( | |
text: str, punctuation_to_keep="-.&'_", keep_first_occurrence_of=None | |
) -> str: | |
"""Take unicode text and return ASCII text. | |
By default, the following punctuation will be retained in the final string: | |
- . & ' _ | |
Other punctuation will be removed. If you wish to keep all punctuation, pass | |
the string "all" as the second argument. If you wish to remove all punctuation, | |
pass None. | |
""" | |
if punctuation_to_keep == "all": | |
punctuation_to_keep = string.punctuation | |
return text_to_ascii( | |
normalize_hyphens(normalize_quotes(text)), | |
punctuation_to_keep=punctuation_to_keep, | |
keep_first_occurrence_of=keep_first_occurrence_of, | |
).strip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment