Skip to content

Instantly share code, notes, and snippets.

View pszemraj's full-sized avatar

Peter pszemraj

View GitHub Profile
@pszemraj
pszemraj / filter_refusals.py
Created August 29, 2024 23:35
filter df dataset col for LLM refusals in instruct data
# !pip install -q sentence-splitter
import os
from sentence_splitter import split_text_into_sentences
REFUSAL_TERMS = [
"sorry",
"i can't",
"unfortunately,",
"as a language model",
"as an ai language model",
@pszemraj
pszemraj / extract_comments_and_docs.py
Created August 27, 2024 18:59
extract non-code strings from python files: comments, docstrings, string literals
import re
def extract_comments_and_docs(multiline_string):
# Pattern to match lines where the first non-whitespace character is '#'
comment_pattern = r"^\s*#(.*)"
# Pattern to match any text within triple quotes (either ''' or """)
@pszemraj
pszemraj / load_smolllm_corpus_python.py
Created August 24, 2024 22:55
load the python subset of smolllm-corpus without aws creds
import boto3
import gzip
from datasets import load_dataset
from botocore import UNSIGNED
from botocore.config import Config
num_proc = 32
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
bucket_name = "softwareheritage"
import logging
import os
import fire
import torch
from datasets import load_dataset
from huggingface_hub import PyTorchModelHubMixin
from torch import nn
from transformers import AutoConfig, AutoModel, AutoTokenizer
@pszemraj
pszemraj / transcribe_imgdir2text.py
Last active August 12, 2024 22:28
run ocr on imgdir with openai model
import base64
import os
from pathlib import Path
import fire
from openai import OpenAI
from tqdm.auto import tqdm
from joblib import Memory
# Set up joblib caching
@pszemraj
pszemraj / sncl.md
Created August 5, 2024 03:26
Schrödinger's Non-Commercial License (SNCL) v1.0 draft

Schrödinger's Non-Commercial License (SNCL) v1.0

Preamble:

This license is designed to allow users to freely use, modify, and distribute the software for non-commercial purposes. It recognizes the challenges in defining what constitutes commercial activity and offers guidance and flexibility for users who are unsure about the nature of their activities.

1. Grant of License

Subject to the terms and conditions of this License, the Licensor hereby grants to the Licensee a worldwide, royalty-free, non-exclusive license to use, modify, and distribute the Software, provided that such activities are conducted for Non-Commercial Purposes, as defined below.

import streamlit as st
import pandas as pd
from datasets import load_from_disk
import textwrap
import json
# Constants
ROWS_PER_PAGE = 100
LOGO_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/datasets_logo.png"
DOCS_URL = "https://huggingface.co/docs/datasets/index"
@pszemraj
pszemraj / parse_nanoT5_log.py
Last active August 10, 2024 13:56
Process log file from nanoT5
"""
parses the standard main.log from nanoT5 and makes some plots
pip install matplotlib pandas seaborn
"""
import argparse
import logging
import os
import re
from pathlib import Path
@pszemraj
pszemraj / recursive_model_summary.py
Created July 23, 2024 04:19
print out a summary of a pytorch model
from typing import List, Tuple, Optional, Set
import torch.nn as nn
from transformers import PreTrainedModel
def model_summary(
model: PreTrainedModel, max_depth: int = 4, show_input_size: bool = False
) -> None:
"""
Prints an accurate summary of the model, avoiding double-counting of parameters.
@pszemraj
pszemraj / is_image_url.py
Created July 13, 2024 22:07
simple fn using regex to check if a string url points to an image file
import re
# List of common image file extensions
image_extensions = [
'jpg', 'jpeg', 'jpe', 'jif', 'jfif', 'jfi', # JPEG
'png', # PNG
'gif', # GIF
'webp', # WebP
'tiff', 'tif', # TIFF
'bmp', # BMP