Skip to content

Instantly share code, notes, and snippets.

@sizhky
sizhky / sample_gt.json
Last active November 4, 2024 16:20
sample tables for TEDS evaluation
{
"full-table": {
"html": "<html><body><table> <thead> <tr> <th>S.No</th> <th>Description</th> <th>Qty</th> <th>Unit Price ($)</th> <th>Total ($)</th> </tr> </thead> <tbody> <tr> <td>1</td> <td>Monitor 4k</td> <td>1</td> <td>320</td> <td>320</td> </tr> <tr> <td>2</td> <td>Keyboard</td> <td>1</td> <td>50</td> <td>50</td> </tr> <tr> <td>3</td> <td>LEDs</td> <td>100</td> <td>1</td> <td>100</td> </tr> <tr> <td>4</td> <td>MiniLEDs</td> <td>100</td> <td>1</td> <td>100</td> </tr> </tbody> </table> </body> </html>"
},
"missing-rows": {
"html": "<html><body><table> <thead> <tr> <th>S.No</th> <th>Description</th> <th>Qty</th> <th>Unit Price ($)</th> <th>Total ($)</th> </tr> </thead> <tbody> <tr> <td>1</td> <td>Monitor 4k</td> <td>1</td> <td>320</td> <td>320</td> </tr> <tr> <td>2</td> <td>Keyboard</td> <td>1</td> <td>50</td> <td>50</td> </tr> <tr> <td>3</td> <td>LEDs</td> <td>100</td> <td>1</td> <td>100</td> </tr> <tr> <td>4</td> <td>MiniLEDs</td> <td>100</td> <td>1</td> <td>100</td> </tr> </
@sizhky
sizhky / parse_bruno.py
Created October 14, 2024 05:23
Parse all chunks of bruno
from torch_snippets import json, readlines
import re
def parse_bruno(file):
def process_chunks(chunk):
a, *b = chunk.split('\n')
a, _b = a.split()
b = '\n'.join([_b, *b])
if a == 'body:json':
b = json.loads(b[1:-1])
@sizhky
sizhky / ticktock.py
Created December 9, 2023 19:15
ticktock timer for time profiling
from time import perf_counter
class Ticker:
def __init__(self):
self.tik = perf_counter()
def __call__(self, arg, info=None, newl=False):
tok = perf_counter()
t = round(tok-self.tik, 3)
if newl: print()
@sizhky
sizhky / clean_mem.py
Created December 8, 2023 13:00
clean memory
import traceback, gc
def clean_ipython_hist():
# Code in this function mainly copied from IPython source
if not 'get_ipython' in globals(): return
ip = get_ipython()
user_ns = ip.user_ns
ip.displayhook.flush()
pc = ip.displayhook.prompt_count + 1
for n in range(1, pc): user_ns.pop('_i'+repr(n),None)
@sizhky
sizhky / conda_utils.sh
Last active January 8, 2024 08:48
utilities for conda environments
create_conda_env() {
# Check if the environment name and Python version are provided as arguments
if [ $# -lt 2 ]; then
echo "Please provide the conda environment name and Python version as arguments."
return 1
fi
# Create conda environment with the specified Python version
conda create -n "$1" "$2"
Sure, here is a summary of the 20 examples of using `pd.DataFrame.query()` to filter data:
1. Filter on exact string match: `df.query('name == "Alice"')`
2. Filter on multiple conditions: `df.query('age > 30 and city == "New York"')`
3. Filter on a list of values: `df.query('age in [25, 30, 35]')`
4. Filter on a range of values: `df.query('age.between(25, 35)')`
5. Filter on string contains: `df.query('name.str.contains("an")')`
6. Filter on string starts with: `df.query('name.str.startswith("A")')`
7. Filter on string ends with: `df.query('name.str.endswith("a")')`
8. Filter on regular expression: `df.query('name.str.match("^[Aa].*a$")')`
@sizhky
sizhky / .gitignore
Created November 10, 2022 09:47
Python's gitignore
# Data
*.csv
*.png
*.jpg
*.jpeg
*.pdf
*.json
# Byte-compiled / optimized / DLL files
__pycache__/
We can make this file beautiful and searchable if this error is corrected: It looks like row 6 should actually have 17 columns, instead of 11 in line 5.
id,city,date,player_of_match,venue,neutral_venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,eliminator,method,umpire1,umpire2
335982,Bangalore,2008-04-18,BB McCullum,M Chinnaswamy Stadium,0,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140,N,NA,Asad Rauf,RE Koertzen
335983,Chandigarh,2008-04-19,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",0,Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33,N,NA,MR Benson,SL Shastri
335984,Delhi,2008-04-19,MF Maharoof,Feroz Shah Kotla,0,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9,N,NA,Aleem Dar,GA Pratapkumar
335985,Mumbai,2008-04-20,MV Boucher,Wankhede Stadium,0,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5,N,NA,SJ Davis,DJ Harper
335986,Kolkata,2008-04-20,DJ Hussey,Eden Gardens,0,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knigh
@sizhky
sizhky / io.py
Created November 11, 2021 15:16
io
from torch_snippets import *
def io(func):
def inner(*args, **kwargs):
Info(f'\nargs: {args}\nkwargs: {kwargs}', depth=1)
o = func(*args, **kwargs)
Info(f'\noutput: {o}', depth=1)
return o
return inner
def add_datepart(df, fldname, drop=True, time=False):
"Helper function that adds columns relevant to a date."
fld = df[fldname]
fld_dtype = fld.dtype
if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
fld_dtype = np.datetime64
if not np.issubdtype(fld_dtype, np.datetime64):
df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
targ_pre = re.sub('[Dd]ate$', '', fldname)