get human-readable file size

ls -lah

vi shortcuts

esc then :set number to get line numbers

git shortcuts

checkout file from another branch

git checkout <branch_name> -- <paths>

remove multiple deleted files

git ls-files --deleted -z | xargs -0 git rm

fix "Not possible to fast-forward" error

git pull --no-ff

rename a branch[https://gist.github.com/lttlrck/9628955]

check out files from another repo and maintain git history [https://blog.billyc.io/how-to-copy-one-or-more-files-from-one-git-repo-to-another-and-keep-the-git-history/]

s5cmd shortcuts

check size and number of objects in a bucket

s5cmd du --humanize '<fullpath>/*'

parquet shortcuts

get schema without reading the whole file

from fastparquet import ParquetFile

pf = ParquetFile('file.parquet')
print(pf.schema)

poetry shortcuts

find and activate virtualenv

poetry shell

#if that doesn't work, you can do:
poetry env list --full-path
source {full path}/bin/activate

or as a one-liner: source "$( poetry env list --full-path )/bin/activate"

make json prettier

jq --color-output . <filename>

unpack a .tar.gz file into a new directory

tar -xzf my_env.tar.gz -C my_env

command-line zip tricks

zip <targetfilename.zip> <folder/*> or <space separated list of filenames>

unzip <targetfilename.zip> -d <target directory>

read zip in python

import zipfile

with zipfile.ZipFile(<zipfilename.zip>, 'r') as f:
    f.extractall("<target directory name>")

write zip in python

import shutil
shutil.make_archive(<target name>, 'zip', <source directory name>)

aws cli

copy with wildcards

aws s3 cp s3://data/ . --recursive --exclude "*" --include "myfilterhere"

docker shortcuts

remove all stopped containers

docker rm $(docker ps --filter status=exited -q)

remove all images without a container

docker image prune -a

docker get hostname

cat /etc/hostname

docker copy to and from a container (run this on your machine, not inside the container)

docker cp <your file>:<hostname>
docker cp <hostname>:<path> <localpath>

python shortcuts

set up basic logging config

logger = logging.basicConfig( format="%(asctime)s %(levelname)s:%(name)s:%(message)s", filename=".log", level=logging.INFO )

create and use a virtualenv

python -m venv /path/to/new/virtual/environment
source <path>/bin/activate

'touch' to create an empty file to write in

from pathlib import Path
pathobj = Path(f"{foldername}/{filename}")
pathobj.touch(exist_ok=True)

flatten a list of tuples

 import itertools; 
 flatten1 = itertools.chain.from_iterable; 
 flatten1(your data)

write and read a list of dicts as json

with open('json_list.json', 'w') as f:
    for line in dict_list:
        f.write(json.dumps(line))
        f.write('\n')
    
with open('json_list.json', 'r') as f:
    json_objects = [json.loads(line) for line in f]

trick to find source path

python -c "import sys; 
sys.path = sys.path[1:]; 
import django; 
print(django.__path__)"

to read from stdin() as a file-like object:

with open(sys.argv[1]\, 'r') as f:

sort a dict by values:

import operator;
sorted_x = sorted(x.items(), key=operator.itemgetter(1))

pip install editable from local path

python -m pip install -e <local absolute path>

Set environment variables

os.environ['API_USER'] = 'username'

os.environ['API_PASSWORD'] = 'secret'

Get environment variables

USER = os.getenv('API_USER')

PASSWORD = os.environ.get('API_PASSWORD')

pandas shortcuts

see more of the dataframe

pd.set_option('display.max_rows', 500)

pd.set_option('display.max_columns', 500)

pd.set_option('display.width', 1000)

pd.set_option('display.max_colwidth', 800)

count nulls in a dataframe by column

df.isna().sum()

pandas groupby datetime:

data.groupby(data['date'].map(lambda x: x.year))

scientific notation:

print("%.4g"%(long number))

a random sample from a pandas dataframe:

df.sample(1000, replace=False)

bash shortcuts

symbolic link to an executable

ln -s /path_to_script/myscript /usr/bin/myscript

add an executable into your path

Assuming the script is in a folder, e.g. ~/bin, add that folder to PATH in your .bashrc, e.g. export PATH=$PATH:~/bin

line numbers:

cat -n rawfile > file_with_numbered_lines

open a tar or tar.gz file:

tar -zxvf <filename>

DESCRIPTION: -z : Uncompress the resulting archive with gzip command. -x : Extract to disk from the archive. -v : Produce verbose output -f : Read the archive from the specified file called data.tar.gz.

to background and un-background a running process:

ctrl-z, then bg; to bring back do fg %jobnumber

count files in current directory:

ls | wc -l

jupyter shortcuts

resize width (I do this with a custom.css in .jupyter now)

from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

show more of pandas dataframes

pd.set_option('display.max_rows', 500)

pd.set_option('display.max_columns', 500)

pd.set_option('display.max_colwidth', 1200)

hide warnings

import warnings; warnings.filterwarnings('ignore')

autoreload

%load_ext autoreload; %autoreload 1; %aimport module name

matplotlib shortcuts

plot multiple lines on a single (timeseries) plot

Convert to datetime
df.set_index(datetime_col)
unstack() to get 1 column per feature, and fillna(0)
for col in plottable.columns:

plt.plot(plottable.index, plottable[col], label=col)

save a file

fig = plt.gcf(); fig.savefig('test2png.pdf', dpi=300)

axis labels

plt.gca(); axes.set_xticklabels(listofstr, rotation=90, fontsize=22)

make charts bigger

plt.rcParams["figure.figsize"] = (20,3)

fig = plt.gcf(); fig.set_size_inches(18.5, 10.5, forward=True)

remove timestamps from datetime index for charts:

index.values.astype('M8[D]')

move legend

l = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

sklearn shortcuts

sklearn.model_selection.train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)[source] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=30, train_size=70, random_state=42, shuffle=True)

sklearn.preprocessing.StandardScaler(*, copy=True, with_mean=True, with_std=True) `scaler = StandardScaler(); scaler.fit(data))

print(scaler.mean_) [0.5 0.5]

scaled = scaler.transform(data)`

szeitlin/shortcuts.md