Last active
April 2, 2020 17:04
-
-
Save dmpetrov/136dd5df9bcf6de90980cec22355437a to your computer and use it in GitHub Desktop.
DVC storage proposal #1487
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### BASIC SCENARIO ### | |
# Create dataset | |
# Assigne dataset name `car-images`, version and verision comment (not Git) | |
$ tar zxf images.tgz | |
$ du -sh images/ | |
8.1G images | |
$ dvc dataset add images/ car-images 1.0.0 -m "Import car images" | |
Dataset [email protected] was added | |
# Commit as usual. All datasets info in dvc-file. | |
$ git add images.dvc .gitignore | |
$ git commit -m 'Car images dataset' | |
# List of datasets | |
$ dvc dataset list | |
[email protected] | |
$ dvc dataset list --details # details? | |
[email protected] 8.1G 845295 Car images dataset images.dvc | |
# 845295 is number of files | |
### OUTPUT AS A DATASET ### | |
$ dvc run -d processed_cars/ -d config.yaml -d cnn/model.py -o cnn/model.p \ | |
python cnn/model.py -e 70 -p 0.3 | |
$ dvc dataset assign model.p mymodel 0.1.0 -m 'First working CNN' | |
# Open questions: | |
# - do we need a separate file for model as a dataset (like cnn/mymodel.p.dvc)? | |
# - - how cnn/mymodel.p.dvc and cnn/model.p.dvc will be connected? | |
# - should `dvc run` update\patch the output model version if assigned? Is warning enough? | |
$ git add cnn/model.p.dvc .gitignore | |
$ git commit -m 'First model' | |
$ dvc dataset list --details | |
[email protected] 8.1G 845295 Car images dataset images.dvc | |
[email protected] 218M 1 First working CNN cnn/model.p.dvc | |
$ vi cnn.model.py | |
$ dvc repro cnn/model.p.dvc | |
$ dvc dataset version cnn/mymodel.p.dvc minor -m 'Some fixes' | |
[email protected] | |
$ dvc dataset list --details | |
[email protected] 8.1G 845295 Car images dataset images.dvc | |
[email protected] 217M 1 Some fixes cnn/model.p.dvc | |
### MODIFY DATASET ### | |
$ dvc unprotect images/ | |
$ rm -rf images/ | |
$ cp -r ~/Download/images_new_version images | |
# SYNOPSIS: dvc dataset version [<newversion> | major | minor | patch ] [-m]. See `npm version`. | |
$ dvc dataset version car-images minor -m "Labels update 2019-01-18" | |
Dataset [email protected] was updated to 1.1.0 | |
$ dvc dataset list --details | |
[email protected] 8.3G 851904 Labels update 2019-01-18 images.dvc | |
[email protected] 217M 1 Some fixes cnn/model.p.dvc | |
# Retrain model | |
$ dvc repro | |
Warning: please update output dataset version 'mymodel'. Old version 0.1.0. | |
$ dvc dataset version car-images minor -m "Retrained with [email protected]" | |
Dataset [email protected] was updated to 0.2.0 | |
$ dvc dataset list --details # Note: number of files in car-images was changed | |
[email protected] 8.3G 851904 Labels update 2019-01-18 images.dvc | |
[email protected] 223M 1 Retrained with [email protected] cnn/model.p.dvc | |
### INFORMATIONALS OPERATIONS ### | |
# Current workspace only | |
$ dvc dataset list | |
[email protected] | |
[email protected] | |
$ dvc dataset list --details | |
[email protected] 8.3G 851904 Labels update from 2019-04-02 images.dvc | |
[email protected] 218M 1 Retrained with [email protected] cnn/model.p.dvc | |
# Version history - dvc should find this from git history | |
# Open question: should we address datasets by name or dvc-files or both? | |
$ dvc dataset hist car-images # hist? | |
1.0.0 8.1G 845295 Import car images | |
1.1.0 8.3G 851904 Labels update 2019-03-18 | |
1.2.0 8.4G 861749 Labels update from 2019-04-02 # <-- new version | |
# Open question: how about versions in parallel branches? Ignore them? | |
$ dvc dataset hist car-images --branch | |
master 1.0.0 8.1G 845295 Import car images | |
master 1.1.0 8.3G 851904 Labels update 2019-03-18 | |
master 1.2.0 8.4G 861749 Labels update from 2019-04-02 | |
try_something 1.1.0 8.3G 851904 labels from 2019-03-18 | |
# Show everything | |
$ dvc dataset hist --all --branch | |
master [email protected] 8.1G 845295 Import car images | |
master [email protected] 8.3G 851904 Labels update 2019-03-18 | |
master [email protected] 8.4G 861749 Labels update from 2019-04-02 | |
try_something [email protected] 8.3G 851904 labels from 2019-03-18 | |
master [email protected] 218M 1 First model | |
master [email protected] 217M 1 Some fixes | |
master [email protected] 223M 1 Retrained with [email protected] | |
try_something [email protected] 219M 1 retrained | |
try_imagenet [email protected] 348M 1 imagenet model | |
try_imagenet [email protected] 147G 14745385 Import imagenet | |
### DIFF ### | |
# diff of the current version (1.2.0) with a previous one (1.1.0) | |
$ dvc dataset diff car-images 1.1.0 # patch version can be ignored "1.1" is enough | |
Size: 8.4G --> 8.3G | |
Files: 861749 --> 851904 | |
New files: 140234 | |
Deleted files: 18 | |
Modified files: 6434 | |
$ dvc dataset diff car-images 1.1 --new-files # or --last instead of verion | |
im4325532.jpg | |
im3454534.jpg | |
... | |
# specify both versions | |
$ dvc dataset diff car-images 1.0 1.2 --modified-files | |
im4865885.jpg | |
im8234012.jpg | |
... | |
### CHECKOUT DATASET ### | |
$ dvc dataset checkout [email protected] # dvc checkout is part of this | |
[email protected] was checked out. | |
Warning: 'images/', 'images.dvc' were modified. | |
$ dvc repro cnn/model.p.dvc | |
### DVC REGISTRY ### | |
# One of the requirements is to extract a common datasets in a separate repository | |
# and reuse them from different projects. | |
# Some company might keep all the datasets in a single place\project and just reuse them. | |
# Open questions: | |
# - do we need a default dir for a registry like `~/dvc/registry/` and | |
# `/usr/local/dvc/regirtry`? Or a environment variable DVC_REGISTRY? | |
# - do we need a multiple registries (~/dvc/registry and ~/dvc/imagenet) | |
# and how to define? | |
$ cd ~/dvc | |
$ git clone https://github.com/iterative/common registry | |
$ cd registry | |
$ ls | |
imagenet.dvc coco.dvc someotherstuff.dvc | |
$ dvc pull imagenet.dvc | |
$ ls | |
imagenet.dvc imagenet/ coco.dvc someotherstuff.dvc | |
$ cd ~/src/myproject | |
$ dvc dataset list --details | |
[email protected] 8.3G 851904 Labels update from 2019-04-02 images.dvc | |
[email protected] 218M 1 Retrained with [email protected] cnn/model.p.dvc | |
~/dvc/registry/[email protected] 147G 14745385 Import imagenet | |
# Use a repo from registry | |
# By default the `last` version is copied. | |
# If a dataset has many output - copy all. | |
$ dvc dataset copy imagenet . | |
Dataset [email protected] was copied | |
Adding 'imagenet/' to '.gitignore'. | |
'imagenet/' is in cache '~/dvc/registry/.dvc/cache'. # <-- not in local cache. | |
Saving information to 'imagnet.dvc'. | |
To track the changes with git run: | |
git add .gitignore imagnet.dvc | |
# A repo might have it's own cache (by default). | |
# A dataset dvc-file should point to that cache like "cache_path" and "modul_version": | |
# outs: | |
# - cache: true | |
# md5: ea4dec866e3f4c734e58909ac1b248a3 | |
# path: data/Posts-train.tsv | |
# cache_path: "iterative_datasets/" | |
# modul_version: last | |
# --local-cache can be used to import a dataset in a local chache. | |
$ (cd ~/dvc/registry/ && git pull) | |
$ dvc dataset update imagenet | |
Dataset [email protected] was updated to 1.2.0. | |
Saving information to 'imagnet.dvc'. | |
To track the changes with git run: | |
git add imagnet.dvc | |
# Push model to a registry | |
# Note, data/cache has to be copied (it stays in the same repo by default). | |
$ dvc dataset copy --copy-cache mymodel ~/dvc/registry/ | |
Dataset [email protected] was added to project '~/dvc/registry/' | |
Adding 'model.p' to '.gitignore'. | |
Adding 'model.p' to cache '~/dvc/registry/.dvc/cache'. | |
Saving information to 'model.p.dvc'. | |
To track the changes with git run: | |
git add .gitignore model.p.dvc | |
$ cd ~/dvc/registry/ | |
$ git add .gitignore model.p.dvc | |
$ git commit -m 'My CNN model v1.0' | |
$ git push | |
$ dvc push | |
# A usefull command. | |
$ cd ~/src/myproject | |
$ dvc dataset update mymodel ~/dvc/registry/ | |
### USAGE WITH NO GIT ### | |
# It would be great if `dvc pull` can work with no Git. | |
# It is needed or deployment systems when Git might be not available. | |
# We might need a separate command for that. | |
$ wget https://raw.githubusercontent.com/iterative/dvc/r1.1/model.p.dvc | |
$ wget https://raw.githubusercontent.com/iterative/dvc/r1.1/.dvc/config | |
$ dvc pull --config config model.p.dvc | |
$ ls model.p | |
-rw-r--r-- 1 dmitry staff 230M May 2 2017 setup.cfg | |
# A special command to get the same result: | |
$ dvc pull --deploy https://github.com/iterative/common/ model.p.dvc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment