Created
December 22, 2019 22:31
-
-
Save unixpickle/0eda21a5e11e947e1f7bfb38e2e2c1f8 to your computer and use it in GitHub Desktop.
Extract ImageNet tar in-place
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Command extract_tar can be used to extract a large | |
// ImageNet tarbal on a system that doesn't have enough | |
// storage for both the tarbal and the untarred data. | |
// | |
// As it extracts the tarbal, it truncates the original | |
// tar file so that it takes up less and less space. | |
// | |
// Based on this earlier gist for processing ImageNet tars: | |
// https://gist.github.com/unixpickle/7304c78032c9f433e28a87409f4d5aca | |
package main | |
import ( | |
"io" | |
"io/ioutil" | |
"log" | |
"os" | |
"sort" | |
"strconv" | |
"strings" | |
"github.com/unixpickle/essentials" | |
) | |
func main() { | |
if len(os.Args) != 2 { | |
essentials.Die("Usage: extract_tar <file.tar>") | |
} | |
f, err := os.OpenFile(os.Args[1], os.O_RDWR, 0) | |
essentials.Must(err) | |
defer f.Close() | |
offsetToPath := PathOffsets(f) | |
offsets := make([]FileOffset, 0, len(offsetToPath)) | |
for offset := range offsetToPath { | |
offsets = append(offsets, offset) | |
} | |
sort.Slice(offsets, func(i, j int) bool { | |
return offsets[i].Start > offsets[j].Start | |
}) | |
log.Println("Extracting", len(offsets), "files...") | |
for _, offset := range offsets { | |
path := offsetToPath[offset] | |
log.Println(" -", path) | |
dirname := strings.Split(path, "/")[0] | |
os.Mkdir(dirname, 0755) | |
_, err = f.Seek(offset.Start, io.SeekStart) | |
essentials.Must(err) | |
data := make([]byte, offset.Length) | |
_, err = io.ReadFull(f, data) | |
essentials.Must(err) | |
essentials.Must(ioutil.WriteFile(path, data, 0755)) | |
f.Seek(0, io.SeekStart) | |
essentials.Must(f.Truncate(offset.Start)) | |
} | |
} | |
type FileOffset struct { | |
Start int64 | |
Length int | |
} | |
func PathOffsets(r io.Reader) map[FileOffset]string { | |
offsets := map[FileOffset]string{} | |
ReadTar(r, func(wnid string, folderTar io.Reader, offset int64) { | |
if !strings.HasSuffix(wnid, ".tar") { | |
return | |
} | |
wnid = wnid[:len(wnid)-4] | |
log.Println("Processing wnid:", wnid) | |
ReadTar(folderTar, func(imageName string, img io.Reader, subOffset int64) { | |
imgData, err := ioutil.ReadAll(img) | |
if err != nil { | |
log.Println(err) | |
} | |
offset := FileOffset{ | |
Start: subOffset + offset, | |
Length: len(imgData), | |
} | |
offsets[offset] = wnid + "/" + imageName | |
}) | |
}) | |
return offsets | |
} | |
func ReadTar(r io.Reader, cb func(name string, data io.Reader, offset int64)) error { | |
var offset int64 | |
for { | |
name, size, err := ReadTarHeader(r) | |
if err != nil { | |
return err | |
} | |
if name == "" { | |
return nil | |
} | |
limited := io.LimitReader(r, size) | |
cb(name, limited, offset+512) | |
if _, err := io.Copy(ioutil.Discard, limited); err != nil { | |
return err | |
} | |
offset += 512 + size | |
if size%512 != 0 { | |
extra := 512 - (size % 512) | |
if _, err := io.Copy(ioutil.Discard, io.LimitReader(r, extra)); err != nil { | |
return err | |
} | |
offset += extra | |
} | |
} | |
} | |
func ReadTarHeader(r io.Reader) (name string, size int64, err error) { | |
buf := make([]byte, 512) | |
if _, err := io.ReadFull(r, buf); err != nil { | |
return "", 0, err | |
} | |
name = NullTermStr(buf[:100]) | |
size, err = strconv.ParseInt(NullTermStr(buf[124:136]), 8, 64) | |
if err != nil { | |
return "", 0, err | |
} | |
return name, size, nil | |
} | |
func NullTermStr(data []byte) string { | |
for i, b := range data { | |
if b == 0 { | |
return string(data[:i]) | |
} | |
} | |
return string(data) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment