Skip to content

Instantly share code, notes, and snippets.

@giuseppe
Last active February 20, 2024 12:16
Show Gist options
  • Save giuseppe/c6aed51d38d6bb53c40a6559fa988bb4 to your computer and use it in GitHub Desktop.
Save giuseppe/c6aed51d38d6bb53c40a6559fa988bb4 to your computer and use it in GitHub Desktop.
PoC create a sparse file
package main
import (
"bytes"
"errors"
"io"
"os"
)
type state int
const (
zerosThreshold = 1024
stateData = iota
stateZeros
)
type SparseWriterUnderlyingFile interface {
Seek(offset int64, whence int) (ret int64, err error)
Close() error
Write(b []byte) (n int, err error)
}
type SparseWriter struct {
state state
file SparseWriterUnderlyingFile
zeros int64
lastIsZero bool
}
func NewSparseWriterToFile(file SparseWriterUnderlyingFile) *SparseWriter {
return &SparseWriter{file: file, state: stateData}
}
func NewSparseWriter(filePath string) (*SparseWriter, error) {
file, err := os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0666)
if err != nil {
return nil, err
}
return &SparseWriter{file: file, state: stateData}, nil
}
func (sw *SparseWriter) createHole() error {
zeros := sw.zeros
if zeros == 0 {
return nil
}
sw.zeros = 0
sw.lastIsZero = true
_, err := sw.file.Seek(zeros, io.SeekCurrent)
return err
}
func findFirstNotZero(b []byte) int {
for i, v := range b {
if v != 0 {
return i
}
}
return -1
}
// Write writes data to the file, creating holes for long sequences of zeros.
func (sw *SparseWriter) Write(data []byte) (int, error) {
written, current := 0, 0
totalLen := len(data)
for current < len(data) {
switch sw.state {
case stateData:
nextZero := bytes.IndexByte(data[current:], 0)
if nextZero < 0 {
_, err := sw.file.Write(data[written:])
sw.lastIsZero = false
return totalLen, err
} else {
current += nextZero
sw.state = stateZeros
}
case stateZeros:
nextNonZero := findFirstNotZero(data[current:])
if nextNonZero < 0 {
// finish with a zero, flush any data and keep track of the zeros
if written != current {
if _, err := sw.file.Write(data[written:current]); err != nil {
return -1, err
}
sw.lastIsZero = false
}
sw.zeros += int64(len(data) - current)
return totalLen, nil
}
// do not bother with too short sequences
if sw.zeros == 0 && nextNonZero < zerosThreshold {
sw.state = stateData
current += nextNonZero
continue
}
if written != current {
if _, err := sw.file.Write(data[written:current]); err != nil {
return -1, err
}
sw.lastIsZero = false
}
sw.zeros += int64(nextNonZero)
current += nextNonZero
if err := sw.createHole(); err != nil {
return -1, err
}
written = current
}
}
return totalLen, nil
}
// Close closes the SparseWriter's underlying file.
func (sw *SparseWriter) Close() error {
if sw.file == nil {
return errors.New("file is already closed")
}
if err := sw.createHole(); err != nil {
return err
}
if sw.lastIsZero {
if _, err := sw.file.Seek(-1, os.SEEK_CUR); err != nil {
return err
}
if _, err := sw.file.Write([]byte{0}); err != nil {
return err
}
}
err := sw.file.Close()
sw.file = nil
return err
}
func main() {
writer, err := NewSparseWriter("output.sparse")
if err != nil {
panic(err)
}
defer writer.Close()
buf := make([]byte, 1024*1024)
if _, err := io.CopyBuffer(writer, os.Stdin, buf); err != nil {
panic(err)
}
}
package main
import (
"bytes"
"errors"
"io"
"testing"
)
type MemorySparseFile struct {
buffer bytes.Buffer
pos int64
}
func (m *MemorySparseFile) Seek(offset int64, whence int) (int64, error) {
var newPos int64
switch whence {
case io.SeekStart:
newPos = offset
case io.SeekCurrent:
newPos = m.pos + offset
case io.SeekEnd:
newPos = int64(m.buffer.Len()) + offset
default:
return 0, errors.New("unsupported seek whence")
}
if newPos < 0 {
return 0, errors.New("negative position is not allowed")
}
m.pos = newPos
return newPos, nil
}
func (m *MemorySparseFile) Write(b []byte) (n int, err error) {
if int64(m.buffer.Len()) < m.pos {
padding := make([]byte, m.pos-int64(m.buffer.Len()))
_, err := m.buffer.Write(padding)
if err != nil {
return 0, err
}
}
m.buffer.Next(int(m.pos) - m.buffer.Len())
n, err = m.buffer.Write(b)
m.pos += int64(n)
return n, err
}
func (m *MemorySparseFile) Close() error {
return nil
}
func testInputWithWriteLen(t *testing.T, input []byte, chunkSize int) {
m := &MemorySparseFile{}
sparseWriter := NewSparseWriterToFile(m)
for i := 0; i < len(input); i += chunkSize {
end := i + chunkSize
if end > len(input) {
end = len(input)
}
_, err := sparseWriter.Write(input[i:end])
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
}
err := sparseWriter.Close()
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if !bytes.Equal(input, m.buffer.Bytes()) {
t.Fatalf("Incorrect output")
}
}
func testInput(t *testing.T, inputBytes []byte) {
currentLen := 1
for {
testInputWithWriteLen(t, inputBytes, currentLen)
currentLen <<= 1
if currentLen > len(inputBytes) {
break
}
}
}
func TestSparseWriter(t *testing.T) {
testInput(t, []byte("hello"))
testInput(t, append(make([]byte, 100), []byte("hello")...))
testInput(t, []byte(""))
// add "hello" at the beginning
largeInput := make([]byte, 1024*1024)
copy(largeInput, []byte("hello"))
testInput(t, largeInput)
// add "hello" at the end
largeInput = make([]byte, 1024*1024)
copy(largeInput[1024*1024-5:], []byte("hello"))
testInput(t, largeInput)
// add "hello" in the middle
largeInput = make([]byte, 1024*1024)
copy(largeInput[len(largeInput)/2:], []byte("hello"))
testInput(t, largeInput)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment