Created
March 24, 2022 12:24
-
-
Save klauspost/8f8dbbd9745662464dfac37d00cbd5f6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
//go:generate go run gen.go -out decompress_amd64_avo.s -stubs delme.go -pkg=huff0 | |
import ( | |
"flag" | |
"io/ioutil" | |
"os" | |
"path/filepath" | |
"strconv" | |
_ "github.com/klauspost/compress" | |
. "github.com/mmcloughlin/avo/build" | |
"github.com/mmcloughlin/avo/buildtags" | |
"github.com/mmcloughlin/avo/gotypes" | |
. "github.com/mmcloughlin/avo/operand" | |
"github.com/mmcloughlin/avo/reg" | |
) | |
func main() { | |
flag.Parse() | |
out := flag.Lookup("out") | |
os.Remove(filepath.Join("..", out.Value.String())) | |
stub := flag.Lookup("stubs") | |
if stub.Value.String() != "" { | |
os.Remove(stub.Value.String()) | |
defer os.Remove(stub.Value.String()) | |
} | |
Constraint(buildtags.Not("appengine").ToConstraint()) | |
Constraint(buildtags.Not("noasm").ToConstraint()) | |
Constraint(buildtags.Term("gc").ToConstraint()) | |
Constraint(buildtags.Not("noasm").ToConstraint()) | |
decompress := decompress4x{} | |
decompress.generateProcedure("decompress4x_main_loop_x86") | |
decompress.bmi2 = true | |
decompress.generateProcedure("decompress4x_main_loop_bmi2") | |
Generate() | |
b, err := ioutil.ReadFile(out.Value.String()) | |
if err != nil { | |
panic(err) | |
} | |
const readOnly = 0444 | |
err = ioutil.WriteFile(filepath.Join("..", out.Value.String()), b, readOnly) | |
if err != nil { | |
panic(err) | |
} | |
os.Remove(out.Value.String()) | |
} | |
type decompress4x struct { | |
bmi2 bool | |
} | |
const buffoff = 256 // see decompress.go, we're using [4][256]byte table | |
func (d decompress4x) generateProcedure(name string) { | |
Package("github.com/klauspost/compress/huff0") | |
TEXT(name, 0, "func(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, peekBits uint8, buf *byte, tbl *dEntrySingle) uint8") | |
Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "") | |
Pragma("noescape") | |
out := reg.RAX // Fixed since we need 8H | |
offsetComp, err := ReturnIndex(0).Resolve() | |
if err != nil { | |
panic(err) | |
} | |
offP := offsetComp.Addr | |
{ | |
off := GP8() | |
XORB(off, off) // off = 0 | |
MOVB(off, offP) | |
} | |
exhausted := reg.RBX // Fixed since we need 8H | |
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false | |
peekBits := GP64() | |
buffer := GP64() | |
table := GP64() | |
Comment("Preload values") | |
{ | |
Load(Param("peekBits"), peekBits) | |
Load(Param("buf"), buffer) | |
Load(Param("tbl"), table) | |
} | |
Comment("Main loop") | |
Label("main_loop") | |
br0 := Dereference(Param("pbr0")) | |
d.decodeTwoValues(0, br0, peekBits, table, buffer, out, exhausted, offP) | |
br1 := Dereference(Param("pbr1")) | |
d.decodeTwoValues(1, br1, peekBits, table, buffer, out, exhausted, offP) | |
br2 := Dereference(Param("pbr2")) | |
d.decodeTwoValues(2, br2, peekBits, table, buffer, out, exhausted, offP) | |
br3 := Dereference(Param("pbr3")) | |
d.decodeTwoValues(3, br3, peekBits, table, buffer, out, exhausted, offP) | |
ADDB(U8(2), offP) // off += 2 | |
TESTB(exhausted.As8H(), exhausted.As8H()) // any br[i].ofs < 4? | |
JNZ(LabelRef("done")) | |
CMPQ(offP, U32(buffoff)) | |
JZ(LabelRef("main_loop")) | |
Label("done") | |
RET() | |
} | |
func (d decompress4x) decodeTwoValues(id int, br gotypes.Component, peekBits, table, buffer reg.GPVirtual, out, exhausted reg.GPPhysical, offP Mem) { | |
Commentf("br%d.fillFast()", id) | |
brOffset := GP64() | |
brBitsRead := GP64() | |
brValue := GP64() | |
Load(br.Field("bitsRead"), brBitsRead) | |
Load(br.Field("off"), brOffset) | |
Load(br.Field("value"), brValue) | |
// We must have at least 2 * max tablelog left | |
CMPQ(brBitsRead, U8(64-22)) | |
JBE(LabelRef("skip_fill" + strconv.Itoa(id))) | |
SUBQ(U8(32), brBitsRead) // b.bitsRead -= 32 | |
SUBQ(U8(4), brOffset) // b.off -= 4 | |
// v := b.in[b.off-4 : b.off] | |
// v = v[:4] | |
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) | |
tmp := GP64() | |
Load(br.Field("in").Base(), tmp.As64()) | |
Comment("b.value |= uint64(low) << (b.bitsRead & 63)") | |
CX := reg.CL | |
addr := Mem{Base: brOffset, Index: tmp.As64(), Scale: 1} | |
if d.bmi2 { | |
SHLXQ(brBitsRead, addr, tmp.As64()) // tmp = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63) | |
} else { | |
MOVL(addr, tmp.As32()) // tmp = uint32(b.in[b.off:b.off+4]) | |
MOVQ(brBitsRead, CX.As64()) | |
SHLQ(CX, tmp.As64()) | |
} | |
ORQ(tmp.As64(), brValue) | |
Commentf("exhausted = exhausted || (br%d.off < 4)", id) | |
CMPQ(brOffset, U8(4)) | |
SETLT(exhausted.As8L()) | |
ORB(exhausted.As8L(), exhausted.As8H()) | |
Label("skip_fill" + strconv.Itoa(id)) | |
tmp = GP64() | |
Commentf("val0 := br%d.peekTopBits(peekBits)", id) | |
if d.bmi2 { | |
SHRXQ(peekBits, brValue, tmp.As64()) // tmp = (value >> peek_bits) & mask | |
} else { | |
MOVQ(brValue, tmp.As64()) | |
MOVQ(peekBits, CX.As64()) | |
SHRQ(CX, tmp.As64()) // tmp = (value >> peek_bits) & mask | |
} | |
Comment("v0 := table[val0&mask]") | |
tmp8 := reg.RDX | |
MOVW(Mem{Base: table, Index: tmp8.As64(), Scale: 2}, tmp8.As16()) // tmp - v0 | |
Commentf("br%d.advance(uint8(v0.entry)", id) | |
MOVB(tmp8.As8H(), out.As8()) // BL = uint8(v0.entry >> 8) | |
MOVBQZX(tmp8.As8(), CX.As64()) | |
if d.bmi2 { | |
SHLXQ(tmp8.As64(), brValue, brValue) // value <<= n | |
} else { | |
SHLQ(CX, brValue) // value <<= n | |
} | |
ADDQ(CX.As64(), brBitsRead) // bits_read += n | |
Commentf("val1 := br%d.peekTopBits(peekBits)", id) | |
if d.bmi2 { | |
SHRXQ(peekBits, brValue, tmp8.As64()) // tmp = (value >> peek_bits) & mask | |
} else { | |
MOVQ(peekBits, CX.As64()) | |
MOVQ(brValue, tmp8.As64()) | |
SHRQ(CX, tmp8.As64()) // tmp = (value >> peek_bits) & mask | |
} | |
Comment("v1 := table[val0&mask]") | |
MOVW(Mem{Base: table, Index: tmp8.As64(), Scale: 2}, tmp8.As16()) // tmp - v1 | |
Commentf("br%d.advance(uint8(v1.entry))", id) | |
MOVB(tmp8.As8H(), out.As8H()) // BH = uint8(v0.entry >> 8) | |
MOVBQZX(tmp8.As8(), CX.As64()) | |
if d.bmi2 { | |
SHLXQ(tmp8.As64(), brValue, brValue) // value <<= n | |
} else { | |
SHLQ(CX, brValue) // value <<= n | |
} | |
ADDQ(CX.As64(), brBitsRead) // bits_read += n | |
Comment("these two writes get coalesced") | |
Comment("buf[stream][off] = uint8(v0.entry >> 8)") | |
Comment("buf[stream][off+1] = uint8(v1.entry >> 8)") | |
off := GP64() | |
MOVBQZX(offP, off) | |
MOVW(out.As16(), Mem{Base: buffer, Index: off, Scale: 1, Disp: id * buffoff}) | |
Comment("update the bitrader reader structure") | |
Store(brBitsRead.As8(), br.Field("bitsRead")) | |
Store(brValue, br.Field("value")) | |
Store(brOffset, br.Field("value")) | |
} | |
func IfDef(def string) { | |
Commentf("#ifdef %s", def) | |
} | |
func Else() { | |
Comment("#else") | |
} | |
func EndIf() { | |
Comment("#endif") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment