package zstd import ( "bytes" "encoding/binary" "errors" "fmt" "io" "math" "sort" "github.com/klauspost/compress/huff0" ) type dict struct { id uint32 litEnc *huff0.Scratch llDec, ofDec, mlDec sequenceDec offsets [3]int content []byte } const dictMagic = "\x37\xa4\x30\xec" // Maximum dictionary size for the reference implementation (1.5.3) is 2 GiB. const dictMaxLength = 1 << 31 // ID returns the dictionary id or 0 if d is nil. func (d *dict) ID() uint32 { if d == nil { return 0 } return d.id } // ContentSize returns the dictionary content size or 0 if d is nil. func (d *dict) ContentSize() int { if d == nil { return 0 } return len(d.content) } // Content returns the dictionary content. func (d *dict) Content() []byte { if d == nil { return nil } return d.content } // Offsets returns the initial offsets. func (d *dict) Offsets() [3]int { if d == nil { return [3]int{} } return d.offsets } // LitEncoder returns the literal encoder. func (d *dict) LitEncoder() *huff0.Scratch { if d == nil { return nil } return d.litEnc } // Load a dictionary as described in // https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format func loadDict(b []byte) (*dict, error) { // Check static field size. if len(b) <= 8+(3*4) { return nil, io.ErrUnexpectedEOF } d := dict{ llDec: sequenceDec{fse: &fseDecoder{}}, ofDec: sequenceDec{fse: &fseDecoder{}}, mlDec: sequenceDec{fse: &fseDecoder{}}, } if string(b[:4]) != dictMagic { return nil, ErrMagicMismatch } d.id = binary.LittleEndian.Uint32(b[4:8]) if d.id == 0 { return nil, errors.New("dictionaries cannot have ID 0") } // Read literal table var err error d.litEnc, b, err = huff0.ReadTable(b[8:], nil) if err != nil { return nil, fmt.Errorf("loading literal table: %w", err) } d.litEnc.Reuse = huff0.ReusePolicyMust br := byteReader{ b: b, off: 0, } readDec := func(i tableIndex, dec *fseDecoder) error { if err := dec.readNCount(&br, uint16(maxTableSymbol[i])); err != nil { return err } if br.overread() { return io.ErrUnexpectedEOF } err = dec.transform(symbolTableX[i]) if err != nil { println("Transform table error:", err) return err } if debugDecoder || debugEncoder { println("Read table ok", "symbolLen:", dec.symbolLen) } // Set decoders as predefined so they aren't reused. dec.preDefined = true return nil } if err := readDec(tableOffsets, d.ofDec.fse); err != nil { return nil, err } if err := readDec(tableMatchLengths, d.mlDec.fse); err != nil { return nil, err } if err := readDec(tableLiteralLengths, d.llDec.fse); err != nil { return nil, err } if br.remain() < 12 { return nil, io.ErrUnexpectedEOF } d.offsets[0] = int(br.Uint32()) br.advance(4) d.offsets[1] = int(br.Uint32()) br.advance(4) d.offsets[2] = int(br.Uint32()) br.advance(4) if d.offsets[0] <= 0 || d.offsets[1] <= 0 || d.offsets[2] <= 0 { return nil, errors.New("invalid offset in dictionary") } d.content = make([]byte, br.remain()) copy(d.content, br.unread()) if d.offsets[0] > len(d.content) || d.offsets[1] > len(d.content) || d.offsets[2] > len(d.content) { return nil, fmt.Errorf("initial offset bigger than dictionary content size %d, offsets: %v", len(d.content), d.offsets) } return &d, nil } // InspectDictionary loads a zstd dictionary and provides functions to inspect the content. func InspectDictionary(b []byte) (interface { ID() uint32 ContentSize() int Content() []byte Offsets() [3]int LitEncoder() *huff0.Scratch }, error) { initPredefined() d, err := loadDict(b) return d, err } type BuildDictOptions struct { // Dictionary ID. ID uint32 // Content to use to create dictionary tables. Contents [][]byte // History to use for all blocks. History []byte // Offsets to use. Offsets [3]int // CompatV155 will make the dictionary compatible with Zstd v1.5.5 and earlier. // See https://github.com/facebook/zstd/issues/3724 CompatV155 bool // Use the specified encoder level. // The dictionary will be built using the specified encoder level, // which will reflect speed and make the dictionary tailored for that level. // If not set SpeedBestCompression will be used. Level EncoderLevel // DebugOut will write stats and other details here if set. DebugOut io.Writer } func BuildDict(o BuildDictOptions) ([]byte, error) { initPredefined() hist := o.History contents := o.Contents debug := o.DebugOut != nil println := func(args ...interface{}) { if o.DebugOut != nil { fmt.Fprintln(o.DebugOut, args...) } } printf := func(s string, args ...interface{}) { if o.DebugOut != nil { fmt.Fprintf(o.DebugOut, s, args...) } } print := func(args ...interface{}) { if o.DebugOut != nil { fmt.Fprint(o.DebugOut, args...) } } if int64(len(hist)) > dictMaxLength { return nil, fmt.Errorf("dictionary of size %d > %d", len(hist), int64(dictMaxLength)) } if len(hist) < 8 { return nil, fmt.Errorf("dictionary of size %d < %d", len(hist), 8) } if len(contents) == 0 { return nil, errors.New("no content provided") } d := dict{ id: o.ID, litEnc: nil, llDec: sequenceDec{}, ofDec: sequenceDec{}, mlDec: sequenceDec{}, offsets: o.Offsets, content: hist, } block := blockEnc{lowMem: false} block.init() enc := encoder(&bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(maxMatchLen), bufferReset: math.MaxInt32 - int32(maxMatchLen*2), lowMem: false}}) if o.Level != 0 { eOpts := encoderOptions{ level: o.Level, blockSize: maxMatchLen, windowSize: maxMatchLen, dict: &d, lowMem: false, } enc = eOpts.encoder() } else { o.Level = SpeedBestCompression } var ( remain [256]int ll [256]int ml [256]int of [256]int ) addValues := func(dst *[256]int, src []byte) { for _, v := range src { dst[v]++ } } addHist := func(dst *[256]int, src *[256]uint32) { for i, v := range src { dst[i] += int(v) } } seqs := 0 nUsed := 0 litTotal := 0 newOffsets := make(map[uint32]int, 1000) for _, b := range contents { block.reset(nil) if len(b) < 8 { continue } nUsed++ enc.Reset(&d, true) enc.Encode(&block, b) addValues(&remain, block.literals) litTotal += len(block.literals) if len(block.sequences) == 0 { continue } seqs += len(block.sequences) block.genCodes() addHist(&ll, block.coders.llEnc.Histogram()) addHist(&ml, block.coders.mlEnc.Histogram()) addHist(&of, block.coders.ofEnc.Histogram()) for i, seq := range block.sequences { if i > 3 { break } offset := seq.offset if offset == 0 { continue } if int(offset) >= len(o.History) { continue } if offset > 3 { newOffsets[offset-3]++ } else { newOffsets[uint32(o.Offsets[offset-1])]++ } } } // Find most used offsets. var sortedOffsets []uint32 for k := range newOffsets { sortedOffsets = append(sortedOffsets, k) } sort.Slice(sortedOffsets, func(i, j int) bool { a, b := sortedOffsets[i], sortedOffsets[j] if a == b { // Prefer the longer offset return sortedOffsets[i] > sortedOffsets[j] } return newOffsets[sortedOffsets[i]] > newOffsets[sortedOffsets[j]] }) if len(sortedOffsets) > 3 { if debug { print("Offsets:") for i, v := range sortedOffsets { if i > 20 { break } printf("[%d: %d],", v, newOffsets[v]) } println("") } sortedOffsets = sortedOffsets[:3] } for i, v := range sortedOffsets { o.Offsets[i] = int(v) } if debug { println("New repeat offsets", o.Offsets) } if nUsed == 0 || seqs == 0 { return nil, fmt.Errorf("%d blocks, %d sequences found", nUsed, seqs) } if debug { println("Sequences:", seqs, "Blocks:", nUsed, "Literals:", litTotal) } if seqs/nUsed < 512 { // Use 512 as minimum. nUsed = seqs / 512 if nUsed == 0 { nUsed = 1 } } copyHist := func(dst *fseEncoder, src *[256]int) ([]byte, error) { hist := dst.Histogram() var maxSym uint8 var maxCount int var fakeLength int for i, v := range src { if v > 0 { v = v / nUsed if v == 0 { v = 1 } } if v > maxCount { maxCount = v } if v != 0 { maxSym = uint8(i) } fakeLength += v hist[i] = uint32(v) } // Ensure we aren't trying to represent RLE. if maxCount == fakeLength { for i := range hist { if uint8(i) == maxSym { fakeLength++ maxSym++ hist[i+1] = 1 if maxSym > 1 { break } } if hist[0] == 0 { fakeLength++ hist[i] = 1 if maxSym > 1 { break } } } } dst.HistogramFinished(maxSym, maxCount) dst.reUsed = false dst.useRLE = false err := dst.normalizeCount(fakeLength) if err != nil { return nil, err } if debug { println("RAW:", dst.count[:maxSym+1], "NORM:", dst.norm[:maxSym+1], "LEN:", fakeLength) } return dst.writeCount(nil) } if debug { print("Literal lengths: ") } llTable, err := copyHist(block.coders.llEnc, &ll) if err != nil { return nil, err } if debug { print("Match lengths: ") } mlTable, err := copyHist(block.coders.mlEnc, &ml) if err != nil { return nil, err } if debug { print("Offsets: ") } ofTable, err := copyHist(block.coders.ofEnc, &of) if err != nil { return nil, err } // Literal table avgSize := litTotal if avgSize > huff0.BlockSizeMax/2 { avgSize = huff0.BlockSizeMax / 2 } huffBuff := make([]byte, 0, avgSize) // Target size div := litTotal / avgSize if div < 1 { div = 1 } if debug { println("Huffman weights:") } for i, n := range remain[:] { if n > 0 { n = n / div // Allow all entries to be represented. if n == 0 { n = 1 } huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...) if debug { printf("[%d: %d], ", i, n) } } } if o.CompatV155 && remain[255]/div == 0 { huffBuff = append(huffBuff, 255) } scratch := &huff0.Scratch{TableLog: 11} for tries := 0; tries < 255; tries++ { scratch = &huff0.Scratch{TableLog: 11} _, _, err = huff0.Compress1X(huffBuff, scratch) if err == nil { break } if debug { printf("Try %d: Huffman error: %v\n", tries+1, err) } huffBuff = huffBuff[:0] if tries == 250 { if debug { println("Huffman: Bailing out with predefined table") } // Bail out.... Just generate something huffBuff = append(huffBuff, bytes.Repeat([]byte{255}, 10000)...) for i := 0; i < 128; i++ { huffBuff = append(huffBuff, byte(i)) } continue } if errors.Is(err, huff0.ErrIncompressible) { // Try truncating least common. for i, n := range remain[:] { if n > 0 { n = n / (div * (i + 1)) if n > 0 { huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...) } } } if o.CompatV155 && len(huffBuff) > 0 && huffBuff[len(huffBuff)-1] != 255 { huffBuff = append(huffBuff, 255) } if len(huffBuff) == 0 { huffBuff = append(huffBuff, 0, 255) } } if errors.Is(err, huff0.ErrUseRLE) { for i, n := range remain[:] { n = n / (div * (i + 1)) // Allow all entries to be represented. if n == 0 { n = 1 } huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...) } } } var out bytes.Buffer out.Write([]byte(dictMagic)) out.Write(binary.LittleEndian.AppendUint32(nil, o.ID)) out.Write(scratch.OutTable) if debug { println("huff table:", len(scratch.OutTable), "bytes") println("of table:", len(ofTable), "bytes") println("ml table:", len(mlTable), "bytes") println("ll table:", len(llTable), "bytes") } out.Write(ofTable) out.Write(mlTable) out.Write(llTable) out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[0]))) out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[1]))) out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[2]))) out.Write(hist) if debug { _, err := loadDict(out.Bytes()) if err != nil { panic(err) } i, err := InspectDictionary(out.Bytes()) if err != nil { panic(err) } println("ID:", i.ID()) println("Content size:", i.ContentSize()) println("Encoder:", i.LitEncoder() != nil) println("Offsets:", i.Offsets()) var totalSize int for _, b := range contents { totalSize += len(b) } encWith := func(opts ...EOption) int { enc, err := NewWriter(nil, opts...) if err != nil { panic(err) } defer enc.Close() var dst []byte var totalSize int for _, b := range contents { dst = enc.EncodeAll(b, dst[:0]) totalSize += len(dst) } return totalSize } plain := encWith(WithEncoderLevel(o.Level)) withDict := encWith(WithEncoderLevel(o.Level), WithEncoderDict(out.Bytes())) println("Input size:", totalSize) println("Plain Compressed:", plain) println("Dict Compressed:", withDict) println("Saved:", plain-withDict, (plain-withDict)/len(contents), "bytes per input (rounded down)") } return out.Bytes(), nil }