247 lines
6.7 KiB
Go
247 lines
6.7 KiB
Go
|
// Copyright 2023 the Blobloom authors
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
package blobloom
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
"encoding/binary"
|
||
|
"errors"
|
||
|
"fmt"
|
||
|
"io"
|
||
|
"strings"
|
||
|
"sync/atomic"
|
||
|
)
|
||
|
|
||
|
const maxCommentLen = 44
|
||
|
|
||
|
// Dump writes f to w, with an optional comment string, in the binary format
|
||
|
// that a Loader accepts. It returns the number of bytes written to w.
|
||
|
//
|
||
|
// The comment may contain arbitrary data, within the limits layed out by the
|
||
|
// format description. It can be used to record the hash function to be used
|
||
|
// with a Filter.
|
||
|
func Dump(w io.Writer, f *Filter, comment string) (int64, error) {
|
||
|
return dump(w, f.b, f.k, comment)
|
||
|
}
|
||
|
|
||
|
// DumpSync is like Dump, but for SyncFilters.
|
||
|
//
|
||
|
// If other goroutines are simultaneously modifying f,
|
||
|
// their modifications may not be reflected in the dump.
|
||
|
// Separate synchronization is required to prevent this.
|
||
|
//
|
||
|
// The format produced is the same as Dump's. The fact that
|
||
|
// the argument is a SyncFilter is not encoded in the dump.
|
||
|
func DumpSync(w io.Writer, f *SyncFilter, comment string) (n int64, err error) {
|
||
|
return dump(w, f.b, f.k, comment)
|
||
|
}
|
||
|
|
||
|
func dump(w io.Writer, b []block, nhashes int, comment string) (n int64, err error) {
|
||
|
switch {
|
||
|
case len(b) == 0 || nhashes == 0:
|
||
|
err = errors.New("blobloom: won't dump uninitialized Filter")
|
||
|
case len(comment) > maxCommentLen:
|
||
|
err = fmt.Errorf("blobloom: comment of length %d too long", len(comment))
|
||
|
case strings.IndexByte(comment, 0) != -1:
|
||
|
err = fmt.Errorf("blobloom: comment %q contains zero byte", len(comment))
|
||
|
}
|
||
|
if err != nil {
|
||
|
return 0, err
|
||
|
}
|
||
|
|
||
|
var buf [64]byte
|
||
|
copy(buf[:8], "blobloom")
|
||
|
// As documented in the comment for Loader, we store one less than the
|
||
|
// number of blocks. This way, we can use the otherwise invalid value 0
|
||
|
// and store 2³² blocks instead of at most 2³²-1.
|
||
|
binary.LittleEndian.PutUint32(buf[12:], uint32(len(b)-1))
|
||
|
binary.LittleEndian.PutUint32(buf[16:], uint32(nhashes))
|
||
|
copy(buf[20:], comment)
|
||
|
|
||
|
k, err := w.Write(buf[:])
|
||
|
n = int64(k)
|
||
|
if err != nil {
|
||
|
return n, err
|
||
|
}
|
||
|
|
||
|
for i := range b {
|
||
|
for j := range b[i] {
|
||
|
x := atomic.LoadUint32(&b[i][j])
|
||
|
binary.LittleEndian.PutUint32(buf[4*j:], x)
|
||
|
}
|
||
|
k, err = w.Write(buf[:])
|
||
|
n += int64(k)
|
||
|
if err != nil {
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return n, err
|
||
|
}
|
||
|
|
||
|
// A Loader reads a Filter or SyncFilter from an io.Reader.
|
||
|
//
|
||
|
// A Loader accepts the binary format produced by Dump. The format starts
|
||
|
// with a 64-byte header:
|
||
|
// - the string "blobloom", in ASCII;
|
||
|
// - a four-byte version number, which must be zero;
|
||
|
// - the number of Bloom filter blocks, minus one, as a 32-bit integer;
|
||
|
// - the number of hashes, as a 32-bit integer;
|
||
|
// - a comment of at most 44 non-zero bytes, padded to 44 bytes with zeros.
|
||
|
//
|
||
|
// After the header come the 512-bit blocks, divided into sixteen 32-bit limbs.
|
||
|
// All integers are little-endian.
|
||
|
type Loader struct {
|
||
|
buf [64]byte
|
||
|
r io.Reader
|
||
|
err error
|
||
|
|
||
|
Comment string // Comment field. Filled in by NewLoader.
|
||
|
nblocks uint64
|
||
|
nhashes int
|
||
|
}
|
||
|
|
||
|
// NewLoader parses the format header from r and returns a Loader
|
||
|
// that can be used to load a Filter from it.
|
||
|
func NewLoader(r io.Reader) (*Loader, error) {
|
||
|
l := &Loader{r: r}
|
||
|
|
||
|
err := l.fillbuf()
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
version := binary.LittleEndian.Uint32(l.buf[8:])
|
||
|
// See comment in dump for the +1.
|
||
|
l.nblocks = 1 + uint64(binary.LittleEndian.Uint32(l.buf[12:]))
|
||
|
l.nhashes = int(binary.LittleEndian.Uint32(l.buf[16:]))
|
||
|
comment := l.buf[20:]
|
||
|
|
||
|
switch {
|
||
|
case string(l.buf[:8]) != "blobloom":
|
||
|
err = errors.New("blobloom: not a Bloom filter dump")
|
||
|
case version != 0:
|
||
|
err = errors.New("blobloom: unsupported dump version")
|
||
|
case l.nhashes == 0:
|
||
|
err = errors.New("blobloom: zero hashes in Bloom filter dump")
|
||
|
}
|
||
|
if err == nil {
|
||
|
comment, err = checkComment(comment)
|
||
|
l.Comment = string(comment)
|
||
|
}
|
||
|
|
||
|
if err != nil {
|
||
|
l = nil
|
||
|
}
|
||
|
return l, err
|
||
|
}
|
||
|
|
||
|
// Load sets f to the union of f and the Loader's filter, then returns f.
|
||
|
// If f is nil, a new Filter of the appropriate size is constructed.
|
||
|
//
|
||
|
// If f is not nil and an error occurs while reading from the Loader,
|
||
|
// f may end up in an inconsistent state.
|
||
|
func (l *Loader) Load(f *Filter) (*Filter, error) {
|
||
|
if f == nil {
|
||
|
nbits := BlockBits * l.nblocks
|
||
|
if nbits > MaxBits {
|
||
|
return nil, fmt.Errorf("blobloom: %d blocks is too large", l.nblocks)
|
||
|
}
|
||
|
f = New(nbits, int(l.nhashes))
|
||
|
} else if err := l.checkBitsAndHashes(len(f.b), f.k); err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
for i := range f.b {
|
||
|
if err := l.fillbuf(); err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
for j := range f.b[i] {
|
||
|
f.b[i][j] |= binary.LittleEndian.Uint32(l.buf[4*j:])
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return f, nil
|
||
|
}
|
||
|
|
||
|
// Load sets f to the union of f and the Loader's filter, then returns f.
|
||
|
// If f is nil, a new SyncFilter of the appropriate size is constructed.
|
||
|
// Else, LoadSync may run concurrently with other modifications to f.
|
||
|
//
|
||
|
// If f is not nil and an error occurs while reading from the Loader,
|
||
|
// f may end up in an inconsistent state.
|
||
|
func (l *Loader) LoadSync(f *SyncFilter) (*SyncFilter, error) {
|
||
|
if f == nil {
|
||
|
nbits := BlockBits * l.nblocks
|
||
|
if nbits > MaxBits {
|
||
|
return nil, fmt.Errorf("blobloom: %d blocks is too large", l.nblocks)
|
||
|
}
|
||
|
f = NewSync(nbits, int(l.nhashes))
|
||
|
} else if err := l.checkBitsAndHashes(len(f.b), f.k); err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
for i := range f.b {
|
||
|
if err := l.fillbuf(); err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
for j := range f.b[i] {
|
||
|
p := &f.b[i][j]
|
||
|
x := binary.LittleEndian.Uint32(l.buf[4*j:])
|
||
|
|
||
|
for {
|
||
|
old := atomic.LoadUint32(p)
|
||
|
if atomic.CompareAndSwapUint32(p, old, old|x) {
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return f, nil
|
||
|
}
|
||
|
|
||
|
func (l *Loader) checkBitsAndHashes(nblocks, nhashes int) error {
|
||
|
switch {
|
||
|
case nblocks != int(l.nblocks):
|
||
|
return fmt.Errorf("blobloom: Filter has %d blocks, but dump has %d", nblocks, l.nblocks)
|
||
|
case nhashes != l.nhashes:
|
||
|
return fmt.Errorf("blobloom: Filter has %d hashes, but dump has %d", nhashes, l.nhashes)
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (l *Loader) fillbuf() error {
|
||
|
_, err := io.ReadFull(l.r, l.buf[:])
|
||
|
if err == io.EOF {
|
||
|
err = io.ErrUnexpectedEOF
|
||
|
}
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
func checkComment(p []byte) ([]byte, error) {
|
||
|
eos := bytes.IndexByte(p, 0)
|
||
|
if eos != -1 {
|
||
|
tail := p[eos+1:]
|
||
|
if !bytes.Equal(tail, make([]byte, len(tail))) {
|
||
|
return nil, fmt.Errorf("blobloom: comment block %q contains zero byte", p)
|
||
|
}
|
||
|
p = p[:eos]
|
||
|
}
|
||
|
return p, nil
|
||
|
}
|