202 lines
5.5 KiB
Go
202 lines
5.5 KiB
Go
|
// Copyright 2020 the Blobloom authors
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
package blobloom
|
||
|
|
||
|
import "math"
|
||
|
|
||
|
// A Config holds parameters for Optimize or NewOptimized.
|
||
|
type Config struct {
|
||
|
// Trigger the "contains filtered or unexported fields" message for
|
||
|
// forward compatibility and force the caller to use named fields.
|
||
|
_ struct{}
|
||
|
|
||
|
// Capacity is the expected number of distinct keys to be added.
|
||
|
// More keys can always be added, but the false positive rate can be
|
||
|
// expected to drop below FPRate if their number exceeds the Capacity.
|
||
|
Capacity uint64
|
||
|
|
||
|
// Desired lower bound on the false positive rate when the Bloom filter
|
||
|
// has been filled to its capacity. FPRate must be between zero
|
||
|
// (exclusive) and one (inclusive).
|
||
|
FPRate float64
|
||
|
|
||
|
// Maximum size of the Bloom filter in bits. Zero means the global
|
||
|
// MaxBits constant. A value less than BlockBits means BlockBits.
|
||
|
MaxBits uint64
|
||
|
}
|
||
|
|
||
|
// NewOptimized is shorthand for New(Optimize(config)).
|
||
|
func NewOptimized(config Config) *Filter {
|
||
|
return New(Optimize(config))
|
||
|
}
|
||
|
|
||
|
// NewSyncOptimized is shorthand for New(Optimize(config)).
|
||
|
func NewSyncOptimized(config Config) *SyncFilter {
|
||
|
return NewSync(Optimize(config))
|
||
|
}
|
||
|
|
||
|
// Optimize returns numbers of keys and hash functions that achieve the
|
||
|
// desired false positive described by config.
|
||
|
//
|
||
|
// Optimize panics when config.FPRate is invalid.
|
||
|
//
|
||
|
// The estimated number of bits is imprecise for false positives rates below
|
||
|
// ca. 1e-15.
|
||
|
func Optimize(config Config) (nbits uint64, nhashes int) {
|
||
|
n := float64(config.Capacity)
|
||
|
p := config.FPRate
|
||
|
|
||
|
if p <= 0 || p > 1 {
|
||
|
panic("false positive rate for a Bloom filter must be > 0, <= 1")
|
||
|
}
|
||
|
if n == 0 {
|
||
|
// Assume the client wants to add at least one key; log2(0) = -inf.
|
||
|
n = 1
|
||
|
}
|
||
|
|
||
|
// The optimal nbits/n is c = -log2(p) / ln(2) for a vanilla Bloom filter.
|
||
|
c := math.Ceil(-math.Log2(p) / math.Ln2)
|
||
|
if c < float64(len(correctC)) {
|
||
|
c = float64(correctC[int(c)])
|
||
|
} else {
|
||
|
// We can't achieve the desired FPR. Just triple the number of bits.
|
||
|
c *= 3
|
||
|
}
|
||
|
nbits = uint64(c * n)
|
||
|
|
||
|
// Round up to a multiple of BlockBits.
|
||
|
if nbits%BlockBits != 0 {
|
||
|
nbits += BlockBits - nbits%BlockBits
|
||
|
}
|
||
|
|
||
|
var maxbits uint64 = MaxBits
|
||
|
if config.MaxBits != 0 && config.MaxBits < maxbits {
|
||
|
maxbits = config.MaxBits
|
||
|
if maxbits < BlockBits {
|
||
|
maxbits = BlockBits
|
||
|
}
|
||
|
}
|
||
|
if nbits > maxbits {
|
||
|
nbits = maxbits
|
||
|
// Round down to a multiple of BlockBits.
|
||
|
nbits -= nbits % BlockBits
|
||
|
}
|
||
|
|
||
|
// The corresponding optimal number of hash functions is k = c * log(2).
|
||
|
// Try rounding up and down to see which rounding is better.
|
||
|
c = float64(nbits) / n
|
||
|
k := c * math.Ln2
|
||
|
if k < 1 {
|
||
|
nhashes = 1
|
||
|
return nbits, nhashes
|
||
|
}
|
||
|
|
||
|
ceilK, floorK := math.Floor(k), math.Ceil(k)
|
||
|
if ceilK == floorK {
|
||
|
return nbits, int(ceilK)
|
||
|
}
|
||
|
|
||
|
fprCeil, _ := fpRate(c, math.Ceil(k))
|
||
|
fprFloor, _ := fpRate(c, math.Floor(k))
|
||
|
if fprFloor < fprCeil {
|
||
|
k = floorK
|
||
|
} else {
|
||
|
k = ceilK
|
||
|
}
|
||
|
|
||
|
return nbits, int(k)
|
||
|
}
|
||
|
|
||
|
// correctC maps c = m/n for a vanilla Bloom filter to the c' for a
|
||
|
// blocked Bloom filter.
|
||
|
//
|
||
|
// This is Putze et al.'s Table I, extended down to zero.
|
||
|
// For c > 34, the values become huge and are hard to compute.
|
||
|
var correctC = []byte{
|
||
|
1, 1, 2, 4, 5,
|
||
|
6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 23,
|
||
|
25, 26, 28, 30, 32, 35, 38, 40, 44, 48, 51, 58, 64, 74, 90,
|
||
|
}
|
||
|
|
||
|
// FPRate computes an estimate of the false positive rate of a Bloom filter
|
||
|
// after nkeys distinct keys have been added.
|
||
|
func FPRate(nkeys, nbits uint64, nhashes int) float64 {
|
||
|
if nkeys == 0 {
|
||
|
return 0
|
||
|
}
|
||
|
p, _ := fpRate(float64(nbits)/float64(nkeys), float64(nhashes))
|
||
|
return p
|
||
|
}
|
||
|
|
||
|
func fpRate(c, k float64) (p float64, iter int) {
|
||
|
switch {
|
||
|
case c == 0:
|
||
|
panic("0 bits per key is too few")
|
||
|
case k == 0:
|
||
|
panic("0 hashes is too few")
|
||
|
}
|
||
|
|
||
|
// Putze et al.'s Equation (3).
|
||
|
//
|
||
|
// The Poisson distribution has a single spike around its mean
|
||
|
// BlockBits/c that gets slimmer and further away from zero as c tends
|
||
|
// to zero (the Bloom filter gets more filled). We start at the mean,
|
||
|
// then add terms left and right of it until their relative contribution
|
||
|
// drops below ε.
|
||
|
const ε = 1e-9
|
||
|
mean := BlockBits / c
|
||
|
|
||
|
// Ceil to make sure we start at one, not zero.
|
||
|
i := math.Ceil(mean)
|
||
|
p = math.Exp(logPoisson(mean, i) + logFprBlock(BlockBits/i, k))
|
||
|
|
||
|
for j := i - 1; j > 0; j-- {
|
||
|
add := math.Exp(logPoisson(mean, j) + logFprBlock(BlockBits/j, k))
|
||
|
p += add
|
||
|
iter++
|
||
|
if add/p < ε {
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for j := i + 1; ; j++ {
|
||
|
add := math.Exp(logPoisson(mean, j) + logFprBlock(BlockBits/j, k))
|
||
|
p += add
|
||
|
iter++
|
||
|
if add/p < ε {
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return p, iter
|
||
|
}
|
||
|
|
||
|
// FPRate computes an estimate of f's false positive rate after nkeys distinct
|
||
|
// keys have been added.
|
||
|
func (f *Filter) FPRate(nkeys uint64) float64 {
|
||
|
return FPRate(nkeys, f.NumBits(), f.k)
|
||
|
}
|
||
|
|
||
|
// Log of the FPR of a single block, FPR = (1 - exp(-k/c))^k.
|
||
|
func logFprBlock(c, k float64) float64 {
|
||
|
return k * math.Log1p(-math.Exp(-k/c))
|
||
|
}
|
||
|
|
||
|
// Log of the Poisson distribution's pmf.
|
||
|
func logPoisson(λ, k float64) float64 {
|
||
|
lg, _ := math.Lgamma(k + 1)
|
||
|
return k*math.Log(λ) - λ - lg
|
||
|
}
|