well-goknown/vendor/github.com/greatroar/blobloom/optimize.go

// Copyright 2020 the Blobloom authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package blobloom

import "math"

// A Config holds parameters for Optimize or NewOptimized.
type Config struct {
	// Trigger the "contains filtered or unexported fields" message for
	// forward compatibility and force the caller to use named fields.
	_ struct{}

	// Capacity is the expected number of distinct keys to be added.
	// More keys can always be added, but the false positive rate can be
	// expected to drop below FPRate if their number exceeds the Capacity.
	Capacity uint64

	// Desired lower bound on the false positive rate when the Bloom filter
	// has been filled to its capacity. FPRate must be between zero
	// (exclusive) and one (inclusive).
	FPRate float64

	// Maximum size of the Bloom filter in bits. Zero means the global
	// MaxBits constant. A value less than BlockBits means BlockBits.
	MaxBits uint64
}

// NewOptimized is shorthand for New(Optimize(config)).
func NewOptimized(config Config) *Filter {
	return New(Optimize(config))
}

// NewSyncOptimized is shorthand for New(Optimize(config)).
func NewSyncOptimized(config Config) *SyncFilter {
	return NewSync(Optimize(config))
}

// Optimize returns numbers of keys and hash functions that achieve the
// desired false positive described by config.
//
// Optimize panics when config.FPRate is invalid.
//
// The estimated number of bits is imprecise for false positives rates below
// ca. 1e-15.
func Optimize(config Config) (nbits uint64, nhashes int) {
	n := float64(config.Capacity)
	p := config.FPRate

	if p <= 0 || p > 1 {
		panic("false positive rate for a Bloom filter must be > 0, <= 1")
	}
	if n == 0 {
		// Assume the client wants to add at least one key; log2(0) = -inf.
		n = 1
	}

	// The optimal nbits/n is c = -log2(p) / ln(2) for a vanilla Bloom filter.
	c := math.Ceil(-math.Log2(p) / math.Ln2)
	if c < float64(len(correctC)) {
		c = float64(correctC[int(c)])
	} else {
		// We can't achieve the desired FPR. Just triple the number of bits.
		c *= 3
	}
	nbits = uint64(c * n)

	// Round up to a multiple of BlockBits.
	if nbits%BlockBits != 0 {
		nbits += BlockBits - nbits%BlockBits
	}

	var maxbits uint64 = MaxBits
	if config.MaxBits != 0 && config.MaxBits < maxbits {
		maxbits = config.MaxBits
		if maxbits < BlockBits {
			maxbits = BlockBits
		}
	}
	if nbits > maxbits {
		nbits = maxbits
		// Round down to a multiple of BlockBits.
		nbits -= nbits % BlockBits
	}

	// The corresponding optimal number of hash functions is k = c * log(2).
	// Try rounding up and down to see which rounding is better.
	c = float64(nbits) / n
	k := c * math.Ln2
	if k < 1 {
		nhashes = 1
		return nbits, nhashes
	}

	ceilK, floorK := math.Floor(k), math.Ceil(k)
	if ceilK == floorK {
		return nbits, int(ceilK)
	}

	fprCeil, _ := fpRate(c, math.Ceil(k))
	fprFloor, _ := fpRate(c, math.Floor(k))
	if fprFloor < fprCeil {
		k = floorK
	} else {
		k = ceilK
	}

	return nbits, int(k)
}

// correctC maps c = m/n for a vanilla Bloom filter to the c' for a
// blocked Bloom filter.
//
// This is Putze et al.'s Table I, extended down to zero.
// For c > 34, the values become huge and are hard to compute.
var correctC = []byte{
	1, 1, 2, 4, 5,
	6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 23,
	25, 26, 28, 30, 32, 35, 38, 40, 44, 48, 51, 58, 64, 74, 90,
}

// FPRate computes an estimate of the false positive rate of a Bloom filter
// after nkeys distinct keys have been added.
func FPRate(nkeys, nbits uint64, nhashes int) float64 {
	if nkeys == 0 {
		return 0
	}
	p, _ := fpRate(float64(nbits)/float64(nkeys), float64(nhashes))
	return p
}

func fpRate(c, k float64) (p float64, iter int) {
	switch {
	case c == 0:
		panic("0 bits per key is too few")
	case k == 0:
		panic("0 hashes is too few")
	}

	// Putze et al.'s Equation (3).
	//
	// The Poisson distribution has a single spike around its mean
	// BlockBits/c that gets slimmer and further away from zero as c tends
	// to zero (the Bloom filter gets more filled). We start at the mean,
	// then add terms left and right of it until their relative contribution
	// drops below ε.
	const ε = 1e-9
	mean := BlockBits / c

	// Ceil to make sure we start at one, not zero.
	i := math.Ceil(mean)
	p = math.Exp(logPoisson(mean, i) + logFprBlock(BlockBits/i, k))

	for j := i - 1; j > 0; j-- {
		add := math.Exp(logPoisson(mean, j) + logFprBlock(BlockBits/j, k))
		p += add
		iter++
		if add/p < ε {
			break
		}
	}

	for j := i + 1; ; j++ {
		add := math.Exp(logPoisson(mean, j) + logFprBlock(BlockBits/j, k))
		p += add
		iter++
		if add/p < ε {
			break
		}
	}

	return p, iter
}

// FPRate computes an estimate of f's false positive rate after nkeys distinct
// keys have been added.
func (f *Filter) FPRate(nkeys uint64) float64 {
	return FPRate(nkeys, f.NumBits(), f.k)
}

// Log of the FPR of a single block, FPR = (1 - exp(-k/c))^k.
func logFprBlock(c, k float64) float64 {
	return k * math.Log1p(-math.Exp(-k/c))
}

// Log of the Poisson distribution's pmf.
func logPoisson(λ, k float64) float64 {
	lg, _ := math.Lgamma(k + 1)
	return k*math.Log(λ) - λ - lg
}
Add goreleaser 2024-10-29 00:11:29 +00:00			`// Copyright 2020 the Blobloom authors`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`

			`package blobloom`

			`import "math"`

			`// A Config holds parameters for Optimize or NewOptimized.`
			`type Config struct {`
			`// Trigger the "contains filtered or unexported fields" message for`
			`// forward compatibility and force the caller to use named fields.`
			`_ struct{}`

			`// Capacity is the expected number of distinct keys to be added.`
			`// More keys can always be added, but the false positive rate can be`
			`// expected to drop below FPRate if their number exceeds the Capacity.`
			`Capacity uint64`

			`// Desired lower bound on the false positive rate when the Bloom filter`
			`// has been filled to its capacity. FPRate must be between zero`
			`// (exclusive) and one (inclusive).`
			`FPRate float64`

			`// Maximum size of the Bloom filter in bits. Zero means the global`
			`// MaxBits constant. A value less than BlockBits means BlockBits.`
			`MaxBits uint64`
			`}`

			`// NewOptimized is shorthand for New(Optimize(config)).`
			`func NewOptimized(config Config) *Filter {`
			`return New(Optimize(config))`
			`}`

			`// NewSyncOptimized is shorthand for New(Optimize(config)).`
			`func NewSyncOptimized(config Config) *SyncFilter {`
			`return NewSync(Optimize(config))`
			`}`

			`// Optimize returns numbers of keys and hash functions that achieve the`
			`// desired false positive described by config.`
			`//`
			`// Optimize panics when config.FPRate is invalid.`
			`//`
			`// The estimated number of bits is imprecise for false positives rates below`
			`// ca. 1e-15.`
			`func Optimize(config Config) (nbits uint64, nhashes int) {`
			`n := float64(config.Capacity)`
			`p := config.FPRate`

			`if p <= 0 \|\| p > 1 {`
			`panic("false positive rate for a Bloom filter must be > 0, <= 1")`
			`}`
			`if n == 0 {`
			`// Assume the client wants to add at least one key; log2(0) = -inf.`
			`n = 1`
			`}`

			`// The optimal nbits/n is c = -log2(p) / ln(2) for a vanilla Bloom filter.`
			`c := math.Ceil(-math.Log2(p) / math.Ln2)`
			`if c < float64(len(correctC)) {`
			`c = float64(correctC[int(c)])`
			`} else {`
			`// We can't achieve the desired FPR. Just triple the number of bits.`
			`c *= 3`
			`}`
			`nbits = uint64(c * n)`

			`// Round up to a multiple of BlockBits.`
			`if nbits%BlockBits != 0 {`
			`nbits += BlockBits - nbits%BlockBits`
			`}`

			`var maxbits uint64 = MaxBits`
			`if config.MaxBits != 0 && config.MaxBits < maxbits {`
			`maxbits = config.MaxBits`
			`if maxbits < BlockBits {`
			`maxbits = BlockBits`
			`}`
			`}`
			`if nbits > maxbits {`
			`nbits = maxbits`
			`// Round down to a multiple of BlockBits.`
			`nbits -= nbits % BlockBits`
			`}`

			`// The corresponding optimal number of hash functions is k = c * log(2).`
			`// Try rounding up and down to see which rounding is better.`
			`c = float64(nbits) / n`
			`k := c * math.Ln2`
			`if k < 1 {`
			`nhashes = 1`
			`return nbits, nhashes`
			`}`

			`ceilK, floorK := math.Floor(k), math.Ceil(k)`
			`if ceilK == floorK {`
			`return nbits, int(ceilK)`
			`}`

			`fprCeil, _ := fpRate(c, math.Ceil(k))`
			`fprFloor, _ := fpRate(c, math.Floor(k))`
			`if fprFloor < fprCeil {`
			`k = floorK`
			`} else {`
			`k = ceilK`
			`}`

			`return nbits, int(k)`
			`}`

			`// correctC maps c = m/n for a vanilla Bloom filter to the c' for a`
			`// blocked Bloom filter.`
			`//`
			`// This is Putze et al.'s Table I, extended down to zero.`
			`// For c > 34, the values become huge and are hard to compute.`
			`var correctC = []byte{`
			`1, 1, 2, 4, 5,`
			`6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 23,`
			`25, 26, 28, 30, 32, 35, 38, 40, 44, 48, 51, 58, 64, 74, 90,`
			`}`

			`// FPRate computes an estimate of the false positive rate of a Bloom filter`
			`// after nkeys distinct keys have been added.`
			`func FPRate(nkeys, nbits uint64, nhashes int) float64 {`
			`if nkeys == 0 {`
			`return 0`
			`}`
			`p, _ := fpRate(float64(nbits)/float64(nkeys), float64(nhashes))`
			`return p`
			`}`

			`func fpRate(c, k float64) (p float64, iter int) {`
			`switch {`
			`case c == 0:`
			`panic("0 bits per key is too few")`
			`case k == 0:`
			`panic("0 hashes is too few")`
			`}`

			`// Putze et al.'s Equation (3).`
			`//`
			`// The Poisson distribution has a single spike around its mean`
			`// BlockBits/c that gets slimmer and further away from zero as c tends`
			`// to zero (the Bloom filter gets more filled). We start at the mean,`
			`// then add terms left and right of it until their relative contribution`
			`// drops below ε.`
			`const ε = 1e-9`
			`mean := BlockBits / c`

			`// Ceil to make sure we start at one, not zero.`
			`i := math.Ceil(mean)`
			`p = math.Exp(logPoisson(mean, i) + logFprBlock(BlockBits/i, k))`

			`for j := i - 1; j > 0; j-- {`
			`add := math.Exp(logPoisson(mean, j) + logFprBlock(BlockBits/j, k))`
			`p += add`
			`iter++`
			`if add/p < ε {`
			`break`
			`}`
			`}`

			`for j := i + 1; ; j++ {`
			`add := math.Exp(logPoisson(mean, j) + logFprBlock(BlockBits/j, k))`
			`p += add`
			`iter++`
			`if add/p < ε {`
			`break`
			`}`
			`}`

			`return p, iter`
			`}`

			`// FPRate computes an estimate of f's false positive rate after nkeys distinct`
			`// keys have been added.`
			`func (f *Filter) FPRate(nkeys uint64) float64 {`
			`return FPRate(nkeys, f.NumBits(), f.k)`
			`}`

			`// Log of the FPR of a single block, FPR = (1 - exp(-k/c))^k.`
			`func logFprBlock(c, k float64) float64 {`
			`return k * math.Log1p(-math.Exp(-k/c))`
			`}`

			`// Log of the Poisson distribution's pmf.`
			`func logPoisson(λ, k float64) float64 {`
			`lg, _ := math.Lgamma(k + 1)`
			`return k*math.Log(λ) - λ - lg`
			`}`