package bboltimport ()const (// MaxKeySize is the maximum length of a key, in bytes.MaxKeySize = 32768// MaxValueSize is the maximum length of a value, in bytes.MaxValueSize = (1 << 31) - 2)const bucketHeaderSize = int(unsafe.Sizeof(bucket{}))const ( minFillPercent = 0.1 maxFillPercent = 1.0)// DefaultFillPercent is the percentage that split pages are filled.// This value can be changed by setting Bucket.FillPercent.constDefaultFillPercent = 0.5// Bucket represents a collection of key/value pairs inside the database.typeBucketstruct { *bucket tx *Tx// the associated transaction buckets map[string]*Bucket// subbucket cache page *page// inline page reference rootNode *node// materialized node for the root page. nodes map[pgid]*node// node cache// Sets the threshold for filling nodes when they split. By default, // the bucket will fill to 50% but it can be useful to increase this // amount if you know that your write workloads are mostly append-only. // // This is non-persisted across transactions so it must be set in every Tx. FillPercent float64}// bucket represents the on-file representation of a bucket.// This is stored as the "value" of a bucket key. If the bucket is small enough,// then its root page can be stored inline in the "value", after the bucket// header. In the case of inline buckets, the "root" will be 0.type bucket struct { root pgid// page id of the bucket's root-level page sequence uint64// monotonically incrementing, used by NextSequence()}// newBucket returns a new bucket associated with a transaction.func newBucket( *Tx) Bucket {var = Bucket{tx: , FillPercent: DefaultFillPercent}if .writable { .buckets = make(map[string]*Bucket) .nodes = make(map[pgid]*node) }return}// Tx returns the tx of the bucket.func ( *Bucket) () *Tx {return .tx}// Root returns the root of the bucket.func ( *Bucket) () pgid {return .root}// Writable returns whether the bucket is writable.func ( *Bucket) () bool {return .tx.writable}// Cursor creates a cursor associated with the bucket.// The cursor is only valid as long as the transaction is open.// Do not use a cursor after the transaction is closed.func ( *Bucket) () *Cursor {// Update transaction statistics. .tx.stats.CursorCount++// Allocate and return a cursor.return &Cursor{bucket: ,stack: make([]elemRef, 0), }}// Bucket retrieves a nested bucket by name.// Returns nil if the bucket does not exist.// The bucket instance is only valid for the lifetime of the transaction.func ( *Bucket) ( []byte) *Bucket {if .buckets != nil {if := .buckets[string()]; != nil {return } }// Move cursor to key. := .Cursor() , , := .seek()// Return nil if the key doesn't exist or it is not a bucket.if !bytes.Equal(, ) || (&bucketLeafFlag) == 0 {returnnil }// Otherwise create a bucket and cache it.var = .openBucket()if .buckets != nil { .buckets[string()] = }return}// Helper method that re-interprets a sub-bucket value// from a parent into a Bucketfunc ( *Bucket) ( []byte) *Bucket {var = newBucket(.tx)// Unaligned access requires a copy to be made.const = unsafe.Alignof(struct {bucketpage }{}) - 1 := uintptr(unsafe.Pointer(&[0]))& != 0if { = cloneBytes() }// If this is a writable transaction then we need to copy the bucket entry. // Read-only transactions can point directly at the mmap entry.if .tx.writable && ! { .bucket = &bucket{} *.bucket = *(*bucket)(unsafe.Pointer(&[0])) } else { .bucket = (*bucket)(unsafe.Pointer(&[0])) }// Save a reference to the inline page if the bucket is inline.if .root == 0 { .page = (*page)(unsafe.Pointer(&[bucketHeaderSize])) }return &}// CreateBucket creates a new bucket at the given key and returns the new bucket.// Returns an error if the key already exists, if the bucket name is blank, or if the bucket name is too long.// The bucket instance is only valid for the lifetime of the transaction.func ( *Bucket) ( []byte) (*Bucket, error) {if .tx.db == nil {returnnil, ErrTxClosed } elseif !.tx.writable {returnnil, ErrTxNotWritable } elseiflen() == 0 {returnnil, ErrBucketNameRequired }// Move cursor to correct position. := .Cursor() , , := .seek()// Return an error if there is an existing key.ifbytes.Equal(, ) {if ( & bucketLeafFlag) != 0 {returnnil, ErrBucketExists }returnnil, ErrIncompatibleValue }// Create empty, inline bucket.var = Bucket{bucket: &bucket{},rootNode: &node{isLeaf: true},FillPercent: DefaultFillPercent, }var = .write()// Insert into node. = cloneBytes() .node().put(, , , 0, bucketLeafFlag)// Since subbuckets are not allowed on inline buckets, we need to // dereference the inline page, if it exists. This will cause the bucket // to be treated as a regular, non-inline bucket for the rest of the tx. .page = nilreturn .Bucket(), nil}// CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it.// Returns an error if the bucket name is blank, or if the bucket name is too long.// The bucket instance is only valid for the lifetime of the transaction.func ( *Bucket) ( []byte) (*Bucket, error) { , := .CreateBucket()if == ErrBucketExists {return .Bucket(), nil } elseif != nil {returnnil, }return , nil}// DeleteBucket deletes a bucket at the given key.// Returns an error if the bucket does not exist, or if the key represents a non-bucket value.func ( *Bucket) ( []byte) error {if .tx.db == nil {returnErrTxClosed } elseif !.Writable() {returnErrTxNotWritable }// Move cursor to correct position. := .Cursor() , , := .seek()// Return an error if bucket doesn't exist or is not a bucket.if !bytes.Equal(, ) {returnErrBucketNotFound } elseif ( & bucketLeafFlag) == 0 {returnErrIncompatibleValue }// Recursively delete all child buckets. := .Bucket() := .ForEach(func(, []byte) error {if , , := .Cursor().seek(); ( & bucketLeafFlag) != 0 {if := .(); != nil {returnfmt.Errorf("delete bucket: %s", ) } }returnnil })if != nil {return }// Remove cached copy.delete(.buckets, string())// Release all bucket pages to freelist. .nodes = nil .rootNode = nil .free()// Delete the node if we have a matching key. .node().del()returnnil}// Get retrieves the value for a key in the bucket.// Returns a nil value if the key does not exist or if the key is a nested bucket.// The returned value is only valid for the life of the transaction.func ( *Bucket) ( []byte) []byte { , , := .Cursor().seek()// Return nil if this is a bucket.if ( & bucketLeafFlag) != 0 {returnnil }// If our target node isn't the same key as what's passed in then return nil.if !bytes.Equal(, ) {returnnil }return}// Put sets the value for a key in the bucket.// If the key exist then its previous value will be overwritten.// Supplied value must remain valid for the life of the transaction.// Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large.func ( *Bucket) ( []byte, []byte) error {if .tx.db == nil {returnErrTxClosed } elseif !.Writable() {returnErrTxNotWritable } elseiflen() == 0 {returnErrKeyRequired } elseiflen() > MaxKeySize {returnErrKeyTooLarge } elseifint64(len()) > MaxValueSize {returnErrValueTooLarge }// Move cursor to correct position. := .Cursor() , , := .seek()// Return an error if there is an existing key with a bucket value.ifbytes.Equal(, ) && (&bucketLeafFlag) != 0 {returnErrIncompatibleValue }// Insert into node. = cloneBytes() .node().put(, , , 0, 0)returnnil}// Delete removes a key from the bucket.// If the key does not exist then nothing is done and a nil error is returned.// Returns an error if the bucket was created from a read-only transaction.func ( *Bucket) ( []byte) error {if .tx.db == nil {returnErrTxClosed } elseif !.Writable() {returnErrTxNotWritable }// Move cursor to correct position. := .Cursor() , , := .seek()// Return nil if the key doesn't exist.if !bytes.Equal(, ) {returnnil }// Return an error if there is already existing bucket value.if ( & bucketLeafFlag) != 0 {returnErrIncompatibleValue }// Delete the node if we have a matching key. .node().del()returnnil}// Sequence returns the current integer for the bucket without incrementing it.func ( *Bucket) () uint64 { return .bucket.sequence }// SetSequence updates the sequence number for the bucket.func ( *Bucket) ( uint64) error {if .tx.db == nil {returnErrTxClosed } elseif !.Writable() {returnErrTxNotWritable }// Materialize the root node if it hasn't been already so that the // bucket will be saved during commit.if .rootNode == nil { _ = .node(.root, nil) }// Increment and return the sequence. .bucket.sequence = returnnil}// NextSequence returns an autoincrementing integer for the bucket.func ( *Bucket) () (uint64, error) {if .tx.db == nil {return0, ErrTxClosed } elseif !.Writable() {return0, ErrTxNotWritable }// Materialize the root node if it hasn't been already so that the // bucket will be saved during commit.if .rootNode == nil { _ = .node(.root, nil) }// Increment and return the sequence. .bucket.sequence++return .bucket.sequence, nil}// ForEach executes a function for each key/value pair in a bucket.// If the provided function returns an error then the iteration is stopped and// the error is returned to the caller. The provided function must not modify// the bucket; this will result in undefined behavior.func ( *Bucket) ( func(, []byte) error) error {if .tx.db == nil {returnErrTxClosed } := .Cursor()for , := .First(); != nil; , = .Next() {if := (, ); != nil {return } }returnnil}// Stat returns stats on a bucket.func ( *Bucket) () BucketStats {var , BucketStats := .tx.db.pageSize .BucketN += 1if .root == 0 { .InlineBucketN += 1 } .forEachPage(func( *page, int) {if (.flags & leafPageFlag) != 0 { .KeyN += int(.count)// used totals the used bytes for the page := pageHeaderSizeif .count != 0 {// If page has any elements, add all element headers. += leafPageElementSize * uintptr(.count-1)// Add all element key, value sizes. // The computation takes advantage of the fact that the position // of the last element's key/value equals to the total of the sizes // of all previous elements' keys and values. // It also includes the last element's header. := .leafPageElement(.count - 1) += uintptr(.pos + .ksize + .vsize) }if .root == 0 {// For inlined bucket just update the inline stats .InlineBucketInuse += int() } else {// For non-inlined bucket update all the leaf stats .LeafPageN++ .LeafInuse += int() .LeafOverflowN += int(.overflow)// Collect stats from sub-buckets. // Do that by iterating over all element headers // looking for the ones with the bucketLeafFlag.for := uint16(0); < .count; ++ { := .leafPageElement()if (.flags & bucketLeafFlag) != 0 {// For any bucket element, open the element value // and recursively call Stats on the contained bucket. .Add(.openBucket(.value()).()) } } } } elseif (.flags & branchPageFlag) != 0 { .BranchPageN++ := .branchPageElement(.count - 1)// used totals the used bytes for the page // Add header and all element headers. := pageHeaderSize + (branchPageElementSize * uintptr(.count-1))// Add size of all keys and values. // Again, use the fact that last element's position equals to // the total of key, value sizes of all previous elements. += uintptr(.pos + .ksize) .BranchInuse += int() .BranchOverflowN += int(.overflow) }// Keep track of maximum page depth.if +1 > .Depth { .Depth = ( + 1) } })// Alloc stats can be computed from page counts and pageSize. .BranchAlloc = (.BranchPageN + .BranchOverflowN) * .LeafAlloc = (.LeafPageN + .LeafOverflowN) * // Add the max depth of sub-buckets to get total nested depth. .Depth += .Depth// Add the stats for all sub-buckets .Add()return}// forEachPage iterates over every page in a bucket, including inline pages.func ( *Bucket) ( func(*page, int)) {// If we have an inline page then just use that.if .page != nil { (.page, 0)return }// Otherwise traverse the page hierarchy. .tx.forEachPage(.root, 0, )}// forEachPageNode iterates over every page (or node) in a bucket.// This also includes inline pages.func ( *Bucket) ( func(*page, *node, int)) {// If we have an inline page or root node then just use that.if .page != nil { (.page, nil, 0)return } ._forEachPageNode(.root, 0, )}func ( *Bucket) ( pgid, int, func(*page, *node, int)) {var , = .pageNode()// Execute function. (, , )// Recursively loop over children.if != nil {if (.flags & branchPageFlag) != 0 {for := 0; < int(.count); ++ { := .branchPageElement(uint16()) .(.pgid, +1, ) } } } else {if !.isLeaf {for , := range .inodes { .(.pgid, +1, ) } } }}// spill writes all the nodes for this bucket to dirty pages.func ( *Bucket) () error {// Spill all child buckets first.for , := range .buckets {// If the child bucket is small enough and it has no child buckets then // write it inline into the parent bucket's page. Otherwise spill it // like a normal bucket and make the parent value a pointer to the page.var []byteif .inlineable() { .free() = .write() } else {if := .(); != nil {return }// Update the child bucket header in this bucket. = make([]byte, unsafe.Sizeof(bucket{}))var = (*bucket)(unsafe.Pointer(&[0])) * = *.bucket }// Skip writing the bucket if there are no materialized nodes.if .rootNode == nil {continue }// Update parent node.var = .Cursor() , , := .seek([]byte())if !bytes.Equal([]byte(), ) {panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(), )) }if &bucketLeafFlag == 0 {panic(fmt.Sprintf("unexpected bucket header flag: %x", )) } .node().put([]byte(), []byte(), , 0, bucketLeafFlag) }// Ignore if there's not a materialized root node.if .rootNode == nil {returnnil }// Spill nodes.if := .rootNode.spill(); != nil {return } .rootNode = .rootNode.root()// Update the root node for this bucket.if .rootNode.pgid >= .tx.meta.pgid {panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", .rootNode.pgid, .tx.meta.pgid)) } .root = .rootNode.pgidreturnnil}// inlineable returns true if a bucket is small enough to be written inline// and if it contains no subbuckets. Otherwise returns false.func ( *Bucket) () bool {var = .rootNode// Bucket must only contain a single leaf node.if == nil || !.isLeaf {returnfalse }// Bucket is not inlineable if it contains subbuckets or if it goes beyond // our threshold for inline bucket size.var = pageHeaderSizefor , := range .inodes { += leafPageElementSize + uintptr(len(.key)) + uintptr(len(.value))if .flags&bucketLeafFlag != 0 {returnfalse } elseif > .maxInlineBucketSize() {returnfalse } }returntrue}// Returns the maximum total size of a bucket to make it a candidate for inlining.func ( *Bucket) () uintptr {returnuintptr(.tx.db.pageSize / 4)}// write allocates and writes a bucket to a byte slice.func ( *Bucket) () []byte {// Allocate the appropriate size.var = .rootNodevar = make([]byte, bucketHeaderSize+.size())// Write a bucket header.var = (*bucket)(unsafe.Pointer(&[0])) * = *.bucket// Convert byte slice to a fake page and write the root node.var = (*page)(unsafe.Pointer(&[bucketHeaderSize])) .write()return}// rebalance attempts to balance all nodes.func ( *Bucket) () {for , := range .nodes { .rebalance() }for , := range .buckets { .() }}// node creates a node from a page and associates it with a given parent.func ( *Bucket) ( pgid, *node) *node {_assert(.nodes != nil, "nodes map expected")// Retrieve node if it's already been created.if := .nodes[]; != nil {return }// Otherwise create a node and cache it. := &node{bucket: , parent: }if == nil { .rootNode = } else { .children = append(.children, ) }// Use the inline page if this is an inline bucket.var = .pageif == nil { = .tx.page() }// Read the page into the node and cache it. .read() .nodes[] = // Update statistics. .tx.stats.NodeCount++return}// free recursively frees all pages in the bucket.func ( *Bucket) () {if .root == 0 {return }var = .tx .forEachPageNode(func( *page, *node, int) {if != nil { .db.freelist.free(.meta.txid, ) } else { .free() } }) .root = 0}// dereference removes all references to the old mmap.func ( *Bucket) () {if .rootNode != nil { .rootNode.root().dereference() }for , := range .buckets { .() }}// pageNode returns the in-memory node, if it exists.// Otherwise returns the underlying page.func ( *Bucket) ( pgid) (*page, *node) {// Inline buckets have a fake page embedded in their value so treat them // differently. We'll return the rootNode (if available) or the fake page.if .root == 0 {if != 0 {panic(fmt.Sprintf("inline bucket non-zero page access(2): %d != 0", )) }if .rootNode != nil {returnnil, .rootNode }return .page, nil }// Check the node cache for non-inline buckets.if .nodes != nil {if := .nodes[]; != nil {returnnil, } }// Finally lookup the page from the transaction if no node is materialized.return .tx.page(), nil}// BucketStats records statistics about resources used by a bucket.typeBucketStatsstruct {// Page count statistics. BranchPageN int// number of logical branch pages BranchOverflowN int// number of physical branch overflow pages LeafPageN int// number of logical leaf pages LeafOverflowN int// number of physical leaf overflow pages// Tree statistics. KeyN int// number of keys/value pairs Depth int// number of levels in B+tree// Page size utilization. BranchAlloc int// bytes allocated for physical branch pages BranchInuse int// bytes actually used for branch data LeafAlloc int// bytes allocated for physical leaf pages LeafInuse int// bytes actually used for leaf data// Bucket statistics BucketN int// total number of buckets including the top bucket InlineBucketN int// total number on inlined buckets InlineBucketInuse int// bytes used for inlined buckets (also accounted for in LeafInuse)}func ( *BucketStats) ( BucketStats) { .BranchPageN += .BranchPageN .BranchOverflowN += .BranchOverflowN .LeafPageN += .LeafPageN .LeafOverflowN += .LeafOverflowN .KeyN += .KeyNif .Depth < .Depth { .Depth = .Depth } .BranchAlloc += .BranchAlloc .BranchInuse += .BranchInuse .LeafAlloc += .LeafAlloc .LeafInuse += .LeafInuse .BucketN += .BucketN .InlineBucketN += .InlineBucketN .InlineBucketInuse += .InlineBucketInuse}// cloneBytes returns a copy of a given slice.func cloneBytes( []byte) []byte {var = make([]byte, len())copy(, )return}
The pages are generated with Goldsv0.8.2. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.