// Licensed to the Apache Software Foundation (ASF) under one// or more contributor license agreements. See the NOTICE file// distributed with this work for additional information// regarding copyright ownership. The ASF licenses this file// to you under the Apache License, Version 2.0 (the// "License"); you may not use this file except in compliance// with the License. You may obtain a copy of the License at//// http://www.apache.org/licenses/LICENSE-2.0//// Unless required by applicable law or agreed to in writing, software// distributed under the License is distributed on an "AS IS" BASIS,// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.// See the License for the specific language governing permissions and// limitations under the License.
// Package hashing provides utilities for and an implementation of a hash// table which is more performant than the default go map implementation// by leveraging xxh3 and some custom hash functions.
package hashingimport ()//go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpltypeTypeTraitsinterface {BytesRequired(n int) int}typeByteSliceinterface {Bytes() []byte}// MemoTable interface for hash tables and dictionary encoding.//// Values will remember the order they are inserted to generate a valid// dictionary.typeMemoTableinterface {TypeTraits() TypeTraits// Reset drops everything in the table allowing it to be reusedReset()// Size returns the current number of unique values stored in // the table, including whether or not a null value has been // inserted via GetOrInsertNull.Size() int// CopyValues populates out with the values currently in the table, out must // be a slice of the appropriate type for the table type.CopyValues(out any)// CopyValuesSubset is like CopyValues but only copies a subset of values starting // at the indicated index.CopyValuesSubset(start int, out any)// Get returns the index of the table the specified value is, and a boolean indicating // whether or not the value was found in the table. Will panic if val is not the appropriate // type for the underlying table.Get(val interface{}) (int, bool)// GetOrInsert returns the index of the table the specified value is, // and a boolean indicating whether or not the value was found in // the table (if false, the value was inserted). An error is returned // if val is not the appropriate type for the table.GetOrInsert(val interface{}) (idx int, existed bool, err error)// GetOrInsertBytes returns the index of the table the specified value is, // and a boolean indicating whether or not the value was found in // the table (if false, the value was inserted). An error is returned // if val is not the appropriate type for the table. This function is intended to be used by // the BinaryMemoTable to prevent unnecessary allocations of the data when converting from a []byte to interface{}.GetOrInsertBytes(val []byte) (idx int, existed bool, err error)// GetOrInsertNull returns the index of the null value in the table, // inserting one if it hasn't already been inserted. It returns a boolean // indicating if the null value already existed or not in the table.GetOrInsertNull() (idx int, existed bool)// GetNull returns the index of the null value in the table, but does not // insert one if it doesn't already exist. Will return -1 if it doesn't exist // indicated by a false value for the boolean.GetNull() (idx int, exists bool)// WriteOut copies the unique values of the memotable out to the byte slice // provided. Must have allocated enough bytes for all the values.WriteOut(out []byte)// WriteOutSubset is like WriteOut, but only writes a subset of values // starting with the index offset.WriteOutSubset(offset int, out []byte)}typeMemoTypesinterface {int8 | int16 | int32 | int64 |uint8 | uint16 | uint32 | uint64 |float32 | float64 | []byte}typeTypedMemoTable[ MemoTypes] interface {MemoTableExists() boolInsertOrGet(val ) (idx int, found bool, err error)}typeNumericMemoTableinterface {MemoTableWriteOutLE(out []byte)WriteOutSubsetLE(offset int, out []byte)}const ( sentinel uint64 = 0 loadFactor int64 = 2)// KeyNotFound is the constant returned by memo table functions when a key isn't found in the tableconstKeyNotFound = -1typeBinaryBuilderIFaceinterface {Reserve(int)ReserveData(int)Retain()Resize(int)ResizeData(int)Release()DataLen() intValue(int) []byteLen() intAppendNull()AppendString(string)Append([]byte)}// BinaryMemoTable is our hashtable for binary data using the BinaryBuilder// to construct the actual data in an easy to pass around way with minimal copies// while using a hash table to keep track of the indexes into the dictionary that// is created as we go.typeBinaryMemoTablestruct { tbl *HashTable[int32] builder BinaryBuilderIFace nullIdx int}// NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will// be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used.// initial and valuesize can be used to pre-allocate the table to reduce allocations. With// initial being the initial number of entries to allocate for and valuesize being the starting// amount of space allocated for writing the actual binary data.func (, int, BinaryBuilderIFace) *BinaryMemoTable { .Reserve(int()) := if <= 0 { = * 4 } .ReserveData()return &BinaryMemoTable{tbl: NewHashTable[int32](uint64()), builder: , nullIdx: KeyNotFound}}type unimplementedtraits struct{}func (unimplementedtraits) (int) int { panic("unimplemented") }func (BinaryMemoTable) () TypeTraits {returnunimplementedtraits{}}// Reset dumps all of the data in the table allowing it to be reutilized.func ( *BinaryMemoTable) () { .tbl.Reset(32) .builder.Resize(0) .builder.ResizeData(0) .builder.Reserve(int(32)) .builder.ReserveData(int(32) * 4) .nullIdx = KeyNotFound}// GetNull returns the index of a null that has been inserted into the table or// KeyNotFound. The bool returned will be true if there was a null inserted into// the table, and false otherwise.func ( *BinaryMemoTable) () (int, bool) {returnint(.nullIdx), .nullIdx != KeyNotFound}// Size returns the current size of the memo table including the null value// if one has been inserted.func ( *BinaryMemoTable) () int { := int(.tbl.size)if , := .GetNull(); { ++ }return}// helper function to easily return a byte slice for any given value// regardless of the type if it's a []byte, string, or fulfills the// ByteSlice interface.func (BinaryMemoTable) ( interface{}) []byte {switch v := .(type) {case []byte:returncaseByteSlice:return .Bytes()casestring:returnstrToBytes()default:panic("invalid type for binarymemotable") }}// helper function to get the hash value regardless of the underlying binary typefunc (BinaryMemoTable) ( interface{}) uint64 {switch v := .(type) {casestring:returnhashString(, 0)case []byte:returnHash(, 0)caseByteSlice:returnHash(.Bytes(), 0)default:panic("invalid type for binarymemotable") }}func ( *BinaryMemoTable) ( uint64, []byte) (*entry[int32], bool) {return .tbl.Lookup(, func( int32) bool {returnbytes.Equal(, .builder.Value(int())) })}func ( *BinaryMemoTable) ( []byte) bool { , := .lookup(.getHash(), )return}// Get returns the index of the specified value in the table or KeyNotFound,// and a boolean indicating whether it was found in the table.func ( *BinaryMemoTable) ( interface{}) (int, bool) {if , := .lookup(.getHash(), .valAsByteSlice()); {returnint(.payload.val), }returnKeyNotFound, false}// GetOrInsertBytes returns the index of the given value in the table, if not found// it is inserted into the table. The return value 'found' indicates whether the value// was found in the table (true) or inserted (false) along with any possible error.func ( *BinaryMemoTable) ( []byte) ( int, bool, error) { := Hash(, 0) , := .lookup(, )if { = int(.payload.val) } else { = .Size() .builder.Append() .tbl.Insert(, , int32(), -1) }return}func ( *BinaryMemoTable) ( interface{}) ( int, bool, error) {return .InsertOrGet(.valAsByteSlice())}// GetOrInsert returns the index of the given value in the table, if not found// it is inserted into the table. The return value 'found' indicates whether the value// was found in the table (true) or inserted (false) along with any possible error.func ( *BinaryMemoTable) ( []byte) ( int, bool, error) { := .getHash() , := .lookup(, )if { = int(.payload.val) } else { = .Size() .builder.Append() .tbl.Insert(, , int32(), -1) }return}// GetOrInsertNull retrieves the index of a null in the table or inserts// null into the table, returning the index and a boolean indicating if it was// found in the table (true) or was inserted (false).func ( *BinaryMemoTable) () ( int, bool) { , = .GetNull()if ! { = .Size() .nullIdx = .builder.AppendNull() }return}func ( *BinaryMemoTable) ( int) []byte {return .builder.Value()}// helper function to get the offset into the builder data for a given// index value.func ( *BinaryMemoTable) ( int) uintptr {if .builder.DataLen() == 0 {// only empty strings, short circuitreturn0 } := .builder.Value()forlen() == 0 { ++if >= .builder.Len() {break } = .builder.Value() }iflen() != 0 {returnuintptr(unsafe.Pointer(&[0])) }returnuintptr(.builder.DataLen()) + .(0)}// CopyOffsets copies the list of offsets into the passed in slice, the offsets// being the start and end values of the underlying allocated bytes in the builder// for the individual values of the table. out should be at least sized to Size()+1func ( *BinaryMemoTable) ( []int32) { .CopyOffsetsSubset(0, )}// CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets,// it gets a subset of the offsets in the table starting at the index provided by "start".func ( *BinaryMemoTable) ( int, []int32) {if .builder.Len() <= {return } := .findOffset(0) := .findOffset() := .Size()for := ; < ; ++ { := int32(.findOffset() - ) [-] = } [-] = int32(.builder.DataLen() - (int() - int()))}// CopyLargeOffsets copies the list of offsets into the passed in slice, the offsets// being the start and end values of the underlying allocated bytes in the builder// for the individual values of the table. out should be at least sized to Size()+1func ( *BinaryMemoTable) ( []int64) { .CopyLargeOffsetsSubset(0, )}// CopyLargeOffsetsSubset is like CopyOffsets but instead of copying all of the offsets,// it gets a subset of the offsets in the table starting at the index provided by "start".func ( *BinaryMemoTable) ( int, []int64) {if .builder.Len() <= {return } := .findOffset(0) := .findOffset() := .Size()for := ; < ; ++ { := int64(.findOffset() - ) [-] = } [-] = int64(.builder.DataLen() - (int() - int()))}// CopyValues copies the raw binary data bytes out, out should be a []byte// with at least ValuesSize bytes allocated to copy into.func ( *BinaryMemoTable) ( interface{}) { .CopyValuesSubset(0, )}// CopyValuesSubset copies the raw binary data bytes out starting with the value// at the index start, out should be a []byte with at least ValuesSize bytes allocatedfunc ( *BinaryMemoTable) ( int, interface{}) {if .builder.Len() <= {return }var ( = .findOffset(0) = .findOffset(int()) = .builder.DataLen() - int(-) ) := .([]byte)copy(, .builder.Value()[0:])}func ( *BinaryMemoTable) ( []byte) { .CopyValues()}func ( *BinaryMemoTable) ( int, []byte) { .CopyValuesSubset(, )}// CopyFixedWidthValues exists to cope with the fact that the table doesn't keep// track of the fixed width when inserting the null value the databuffer holds a// zero length byte slice for the null value (if found)func ( *BinaryMemoTable) (, int, []byte) {if >= .Size() {return } , := .GetNull()if ! || < {// nothing to skip, proceed as usual .CopyValuesSubset(, )return }var ( = .findOffset() = .findOffset() = - = + uintptr(.ValuesSize()) )if > 0 {copy(, .builder.Value()[0:]) } := - if > 0 {// skip the null fixed size valuecopy([int()+:], .builder.Value( + 1)[0:]) }}// VisitValues exists to run the visitFn on each value currently in the hash table.func ( *BinaryMemoTable) ( int, func([]byte)) {for := int(); < .Size(); ++ { (.builder.Value()) }}// Release is used to tell the underlying builder that it can release the memory allocated// when the reference count reaches 0, this is safe to be called from multiple goroutines// simultaneouslyfunc ( *BinaryMemoTable) () { .builder.Release() }// Retain increases the ref count, it is safe to call it from multiple goroutines// simultaneously.func ( *BinaryMemoTable) () { .builder.Retain() }// ValuesSize returns the current total size of all the raw bytes that have been inserted// into the memotable so far.func ( *BinaryMemoTable) () int { return .builder.DataLen() }
The pages are generated with Goldsv0.8.2. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.