// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package hashing provides utilities for and an implementation of a hash // table which is more performant than the default go map implementation // by leveraging xxh3 and some custom hash functions.
package hashing import ( ) //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl type TypeTraits interface { BytesRequired(n int) int } type ByteSlice interface { Bytes() []byte } // MemoTable interface for hash tables and dictionary encoding. // // Values will remember the order they are inserted to generate a valid // dictionary. type MemoTable interface { TypeTraits() TypeTraits // Reset drops everything in the table allowing it to be reused Reset() // Size returns the current number of unique values stored in // the table, including whether or not a null value has been // inserted via GetOrInsertNull. Size() int // CopyValues populates out with the values currently in the table, out must // be a slice of the appropriate type for the table type. CopyValues(out any) // CopyValuesSubset is like CopyValues but only copies a subset of values starting // at the indicated index. CopyValuesSubset(start int, out any) // Get returns the index of the table the specified value is, and a boolean indicating // whether or not the value was found in the table. Will panic if val is not the appropriate // type for the underlying table. Get(val interface{}) (int, bool) // GetOrInsert returns the index of the table the specified value is, // and a boolean indicating whether or not the value was found in // the table (if false, the value was inserted). An error is returned // if val is not the appropriate type for the table. GetOrInsert(val interface{}) (idx int, existed bool, err error) // GetOrInsertBytes returns the index of the table the specified value is, // and a boolean indicating whether or not the value was found in // the table (if false, the value was inserted). An error is returned // if val is not the appropriate type for the table. This function is intended to be used by // the BinaryMemoTable to prevent unnecessary allocations of the data when converting from a []byte to interface{}. GetOrInsertBytes(val []byte) (idx int, existed bool, err error) // GetOrInsertNull returns the index of the null value in the table, // inserting one if it hasn't already been inserted. It returns a boolean // indicating if the null value already existed or not in the table. GetOrInsertNull() (idx int, existed bool) // GetNull returns the index of the null value in the table, but does not // insert one if it doesn't already exist. Will return -1 if it doesn't exist // indicated by a false value for the boolean. GetNull() (idx int, exists bool) // WriteOut copies the unique values of the memotable out to the byte slice // provided. Must have allocated enough bytes for all the values. WriteOut(out []byte) // WriteOutSubset is like WriteOut, but only writes a subset of values // starting with the index offset. WriteOutSubset(offset int, out []byte) } type MemoTypes interface { int8 | int16 | int32 | int64 | uint8 | uint16 | uint32 | uint64 | float32 | float64 | []byte } type TypedMemoTable[ MemoTypes] interface { MemoTable Exists() bool InsertOrGet(val ) (idx int, found bool, err error) } type NumericMemoTable interface { MemoTable WriteOutLE(out []byte) WriteOutSubsetLE(offset int, out []byte) } const ( sentinel uint64 = 0 loadFactor int64 = 2 ) // KeyNotFound is the constant returned by memo table functions when a key isn't found in the table const KeyNotFound = -1 type BinaryBuilderIFace interface { Reserve(int) ReserveData(int) Retain() Resize(int) ResizeData(int) Release() DataLen() int Value(int) []byte Len() int AppendNull() AppendString(string) Append([]byte) } // BinaryMemoTable is our hashtable for binary data using the BinaryBuilder // to construct the actual data in an easy to pass around way with minimal copies // while using a hash table to keep track of the indexes into the dictionary that // is created as we go. type BinaryMemoTable struct { tbl *HashTable[int32] builder BinaryBuilderIFace nullIdx int } // NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will // be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used. // initial and valuesize can be used to pre-allocate the table to reduce allocations. With // initial being the initial number of entries to allocate for and valuesize being the starting // amount of space allocated for writing the actual binary data. func (, int, BinaryBuilderIFace) *BinaryMemoTable { .Reserve(int()) := if <= 0 { = * 4 } .ReserveData() return &BinaryMemoTable{tbl: NewHashTable[int32](uint64()), builder: , nullIdx: KeyNotFound} } type unimplementedtraits struct{} func (unimplementedtraits) (int) int { panic("unimplemented") } func (BinaryMemoTable) () TypeTraits { return unimplementedtraits{} } // Reset dumps all of the data in the table allowing it to be reutilized. func ( *BinaryMemoTable) () { .tbl.Reset(32) .builder.Resize(0) .builder.ResizeData(0) .builder.Reserve(int(32)) .builder.ReserveData(int(32) * 4) .nullIdx = KeyNotFound } // GetNull returns the index of a null that has been inserted into the table or // KeyNotFound. The bool returned will be true if there was a null inserted into // the table, and false otherwise. func ( *BinaryMemoTable) () (int, bool) { return int(.nullIdx), .nullIdx != KeyNotFound } // Size returns the current size of the memo table including the null value // if one has been inserted. func ( *BinaryMemoTable) () int { := int(.tbl.size) if , := .GetNull(); { ++ } return } // helper function to easily return a byte slice for any given value // regardless of the type if it's a []byte, string, or fulfills the // ByteSlice interface. func (BinaryMemoTable) ( interface{}) []byte { switch v := .(type) { case []byte: return case ByteSlice: return .Bytes() case string: return strToBytes() default: panic("invalid type for binarymemotable") } } // helper function to get the hash value regardless of the underlying binary type func (BinaryMemoTable) ( interface{}) uint64 { switch v := .(type) { case string: return hashString(, 0) case []byte: return Hash(, 0) case ByteSlice: return Hash(.Bytes(), 0) default: panic("invalid type for binarymemotable") } } func ( *BinaryMemoTable) ( uint64, []byte) (*entry[int32], bool) { return .tbl.Lookup(, func( int32) bool { return bytes.Equal(, .builder.Value(int())) }) } func ( *BinaryMemoTable) ( []byte) bool { , := .lookup(.getHash(), ) return } // Get returns the index of the specified value in the table or KeyNotFound, // and a boolean indicating whether it was found in the table. func ( *BinaryMemoTable) ( interface{}) (int, bool) { if , := .lookup(.getHash(), .valAsByteSlice()); { return int(.payload.val), } return KeyNotFound, false } // GetOrInsertBytes returns the index of the given value in the table, if not found // it is inserted into the table. The return value 'found' indicates whether the value // was found in the table (true) or inserted (false) along with any possible error. func ( *BinaryMemoTable) ( []byte) ( int, bool, error) { := Hash(, 0) , := .lookup(, ) if { = int(.payload.val) } else { = .Size() .builder.Append() .tbl.Insert(, , int32(), -1) } return } func ( *BinaryMemoTable) ( interface{}) ( int, bool, error) { return .InsertOrGet(.valAsByteSlice()) } // GetOrInsert returns the index of the given value in the table, if not found // it is inserted into the table. The return value 'found' indicates whether the value // was found in the table (true) or inserted (false) along with any possible error. func ( *BinaryMemoTable) ( []byte) ( int, bool, error) { := .getHash() , := .lookup(, ) if { = int(.payload.val) } else { = .Size() .builder.Append() .tbl.Insert(, , int32(), -1) } return } // GetOrInsertNull retrieves the index of a null in the table or inserts // null into the table, returning the index and a boolean indicating if it was // found in the table (true) or was inserted (false). func ( *BinaryMemoTable) () ( int, bool) { , = .GetNull() if ! { = .Size() .nullIdx = .builder.AppendNull() } return } func ( *BinaryMemoTable) ( int) []byte { return .builder.Value() } // helper function to get the offset into the builder data for a given // index value. func ( *BinaryMemoTable) ( int) uintptr { if .builder.DataLen() == 0 { // only empty strings, short circuit return 0 } := .builder.Value() for len() == 0 { ++ if >= .builder.Len() { break } = .builder.Value() } if len() != 0 { return uintptr(unsafe.Pointer(&[0])) } return uintptr(.builder.DataLen()) + .(0) } // CopyOffsets copies the list of offsets into the passed in slice, the offsets // being the start and end values of the underlying allocated bytes in the builder // for the individual values of the table. out should be at least sized to Size()+1 func ( *BinaryMemoTable) ( []int32) { .CopyOffsetsSubset(0, ) } // CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, // it gets a subset of the offsets in the table starting at the index provided by "start". func ( *BinaryMemoTable) ( int, []int32) { if .builder.Len() <= { return } := .findOffset(0) := .findOffset() := .Size() for := ; < ; ++ { := int32(.findOffset() - ) [-] = } [-] = int32(.builder.DataLen() - (int() - int())) } // CopyLargeOffsets copies the list of offsets into the passed in slice, the offsets // being the start and end values of the underlying allocated bytes in the builder // for the individual values of the table. out should be at least sized to Size()+1 func ( *BinaryMemoTable) ( []int64) { .CopyLargeOffsetsSubset(0, ) } // CopyLargeOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, // it gets a subset of the offsets in the table starting at the index provided by "start". func ( *BinaryMemoTable) ( int, []int64) { if .builder.Len() <= { return } := .findOffset(0) := .findOffset() := .Size() for := ; < ; ++ { := int64(.findOffset() - ) [-] = } [-] = int64(.builder.DataLen() - (int() - int())) } // CopyValues copies the raw binary data bytes out, out should be a []byte // with at least ValuesSize bytes allocated to copy into. func ( *BinaryMemoTable) ( interface{}) { .CopyValuesSubset(0, ) } // CopyValuesSubset copies the raw binary data bytes out starting with the value // at the index start, out should be a []byte with at least ValuesSize bytes allocated func ( *BinaryMemoTable) ( int, interface{}) { if .builder.Len() <= { return } var ( = .findOffset(0) = .findOffset(int()) = .builder.DataLen() - int(-) ) := .([]byte) copy(, .builder.Value()[0:]) } func ( *BinaryMemoTable) ( []byte) { .CopyValues() } func ( *BinaryMemoTable) ( int, []byte) { .CopyValuesSubset(, ) } // CopyFixedWidthValues exists to cope with the fact that the table doesn't keep // track of the fixed width when inserting the null value the databuffer holds a // zero length byte slice for the null value (if found) func ( *BinaryMemoTable) (, int, []byte) { if >= .Size() { return } , := .GetNull() if ! || < { // nothing to skip, proceed as usual .CopyValuesSubset(, ) return } var ( = .findOffset() = .findOffset() = - = + uintptr(.ValuesSize()) ) if > 0 { copy(, .builder.Value()[0:]) } := - if > 0 { // skip the null fixed size value copy([int()+:], .builder.Value( + 1)[0:]) } } // VisitValues exists to run the visitFn on each value currently in the hash table. func ( *BinaryMemoTable) ( int, func([]byte)) { for := int(); < .Size(); ++ { (.builder.Value()) } } // Release is used to tell the underlying builder that it can release the memory allocated // when the reference count reaches 0, this is safe to be called from multiple goroutines // simultaneously func ( *BinaryMemoTable) () { .builder.Release() } // Retain increases the ref count, it is safe to call it from multiple goroutines // simultaneously. func ( *BinaryMemoTable) () { .builder.Retain() } // ValuesSize returns the current total size of all the raw bytes that have been inserted // into the memotable so far. func ( *BinaryMemoTable) () int { return .builder.DataLen() }