package builder
import (
"fmt"
"math"
"slices"
"sync/atomic"
"unsafe"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/apache/arrow-go/v18/arrow/bitutil"
"github.com/apache/arrow-go/v18/arrow/memory"
"github.com/parquet-go/parquet-go"
)
type ColumnBuilder interface {
Retain ()
Release ()
Len () int
AppendNull ()
Reserve (int )
NewArray () arrow .Array
}
type OptimizedBuilder interface {
ColumnBuilder
AppendNulls (int )
ResetToLength (int )
RepeatLastValue (int ) error
IsNull (i int ) bool
IsValid (i int ) bool
SetNull (i int )
}
type builderBase struct {
dtype arrow .DataType
refCount int64
length int
validityBitmap []byte
}
func (b *builderBase ) reset () {
b .length = 0
b .validityBitmap = b .validityBitmap [:0 ]
}
func (b *builderBase ) Retain () {
atomic .AddInt64 (&b .refCount , 1 )
}
func (b *builderBase ) releaseInternal () {
b .length = 0
b .validityBitmap = nil
}
func (b *builderBase ) Release () {
atomic .AddInt64 (&b .refCount , -1 )
b .releaseInternal ()
}
func (b *builderBase ) Len () int {
return b .length
}
func (b *builderBase ) Reserve (int ) {}
func (b *builderBase ) AppendNulls (n int ) {
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length +n )
bitutil .SetBitsTo (b .validityBitmap , int64 (b .length ), int64 (n ), false )
b .length += n
}
func (b *builderBase ) SetNull (i int ) {
bitutil .ClearBit (b .validityBitmap , i )
}
func (b *builderBase ) IsValid (n int ) bool {
return bitutil .BitIsSet (b .validityBitmap , n )
}
func (b *builderBase ) appendValid (n int ) {
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length +n )
bitutil .SetBitsTo (b .validityBitmap , int64 (b .length ), int64 (n ), true )
b .length += n
}
func (b *builderBase ) IsNull (n int ) bool {
return bitutil .BitIsNotSet (b .validityBitmap , n )
}
func resizeBitmap(bitmap []byte , valuesToRepresent int ) []byte {
bytesNeeded := int (bitutil .BytesForBits (int64 (valuesToRepresent )))
if cap (bitmap ) < bytesNeeded {
existingBitmap := bitmap
bitmap = make ([]byte , bitutil .NextPowerOf2 (bytesNeeded ))
copy (bitmap , existingBitmap )
}
return bitmap [:bytesNeeded ]
}
var (
_ OptimizedBuilder = (*OptBinaryBuilder )(nil )
_ OptimizedBuilder = (*OptInt64Builder )(nil )
_ OptimizedBuilder = (*OptBooleanBuilder )(nil )
_ OptimizedBuilder = (*OptFloat64Builder )(nil )
)
type OptBinaryBuilder struct {
builderBase
data []byte
offsets []uint32
}
func NewOptBinaryBuilder (dtype arrow .BinaryDataType ) *OptBinaryBuilder {
b := &OptBinaryBuilder {}
b .dtype = dtype
return b
}
func (b *OptBinaryBuilder ) Release () {
if atomic .AddInt64 (&b .refCount , -1 ) == 0 {
b .data = nil
b .offsets = nil
b .releaseInternal ()
}
}
func (b *OptBinaryBuilder ) AppendNull () {
b .offsets = append (b .offsets , uint32 (len (b .data )))
b .builderBase .AppendNulls (1 )
}
func (b *OptBinaryBuilder ) AppendEmptyValue () {
b .offsets = append (b .offsets , uint32 (len (b .data )))
b .appendValid (1 )
}
func (b *OptBinaryBuilder ) AppendNulls (n int ) {
for i := 0 ; i < n ; i ++ {
b .offsets = append (b .offsets , uint32 (len (b .data )))
}
b .builderBase .AppendNulls (n )
}
func (b *OptBinaryBuilder ) NewArray () arrow .Array {
b .offsets = append (b .offsets , uint32 (len (b .data )))
offsetsAsBytes := unsafe .Slice ((*byte )(unsafe .Pointer (unsafe .SliceData (b .offsets ))), len (b .offsets )*arrow .Uint32SizeBytes )
data := array .NewData (
b .dtype ,
b .length ,
[]*memory .Buffer {
memory .NewBufferBytes (b .validityBitmap ),
memory .NewBufferBytes (offsetsAsBytes ),
memory .NewBufferBytes (b .data ),
},
nil ,
b .length -bitutil .CountSetBits (b .validityBitmap , 0 , b .length ),
0 ,
)
b .reset ()
b .offsets = b .offsets [:0 ]
b .data = nil
return array .NewBinaryData (data )
}
var ErrMaxSizeReached = fmt .Errorf ("max size reached" )
func (b *OptBinaryBuilder ) AppendData (data []byte , offsets []uint32 ) error {
if len (b .data )+len (data ) > math .MaxInt32 {
return ErrMaxSizeReached
}
offsets = offsets [:len (offsets )-1 ]
offsetConversion := uint32 (len (b .data ))
b .data = append (b .data , data ...)
startOffset := len (b .offsets )
b .offsets = append (b .offsets , offsets ...)
for curOffset := startOffset ; curOffset < len (b .offsets ); curOffset ++ {
b .offsets [curOffset ] += offsetConversion
}
b .length += len (offsets )
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length )
bitutil .SetBitsTo (b .validityBitmap , int64 (startOffset ), int64 (len (offsets )), true )
return nil
}
func (b *OptBinaryBuilder ) Append (v []byte ) error {
if len (b .data )+len (v ) > math .MaxInt32 {
return ErrMaxSizeReached
}
b .offsets = append (b .offsets , uint32 (len (b .data )))
b .data = append (b .data , v ...)
b .length ++
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length )
bitutil .SetBit (b .validityBitmap , b .length -1 )
return nil
}
func (b *OptBinaryBuilder ) AppendParquetValues (values []parquet .Value ) error {
size := 0
for i := range values {
size += len (values [i ].ByteArray ())
}
if len (b .data )+size > math .MaxInt32 {
return ErrMaxSizeReached
}
for i := range values {
b .offsets = append (b .offsets , uint32 (len (b .data )))
b .data = append (b .data , values [i ].ByteArray ()...)
}
oldLength := b .length
b .length += len (values )
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length )
for i := range values {
bitutil .SetBitTo (b .validityBitmap , oldLength +i , !values [i ].IsNull ())
}
return nil
}
func (b *OptBinaryBuilder ) RepeatLastValue (n int ) error {
if bitutil .BitIsNotSet (b .validityBitmap , b .length -1 ) {
b .AppendNulls (n )
return nil
}
lastValue := b .data [b .offsets [len (b .offsets )-1 ]:]
if len (b .data )+(len (lastValue )*n ) > math .MaxInt32 {
return ErrMaxSizeReached
}
for i := 0 ; i < n ; i ++ {
b .offsets = append (b .offsets , uint32 (len (b .data )))
b .data = append (b .data , lastValue ...)
}
b .appendValid (n )
return nil
}
func (b *OptBinaryBuilder ) ResetToLength (n int ) {
if n == b .length {
return
}
b .length = n
b .data = b .data [:b .offsets [n ]]
b .offsets = b .offsets [:n ]
b .validityBitmap = resizeBitmap (b .validityBitmap , n )
}
func (b *OptBinaryBuilder ) Value (i int ) []byte {
if i == b .length -1 {
return b .data [b .offsets [i ]:]
}
return b .data [b .offsets [i ]:b .offsets [i +1 ]]
}
type OptInt64Builder struct {
builderBase
data []int64
}
func NewOptInt64Builder (dtype arrow .DataType ) *OptInt64Builder {
b := &OptInt64Builder {}
b .dtype = dtype
return b
}
func (b *OptInt64Builder ) resizeData (neededLength int ) {
if cap (b .data ) < neededLength {
oldData := b .data
b .data = make ([]int64 , bitutil .NextPowerOf2 (neededLength ))
copy (b .data , oldData )
}
b .data = b .data [:neededLength ]
}
func (b *OptInt64Builder ) Release () {
if atomic .AddInt64 (&b .refCount , -1 ) == 0 {
b .data = nil
b .releaseInternal ()
}
}
func (b *OptInt64Builder ) AppendNull () {
b .AppendNulls (1 )
}
func (b *OptInt64Builder ) AppendEmptyValue () {
b .Append (0 )
}
func (b *OptInt64Builder ) AppendNulls (n int ) {
b .resizeData (b .length + n )
b .builderBase .AppendNulls (n )
}
func (b *OptInt64Builder ) NewArray () arrow .Array {
dataAsBytes := unsafe .Slice ((*byte )(unsafe .Pointer (unsafe .SliceData (b .data ))), len (b .data )*arrow .Int64SizeBytes )
data := array .NewData (
b .dtype ,
b .length ,
[]*memory .Buffer {
memory .NewBufferBytes (b .validityBitmap ),
memory .NewBufferBytes (dataAsBytes ),
},
nil ,
b .length -bitutil .CountSetBits (b .validityBitmap , 0 , b .length ),
0 ,
)
b .reset ()
b .data = nil
return array .NewInt64Data (data )
}
func (b *OptInt64Builder ) AppendData (data []int64 ) {
oldLength := b .length
b .data = append (b .data , data ...)
b .length += len (data )
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length )
bitutil .SetBitsTo (b .validityBitmap , int64 (oldLength ), int64 (len (data )), true )
}
func (b *OptInt64Builder ) Append (v int64 ) {
b .data = append (b .data , v )
b .length ++
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length )
bitutil .SetBit (b .validityBitmap , b .length -1 )
}
func (b *OptInt64Builder ) Set (i int , v int64 ) {
b .data [i ] = v
}
func (b *OptInt64Builder ) Add (i int , v int64 ) {
b .data [i ] += v
}
func (b *OptInt64Builder ) Value (i int ) int64 {
return b .data [i ]
}
func (b *OptInt64Builder ) AppendParquetValues (values []parquet .Value ) {
b .resizeData (b .length + len (values ))
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length +len (values ))
for i , j := b .length , 0 ; i < b .length +len (values ) && j < len (values ); {
b .data [i ] = values [j ].Int64 ()
bitutil .SetBitTo (b .validityBitmap , i , !values [j ].IsNull ())
i ++
j ++
}
b .length += len (values )
}
func (b *OptInt64Builder ) RepeatLastValue (n int ) error {
if bitutil .BitIsNotSet (b .validityBitmap , b .length -1 ) {
b .AppendNulls (n )
return nil
}
lastValue := b .data [b .length -1 ]
b .resizeData (b .length + n )
for i := b .length ; i < b .length +n ; i ++ {
b .data [i ] = lastValue
}
b .appendValid (n )
return nil
}
func (b *OptInt64Builder ) ResetToLength (n int ) {
if n == b .length {
return
}
b .length = n
b .data = b .data [:n ]
b .validityBitmap = resizeBitmap (b .validityBitmap , n )
}
type OptBooleanBuilder struct {
builderBase
data []byte
}
func NewOptBooleanBuilder (dtype arrow .DataType ) *OptBooleanBuilder {
b := &OptBooleanBuilder {}
b .dtype = dtype
return b
}
func (b *OptBooleanBuilder ) Release () {
if atomic .AddInt64 (&b .refCount , -1 ) == 0 {
b .data = nil
b .releaseInternal ()
}
}
func (b *OptBooleanBuilder ) AppendNull () {
b .AppendNulls (1 )
}
func (b *OptBooleanBuilder ) AppendEmptyValue () {
b .AppendSingle (false )
}
func (b *OptBooleanBuilder ) AppendNulls (n int ) {
v := b .length + n
b .data = resizeBitmap (b .data , v )
b .validityBitmap = resizeBitmap (b .validityBitmap , v )
for i := 0 ; i < n ; i ++ {
bitutil .SetBitTo (b .data , b .length , false )
bitutil .SetBitTo (b .validityBitmap , b .length , false )
b .length ++
}
}
func (b *OptBooleanBuilder ) NewArray () arrow .Array {
data := array .NewData (
b .dtype ,
b .length ,
[]*memory .Buffer {
memory .NewBufferBytes (b .validityBitmap ),
memory .NewBufferBytes (b .data ),
},
nil ,
b .length -bitutil .CountSetBits (b .validityBitmap , 0 , b .length ),
0 ,
)
b .reset ()
b .data = nil
array := array .NewBooleanData (data )
return array
}
func (b *OptBooleanBuilder ) Append (data []byte , valid int ) {
n := b .length + valid
b .data = resizeBitmap (b .data , n )
b .validityBitmap = resizeBitmap (b .validityBitmap , n )
for i := 0 ; i < valid ; i ++ {
bitutil .SetBitTo (b .data , b .length , bitutil .BitIsSet (data , i ))
bitutil .SetBitTo (b .validityBitmap , b .length , true )
b .length ++
}
}
func (b *OptBooleanBuilder ) Set (i int , v bool ) {
bitutil .SetBitTo (b .data , i , v )
}
func (b *OptBooleanBuilder ) Value (i int ) bool {
return bitutil .BitIsSet (b .data , i )
}
func (b *OptBooleanBuilder ) AppendData (_ []byte ) {
panic ("do not use AppendData for opt boolean builder, use Append instead" )
}
func (b *OptBooleanBuilder ) AppendParquetValues (values []parquet .Value ) {
n := b .length + len (values )
b .data = resizeBitmap (b .data , n )
b .validityBitmap = resizeBitmap (b .validityBitmap , n )
for _ , v := range values {
bitutil .SetBitTo (b .data , b .length , v .Boolean ())
bitutil .SetBitTo (b .validityBitmap , b .length , true )
b .length ++
}
}
func (b *OptBooleanBuilder ) AppendSingle (v bool ) {
b .length ++
b .data = resizeBitmap (b .data , b .length )
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length )
bitutil .SetBitTo (b .data , b .length -1 , v )
bitutil .SetBit (b .validityBitmap , b .length -1 )
}
func (b *OptBooleanBuilder ) RepeatLastValue (n int ) error {
if bitutil .BitIsNotSet (b .validityBitmap , b .length -1 ) {
b .AppendNulls (n )
return nil
}
lastValue := bitutil .BitIsSet (b .data , b .length -1 )
b .data = resizeBitmap (b .data , b .length +n )
bitutil .SetBitsTo (b .data , int64 (b .length ), int64 (n ), lastValue )
b .appendValid (n )
return nil
}
func (b *OptBooleanBuilder ) ResetToLength (n int ) {
if n == b .length {
return
}
b .length = n
b .data = resizeBitmap (b .data , n )
b .validityBitmap = resizeBitmap (b .validityBitmap , n )
}
type OptInt32Builder struct {
builderBase
data []int32
}
func NewOptInt32Builder (dtype arrow .DataType ) *OptInt32Builder {
b := &OptInt32Builder {}
b .dtype = dtype
return b
}
func (b *OptInt32Builder ) resizeData (neededLength int ) {
if cap (b .data ) < neededLength {
oldData := b .data
b .data = make ([]int32 , bitutil .NextPowerOf2 (neededLength ))
copy (b .data , oldData )
}
b .data = b .data [:neededLength ]
}
func (b *OptInt32Builder ) Release () {
if atomic .AddInt64 (&b .refCount , -1 ) == 0 {
b .data = nil
b .releaseInternal ()
}
}
func (b *OptInt32Builder ) AppendNull () {
b .AppendNulls (1 )
}
func (b *OptInt32Builder ) AppendEmptyValue () {
b .Append (0 )
}
func (b *OptInt32Builder ) AppendNulls (n int ) {
b .resizeData (b .length + n )
b .builderBase .AppendNulls (n )
}
func (b *OptInt32Builder ) NewArray () arrow .Array {
dataAsBytes := unsafe .Slice ((*byte )(unsafe .Pointer (unsafe .SliceData (b .data ))), len (b .data )*arrow .Int32SizeBytes )
data := array .NewData (
b .dtype ,
b .length ,
[]*memory .Buffer {
memory .NewBufferBytes (b .validityBitmap ),
memory .NewBufferBytes (dataAsBytes ),
},
nil ,
b .length -bitutil .CountSetBits (b .validityBitmap , 0 , b .length ),
0 ,
)
b .reset ()
b .data = nil
return array .NewInt32Data (data )
}
func (b *OptInt32Builder ) AppendData (data []int32 ) {
oldLength := b .length
b .data = append (b .data , data ...)
b .length += len (data )
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length )
bitutil .SetBitsTo (b .validityBitmap , int64 (oldLength ), int64 (len (data )), true )
}
func (b *OptInt32Builder ) Append (v int32 ) {
b .data = append (b .data , v )
b .length ++
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length )
bitutil .SetBit (b .validityBitmap , b .length -1 )
}
func (b *OptInt32Builder ) Set (i int , v int32 ) {
b .data [i ] = v
bitutil .SetBit (b .validityBitmap , i )
}
func (b *OptInt32Builder ) Swap (i , j int ) {
b .data [i ], b .data [j ] = b .data [j ], b .data [i ]
}
func (b *OptInt32Builder ) Add (i int , v int32 ) {
b .data [i ] += v
}
func (b *OptInt32Builder ) Value (i int ) int32 {
return b .data [i ]
}
func (b *OptInt32Builder ) AppendParquetValues (values []parquet .Value ) {
b .resizeData (b .length + len (values ))
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length +len (values ))
for i , j := b .length , 0 ; i < b .length +len (values ) && j < len (values ); {
b .data [i ] = values [j ].Int32 ()
bitutil .SetBitTo (b .validityBitmap , i , !values [j ].IsNull ())
i ++
j ++
}
b .length += len (values )
}
func (b *OptInt32Builder ) RepeatLastValue (n int ) error {
if bitutil .BitIsNotSet (b .validityBitmap , b .length -1 ) {
b .AppendNulls (n )
return nil
}
lastValue := b .data [b .length -1 ]
b .resizeData (b .length + n )
for i := b .length ; i < b .length +n ; i ++ {
b .data [i ] = lastValue
}
b .appendValid (n )
return nil
}
func (b *OptInt32Builder ) ResetToLength (n int ) {
if n == b .length {
return
}
b .length = n
b .data = b .data [:n ]
b .validityBitmap = resizeBitmap (b .validityBitmap , n )
}
func (b *OptInt32Builder ) Reserve (n int ) {
b .length = n
b .data = slices .Grow (b .data , n )[:n ]
b .validityBitmap = resizeBitmap (b .validityBitmap , n )
}
type OptFloat64Builder struct {
builderBase
data []float64
}
func NewOptFloat64Builder (dtype arrow .DataType ) *OptFloat64Builder {
b := &OptFloat64Builder {}
b .dtype = dtype
return b
}
func (b *OptFloat64Builder ) resizeData (neededLength int ) {
if cap (b .data ) < neededLength {
oldData := b .data
b .data = make ([]float64 , bitutil .NextPowerOf2 (neededLength ))
copy (b .data , oldData )
}
b .data = b .data [:neededLength ]
}
func (b *OptFloat64Builder ) Release () {
if atomic .AddInt64 (&b .refCount , -1 ) == 0 {
b .data = nil
b .releaseInternal ()
}
}
func (b *OptFloat64Builder ) AppendNull () {
b .AppendNulls (1 )
}
func (b *OptFloat64Builder ) AppendEmptyValue () {
b .Append (0.0 )
}
func (b *OptFloat64Builder ) AppendNulls (n int ) {
b .resizeData (b .length + n )
b .builderBase .AppendNulls (n )
}
func (b *OptFloat64Builder ) NewArray () arrow .Array {
dataAsBytes := unsafe .Slice ((*byte )(unsafe .Pointer (unsafe .SliceData (b .data ))), len (b .data )*arrow .Float64SizeBytes )
data := array .NewData (
b .dtype ,
b .length ,
[]*memory .Buffer {
memory .NewBufferBytes (b .validityBitmap ),
memory .NewBufferBytes (dataAsBytes ),
},
nil ,
b .length -bitutil .CountSetBits (b .validityBitmap , 0 , b .length ),
0 ,
)
b .reset ()
b .data = nil
return array .NewFloat64Data (data )
}
func (b *OptFloat64Builder ) AppendData (data []float64 ) {
oldLength := b .length
b .data = append (b .data , data ...)
b .length += len (data )
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length )
bitutil .SetBitsTo (b .validityBitmap , int64 (oldLength ), int64 (len (data )), true )
}
func (b *OptFloat64Builder ) Append (v float64 ) {
b .data = append (b .data , v )
b .length ++
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length )
bitutil .SetBit (b .validityBitmap , b .length -1 )
}
func (b *OptFloat64Builder ) Set (i int , v float64 ) {
b .data [i ] = v
}
func (b *OptFloat64Builder ) Add (i int , v float64 ) {
b .data [i ] += v
}
func (b *OptFloat64Builder ) Value (i int ) float64 {
return b .data [i ]
}
func (b *OptFloat64Builder ) AppendParquetValues (values []parquet .Value ) {
b .resizeData (b .length + len (values ))
b .validityBitmap = resizeBitmap (b .validityBitmap , b .length +len (values ))
for i , j := b .length , 0 ; i < b .length +len (values ) && j < len (values ); {
b .data [i ] = values [j ].Double ()
bitutil .SetBitTo (b .validityBitmap , i , !values [j ].IsNull ())
i ++
j ++
}
b .length += len (values )
}
func (b *OptFloat64Builder ) RepeatLastValue (n int ) error {
if bitutil .BitIsNotSet (b .validityBitmap , b .length -1 ) {
b .AppendNulls (n )
return nil
}
lastValue := b .data [b .length -1 ]
b .resizeData (b .length + n )
for i := b .length ; i < b .length +n ; i ++ {
b .data [i ] = lastValue
}
b .appendValid (n )
return nil
}
func (b *OptFloat64Builder ) ResetToLength (n int ) {
if n == b .length {
return
}
b .length = n
b .data = b .data [:n ]
b .validityBitmap = resizeBitmap (b .validityBitmap , n )
}
The pages are generated with Golds v0.8.2 . (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu .
PR and bug reports are welcome and can be submitted to the issue list .
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds .