// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package schema provides types and functions for manipulating and building parquet // file schemas. // // Some of the utilities provided include building a schema using Struct Tags // on a struct type, getting Column Paths from a node, and dealing with the // converted and logical types for Parquet. // // Logical types specify ways to interpret the primitive types allowing the // number of primitive types to be smaller and reuse efficient encodings. // For instance a "string" is just a ByteArray column with a UTF-8 annotation // or "String Logical Type". // // For more information about Logical and Converted Types, check: // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
package schema import ( format ) // Schema is the container for the converted Parquet schema with a computed // information from the schema analysis needed for file reading // // * Column index to Node // // * Max repetition / definition levels for each primitive node // // The ColumnDescriptor objects produced by this class can be used to assist in // the reconstruction of fully materialized data structures from the // repetition-definition level encoding of nested data type Schema struct { root Node leaves []*Column nodeToLeaf map[*PrimitiveNode]int leafToBase map[int]Node leafToIndex strIntMultimap } // FromParquet converts a slice of thrift Schema Elements to the correct node type func ( []*format.SchemaElement) (Node, error) { if len() == 0 { return nil, xerrors.New("parquet: empty schema (no root)") } if [0].GetNumChildren() == 0 { if len() > 1 { return nil, xerrors.New("parquet: schema had multiple nodes but root had no children") } // parquet file with no columns return GroupNodeFromThrift([0], []Node{}) } // We don't check that the root node is repeated since this is not // consistently set by implementations var ( = 0 func() (Node, error) ) = func() (Node, error) { if == len() { return nil, xerrors.New("parquet: malformed schema: not enough elements") } := [] ++ if .GetNumChildren() == 0 { return PrimitiveNodeFromThrift() } := make([]Node, 0, .GetNumChildren()) for := 0; < int(.GetNumChildren()); ++ { , := () if != nil { return nil, } = append(, ) } return GroupNodeFromThrift(, ) } return () } // Root returns the group node that is the root of this schema func ( *Schema) () *GroupNode { return .root.(*GroupNode) } // NumColumns returns the number of leaf nodes that are the actual primitive // columns in this schema. func ( *Schema) () int { return len(.leaves) } // Equals returns true as long as the leaf columns are equal, doesn't take // into account the groups and only checks whether the schemas are compatible // at the physical storage level. func ( *Schema) ( *Schema) bool { if .NumColumns() != .NumColumns() { return false } for , := range .leaves { if !.Equals(.Column()) { return false } } return true } func ( *Schema) ( Node, , int16, Node) { switch .RepetitionType() { case parquet.Repetitions.Repeated: ++ fallthrough case parquet.Repetitions.Optional: ++ } switch n := .(type) { case *GroupNode: for , := range .fields { .(, , , ) } case *PrimitiveNode: .nodeToLeaf[] = len(.leaves) .leaves = append(.leaves, NewColumn(, , )) .leafToBase[len(.leaves)-1] = .leafToIndex.Add(.Path(), len(.leaves)-1) } } // Column returns the (0-indexed) column of the provided index. func ( *Schema) ( int) *Column { return .leaves[] } // Columns returns an iterator over the leaf columns of the schema func ( *Schema) () iter.Seq2[int, *Column] { return slices.All(.leaves) } // ColumnIndexByName looks up the column by it's full dot separated // node path. If there are multiple columns that match, it returns the first one. // // Returns -1 if not found. func ( *Schema) ( string) int { if , := .leafToIndex[]; { return [0] } return -1 } // ColumnIndexByNode returns the index of the column represented by this node. // // Returns -1 if not found. func ( *Schema) ( Node) int { if , := .leafToIndex[.Path()]; { for , := range { if == .Column().SchemaNode() { return } } } return -1 } // ColumnRoot returns the root node of a given column if it is under a // nested group node, providing that root group node. func ( *Schema) ( int) Node { return .leafToBase[] } // HasRepeatedFields returns true if any node in the schema has a repeated field type. func ( *Schema) () bool { return .root.(*GroupNode).HasRepeatedFields() } // UpdateColumnOrders must get a slice that is the same length as the number of leaf columns // and is used to update the schema metadata Column Orders. len(orders) must equal s.NumColumns() func ( *Schema) ( []parquet.ColumnOrder) error { if len() != .NumColumns() { return xerrors.New("parquet: malformed schema: not enough ColumnOrder values") } := schemaColumnOrderUpdater{, 0} .root.Visit(&) return nil } func ( *Schema) () string { var strings.Builder PrintSchema(.root, &, 2) return .String() } // NewSchema constructs a new Schema object from a root group node. // // Any fields with a field-id of -1 will be given an appropriate field number based on their order. func ( *GroupNode) *Schema { := &Schema{ , make([]*Column, 0), make(map[*PrimitiveNode]int), make(map[int]Node), make(strIntMultimap), } for , := range .fields { .buildTree(, 0, 0, ) } return } type schemaColumnOrderUpdater struct { colOrders []parquet.ColumnOrder leafCount int } func ( *schemaColumnOrderUpdater) ( Node) bool { if .Type() == Primitive { := .(*PrimitiveNode) .ColumnOrder = .colOrders[.leafCount] .leafCount++ } return true } func ( *schemaColumnOrderUpdater) (Node) {} type toThriftVisitor struct { elements []*format.SchemaElement } func ( *toThriftVisitor) ( Node) bool { .elements = append(.elements, .toThrift()) return true } func ( *toThriftVisitor) (Node) {} // ToThrift converts a GroupNode to a slice of SchemaElements which is used // for thrift serialization. func ( *GroupNode) []*format.SchemaElement { := &toThriftVisitor{make([]*format.SchemaElement, 0)} .Visit() return .elements } type schemaPrinter struct { w io.Writer indent int indentWidth int } func ( *schemaPrinter) ( Node) bool { fmt.Fprint(.w, strings.Repeat(" ", .indent)) if .Type() == Group { := .(*GroupNode) fmt.Fprintf(.w, "%s group field_id=%d %s", .RepetitionType(), .FieldID(), .Name()) , := .logicalType.(UnknownLogicalType) , := .logicalType.(NoLogicalType) if .logicalType != nil && ! && ! { fmt.Fprintf(.w, " (%s)", .logicalType) } else if .convertedType != ConvertedTypes.None { fmt.Fprintf(.w, " (%s)", .convertedType) } fmt.Fprintln(.w, " {") .indent += .indentWidth } else { := .(*PrimitiveNode) fmt.Fprintf(.w, "%s %s field_id=%d %s", .RepetitionType(), strings.ToLower(.PhysicalType().String()), .FieldID(), .Name()) , := .logicalType.(UnknownLogicalType) , := .logicalType.(NoLogicalType) if .logicalType != nil && ! && ! { fmt.Fprintf(.w, " (%s)", .logicalType) } else if .convertedType == ConvertedTypes.Decimal { fmt.Fprintf(.w, " (%s(%d,%d))", .convertedType, .DecimalMetadata().Precision, .DecimalMetadata().Scale) } else if .convertedType != ConvertedTypes.None { fmt.Fprintf(.w, " (%s)", .convertedType) } fmt.Fprintln(.w, ";") } return true } func ( *schemaPrinter) ( Node) { if .Type() == Group { .indent -= .indentWidth fmt.Fprint(.w, strings.Repeat(" ", .indent)) fmt.Fprintln(.w, "}") } } // PrintSchema writes a string representation of the tree to w using the indent // width provided. func ( Node, io.Writer, int) { .Visit(&schemaPrinter{, 0, }) } type strIntMultimap map[string][]int func ( strIntMultimap) ( string, int) bool { if , := []; ! { [] = []int{} return false } [] = append([], ) return true }