⚠ This page is served via a proxy. Original site: https://github.com
This service does not collect credentials or authentication data.
Skip to content
63 changes: 63 additions & 0 deletions ice/src/main/java/com/altinity/ice/cli/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import com.altinity.ice.cli.internal.cmd.DeleteNamespace;
import com.altinity.ice.cli.internal.cmd.DeleteTable;
import com.altinity.ice.cli.internal.cmd.Describe;
import com.altinity.ice.cli.internal.cmd.DescribeParquet;
import com.altinity.ice.cli.internal.cmd.Insert;
import com.altinity.ice.cli.internal.cmd.InsertWatch;
import com.altinity.ice.cli.internal.cmd.Scan;
Expand Down Expand Up @@ -142,6 +143,68 @@ void describe(
}
}

@CommandLine.Command(name = "describe-parquet", description = "Describe parquet file metadata.")
void describeParquet(
@CommandLine.Parameters(
arity = "1",
paramLabel = "<target>",
description = "Path to parquet file")
String target,
@CommandLine.Option(
names = {"-a", "--all"},
description = "Show everything")
boolean showAll,
@CommandLine.Option(
names = {"-s", "--summary"},
description = "Show size, rows, number of row groups, size, compress_size, etc.")
boolean showSummary,
@CommandLine.Option(
names = {"--columns"},
description = "Show columns")
boolean showColumns,
@CommandLine.Option(
names = {"-r", "--row-groups"},
description = "Show row groups")
boolean showRowGroups,
@CommandLine.Option(
names = {"-d", "--row-group-details"},
description = "Show column stats within row group")
boolean showRowGroupDetails,
@CommandLine.Option(
names = {"--json"},
description = "Output JSON instead of YAML")
boolean json,
@CommandLine.Option(names = {"--s3-region"}) String s3Region,
@CommandLine.Option(
names = {"--s3-no-sign-request"},
description = "Access S3 files without authentication")
boolean s3NoSignRequest)
throws IOException {
setAWSRegion(s3Region);
try (RESTCatalog catalog = loadCatalog()) {
var options = new ArrayList<DescribeParquet.Option>();
if (showAll || showSummary) {
options.add(DescribeParquet.Option.SUMMARY);
}
if (showAll || showColumns) {
options.add(DescribeParquet.Option.COLUMNS);
}
if (showAll || showRowGroups) {
options.add(DescribeParquet.Option.ROW_GROUPS);
}
if (showAll || showRowGroupDetails) {
options.add(DescribeParquet.Option.ROW_GROUP_DETAILS);
}

if (options.isEmpty()) {
options.add(DescribeParquet.Option.SUMMARY);
}

DescribeParquet.run(
catalog, target, json, s3NoSignRequest, options.toArray(new DescribeParquet.Option[0]));
}
}

public record IceSortOrder(
@JsonProperty("column") String column,
@JsonProperty("desc") boolean desc,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
/*
* Copyright (c) 2025 Altinity Inc and/or its affiliates. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*/
package com.altinity.ice.cli.internal.cmd;

import com.altinity.ice.cli.internal.iceberg.io.Input;
import com.altinity.ice.cli.internal.iceberg.parquet.Metadata;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.rest.RESTCatalog;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;
import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.internal.crossregion.S3CrossRegionSyncClient;
import software.amazon.awssdk.utils.Lazy;

public final class DescribeParquet {

private DescribeParquet() {}

public enum Option {
ALL,
SUMMARY,
COLUMNS,
ROW_GROUPS,
ROW_GROUP_DETAILS
}

public static void run(
RESTCatalog catalog,
String filePath,
boolean json,
boolean s3NoSignRequest,
Option... options)
throws IOException {

Lazy<S3Client> s3ClientLazy =
new Lazy<>(
() ->
new S3CrossRegionSyncClient(
com.altinity.ice.cli.internal.s3.S3.newClient(s3NoSignRequest)));
FileIO io = Input.newIO(filePath, null, s3ClientLazy);
InputFile inputFile = Input.newFile(filePath, catalog, io);
run(inputFile, json, options);
}

public static void run(InputFile inputFile, boolean json, Option... options) throws IOException {
ParquetMetadata metadata = Metadata.read(inputFile);

ParquetInfo info = extractParquetInfo(metadata, options);

ObjectMapper mapper = json ? new ObjectMapper() : new ObjectMapper(new YAMLFactory());
mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL);
String output = mapper.writeValueAsString(info);
System.out.println(output);
}

private static ParquetInfo extractParquetInfo(ParquetMetadata metadata, Option... options) {
var optionsSet = Set.of(options);
boolean includeAll = optionsSet.contains(Option.ALL);

FileMetaData fileMetadata = metadata.getFileMetaData();

// Summary info
Summary summary = null;
if (includeAll || optionsSet.contains(Option.SUMMARY)) {
long totalRows = metadata.getBlocks().stream().mapToLong(BlockMetaData::getRowCount).sum();

long compressedSize =
metadata.getBlocks().stream().mapToLong(BlockMetaData::getCompressedSize).sum();

long uncompressedSize =
metadata.getBlocks().stream().mapToLong(BlockMetaData::getTotalByteSize).sum();

summary =
new Summary(
totalRows,
metadata.getBlocks().size(),
compressedSize,
uncompressedSize,
fileMetadata.getCreatedBy(),
fileMetadata.getSchema().getFieldCount());
}

// Column info
List<Column> columns = null;
if (includeAll || optionsSet.contains(Option.COLUMNS)) {
columns = extractColumns(fileMetadata.getSchema());
}

// Row group info
List<RowGroup> rowGroups = null;
if (includeAll
|| optionsSet.contains(Option.ROW_GROUPS)
|| optionsSet.contains(Option.ROW_GROUP_DETAILS)) {
boolean includeDetails = includeAll || optionsSet.contains(Option.ROW_GROUP_DETAILS);
rowGroups = extractRowGroups(metadata.getBlocks(), includeDetails);
}

return new ParquetInfo(summary, columns, rowGroups);
}

private static List<Column> extractColumns(MessageType schema) {
List<Column> columns = new ArrayList<>();
for (Type field : schema.getFields()) {
String logicalType = null;
if (field.isPrimitive()) {
var annotation = field.asPrimitiveType().getLogicalTypeAnnotation();
logicalType = annotation != null ? annotation.toString() : null;
}
columns.add(
new Column(
field.getName(),
field.isPrimitive() ? field.asPrimitiveType().getPrimitiveTypeName().name() : "GROUP",
field.getRepetition().name(),
logicalType));
}
return columns;
}

private static List<RowGroup> extractRowGroups(
List<BlockMetaData> blocks, boolean includeDetails) {
List<RowGroup> rowGroups = new ArrayList<>();

for (int i = 0; i < blocks.size(); i++) {
BlockMetaData block = blocks.get(i);

List<ColumnChunk> columnChunks = null;
if (includeDetails) {
columnChunks = new ArrayList<>();
for (ColumnChunkMetaData column : block.getColumns()) {
Statistics<?> stats = column.getStatistics();

ColumnStats columnStats = null;
if (stats != null && !stats.isEmpty()) {
long nulls = stats.isNumNullsSet() ? stats.getNumNulls() : 0;
String min = null;
String max = null;
if (stats.hasNonNullValue()) {
Object minVal = stats.genericGetMin();
Object maxVal = stats.genericGetMax();
min = minVal != null ? minVal.toString() : null;
max = maxVal != null ? maxVal.toString() : null;
}
columnStats = new ColumnStats(nulls, min, max);
}

columnChunks.add(
new ColumnChunk(
column.getPath().toDotString(),
column.getPrimitiveType().getName(),
column.getEncodings().toString(),
column.getCodec().name(),
column.getTotalSize(),
column.getTotalUncompressedSize(),
column.getValueCount(),
columnStats));
}
}

rowGroups.add(
new RowGroup(
i,
block.getRowCount(),
block.getTotalByteSize(),
block.getCompressedSize(),
block.getStartingPos(),
columnChunks));
}

return rowGroups;
}

@JsonInclude(JsonInclude.Include.NON_NULL)
public record ParquetInfo(Summary summary, List<Column> columns, List<RowGroup> rowGroups) {}

@JsonInclude(JsonInclude.Include.NON_NULL)
public record Summary(
long rows,
int rowGroups,
long compressedSize,
long uncompressedSize,
String createdBy,
int columnCount) {}

@JsonInclude(JsonInclude.Include.NON_NULL)
public record Column(String name, String type, String repetition, String logicalType) {}

@JsonInclude(JsonInclude.Include.NON_NULL)
public record RowGroup(
int index,
long rowCount,
long totalSize,
long compressedSize,
long startingPos,
List<ColumnChunk> columns) {}

@JsonInclude(JsonInclude.Include.NON_NULL)
public record ColumnChunk(
String path,
String type,
String encodings,
String codec,
long totalSize,
long uncompressedSize,
long valueCount,
ColumnStats stats) {}

@JsonInclude(JsonInclude.Include.NON_NULL)
public record ColumnStats(long nulls, String min, String max) {}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Copyright (c) 2025 Altinity Inc and/or its affiliates. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*/
package com.altinity.ice.cli.internal.cmd;

import static org.assertj.core.api.Assertions.assertThat;

import com.altinity.ice.test.Resource;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import org.testng.annotations.Test;

public class DescribeParquetTest {

@Test
public void testDescribeParquetSummary() throws IOException {
ByteArrayOutputStream outContent = new ByteArrayOutputStream();
PrintStream originalOut = System.out;
System.setOut(new PrintStream(outContent));

try {
var sampleFile =
Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet");

DescribeParquet.run(sampleFile, false, DescribeParquet.Option.SUMMARY);

String output = outContent.toString();

assertThat(output).contains("rows:");
assertThat(output).contains("rowGroups:");
assertThat(output).contains("compressedSize:");
assertThat(output).contains("uncompressedSize:");
} finally {
System.setOut(originalOut);
}
}

@Test
public void testDescribeParquetColumns() throws IOException {
ByteArrayOutputStream outContent = new ByteArrayOutputStream();
PrintStream originalOut = System.out;
System.setOut(new PrintStream(outContent));

try {
var sampleFile =
Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet");

DescribeParquet.run(sampleFile, false, DescribeParquet.Option.COLUMNS);

String output = outContent.toString();

assertThat(output).contains("columns:");
assertThat(output).contains("name:");
assertThat(output).contains("type:");
} finally {
System.setOut(originalOut);
}
}

@Test
public void testDescribeParquetJson() throws IOException {
ByteArrayOutputStream outContent = new ByteArrayOutputStream();
PrintStream originalOut = System.out;
System.setOut(new PrintStream(outContent));

try {
var sampleFile =
Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet");

DescribeParquet.run(sampleFile, true, DescribeParquet.Option.SUMMARY);

String output = outContent.toString();

assertThat(output).contains("{");
assertThat(output).contains("}");
assertThat(output).contains("\"summary\"");
} finally {
System.setOut(originalOut);
}
}
}