parquet文件查看中每个block中的column是一致的吗

Java Code Example parquet.hadoop.ParquetFileReader
Java Code Examples for parquet.hadoop.ParquetFileReader
The following are top voted examples for showing how to use
parquet.hadoop.ParquetFileReader. These examples are extracted from open source projects.
You can vote up the examples you like and your votes will be used in our system to product
more good examples.
+ Save this class to your library
* @param conf the configuration
* @param file the file to read
* @param readSupport to materialize records
* @param filter the filter to use to filter records
* @throws java.io.IOException
public ParquetReader(Configuration conf, Path file, ReadSupport&T& readSupport, UnboundRecordFilter filter) throws IOException {
this.readSupport = readS
this.filter =
this.conf =
FileSystem fs = file.getFileSystem(conf);
List&FileStatus& statuses = Arrays.asList(fs.listStatus(file));
List&Footer& footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses);
this.footersIterator = footers.iterator();
globalMetaData = ParquetFileWriter.getGlobalMetaData(footers);
List&BlockMetaData& blocks = new ArrayList&&();
for (Footer footer : footers) {
blocks.addAll(footer.getParquetMetadata().getBlocks());
MessageType schema = globalMetaData.getSchema();
Map&String, Set&String&& extraMetadata = globalMetaData.getKeyValueMetaData();
readContext = readSupport.init(new InitContext(conf, extraMetadata, schema));
public void initialize(MessageType requestedSchema, MessageType fileSchema,
Map&String, String& extraMetadata, Map&String, String& readSupportMetadata,
Path file, List&BlockMetaData& blocks, Configuration configuration)
throws IOException {
this.requestedSchema = requestedS
this.fileSchema = fileS
this.file =
this.columnCount = this.requestedSchema.getPaths().size();
this.recordConverter = readSupport.prepareForRead(
configuration, extraMetadata, fileSchema,
new ReadSupport.ReadContext(requestedSchema, readSupportMetadata));
List&ColumnDescriptor& columns = requestedSchema.getColumns();
reader = new ParquetFileReader(configuration, file, blocks, columns);
for (BlockMetaData block : blocks) {
total += block.getRowCount();
if (DEBUG) LOG.debug(&RecordReader initialized will read a total of & + total + & records.&);
public void execute(CommandLine options) throws Exception {
super.execute(options);
String[] args = options.getArgs();
String input = args[0];
Configuration conf = new Configuration();
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, new Path(input));
PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
.withAutoColumn()
.withAutoCrop()
.withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
.withColumnPadding(1)
MetadataUtils.showDetails(out, metaData);
out.flushColumns();
public void execute(CommandLine options) throws Exception {
super.execute(options);
String[] args = options.getArgs();
String input = args[0];
Configuration conf = new Configuration();
Path inpath = new Path(input);
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath);
MessageType schema = metaData.getFileMetaData().getSchema();
PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
.withAutoColumn()
.withAutoCrop()
.withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES)
.withColumnPadding(1)
.withMaxBufferedLines(1000000)
.withFlushOnTab()
boolean showmd = !options.hasOption('m');
boolean showdt = !options.hasOption('d');
Set&String& showColumns =
if (options.hasOption('c')) {
String[] cols = options.getOptionValues('c');
showColumns = new HashSet&String&(Arrays.asList(cols));
dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
public void execute(CommandLine options) throws Exception {
super.execute(options);
String[] args = options.getArgs();
String input = args[0];
Configuration conf = new Configuration();
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, new Path(input));
MessageType schema = metaData.getFileMetaData().getSchema();
Main.out.println(schema);
if (options.hasOption('d')) {
PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter().build();
MetadataUtils.showDetails(out, metaData);
public static void convertParquetToCSV(File parquetFile, File csvOutputFile) throws IOException {
Preconditions.checkArgument(parquetFile.getName().endsWith(&.parquet&),
&parquet file should have .parquet extension&);
Preconditions.checkArgument(csvOutputFile.getName().endsWith(&.csv&),
&csv file should have .csv extension&);
Preconditions.checkArgument(!csvOutputFile.exists(),
&Output file & + csvOutputFile.getAbsolutePath() + & already exists&);
(&Converting & + parquetFile.getName() + & to & + csvOutputFile.getName());
Path parquetFilePath = new Path(parquetFile.toURI());
Configuration configuration = new Configuration(true);
GroupReadSupport readSupport = new GroupReadSupport();
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
readSupport.init(configuration, null, schema);
BufferedWriter w = new BufferedWriter(new FileWriter(csvOutputFile));
ParquetReader&Group& reader = new ParquetReader&Group&(parquetFilePath, readSupport);
while( (g = reader.read())!= null) {
writeGroup(w, g, schema);
reader.close();
Utils.closeQuietly(w);
@Deprecated
public static void convertParquetToCSVEx(File parquetFile, File csvOutputFile) throws IOException {
Preconditions.checkArgument(parquetFile.getName().endsWith(&.parquet&),
&parquet file should have .parquet extension&);
Preconditions.checkArgument(csvOutputFile.getName().endsWith(&.csv&),
&csv file should have .csv extension&);
Preconditions.checkArgument(!csvOutputFile.exists(),
&Output file & + csvOutputFile.getAbsolutePath() + & already exists&);
(&Converting & + parquetFile.getName() + & to & + csvOutputFile.getName());
Path parquetFilePath = new Path(parquetFile.toURI());
Configuration configuration = new Configuration(true);
// TODO Following can be changed by using ParquetReader instead of ParquetFileReader
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
ParquetFileReader parquetFileReader = new ParquetFileReader(
configuration, parquetFilePath, readFooter.getBlocks(), schema.getColumns());
BufferedWriter w = new BufferedWriter(new FileWriter(csvOutputFile));
PageReadStore pages =
while (null != (pages = parquetFileReader.readNextRowGroup())) {
final long rows = pages.getRowCount();
(&Number of rows: & + rows);
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
final RecordReader&Group& recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i & i++) {
final Group g = recordReader.read();
writeGroup(w, g, schema);
} finally {
Utils.closeQuietly(parquetFileReader);
Utils.closeQuietly(w);
public static void convertParquetToCSV(File parquetFile, File csvOutputFile) throws IOException {
Preconditions.checkArgument(parquetFile.getName().endsWith(&.parquet&),
&parquet file should have .parquet extension&);
Preconditions.checkArgument(csvOutputFile.getName().endsWith(&.csv&),
&csv file should have .csv extension&);
Preconditions.checkArgument(!csvOutputFile.exists(),
&Output file & + csvOutputFile.getAbsolutePath() + & already exists&);
(&Converting & + parquetFile.getName() + & to & + csvOutputFile.getName());
Path parquetFilePath = new Path(parquetFile.toURI());
Configuration configuration = new Configuration(true);
GroupReadSupport readSupport = new GroupReadSupport();
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
readSupport.init(configuration, null, schema);
BufferedWriter w = new BufferedWriter(new FileWriter(csvOutputFile));
ParquetReader&Group& reader = new ParquetReader&Group&(parquetFilePath, readSupport);
while( (g = reader.read())!= null) {
writeGroup(w, g, schema);
reader.close();
Utils.closeQuietly(w);
@Deprecated
public static void convertParquetToCSVEx(File parquetFile, File csvOutputFile) throws IOException {
Preconditions.checkArgument(parquetFile.getName().endsWith(&.parquet&),
&parquet file should have .parquet extension&);
Preconditions.checkArgument(csvOutputFile.getName().endsWith(&.csv&),
&csv file should have .csv extension&);
Preconditions.checkArgument(!csvOutputFile.exists(),
&Output file & + csvOutputFile.getAbsolutePath() + & already exists&);
(&Converting & + parquetFile.getName() + & to & + csvOutputFile.getName());
Path parquetFilePath = new Path(parquetFile.toURI());
Configuration configuration = new Configuration(true);
// TODO Following can be changed by using ParquetReader instead of ParquetFileReader
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
ParquetFileReader parquetFileReader = new ParquetFileReader(
configuration, parquetFilePath, readFooter.getBlocks(), schema.getColumns());
BufferedWriter w = new BufferedWriter(new FileWriter(csvOutputFile));
PageReadStore pages =
while (null != (pages = parquetFileReader.readNextRowGroup())) {
final long rows = pages.getRowCount();
(&Number of rows: & + rows);
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
final RecordReader&Group& recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i & i++) {
final Group g = recordReader.read();
writeGroup(w, g, schema);
} finally {
Utils.closeQuietly(parquetFileReader);
Utils.closeQuietly(w);
Example 10
public static void convertParquetToCSV(File parquetFile, File csvOutputFile) throws IOException {
Preconditions.checkArgument(parquetFile.getName().endsWith(&.parquet&),
&parquet file should have .parquet extension&);
Preconditions.checkArgument(csvOutputFile.getName().endsWith(&.csv&),
&csv file should have .csv extension&);
Preconditions.checkArgument(!csvOutputFile.exists(),
&Output file & + csvOutputFile.getAbsolutePath() + & already exists&);
(&Converting & + parquetFile.getName() + & to & + csvOutputFile.getName());
Path parquetFilePath = new Path(parquetFile.toURI());
Configuration configuration = new Configuration(true);
GroupReadSupport readSupport = new GroupReadSupport();
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
readSupport.init(configuration, null, schema);
BufferedWriter w = new BufferedWriter(new FileWriter(csvOutputFile));
ParquetReader&Group& reader = new ParquetReader&Group&(parquetFilePath, readSupport);
while( (g = reader.read())!= null) {
writeGroup(w, g, schema);
reader.close();
Utils.closeQuietly(w);
Example 11
@Deprecated
public static void convertParquetToCSVEx(File parquetFile, File csvOutputFile) throws IOException {
Preconditions.checkArgument(parquetFile.getName().endsWith(&.parquet&),
&parquet file should have .parquet extension&);
Preconditions.checkArgument(csvOutputFile.getName().endsWith(&.csv&),
&csv file should have .csv extension&);
Preconditions.checkArgument(!csvOutputFile.exists(),
&Output file & + csvOutputFile.getAbsolutePath() + & already exists&);
(&Converting & + parquetFile.getName() + & to & + csvOutputFile.getName());
Path parquetFilePath = new Path(parquetFile.toURI());
Configuration configuration = new Configuration(true);
// TODO Following can be changed by using ParquetReader instead of ParquetFileReader
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
ParquetFileReader parquetFileReader = new ParquetFileReader(
configuration, parquetFilePath, readFooter.getBlocks(), schema.getColumns());
BufferedWriter w = new BufferedWriter(new FileWriter(csvOutputFile));
PageReadStore pages =
while (null != (pages = parquetFileReader.readNextRowGroup())) {
final long rows = pages.getRowCount();
(&Number of rows: & + rows);
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
final RecordReader&Group& recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i & i++) {
final Group g = recordReader.read();
writeGroup(w, g, schema);
} finally {
Utils.closeQuietly(parquetFileReader);
Utils.closeQuietly(w);
Example 12
public static void convertParquetToCSV(File parquetFile, File csvOutputFile) throws IOException {
Preconditions.checkArgument(parquetFile.getName().endsWith(&.parquet&),
&parquet file should have .parquet extension&);
Preconditions.checkArgument(csvOutputFile.getName().endsWith(&.csv&),
&csv file should have .csv extension&);
Preconditions.checkArgument(!csvOutputFile.exists(),
&Output file & + csvOutputFile.getAbsolutePath() + & already exists&);
(&Converting & + parquetFile.getName() + & to & + csvOutputFile.getName());
Path parquetFilePath = new Path(parquetFile.toURI());
Configuration configuration = new Configuration(true);
GroupReadSupport readSupport = new GroupReadSupport();
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
readSupport.init(configuration, null, schema);
BufferedWriter w = new BufferedWriter(new FileWriter(csvOutputFile));
ParquetReader&Group& reader = new ParquetReader&Group&(parquetFilePath, readSupport);
while( (g = reader.read())!= null) {
writeGroup(w, g, schema);
reader.close();
Utils.closeQuietly(w);
Example 13
@Deprecated
public static void convertParquetToCSVEx(File parquetFile, File csvOutputFile) throws IOException {
Preconditions.checkArgument(parquetFile.getName().endsWith(&.parquet&),
&parquet file should have .parquet extension&);
Preconditions.checkArgument(csvOutputFile.getName().endsWith(&.csv&),
&csv file should have .csv extension&);
Preconditions.checkArgument(!csvOutputFile.exists(),
&Output file & + csvOutputFile.getAbsolutePath() + & already exists&);
(&Converting & + parquetFile.getName() + & to & + csvOutputFile.getName());
Path parquetFilePath = new Path(parquetFile.toURI());
Configuration configuration = new Configuration(true);
// TODO Following can be changed by using ParquetReader instead of ParquetFileReader
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
ParquetFileReader parquetFileReader = new ParquetFileReader(
configuration, parquetFilePath, readFooter.getBlocks(), schema.getColumns());
BufferedWriter w = new BufferedWriter(new FileWriter(csvOutputFile));
PageReadStore pages =
while (null != (pages = parquetFileReader.readNextRowGroup())) {
final long rows = pages.getRowCount();
(&Number of rows: & + rows);
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
final RecordReader&Group& recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i & i++) {
final Group g = recordReader.read();
writeGroup(w, g, schema);
} finally {
Utils.closeQuietly(parquetFileReader);
Utils.closeQuietly(w);
Example 14
private PageReadStore fetchRowGroup() throws IOException {
if (fileReader == null) {
if (LOG.isInfoEnabled()) {
(MessageFormat.format(
Messages.getString(&LoadMetadata&), //$NON-NLS-1$
descriptor.getDataModelClass().getSimpleName(),
ParquetMetadata footer = ParquetFileReader.readFooter(hadoopConfiguration, path);
List&BlockMetaData& blocks = filterBlocks(footer.getBlocks());
if (blocks.isEmpty()) {
long totalRecords = computeTotalRecords(blocks);
this.averageBytesPerRecord = (double) fragmentSize / totalR
if (LOG.isInfoEnabled()) {
(MessageFormat.format(
Messages.getString(&LoadContents&), //$NON-NLS-1$
descriptor.getDataModelClass().getSimpleName(),
fragmentSize));
this.fileReader = new ParquetFileReader(
hadoopConfiguration,
footer.getFileMetaData().getSchema().getColumns());
this.materializer = new DataModelMaterializer(
descriptor,
footer.getFileMetaData().getSchema(),
mappingConfiguration);
this.columnIo = new ColumnIOFactory().getColumnIO(
materializer.getMaterializeSchema(),
footer.getFileMetaData().getSchema());
return fileReader.readNextRowGroup();
Example 15
public static void dump(PrettyPrintWriter out, ParquetMetadata meta, MessageType schema, Path inpath, boolean showmd, boolean showdt, Set&String& showColumns) throws IOException {
Configuration conf = new Configuration();
List&BlockMetaData& blocks = meta.getBlocks();
List&ColumnDescriptor& columns = schema.getColumns();
if (showColumns != null) {
columns = new ArrayList&ColumnDescriptor&();
for (ColumnDescriptor column : schema.getColumns()) {
String path = Joiner.on('.').skipNulls().join(column.getPath());
if (showColumns.contains(path)) {
columns.add(column);
ParquetFileReader freader =
if (showmd) {
long group = 0;
for (BlockMetaData block : blocks) {
if (group != 0) out.println();
out.format(&row group %d%n&, group++);
out.rule('-');
List&ColumnChunkMetaData& ccmds = block.getColumns();
if (showColumns != null) {
ccmds = new ArrayList&ColumnChunkMetaData&();
for (ColumnChunkMetaData ccmd : block.getColumns()) {
String path = Joiner.on('.').skipNulls().join(ccmd.getPath().toArray());
if (showColumns.contains(path)) {
ccmds.add(ccmd);
MetadataUtils.showDetails(out, ccmds);
List&BlockMetaData& rblocks = Collections.singletonList(block);
freader = new ParquetFileReader(conf, inpath, rblocks, columns);
PageReadStore store = freader.readNextRowGroup();
while (store != null) {
out.incrementTabLevel();
for (ColumnDescriptor column : columns) {
out.println();
dump(out, store, column);
out.decrementTabLevel();
store = freader.readNextRowGroup();
out.flushColumns();
} finally {
if (freader != null) {
freader.close();
if (showdt) {
boolean first =
for (ColumnDescriptor column : columns) {
if (!first || showmd) out.println();
out.format(&%s %s%n&, column.getType(), Joiner.on('.').skipNulls().join(column.getPath()));
out.rule('-');
long page = 1;
long total = blocks.size();
long offset = 1;
freader = new ParquetFileReader(conf, inpath, blocks, Collections.singletonList(column));
PageReadStore store = freader.readNextRowGroup();
while (store != null) {
ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DumpGroupConverter(), schema);
dump(out, crstore, column, page++, total, offset);
offset += store.getRowCount();
store = freader.readNextRowGroup();
out.flushColumns();
} finally {
out.flushColumns();
if (freader != null) {
freader.close();
Example 16
private ParquetRecordReader&FakeParquetRecord& createParquetRecordReader(
Configuration configuration,
Path path,
long start,
long length,
List&HiveColumnHandle& columns,
boolean useParquetColumnNames,
TypeManager typeManager,
boolean predicatePushdownEnabled,
TupleDomain&HiveColumnHandle& effectivePredicate)
ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, path, NO_FILTER);
List&BlockMetaData& blocks = parquetMetadata.getBlocks();
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
PrestoReadSupport readSupport = new PrestoReadSupport(useParquetColumnNames, columns, fileSchema);
List&parquet.schema.Type& fields = columns.stream()
.filter(column -& !column.isPartitionKey())
.map(column -& getParquetType(column, fileSchema, useParquetColumnNames))
.filter(Objects::nonNull)
.collect(toList());
MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
List&BlockMetaData& splitGroup = new ArrayList&&();
for (BlockMetaData block : blocks) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage &= start && firstDataPage & start + length) {
splitGroup.add(block);
if (predicatePushdownEnabled) {
ParquetPredicate parquetPredicate = buildParquetPredicate(columns, effectivePredicate, fileMetaData.getSchema(), typeManager);
splitGroup = splitGroup.stream()
.filter(block -& predicateMatches(parquetPredicate, block, configuration, path, requestedSchema, effectivePredicate))
.collect(toList());
long[] offsets = new long[splitGroup.size()];
for (int i = 0; i & splitGroup.size(); i++) {
BlockMetaData block = splitGroup.get(i);
offsets[i] = block.getStartingPos();
ParquetInputSplit split = new ParquetInputSplit(path, start, start + length, length, null, offsets);
TaskAttemptContext taskContext = ContextUtil.newTaskAttemptContext(configuration, new TaskAttemptID());
ParquetRecordReader&FakeParquetRecord& realReader = new PrestoParquetRecordReader(readSupport);
realReader.initialize(split, taskContext);
return realR
catch (IOException e) {
throw Throwables.propagate(e);
catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw Throwables.propagate(e);

我要回帖

更多关于 如何生成parquet文件 的文章

 

随机推荐