From 8091dd62eaff28196dfb9742a4f39182704b1024 Mon Sep 17 00:00:00 2001 From: Sasaki Toru <sasakitoa@nttdata.co.jp> Date: Thu, 11 Dec 2014 22:54:21 -0800 Subject: [PATCH] [SPARK-4742][SQL] The name of Parquet File generated by AppendingParquetOutputFormat should be zero padded When I use Parquet File as a output file using ParquetOutputFormat#getDefaultWorkFile, the file name is not zero padded while RDD#saveAsText does zero padding. Author: Sasaki Toru <sasakitoa@nttdata.co.jp> Closes #3602 from sasakitoa/parquet-zeroPadding and squashes the following commits: 6b0e58f [Sasaki Toru] Merge branch 'master' of git://github.com/apache/spark into parquet-zeroPadding 20dc79d [Sasaki Toru] Fixed the name of Parquet File generated by AppendingParquetOutputFormat --- .../apache/spark/sql/parquet/ParquetTableOperations.scala | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala index 232ef90b01..5a49384ade 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.parquet import java.io.IOException import java.lang.{Long => JLong} import java.text.SimpleDateFormat +import java.text.NumberFormat import java.util.concurrent.{Callable, TimeUnit} import java.util.{ArrayList, Collections, Date, List => JList} @@ -338,9 +339,13 @@ private[parquet] class AppendingParquetOutputFormat(offset: Int) // override to choose output filename so not overwrite existing ones override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { + val numfmt = NumberFormat.getInstance() + numfmt.setMinimumIntegerDigits(5) + numfmt.setGroupingUsed(false) + val taskId: TaskID = getTaskAttemptID(context).getTaskID val partition: Int = taskId.getId - val filename = s"part-r-${partition + offset}.parquet" + val filename = "part-r-" + numfmt.format(partition + offset) + ".parquet" val committer: FileOutputCommitter = getOutputCommitter(context).asInstanceOf[FileOutputCommitter] new Path(committer.getWorkPath, filename) -- GitLab