Skip to content
Snippets Groups Projects
Commit 11caf1ce authored by Cheng Lian's avatar Cheng Lian
Browse files

[SPARK-4176] [SQL] [MINOR] Should use unscaled Long to write decimals for...

[SPARK-4176] [SQL] [MINOR] Should use unscaled Long to write decimals for precision <= 18 rather than 8

This PR fixes a minor bug introduced in #7455: when writing decimals, we should use the unscaled Long for better performance when the precision <= 18 rather than 8 (should be a typo). This bug doesn't affect correctness, but hurts Parquet decimal writing performance.

This PR also replaced similar magic numbers with newly defined constants.

Author: Cheng Lian <lian@databricks.com>

Closes #8031 from liancheng/spark-4176/minor-fix-for-writing-decimals and squashes the following commits:

10d4ea3 [Cheng Lian] Should use unscaled Long to write decimals for precision <= 18 rather than 8
parent ef062c15
No related branches found
No related tags found
No related merge requests found
......@@ -264,7 +264,7 @@ private[parquet] class CatalystRowConverter(
val scale = decimalType.scale
val bytes = value.getBytes
if (precision <= 8) {
if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) {
// Constructs a `Decimal` with an unscaled `Long` value if possible.
var unscaled = 0L
var i = 0
......
......@@ -25,6 +25,7 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._
import org.apache.parquet.schema.Type.Repetition._
import org.apache.parquet.schema._
import org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, maxPrecisionForBytes}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{AnalysisException, SQLConf}
......@@ -155,7 +156,7 @@ private[parquet] class CatalystSchemaConverter(
case INT_16 => ShortType
case INT_32 | null => IntegerType
case DATE => DateType
case DECIMAL => makeDecimalType(maxPrecisionForBytes(4))
case DECIMAL => makeDecimalType(MAX_PRECISION_FOR_INT32)
case TIME_MILLIS => typeNotImplemented()
case _ => illegalType()
}
......@@ -163,7 +164,7 @@ private[parquet] class CatalystSchemaConverter(
case INT64 =>
originalType match {
case INT_64 | null => LongType
case DECIMAL => makeDecimalType(maxPrecisionForBytes(8))
case DECIMAL => makeDecimalType(MAX_PRECISION_FOR_INT64)
case TIMESTAMP_MILLIS => typeNotImplemented()
case _ => illegalType()
}
......@@ -405,7 +406,7 @@ private[parquet] class CatalystSchemaConverter(
// Uses INT32 for 1 <= precision <= 9
case DecimalType.Fixed(precision, scale)
if precision <= maxPrecisionForBytes(4) && followParquetFormatSpec =>
if precision <= MAX_PRECISION_FOR_INT32 && followParquetFormatSpec =>
Types
.primitive(INT32, repetition)
.as(DECIMAL)
......@@ -415,7 +416,7 @@ private[parquet] class CatalystSchemaConverter(
// Uses INT64 for 1 <= precision <= 18
case DecimalType.Fixed(precision, scale)
if precision <= maxPrecisionForBytes(8) && followParquetFormatSpec =>
if precision <= MAX_PRECISION_FOR_INT64 && followParquetFormatSpec =>
Types
.primitive(INT64, repetition)
.as(DECIMAL)
......@@ -534,14 +535,6 @@ private[parquet] class CatalystSchemaConverter(
throw new AnalysisException(s"Unsupported data type $field.dataType")
}
}
// Max precision of a decimal value stored in `numBytes` bytes
private def maxPrecisionForBytes(numBytes: Int): Int = {
Math.round( // convert double to long
Math.floor(Math.log10( // number of base-10 digits
Math.pow(2, 8 * numBytes - 1) - 1))) // max value stored in numBytes
.asInstanceOf[Int]
}
}
......@@ -584,4 +577,16 @@ private[parquet] object CatalystSchemaConverter {
computeMinBytesForPrecision(precision)
}
}
val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4)
val MAX_PRECISION_FOR_INT64 = maxPrecisionForBytes(8)
// Max precision of a decimal value stored in `numBytes` bytes
def maxPrecisionForBytes(numBytes: Int): Int = {
Math.round( // convert double to long
Math.floor(Math.log10( // number of base-10 digits
Math.pow(2, 8 * numBytes - 1) - 1))) // max value stored in numBytes
.asInstanceOf[Int]
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment