Skip to content
Snippets Groups Projects
Commit db8784fe authored by Wenchen Fan's avatar Wenchen Fan Committed by Andrew Or
Browse files

[SPARK-17899][SQL] add a debug mode to keep raw table properties in HiveExternalCatalog

## What changes were proposed in this pull request?

Currently `HiveExternalCatalog` will filter out the Spark SQL internal table properties, e.g. `spark.sql.sources.provider`, `spark.sql.sources.schema`, etc. This is reasonable for external users as they don't want to see these internal properties in `DESC TABLE`.

However, as a Spark developer, sometimes we do wanna see the raw table properties. This PR adds a new internal SQL conf, `spark.sql.debug`, to enable debug mode and keep these raw table properties.

This config can also be used in similar places where we wanna retain debug information in the future.

## How was this patch tested?

new test in MetastoreDataSourcesSuite

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15458 from cloud-fan/debug.
parent 6f2fa6c5
No related branches found
No related tags found
No related merge requests found
...@@ -915,4 +915,9 @@ object StaticSQLConf { ...@@ -915,4 +915,9 @@ object StaticSQLConf {
.internal() .internal()
.intConf .intConf
.createWithDefault(4000) .createWithDefault(4000)
val DEBUG_MODE = buildConf("spark.sql.debug")
.internal()
.booleanConf
.createWithDefault(false)
} }
...@@ -19,6 +19,7 @@ package org.apache.spark.sql.internal ...@@ -19,6 +19,7 @@ package org.apache.spark.sql.internal
import org.apache.hadoop.fs.Path import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.apache.spark.sql.execution.WholeStageCodegenExec import org.apache.spark.sql.execution.WholeStageCodegenExec
import org.apache.spark.sql.internal.StaticSQLConf._ import org.apache.spark.sql.internal.StaticSQLConf._
...@@ -254,18 +255,21 @@ class SQLConfSuite extends QueryTest with SharedSQLContext { ...@@ -254,18 +255,21 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
} }
} }
test("global SQL conf comes from SparkConf") { test("static SQL conf comes from SparkConf") {
val newSession = SparkSession.builder() val previousValue = sparkContext.conf.get(SCHEMA_STRING_LENGTH_THRESHOLD)
.config(SCHEMA_STRING_LENGTH_THRESHOLD.key, "2000") try {
.getOrCreate() sparkContext.conf.set(SCHEMA_STRING_LENGTH_THRESHOLD, 2000)
val newSession = new SparkSession(sparkContext)
assert(newSession.conf.get(SCHEMA_STRING_LENGTH_THRESHOLD.key) == "2000") assert(newSession.conf.get(SCHEMA_STRING_LENGTH_THRESHOLD) == 2000)
checkAnswer( checkAnswer(
newSession.sql(s"SET ${SCHEMA_STRING_LENGTH_THRESHOLD.key}"), newSession.sql(s"SET ${SCHEMA_STRING_LENGTH_THRESHOLD.key}"),
Row(SCHEMA_STRING_LENGTH_THRESHOLD.key, "2000")) Row(SCHEMA_STRING_LENGTH_THRESHOLD.key, "2000"))
} finally {
sparkContext.conf.set(SCHEMA_STRING_LENGTH_THRESHOLD, previousValue)
}
} }
test("cannot set/unset global SQL conf") { test("cannot set/unset static SQL conf") {
val e1 = intercept[AnalysisException](sql(s"SET ${SCHEMA_STRING_LENGTH_THRESHOLD.key}=10")) val e1 = intercept[AnalysisException](sql(s"SET ${SCHEMA_STRING_LENGTH_THRESHOLD.key}=10"))
assert(e1.message.contains("Cannot modify the value of a static config")) assert(e1.message.contains("Cannot modify the value of a static config"))
val e2 = intercept[AnalysisException](spark.conf.unset(SCHEMA_STRING_LENGTH_THRESHOLD.key)) val e2 = intercept[AnalysisException](spark.conf.unset(SCHEMA_STRING_LENGTH_THRESHOLD.key))
......
...@@ -37,7 +37,7 @@ import org.apache.spark.sql.execution.command.{ColumnStatStruct, DDLUtils} ...@@ -37,7 +37,7 @@ import org.apache.spark.sql.execution.command.{ColumnStatStruct, DDLUtils}
import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.hive.client.HiveClient
import org.apache.spark.sql.internal.HiveSerDe import org.apache.spark.sql.internal.HiveSerDe
import org.apache.spark.sql.internal.StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOLD import org.apache.spark.sql.internal.StaticSQLConf._
import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.types.{DataType, StructType}
...@@ -461,13 +461,18 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat ...@@ -461,13 +461,18 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
} else { } else {
table.storage table.storage
} }
val tableProps = if (conf.get(DEBUG_MODE)) {
table.properties
} else {
getOriginalTableProperties(table)
}
table.copy( table.copy(
storage = storage, storage = storage,
schema = getSchemaFromTableProperties(table), schema = getSchemaFromTableProperties(table),
provider = Some(provider), provider = Some(provider),
partitionColumnNames = getPartitionColumnsFromTableProperties(table), partitionColumnNames = getPartitionColumnsFromTableProperties(table),
bucketSpec = getBucketSpecFromTableProperties(table), bucketSpec = getBucketSpecFromTableProperties(table),
properties = getOriginalTableProperties(table)) properties = tableProps)
} getOrElse { } getOrElse {
table.copy(provider = Some("hive")) table.copy(provider = Some("hive"))
} }
......
...@@ -23,6 +23,7 @@ import scala.collection.mutable.ArrayBuffer ...@@ -23,6 +23,7 @@ import scala.collection.mutable.ArrayBuffer
import org.apache.hadoop.fs.Path import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
...@@ -31,7 +32,7 @@ import org.apache.spark.sql.hive.HiveExternalCatalog._ ...@@ -31,7 +32,7 @@ import org.apache.spark.sql.hive.HiveExternalCatalog._
import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.hive.client.HiveClient
import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.StaticSQLConf.SCHEMA_STRING_LENGTH_THRESHOLD import org.apache.spark.sql.internal.StaticSQLConf._
import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.sql.types._ import org.apache.spark.sql.types._
import org.apache.spark.util.Utils import org.apache.spark.util.Utils
...@@ -1324,4 +1325,18 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv ...@@ -1324,4 +1325,18 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
hiveClient.dropTable("default", "t", ignoreIfNotExists = true, purge = true) hiveClient.dropTable("default", "t", ignoreIfNotExists = true, purge = true)
} }
} }
test("should keep data source entries in table properties when debug mode is on") {
val previousValue = sparkSession.sparkContext.conf.get(DEBUG_MODE)
try {
sparkSession.sparkContext.conf.set(DEBUG_MODE, true)
val newSession = sparkSession.newSession()
newSession.sql("CREATE TABLE abc(i int) USING json")
val tableMeta = newSession.sessionState.catalog.getTableMetadata(TableIdentifier("abc"))
assert(tableMeta.properties(DATASOURCE_SCHEMA_NUMPARTS).toInt == 1)
assert(tableMeta.properties(DATASOURCE_PROVIDER) == "json")
} finally {
sparkSession.sparkContext.conf.set(DEBUG_MODE, previousValue)
}
}
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment