From 2d799d08081032828cc2c95cbf58a268653c7a05 Mon Sep 17 00:00:00 2001 From: gatorsmile <gatorsmile@gmail.com> Date: Wed, 9 Aug 2017 08:46:25 -0700 Subject: [PATCH] [SPARK-21504][SQL] Add spark version info into table metadata ## What changes were proposed in this pull request? This PR is to add the spark version info in the table metadata. When creating the table, this value is assigned. It can help users find which version of Spark was used to create the table. ## How was this patch tested? N/A Author: gatorsmile <gatorsmile@gmail.com> Closes #18709 from gatorsmile/addVersion. --- .../sql/catalyst/catalog/ExternalCatalog.scala | 4 +++- .../spark/sql/catalyst/catalog/interface.scala | 7 ++++++- .../sql/catalyst/trees/TreeNodeSuite.scala | 4 +++- .../describe-table-after-alter-table.sql.out | 15 ++++++++++----- .../sql-tests/results/describe.sql.out | 18 ++++++++++++------ .../sql-tests/results/show-tables.sql.out | 9 ++++++--- .../apache/spark/sql/SQLQueryTestSuite.scala | 3 ++- .../spark/sql/execution/command/DDLSuite.scala | 1 + .../spark/sql/hive/HiveExternalCatalog.scala | 12 +++++++++++- .../sql/hive/execution/HiveDDLSuite.scala | 1 + 10 files changed, 55 insertions(+), 19 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 6000d483db..68644f4d6b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -106,8 +106,10 @@ abstract class ExternalCatalog final def createTable(tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = { val db = tableDefinition.database val name = tableDefinition.identifier.table + val tableDefinitionWithVersion = + tableDefinition.copy(createVersion = org.apache.spark.SPARK_VERSION) postToAll(CreateTablePreEvent(db, name)) - doCreateTable(tableDefinition, ignoreIfExists) + doCreateTable(tableDefinitionWithVersion, ignoreIfExists) postToAll(CreateTableEvent(db, name)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 9531456434..f86510624a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -205,6 +205,9 @@ case class BucketSpec( * configured. * @param ignoredProperties is a list of table properties that are used by the underlying table * but ignored by Spark SQL yet. + * @param createVersion records the version of Spark that created this table metadata. The default + * is an empty string. We expect it will be read from the catalog or filled by + * ExternalCatalog.createTable. For temporary views, the value will be empty. */ case class CatalogTable( identifier: TableIdentifier, @@ -217,6 +220,7 @@ case class CatalogTable( owner: String = "", createTime: Long = System.currentTimeMillis, lastAccessTime: Long = -1, + createVersion: String = "", properties: Map[String, String] = Map.empty, stats: Option[CatalogStatistics] = None, viewText: Option[String] = None, @@ -302,8 +306,9 @@ case class CatalogTable( identifier.database.foreach(map.put("Database", _)) map.put("Table", identifier.table) if (owner.nonEmpty) map.put("Owner", owner) - map.put("Created", new Date(createTime).toString) + map.put("Created Time", new Date(createTime).toString) map.put("Last Access", new Date(lastAccessTime).toString) + map.put("Created By", "Spark " + createVersion) map.put("Type", tableType.name) provider.foreach(map.put("Provider", _)) bucketSpec.foreach(map ++= _.toLinkedHashMap) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala index 4fc947a88f..f83c637918 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala @@ -480,7 +480,8 @@ class TreeNodeSuite extends SparkFunSuite { CatalogTableType.MANAGED, CatalogStorageFormat.empty, StructType(StructField("a", IntegerType, true) :: Nil), - createTime = 0L), + createTime = 0L, + createVersion = "2.x"), JObject( "product-class" -> classOf[CatalogTable].getName, @@ -509,6 +510,7 @@ class TreeNodeSuite extends SparkFunSuite { "owner" -> "", "createTime" -> 0, "lastAccessTime" -> -1, + "createVersion" -> "2.x", "tracksPartitionsInCatalog" -> false, "properties" -> JNull, "unsupportedFeatures" -> List.empty[String], diff --git a/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out index 4bf4633491..7873085da5 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out @@ -23,8 +23,9 @@ d string # Detailed Table Information Database default Table table_with_comment -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type MANAGED Provider parquet Comment added @@ -52,8 +53,9 @@ d string # Detailed Table Information Database default Table table_with_comment -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type MANAGED Provider parquet Comment modified comment @@ -88,8 +90,9 @@ b int # Detailed Table Information Database default Table table_comment -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type MANAGED Provider parquet Location [not included in comparison]sql/core/spark-warehouse/table_comment @@ -114,8 +117,9 @@ b int # Detailed Table Information Database default Table table_comment -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type MANAGED Provider parquet Comment added comment @@ -141,8 +145,9 @@ b int # Detailed Table Information Database default Table table_comment -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type MANAGED Provider parquet Location [not included in comparison]sql/core/spark-warehouse/table_comment diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out index e2b79e8f78..b91f2c09f3 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out @@ -120,8 +120,9 @@ d string # Detailed Table Information Database default Table t -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type MANAGED Provider parquet Num Buckets 2 @@ -151,8 +152,9 @@ d string # Detailed Table Information Database default Table t -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type MANAGED Provider parquet Num Buckets 2 @@ -190,8 +192,9 @@ d string # Detailed Table Information Database default Table t -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type MANAGED Provider parquet Num Buckets 2 @@ -228,8 +231,9 @@ d string # Detailed Table Information Database default Table t -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type MANAGED Provider parquet Num Buckets 2 @@ -458,8 +462,9 @@ d string # Detailed Table Information Database default Table v -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type VIEW View Text SELECT * FROM t View Default Database default @@ -480,8 +485,9 @@ d string # Detailed Table Information Database default Table v -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type VIEW View Text SELECT * FROM t View Default Database default diff --git a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out index 8f2a54f7c2..da729cd757 100644 --- a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out @@ -119,8 +119,9 @@ SHOW TABLE EXTENDED LIKE 'show_t*' struct<database:string,tableName:string,isTemporary:boolean,information:string> -- !query 12 output show_t3 true Table: show_t3 -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type: VIEW Schema: root |-- e: integer (nullable = true) @@ -128,8 +129,9 @@ Schema: root showdb show_t1 false Database: showdb Table: show_t1 -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type: MANAGED Provider: parquet Location [not included in comparison]sql/core/spark-warehouse/showdb.db/show_t1 @@ -144,8 +146,9 @@ Schema: root showdb show_t2 false Database: showdb Table: show_t2 -Created [not included in comparison] +Created Time [not included in comparison] Last Access [not included in comparison] +Created By [not included in comparison] Type: MANAGED Provider: parquet Location [not included in comparison]sql/core/spark-warehouse/showdb.db/show_t2 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala index d9130fdcfa..aa000bddf9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala @@ -228,7 +228,8 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext { // Get answer, but also get rid of the #1234 expression ids that show up in explain plans val answer = df.queryExecution.hiveResultString().map(_.replaceAll("#\\d+", "#x") .replaceAll("Location.*/sql/core/", s"Location ${notIncludedMsg}sql/core/") - .replaceAll("Created.*", s"Created $notIncludedMsg") + .replaceAll("Created By.*", s"Created By $notIncludedMsg") + .replaceAll("Created Time.*", s"Created Time $notIncludedMsg") .replaceAll("Last Access.*", s"Last Access $notIncludedMsg")) // If the output is not pre-sorted, sort it. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 5c0a6aa724..9332f77343 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -68,6 +68,7 @@ class InMemoryCatalogedDDLSuite extends DDLSuite with SharedSQLContext with Befo provider = Some("parquet"), partitionColumnNames = Seq("a", "b"), createTime = 0L, + createVersion = org.apache.spark.SPARK_VERSION, tracksPartitionsInCatalog = true) } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 172317c346..19e5f78982 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -390,6 +390,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat val bucketSpec = table.bucketSpec val properties = new mutable.HashMap[String, String] + + properties.put(CREATED_SPARK_VERSION, table.createVersion) + // Serialized JSON schema string may be too long to be stored into a single metastore table // property. In this case, we split the JSON string and store each part as a separate table // property. @@ -594,7 +597,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat // Set the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition, // to retain the spark specific format if it is. val propsFromOldTable = oldTableDef.properties.filter { case (k, v) => - k.startsWith(DATASOURCE_PREFIX) || k.startsWith(STATISTICS_PREFIX) + k.startsWith(DATASOURCE_PREFIX) || k.startsWith(STATISTICS_PREFIX) || + k.startsWith(CREATED_SPARK_VERSION) } val newTableProps = propsFromOldTable ++ tableDefinition.properties + partitionProviderProp val newDef = tableDefinition.copy( @@ -700,6 +704,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat table = restoreDataSourceTable(table, provider) } + // Restore version info + val version: String = table.properties.getOrElse(CREATED_SPARK_VERSION, "2.2 or prior") + // Restore Spark's statistics from information in Metastore. val statsProps = table.properties.filterKeys(_.startsWith(STATISTICS_PREFIX)) @@ -735,6 +742,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat // Get the original table properties as defined by the user. table.copy( + createVersion = version, properties = table.properties.filterNot { case (key, _) => key.startsWith(SPARK_SQL_PREFIX) }) } @@ -1208,6 +1216,8 @@ object HiveExternalCatalog { val TABLE_PARTITION_PROVIDER_CATALOG = "catalog" val TABLE_PARTITION_PROVIDER_FILESYSTEM = "filesystem" + val CREATED_SPARK_VERSION = SPARK_SQL_PREFIX + "create.version" + /** * Returns the fully qualified name used in table properties for a particular column stat. * For example, for column "mycol", and "min" stat, this should return diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 5b62e37311..0007d25614 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -90,6 +90,7 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA provider = if (isDataSource) Some("parquet") else Some("hive"), partitionColumnNames = Seq("a", "b"), createTime = 0L, + createVersion = org.apache.spark.SPARK_VERSION, tracksPartitionsInCatalog = true) } -- GitLab