diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 1af2f9afea5eb421498a992925e51d8fb8d22b7c..365b50dee93c4d89e1d07c99e23dc4258c30daf0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -159,7 +159,11 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * @since 1.4.0 */ def jdbc(url: String, table: String, properties: Properties): DataFrame = { - jdbc(url, table, JDBCRelation.columnPartition(null), properties) + // properties should override settings in extraOptions. + this.extraOptions = this.extraOptions ++ properties.asScala + // explicit url and dbtable should override all + this.extraOptions += (JDBCOptions.JDBC_URL -> url, JDBCOptions.JDBC_TABLE_NAME -> table) + format("jdbc").load() } /** @@ -177,7 +181,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * @param upperBound the maximum value of `columnName` used to decide partition stride. * @param numPartitions the number of partitions. This, along with `lowerBound` (inclusive), * `upperBound` (exclusive), form partition strides for generated WHERE - * clause expressions used to split the column `columnName` evenly. + * clause expressions used to split the column `columnName` evenly. When + * the input is less than 1, the number is set to 1. * @param connectionProperties JDBC database connection arguments, a list of arbitrary string * tag/value. Normally at least a "user" and "password" property * should be included. "fetchsize" can be used to control the @@ -192,9 +197,13 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { upperBound: Long, numPartitions: Int, connectionProperties: Properties): DataFrame = { - val partitioning = JDBCPartitioningInfo(columnName, lowerBound, upperBound, numPartitions) - val parts = JDBCRelation.columnPartition(partitioning) - jdbc(url, table, parts, connectionProperties) + // columnName, lowerBound, upperBound and numPartitions override settings in extraOptions. + this.extraOptions ++= Map( + JDBCOptions.JDBC_PARTITION_COLUMN -> columnName, + JDBCOptions.JDBC_LOWER_BOUND -> lowerBound.toString, + JDBCOptions.JDBC_UPPER_BOUND -> upperBound.toString, + JDBCOptions.JDBC_NUM_PARTITIONS -> numPartitions.toString) + jdbc(url, table, connectionProperties) } /** @@ -220,22 +229,14 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { table: String, predicates: Array[String], connectionProperties: Properties): DataFrame = { + // connectionProperties should override settings in extraOptions. + val params = extraOptions.toMap ++ connectionProperties.asScala.toMap + val options = new JDBCOptions(url, table, params) val parts: Array[Partition] = predicates.zipWithIndex.map { case (part, i) => JDBCPartition(part, i) : Partition } - jdbc(url, table, parts, connectionProperties) - } - - private def jdbc( - url: String, - table: String, - parts: Array[Partition], - connectionProperties: Properties): DataFrame = { - // connectionProperties should override settings in extraOptions. - this.extraOptions = this.extraOptions ++ connectionProperties.asScala - // explicit url and dbtable should override all - this.extraOptions += ("url" -> url, "dbtable" -> table) - format("jdbc").load() + val relation = JDBCRelation(parts, options)(sparkSession) + sparkSession.baseRelationToDataFrame(relation) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala index fe2f4c1d78647933008379497620820165a90baf..56cd17816f7bd6ae86ab590a1e9e3eaba605ca0c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala @@ -76,9 +76,6 @@ class JDBCOptions( // the number of partitions val numPartitions = parameters.get(JDBC_NUM_PARTITIONS).map(_.toInt) - require(numPartitions.isEmpty || numPartitions.get > 0, - s"Invalid value `${numPartitions.get}` for parameter `$JDBC_NUM_PARTITIONS`. " + - "The minimum value is 1.") // ------------------------------------------------------------ // Optional parameters only for reading diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala index 672c21c6ac73458174d1349eb59a0a3a8b24ec63..30caa73adc1eae6914157d1f52d3e886c8c33db7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala @@ -137,7 +137,8 @@ private[sql] case class JDBCRelation( } override def toString: String = { + val partitioningInfo = if (parts.nonEmpty) s" [numPartitions=${parts.length}]" else "" // credentials should not be included in the plan output, table information is sufficient. - s"JDBCRelation(${jdbcOptions.table})" + s"JDBCRelation(${jdbcOptions.table})" + partitioningInfo } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index c2a1ad84b952e5378cf977a9b6985c29b55d162a..ff29a15960c5702965e7b13d2392884135fcc840 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -657,7 +657,7 @@ object JdbcUtils extends Logging { df: DataFrame, url: String, table: String, - options: JDBCOptions) { + options: JDBCOptions): Unit = { val dialect = JdbcDialects.get(url) val nullTypes: Array[Int] = df.schema.fields.map { field => getJdbcType(field.dataType, dialect).jdbcNullType @@ -667,13 +667,13 @@ object JdbcUtils extends Logging { val getConnection: () => Connection = createConnectionFactory(options) val batchSize = options.batchSize val isolationLevel = options.isolationLevel - val numPartitions = options.numPartitions - val repartitionedDF = - if (numPartitions.isDefined && numPartitions.get < df.rdd.getNumPartitions) { - df.coalesce(numPartitions.get) - } else { - df - } + val repartitionedDF = options.numPartitions match { + case Some(n) if n <= 0 => throw new IllegalArgumentException( + s"Invalid value `$n` for parameter `${JDBCOptions.JDBC_NUM_PARTITIONS}` in table writing " + + "via JDBC. The minimum value is 1.") + case Some(n) if n < df.rdd.getNumPartitions => df.coalesce(n) + case _ => df + } repartitionedDF.foreachPartition(iterator => savePartition( getConnection, table, iterator, rddSchema, nullTypes, batchSize, dialect, isolationLevel) ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index b16be457ed5c3bd3e78bd41f501f3765d8bf2e4f..218ccf9332cd67f12490677342583bda6f397716 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -24,12 +24,12 @@ import java.util.{Calendar, GregorianCalendar, Properties} import org.h2.jdbc.JdbcSQLException import org.scalatest.{BeforeAndAfter, PrivateMethodTester} -import org.apache.spark.{SparkException, SparkFunSuite} +import org.apache.spark.SparkFunSuite import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.execution.DataSourceScanExec import org.apache.spark.sql.execution.command.ExplainCommand import org.apache.spark.sql.execution.datasources.LogicalRelation -import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JDBCRDD, JdbcUtils} +import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JDBCRDD, JDBCRelation, JdbcUtils} import org.apache.spark.sql.sources._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ @@ -209,6 +209,16 @@ class JDBCSuite extends SparkFunSuite conn.close() } + // Check whether the tables are fetched in the expected degree of parallelism + def checkNumPartitions(df: DataFrame, expectedNumPartitions: Int): Unit = { + val jdbcRelations = df.queryExecution.analyzed.collect { + case LogicalRelation(r: JDBCRelation, _, _) => r + } + assert(jdbcRelations.length == 1) + assert(jdbcRelations.head.parts.length == expectedNumPartitions, + s"Expecting a JDBCRelation with $expectedNumPartitions partitions, but got:`$jdbcRelations`") + } + test("SELECT *") { assert(sql("SELECT * FROM foobar").collect().size === 3) } @@ -313,13 +323,23 @@ class JDBCSuite extends SparkFunSuite } test("SELECT * partitioned") { - assert(sql("SELECT * FROM parts").collect().size == 3) + val df = sql("SELECT * FROM parts") + checkNumPartitions(df, expectedNumPartitions = 3) + assert(df.collect().length == 3) } test("SELECT WHERE (simple predicates) partitioned") { - assert(sql("SELECT * FROM parts WHERE THEID < 1").collect().size === 0) - assert(sql("SELECT * FROM parts WHERE THEID != 2").collect().size === 2) - assert(sql("SELECT THEID FROM parts WHERE THEID = 1").collect().size === 1) + val df1 = sql("SELECT * FROM parts WHERE THEID < 1") + checkNumPartitions(df1, expectedNumPartitions = 3) + assert(df1.collect().length === 0) + + val df2 = sql("SELECT * FROM parts WHERE THEID != 2") + checkNumPartitions(df2, expectedNumPartitions = 3) + assert(df2.collect().length === 2) + + val df3 = sql("SELECT THEID FROM parts WHERE THEID = 1") + checkNumPartitions(df3, expectedNumPartitions = 3) + assert(df3.collect().length === 1) } test("SELECT second field partitioned") { @@ -370,24 +390,27 @@ class JDBCSuite extends SparkFunSuite } test("Partitioning via JDBCPartitioningInfo API") { - assert( - spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties()) - .collect().length === 3) + val df = spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties()) + checkNumPartitions(df, expectedNumPartitions = 3) + assert(df.collect().length === 3) } test("Partitioning via list-of-where-clauses API") { val parts = Array[String]("THEID < 2", "THEID >= 2") - assert(spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties()) - .collect().length === 3) + val df = spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties()) + checkNumPartitions(df, expectedNumPartitions = 2) + assert(df.collect().length === 3) } test("Partitioning on column that might have null values.") { - assert( - spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", "theid", 0, 4, 3, new Properties()) - .collect().length === 4) - assert( - spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", "THEID", 0, 4, 3, new Properties()) - .collect().length === 4) + val df = spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", "theid", 0, 4, 3, new Properties()) + checkNumPartitions(df, expectedNumPartitions = 3) + assert(df.collect().length === 4) + + val df2 = spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", "THEID", 0, 4, 3, new Properties()) + checkNumPartitions(df2, expectedNumPartitions = 3) + assert(df2.collect().length === 4) + // partitioning on a nullable quoted column assert( spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", """"Dept"""", 0, 4, 3, new Properties()) @@ -404,6 +427,7 @@ class JDBCSuite extends SparkFunSuite numPartitions = 0, connectionProperties = new Properties() ) + checkNumPartitions(res, expectedNumPartitions = 1) assert(res.count() === 8) } @@ -417,6 +441,7 @@ class JDBCSuite extends SparkFunSuite numPartitions = 10, connectionProperties = new Properties() ) + checkNumPartitions(res, expectedNumPartitions = 4) assert(res.count() === 8) } @@ -430,6 +455,7 @@ class JDBCSuite extends SparkFunSuite numPartitions = 4, connectionProperties = new Properties() ) + checkNumPartitions(res, expectedNumPartitions = 1) assert(res.count() === 8) } @@ -450,7 +476,9 @@ class JDBCSuite extends SparkFunSuite } test("SELECT * on partitioned table with a nullable partition column") { - assert(sql("SELECT * FROM nullparts").collect().size == 4) + val df = sql("SELECT * FROM nullparts") + checkNumPartitions(df, expectedNumPartitions = 3) + assert(df.collect().length == 4) } test("H2 integral types") { @@ -722,7 +750,8 @@ class JDBCSuite extends SparkFunSuite } // test the JdbcRelation toString output df.queryExecution.analyzed.collect { - case r: LogicalRelation => assert(r.relation.toString == "JDBCRelation(TEST.PEOPLE)") + case r: LogicalRelation => + assert(r.relation.toString == "JDBCRelation(TEST.PEOPLE) [numPartitions=3]") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala index c834419948c530a588da56ad47e29ce43124bd95..064958bc1a14cbc713ea40b6551102ceef44cd02 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala @@ -319,9 +319,12 @@ class JDBCWriteSuite extends SharedSQLContext with BeforeAndAfter { df.write.format("jdbc") .option("dbtable", "TEST.SAVETEST") .option("url", url1) + .option("user", "testUser") + .option("password", "testPass") .option(s"${JDBCOptions.JDBC_NUM_PARTITIONS}", "0") .save() }.getMessage - assert(e.contains("Invalid value `0` for parameter `numPartitions`. The minimum value is 1")) + assert(e.contains("Invalid value `0` for parameter `numPartitions` in table writing " + + "via JDBC. The minimum value is 1.")) } }