I am trying to save dataframe to TFrecord file in spark-shell which need the dependency of spark-tensorflow-connector jar, so i run the
spark-shell --jars xxx/xxx/spark-tensorflow-connector_2.11-1.11.0.jar
then run the code below in spark-shell:
scala> import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
scala> val df = Seq((8, "bat"),(8, "abc"), (1, "xyz"), (2, "aaa")).toDF("number", "word")
df: org.apache.spark.sql.DataFrame = [number: int, word: string]
scala> df.show
+------+----+
|number|word|
+------+----+
| 8| bat|
| 8| abc|
| 1| xyz|
| 2| aaa|
+------+----+
scala> var s = df.write.mode(SaveMode.Overwrite).format("tfrecords").option("recordType", "Example")
s: org.apache.spark.sql.DataFrameWriter[org.apache.spark.sql.Row] = org.apache.spark.sql.DataFrameWriter@da1382f
scala> s.save("tmp/tfrecords")
java.lang.NoClassDefFoundError: scala/Product$class
at org.tensorflow.spark.datasources.tfrecords.TensorflowRelation.<init>(TensorflowRelation.scala:29)
at org.tensorflow.spark.datasources.tfrecords.DefaultSource.createRelation(DefaultSource.scala:78)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:122)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:121)
at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:944)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:944)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:396)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:380)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:269)
... 47 elided
Caused by: java.lang.ClassNotFoundException: scala.Product$class
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:355)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
... 70 more
the Spark version is 3.0.0 with Using Scala version 2.12.10 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_261)
The problem is that you're using Tensorflow connector compiled with Scala 2.11 (notice the _2.11
part in the name of the jar) with Spark 3.0 that is compiled with Scala 2.12.
As of right now, there is no Tensorflow connector compiled for Spark 3.0, so you need to take Spark 2.4.6 that is compiled with Scala 2.11.