I am unable to build a fat jar for my Kafka-SparkStructuredStreaming-MongoDB pipeline.
I have built StructuredStreamingProgram: receives streaming data from Kafka Topics and apply some parsing and then my intention is to save the structured streaming data into a MongoDB collection.
I have followed this article to build my pipeline https://learningfromdata.blog/2017/04/16/real-time-data-ingestion-with-apache-spark-structured-streaming-implementation/
I have created Helpers.scala and MongoDBForeachWriter.scala as suggested in the article for my streaming pipeline and save it under src/main/scala/example
When i do sbt assembly to build a fat jar i face this errors;
"[error] C:\spark_streaming\src\main\scala\example\structuredStreamApp.scala:63: class MongoDBForeachWriter is abstract; cannot be instantiated
[error] val structuredStreamForeachWriter: MongoDBForeachWriter = new MongoDBForeachWriter(mongodb_uri,mdb_name,mdb_collection,CountAccum)"
I need guidance in making this pipeline work.
Any help will be appreciated
package example
import java.util.Calendar
import org.apache.spark.util.LongAccumulator
import org.apache.spark.sql.Row
import org.apache.spark.sql.ForeachWriter
import org.mongodb.scala._
import org.mongodb.scala.bson.collection.mutable.Document
import org.mongodb.scala.bson._
import example.Helpers._
abstract class MongoDBForeachWriter(p_uri: String,
p_dbName: String,
p_collectionName: String,
p_messageCountAccum: LongAccumulator) extends ForeachWriter[Row] {
val mongodbURI = p_uri
val dbName = p_dbName
val collectionName = p_collectionName
val messageCountAccum = p_messageCountAccum
var mongoClient: MongoClient = null
var db: MongoDatabase = null
var collection: MongoCollection[Document] = null
def ensureMongoDBConnection(): Unit = {
if (mongoClient == null) {
mongoClient = MongoClient(mongodbURI)
db = mongoClient.getDatabase(dbName)
collection = db.getCollection(collectionName)
}
}
override def open(partitionId: Long, version: Long): Boolean = {
true
}
override def process(record: Row): Unit = {
val valueStr = new String(record.getAs[Array[Byte]]("value"))
val doc: Document = Document(valueStr)
doc += ("log_time" -> Calendar.getInstance().getTime())
// lazy opening of MongoDB connection
ensureMongoDBConnection()
val result = collection.insertOne(doc).results()
// tracks how many records I have processed
if (messageCountAccum != null)
messageCountAccum.add(1)
}
}
package example
import java.util.concurrent.TimeUnit
import scala.concurrent.Await
import scala.concurrent.duration.Duration
import org.mongodb.scala._
object Helpers {
implicit class DocumentObservable[C](val observable: Observable[Document]) extends ImplicitObservable[Document] {
override val converter: (Document) => String = (doc) => doc.toJson
}
implicit class GenericObservable[C](val observable: Observable[C]) extends ImplicitObservable[C] {
override val converter: (C) => String = (doc) => doc.toString
}
trait ImplicitObservable[C] {
val observable: Observable[C]
val converter: (C) => String
def results(): Seq[C] = Await.result(observable.toFuture(), Duration(10, TimeUnit.SECONDS))
def headResult() = Await.result(observable.head(), Duration(10, TimeUnit.SECONDS))
def printResults(initial: String = ""): Unit = {
if (initial.length > 0) print(initial)
results().foreach(res => println(converter(res)))
}
def printHeadResult(initial: String = ""): Unit = println(s"${initial}${converter(headResult())}")
}
}
package example
import org.apache.spark.sql.functions.{col, _}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.util.LongAccumulator
import example.Helpers._
import java.util.Calendar
object StructuredStreamingProgram {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("OSB_Streaming_Model")
.getOrCreate()
import spark.implicits._
val df = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "10.160.172.45:9092, 10.160.172.46:9092, 10.160.172.100:9092")
.option("subscribe", "TOPIC_WITH_COMP_P2_R2, TOPIC_WITH_COMP_P2_R2.DIT, TOPIC_WITHOUT_COMP_P2_R2.DIT")
.load()
val dfs = df.selectExpr("CAST(value AS STRING)").toDF()
val data =dfs.withColumn("splitted", split($"SERVICE_NAME8", "/"))
.select($"splitted".getItem(4).alias("region"),$"splitted".getItem(5).alias("service"),col("_raw"))
.withColumn("service_type", regexp_extract($"service", """.*(Inbound|Outbound|Outound).*""",1))
.withColumn("region_type", concat(
when(col("region").isNotNull,col("region")).otherwise(lit("null")), lit(" "),
when(col("service").isNotNull,col("service_type")).otherwise(lit("null"))))
val extractedDF = data.filter(
col("region").isNotNull &&
col("service").isNotNull &&
col("_raw").isNotNull &&
col("service_type").isNotNull &&
col("region_type").isNotNull)
.filter("region != ''")
.filter("service != ''")
.filter("_raw != ''")
.filter("service_type != ''")
.filter("region_type != ''")
// sends to MongoDB once every 20 seconds
val mongodb_uri = "mongodb://dstk8sdev06.us.dell.com/?maxPoolSize=1"
val mdb_name = "HANZO_MDB"
val mdb_collection = "Testing_Spark"
val CountAccum: LongAccumulator = spark.sparkContext.longAccumulator("mongostreamcount")
val structuredStreamForeachWriter: MongoDBForeachWriter = new MongoDBForeachWriter(mongodb_uri,mdb_name,mdb_collection,CountAccum)
val query = df.writeStream
.foreach(structuredStreamForeachWriter)
.trigger(Trigger.ProcessingTime("20 seconds"))
.start()
while (!spark.streams.awaitAnyTermination(60000)) {
println(Calendar.getInstance().getTime()+" :: mongoEventsCount = "+CountAccum.value)
}
}
}
with the above by doing corrections i would need to be able to save the structured streaming data into mongodb
You can instantiate object for abstract class. To resolve this issue, implement close function in MongoDBForeachWriter class and make it as as concrete class.
class MongoDBForeachWriter(p_uri: String,
p_dbName: String,
p_collectionName: String,
p_messageCountAccum: LongAccumulator) extends ForeachWriter[Row] {
val mongodbURI = p_uri
val dbName = p_dbName
val collectionName = p_collectionName
val messageCountAccum = p_messageCountAccum
var mongoClient: MongoClient = null
var db: MongoDatabase = null
var collection: MongoCollection[Document] = null
def ensureMongoDBConnection(): Unit = {
if (mongoClient == null) {
mongoClient = MongoClient(mongodbURI)
db = mongoClient.getDatabase(dbName)
collection = db.getCollection(collectionName)
}
}
override def open(partitionId: Long, version: Long): Boolean = {
true
}
override def process(record: Row): Unit = {
val valueStr = new String(record.getAs[Array[Byte]]("value"))
val doc: Document = Document(valueStr)
doc += ("log_time" -> Calendar.getInstance().getTime())
// lazy opening of MongoDB connection
ensureMongoDBConnection()
val result = collection.insertOne(doc)
// tracks how many records I have processed
if (messageCountAccum != null)
messageCountAccum.add(1)
}
override def close(errorOrNull: Throwable): Unit = {
if(mongoClient != null) {
Try {
mongoClient.close()
}
}
}
}
Hope this helps.
Ravi