Search code examples
pythonapache-sparkpysparknlpjohnsnowlabs-spark-nlp

Sparknlp Java Error While Trying to Display Model Results


I'm trying to output the results from a practice NLP model created using Spark-NLP. However, I keep getting the error below. Can anyone help me out here. The .show() method works earlier in the code, when I attempt to output the dataframe. It just fails whenever I attempt to output any parts of the model results.

I'm running the code from Jupyter Notebook on a windows machine. I have pyspark spark-3.0.3 with Hadoop 2.7 on my machine.

Code Used

import findspark
findspark.init()
findspark.find()
import pyspark

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

spark = sparknlp.start()

data = spark.createDataFrame([['Peter is a godo person living in Germany. Paula is also a good person. She lives in London']]).toDF('text')

data.show(truncate=False)

document = DocumentAssembler().setInputCol('text').setOutputCol('document').setCleanupMode('shrink')

sentence = SentenceDetector().setInputCols('document').setOutputCol('sentence')

sentence.setExplodeSentences(True)

tokenizer = Tokenizer().setInputCols('sentence').setOutputCol('token')

checker = NorvigSweetingModel.pretrained().setInputCols(['token']).setOutputCol('checked')

embeddings = WordEmbeddingsModel.pretrained().setInputCols(['sentence','checked']).setOutputCol('embeddings')

ner = nerDLModel.pretrained().setInputCols(['sentence','checked','embeddings']).setOutputCol('ner')

converter = NerConverter().setInputCols(['sentence','checked','ner']).setOutputCol('chunk')

pipeline = Pipeline().setStages([document,sentence,tokenizer,checker,embeddings,ner,converter])

model = pipeline.fit(data)

result = model.transform(data)

#LINE THAT TRIGGERS ERROR
result.select('chunk.result').show(truncate=False)

ERROR

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-75-4f3ba5a75c4a> in <module>
----> 1 result.select('chunk.result').show(truncate=False)

C:\Spark\python\pyspark\sql\dataframe.py in show(self, n, truncate, vertical)
    440             print(self._jdf.showString(n, 20, vertical))
    441         else:
--> 442             print(self._jdf.showString(n, int(truncate), vertical))
    443 
    444     def __repr__(self):

C:\Spark\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py in __call__(self, *args)
   1302 
   1303         answer = self.gateway_client.send_command(command)
-> 1304         return_value = get_return_value(
   1305             answer, self.gateway_client, self.target_id, self.name)
   1306 

C:\Spark\python\pyspark\sql\utils.py in deco(*a, **kw)
    126     def deco(*a, **kw):
    127         try:
--> 128             return f(*a, **kw)
    129         except py4j.protocol.Py4JJavaError as e:
    130             converted = convert_exception(e.java_exception)

C:\Spark\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
    324             value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
    325             if answer[1] == REFERENCE_TYPE:
--> 326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
    328                     format(target_id, ".", name), value)

Py4JJavaError: An error occurred while calling o1393.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 6 in stage 39.0 failed 1 times, most recent failure: Lost task 6.0 in stage 39.0 (TID 174, DESKTOP-G6LQ7L8, executor driver): org.apache.spark.SparkException: Failed to execute user defined function(HasSimpleAnnotate$$Lambda$2720/1692472191: (array<array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>>) => array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1209)
    at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1215)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
    at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
    at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
    at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:127)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    at java.lang.Thread.run(Unknown Source)
Caused by: java.lang.Exception: feature Number of words in the dictionary is not set
    at com.johnsnowlabs.nlp.serialization.Feature.$anonfun$getOrDefault$1(Feature.scala:81)
    at scala.Option.getOrElse(Option.scala:189)
    at com.johnsnowlabs.nlp.serialization.Feature.getOrDefault(Feature.scala:81)
    at com.johnsnowlabs.nlp.HasFeatures.$$(HasFeatures.scala:39)
    at com.johnsnowlabs.nlp.HasFeatures.$$$(HasFeatures.scala:39)
    at com.johnsnowlabs.nlp.AnnotatorModel.$$(AnnotatorModel.scala:14)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.allWords$lzycompute(NorvigSweetingModel.scala:125)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.allWords(NorvigSweetingModel.scala:124)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.getSuggestion(NorvigSweetingModel.scala:189)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.getBestSpellingSuggestion(NorvigSweetingModel.scala:170)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.checkSpellWord(NorvigSweetingModel.scala:154)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.$anonfun$annotate$1(NorvigSweetingModel.scala:137)
    at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
    at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
    at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
    at scala.collection.TraversableLike.map(TraversableLike.scala:238)
    at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
    at scala.collection.AbstractTraversable.map(Traversable.scala:108)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.annotate(NorvigSweetingModel.scala:136)
    at com.johnsnowlabs.nlp.HasSimpleAnnotate.$anonfun$dfAnnotate$1(HasSimpleAnnotate.scala:24)
    ... 27 more

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
    at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
    at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
    at scala.Option.foreach(Option.scala:407)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2135)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2154)
    at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)
    at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
    at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
    at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3627)
    at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2697)
    at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
    at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
    at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:767)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
    at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
    at org.apache.spark.sql.Dataset.head(Dataset.scala:2697)
    at org.apache.spark.sql.Dataset.take(Dataset.scala:2904)
    at org.apache.spark.sql.Dataset.getRows(Dataset.scala:300)
    at org.apache.spark.sql.Dataset.showString(Dataset.scala:337)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
    at java.lang.reflect.Method.invoke(Unknown Source)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:238)
    at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function(HasSimpleAnnotate$$Lambda$2720/1692472191: (array<array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>>) => array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1209)
    at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1215)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
    at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
    at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
    at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:127)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    ... 1 more
Caused by: java.lang.Exception: feature Number of words in the dictionary is not set
    at com.johnsnowlabs.nlp.serialization.Feature.$anonfun$getOrDefault$1(Feature.scala:81)
    at scala.Option.getOrElse(Option.scala:189)
    at com.johnsnowlabs.nlp.serialization.Feature.getOrDefault(Feature.scala:81)
    at com.johnsnowlabs.nlp.HasFeatures.$$(HasFeatures.scala:39)
    at com.johnsnowlabs.nlp.HasFeatures.$$$(HasFeatures.scala:39)
    at com.johnsnowlabs.nlp.AnnotatorModel.$$(AnnotatorModel.scala:14)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.allWords$lzycompute(NorvigSweetingModel.scala:125)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.allWords(NorvigSweetingModel.scala:124)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.getSuggestion(NorvigSweetingModel.scala:189)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.getBestSpellingSuggestion(NorvigSweetingModel.scala:170)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.checkSpellWord(NorvigSweetingModel.scala:154)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.$anonfun$annotate$1(NorvigSweetingModel.scala:137)
    at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
    at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
    at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
    at scala.collection.TraversableLike.map(TraversableLike.scala:238)
    at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
    at scala.collection.AbstractTraversable.map(Traversable.scala:108)
    at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.annotate(NorvigSweetingModel.scala:136)
    at com.johnsnowlabs.nlp.HasSimpleAnnotate.$anonfun$dfAnnotate$1(HasSimpleAnnotate.scala:24)
    ... 27 more

Solution

  • I figured out the problem. I installed sparknlp using conda, but I also installed it using pip. For some reason, Juypter will not recognize sparknlp when I install it using conda. However, when both versions were installed it creates this error. I uninstalled the conda version and left only the pip method installed. This solved the problem.