I'm trying to output the results from a practice NLP model created using Spark-NLP. However, I keep getting the error below. Can anyone help me out here. The .show() method works earlier in the code, when I attempt to output the dataframe. It just fails whenever I attempt to output any parts of the model results.
I'm running the code from Jupyter Notebook on a windows machine. I have pyspark spark-3.0.3 with Hadoop 2.7 on my machine.
Code Used
import findspark
findspark.init()
findspark.find()
import pyspark
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
spark = sparknlp.start()
data = spark.createDataFrame([['Peter is a godo person living in Germany. Paula is also a good person. She lives in London']]).toDF('text')
data.show(truncate=False)
document = DocumentAssembler().setInputCol('text').setOutputCol('document').setCleanupMode('shrink')
sentence = SentenceDetector().setInputCols('document').setOutputCol('sentence')
sentence.setExplodeSentences(True)
tokenizer = Tokenizer().setInputCols('sentence').setOutputCol('token')
checker = NorvigSweetingModel.pretrained().setInputCols(['token']).setOutputCol('checked')
embeddings = WordEmbeddingsModel.pretrained().setInputCols(['sentence','checked']).setOutputCol('embeddings')
ner = nerDLModel.pretrained().setInputCols(['sentence','checked','embeddings']).setOutputCol('ner')
converter = NerConverter().setInputCols(['sentence','checked','ner']).setOutputCol('chunk')
pipeline = Pipeline().setStages([document,sentence,tokenizer,checker,embeddings,ner,converter])
model = pipeline.fit(data)
result = model.transform(data)
#LINE THAT TRIGGERS ERROR
result.select('chunk.result').show(truncate=False)
ERROR
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-75-4f3ba5a75c4a> in <module>
----> 1 result.select('chunk.result').show(truncate=False)
C:\Spark\python\pyspark\sql\dataframe.py in show(self, n, truncate, vertical)
440 print(self._jdf.showString(n, 20, vertical))
441 else:
--> 442 print(self._jdf.showString(n, int(truncate), vertical))
443
444 def __repr__(self):
C:\Spark\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
C:\Spark\python\pyspark\sql\utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
C:\Spark\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o1393.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 6 in stage 39.0 failed 1 times, most recent failure: Lost task 6.0 in stage 39.0 (TID 174, DESKTOP-G6LQ7L8, executor driver): org.apache.spark.SparkException: Failed to execute user defined function(HasSimpleAnnotate$$Lambda$2720/1692472191: (array<array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>>) => array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1209)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1215)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Caused by: java.lang.Exception: feature Number of words in the dictionary is not set
at com.johnsnowlabs.nlp.serialization.Feature.$anonfun$getOrDefault$1(Feature.scala:81)
at scala.Option.getOrElse(Option.scala:189)
at com.johnsnowlabs.nlp.serialization.Feature.getOrDefault(Feature.scala:81)
at com.johnsnowlabs.nlp.HasFeatures.$$(HasFeatures.scala:39)
at com.johnsnowlabs.nlp.HasFeatures.$$$(HasFeatures.scala:39)
at com.johnsnowlabs.nlp.AnnotatorModel.$$(AnnotatorModel.scala:14)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.allWords$lzycompute(NorvigSweetingModel.scala:125)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.allWords(NorvigSweetingModel.scala:124)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.getSuggestion(NorvigSweetingModel.scala:189)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.getBestSpellingSuggestion(NorvigSweetingModel.scala:170)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.checkSpellWord(NorvigSweetingModel.scala:154)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.$anonfun$annotate$1(NorvigSweetingModel.scala:137)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at scala.collection.TraversableLike.map(TraversableLike.scala:238)
at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.annotate(NorvigSweetingModel.scala:136)
at com.johnsnowlabs.nlp.HasSimpleAnnotate.$anonfun$dfAnnotate$1(HasSimpleAnnotate.scala:24)
... 27 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2135)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2154)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3627)
at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2697)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:767)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2697)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2904)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:300)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:337)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function(HasSimpleAnnotate$$Lambda$2720/1692472191: (array<array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>>) => array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1209)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1215)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
... 1 more
Caused by: java.lang.Exception: feature Number of words in the dictionary is not set
at com.johnsnowlabs.nlp.serialization.Feature.$anonfun$getOrDefault$1(Feature.scala:81)
at scala.Option.getOrElse(Option.scala:189)
at com.johnsnowlabs.nlp.serialization.Feature.getOrDefault(Feature.scala:81)
at com.johnsnowlabs.nlp.HasFeatures.$$(HasFeatures.scala:39)
at com.johnsnowlabs.nlp.HasFeatures.$$$(HasFeatures.scala:39)
at com.johnsnowlabs.nlp.AnnotatorModel.$$(AnnotatorModel.scala:14)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.allWords$lzycompute(NorvigSweetingModel.scala:125)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.allWords(NorvigSweetingModel.scala:124)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.getSuggestion(NorvigSweetingModel.scala:189)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.getBestSpellingSuggestion(NorvigSweetingModel.scala:170)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.checkSpellWord(NorvigSweetingModel.scala:154)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.$anonfun$annotate$1(NorvigSweetingModel.scala:137)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at scala.collection.TraversableLike.map(TraversableLike.scala:238)
at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel.annotate(NorvigSweetingModel.scala:136)
at com.johnsnowlabs.nlp.HasSimpleAnnotate.$anonfun$dfAnnotate$1(HasSimpleAnnotate.scala:24)
... 27 more
I figured out the problem. I installed sparknlp using conda, but I also installed it using pip. For some reason, Juypter will not recognize sparknlp when I install it using conda. However, when both versions were installed it creates this error. I uninstalled the conda version and left only the pip method installed. This solved the problem.