Can you kindly show me how do we start the Spark session on Google Cloud Vertex AI workbench Jupyterlab notebook?
This is working fine in Google Colaboratory by the way.
What is missing here?
# Install Spark NLP from PyPI
!pip install -q spark-nlp==4.0.1 pyspark==3.3.0
import os
import sys
# https://github.com/jupyter/jupyter/issues/248
os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk-18.0.1.1"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
import sparknlp
from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pandas as pd
spark=sparknlp.start()
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)
spark
UPDATE_2022-07-21:
Hi @Sayan. I am still not able to start Spark session on Vertex AI workbench Jupyterlab notebook after running the commands =(
# Install Spark NLP from PyPI
!pip install -q spark-nlp==4.0.1 pyspark==3.3.0
import os
# Included else "JAVA_HOME is not set"
# https://github.com/jupyter/jupyter/issues/248
os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk-18.0.1.1"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
import sparknlp
spark = sparknlp.start()
print("Spark NLP version: {}".format(sparknlp.version()))
print("Apache Spark version: {}".format(spark.version))
The error:
/opt/conda/lib/python3.7/site-packages/pyspark/bin/spark-class: line 71: C:/Program Files/Java/jdk-18.0.1.1/bin/java: No such file or directory
/opt/conda/lib/python3.7/site-packages/pyspark/bin/spark-class: line 96: CMD: bad array subscript
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_5831/489505405.py in <module>
6
7 import sparknlp
----> 8 spark = sparknlp.start()
9
10 print("Spark NLP version: {}".format(sparknlp.version()))
/opt/conda/lib/python3.7/site-packages/sparknlp/__init__.py in start(gpu, m1, memory, cache_folder, log_folder, cluster_tmp_dir, real_time_output, output_level)
242 return SparkRealTimeOutput()
243 else:
--> 244 spark_session = start_without_realtime_output()
245 return spark_session
246
/opt/conda/lib/python3.7/site-packages/sparknlp/__init__.py in start_without_realtime_output()
152 builder.config("spark.jsl.settings.storage.cluster_tmp_dir", cluster_tmp_dir)
153
--> 154 return builder.getOrCreate()
155
156 def start_with_realtime_output():
/opt/conda/lib/python3.7/site-packages/pyspark/sql/session.py in getOrCreate(self)
267 sparkConf.set(key, value)
268 # This SparkContext may be an existing one.
--> 269 sc = SparkContext.getOrCreate(sparkConf)
270 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
271 # by all sessions.
/opt/conda/lib/python3.7/site-packages/pyspark/context.py in getOrCreate(cls, conf)
481 with SparkContext._lock:
482 if SparkContext._active_spark_context is None:
--> 483 SparkContext(conf=conf or SparkConf())
484 assert SparkContext._active_spark_context is not None
485 return SparkContext._active_spark_context
/opt/conda/lib/python3.7/site-packages/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls)
193 )
194
--> 195 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
196 try:
197 self._do_init(
/opt/conda/lib/python3.7/site-packages/pyspark/context.py in _ensure_initialized(cls, instance, gateway, conf)
415 with SparkContext._lock:
416 if not SparkContext._gateway:
--> 417 SparkContext._gateway = gateway or launch_gateway(conf)
418 SparkContext._jvm = SparkContext._gateway.jvm
419
/opt/conda/lib/python3.7/site-packages/pyspark/java_gateway.py in launch_gateway(conf, popen_kwargs)
104
105 if not os.path.isfile(conn_info_file):
--> 106 raise RuntimeError("Java gateway process exited before sending its port number")
107
108 with open(conn_info_file, "rb") as info:
RuntimeError: Java gateway process exited before sending its port number
One possible reason is that Java
is not installed. When you create a Python-3 Vertex AI Workbench you can have either Debian
or Ubuntu
as an OS and it does not come with Java pre-installed. You need to install it manually.
To install you can use
sudo apt-get update
sudo apt-get install default-jdk
You can follow this tutorial to install Open JDK.
All your problems lie with installing JDK and setting its path in the environment. Once you do this properly you don't need to set path in python also. Your code should look something like this
# Install Spark NLP from PyPI
!pip install -q spark-nlp==4.0.1 pyspark==3.3.0
#no need to set the environment path
import sparknlp
#all other imports
import pandas as pd
spark=sparknlp.start()
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)
spark
EDIT: I have tried your code and had the same error.All I did was Open the terminal inside JupyterLab of the workbench and installed java there.
Opened the JupyterLab from Workbench
Notebook instance.
Opening the terminal from File->New->Terminal
From here I downloaded and installed the Java.
You can check whether it has been installed and added to your path by running java --version
it will return the current version.