Search code examples
pythonpysparkgoogle-colaboratorypy4j

py4j.protocol.Py4JNetworkError: Answer from Java side is empty while trying to execute df.show(5)


I'm a newbie in PySpark and I got stuck at one point. Here I'm trying to do an analysis of the Twitter data dump in Parquet files through PySpark. I'm trying to read a parquet file in Pyspark on Google CoLab and it works fine up until I try to run "df.show(5)". I think there is some issue with memory of the driver and the executor but I'm not sure and also I don't know how much can I change it to. I'm using Google Colab Pro+. I have included the entire error below. This is after I build a SparkSession and do spark.read.parquet and then when I try to run df.show(5), it gives me this error.

Error along with all the exceptions:

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 480, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 504, in send_command
    "Error while sending or receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-eb589bae8d4b>", line 1, in <module>
    df.show(5)
  File "/content/spark-3.2.0-bin-hadoop3.2/python/pyspark/sql/dataframe.py", line 494, in show
    print(self._jdf.showString(n, 20, vertical))
  File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1310, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/content/spark-3.2.0-bin-hadoop3.2/python/pyspark/sql/utils.py", line 111, in deco
    return f(*a, **kw)
  File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py", line 328, in get_return_value
    format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: <unprintable Py4JJavaError object>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 1823, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'Py4JJavaError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 480, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 504, in send_command
    "Error while sending or receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while sending or receiving
---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py in run_code(self, code_obj, result)
   2881                 #rprint('Running code', repr(code_obj)) # dbg
-> 2882                 exec(code_obj, self.user_global_ns, self.user_ns)
   2883             finally:

13 frames
<ipython-input-14-eb589bae8d4b> in <module>()
----> 1 df.show(5)

/content/spark-3.2.0-bin-hadoop3.2/python/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
    493         if isinstance(truncate, bool) and truncate:
--> 494             print(self._jdf.showString(n, 20, vertical))
    495         else:

/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1309         return_value = get_return_value(
-> 1310             answer, self.gateway_client, self.target_id, self.name)
   1311 

/content/spark-3.2.0-bin-hadoop3.2/python/pyspark/sql/utils.py in deco(*a, **kw)
    110         try:
--> 111             return f(*a, **kw)
    112         except py4j.protocol.Py4JJavaError as e:

/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:

<class 'str'>: (<class 'ConnectionRefusedError'>, ConnectionRefusedError(111, 'Connection refused'))

During handling of the above exception, another exception occurred:

ConnectionRefusedError                    Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py in run_code(self, code_obj, result)
   2897             if result is not None:
   2898                 result.error_in_exec = sys.exc_info()[1]
-> 2899             self.showtraceback()
   2900         else:
   2901             outflag = 0

/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py in showtraceback(self, exc_tuple, filename, tb_offset, exception_only)
   1826                                             value, tb, tb_offset=tb_offset)
   1827 
-> 1828                     self._showtraceback(etype, value, stb)
   1829                     if self.call_pdb:
   1830                         # drop into debugger

/usr/local/lib/python3.7/dist-packages/google/colab/_shell.py in _showtraceback(self, etype, evalue, stb)
    131         'traceback': stb,
    132         'ename': py3compat.unicode_type(etype.__name__),
--> 133         'evalue': py3compat.safe_unicode(evalue),
    134     }
    135 

/usr/local/lib/python3.7/dist-packages/ipython_genutils/py3compat.py in safe_unicode(e)
     63     """
     64     try:
---> 65         return unicode_type(e)
     66     except UnicodeError:
     67         pass

/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py in __str__(self)
    469     def __str__(self):
    470         gateway_client = self.java_exception._gateway_client
--> 471         answer = gateway_client.send_command(self.exception_cmd)
    472         return_value = get_return_value(answer, gateway_client, None, None)
    473         # Note: technically this should return a bytestring 'str' rather than

/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py in send_command(self, command, retry, binary)
   1034          if `binary` is `True`.
   1035         """
-> 1036         connection = self._get_connection()
   1037         try:
   1038             response = connection.send_command(command)

/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py in _get_connection(self)
    279 
    280         if connection is None or connection.socket is None:
--> 281             connection = self._create_new_connection()
    282         return connection
    283 

/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py in _create_new_connection(self)
    286             self.java_parameters, self.python_parameters,
    287             self.gateway_property, self)
--> 288         connection.connect_to_java_server()
    289         self.set_thread_connection(connection)
    290         return connection

/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py in connect_to_java_server(self)
    400                 self.socket = self.ssl_context.wrap_socket(
    401                     self.socket, server_hostname=self.java_address)
--> 402             self.socket.connect((self.java_address, self.java_port))
    403             self.stream = self.socket.makefile("rb")
    404             self.is_connected = True

ConnectionRefusedError: [Errno 111] Connection refused

Solution

  • I found the answer. I just configured the driver memory to have 12 G and it worked. I think it wasn't working because the driver was receiving a huge amount of data and the default driver memory of 2-4G wasn't able to handle it.