I cant find a way to set driver max results size. Below is my configuration.
conf = pyspark.SparkConf().setAll([("spark.driver.extraClassPath", "/usr/local/bin/postgresql-42.2.5.jar")
,("spark.executor.instances", "4")
,("spark.executor.cores", "4")
,("spark.executor.memories", "10g")
,("spark.driver.memory", "15g")
,("spark.dirver.maxResultSize", "0")
,("spark.memory.offHeap.enabled","true")
,("spark.memory.offHeap.size","20g")])
sc = pyspark.SparkContext(conf=conf)
sc.getConf().getAll()
sqlContext = SQLContext(sc)
i get this error after joining 2 large tables and getting collect
'Py4JJavaError: An error occurred while calling o292.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Total size of serialized results of 101 tasks (1028.8 MB) is bigger than spark.driver.maxResultSize (1024.0 MB)'
I have seen similar problems on stackoverflow advising to maxResultsize but I can;t figure out how to do that correctly.
The following should do the trick. Also note that you have mis-spelled ("spark.executor.memories", "10g")
. The correct configuration is 'spark.executor.memory'
.
from pyspark.sql import SparkSession
spark = (SparkSession.builder
.master('yarn') # depends on the cluster manager of your choice
.appName('StackOverflow')
.config('spark.driver.extraClassPath', '/usr/local/bin/postgresql-42.2.5.jar')
.config('spark.executor.instances', 4)
.config('spark.executor.cores', 4)
.config('spark.executor.memory', '10g')
.config('spark.driver.memory', '15g')
.config('spark.memory.offHeap.enabled', True)
.config('spark.memory.offHeap.size', '20g')
.config('spark.driver.maxResultSize', '4096')
)
sc = spark.sparkContext
Alternatively, try this:
from pyspark import SparkContext
from pyspark import SparkConf
conf = SparkConf()
.setMaster('yarn') \
.setAppName('StackOverflow') \
.set('spark.driver.extraClassPath', '/usr/local/bin/postgresql-42.2.5.jar') \
.set('spark.executor.instances', 4) \
.set('spark.executor.cores', 4) \
.set('spark.executor.memory', '10g') \
.set('spark.driver.memory', '15g') \
.set('spark.memory.offHeap.enabled', True) \
.set('spark.memory.offHeap.size', '20g') \
.set('spark.driver.maxResultSize', '4096')
spark_context = SparkContext(conf=conf)