From 32045259ece73be65a6e2685cec2534527782122 Mon Sep 17 00:00:00 2001 From: pgandhi Date: Thu, 17 Jan 2019 16:11:52 -0600 Subject: [PATCH] [SPARK-26658] : Call pickle.dump with protocol version 3 for Python 3 to fix the serialization issue with large objects When a pyspark job using python 3 tries to serialize large objects, it throws a pickle error in case of trying to serialize global variable object and overflow error in case of broadcast. Fixed this by updating the dump method in code to use protocol version 3 for python3. --- python/pyspark/broadcast.py | 4 +++- python/pyspark/serializers.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py index 29358b5740e51..0df51751e333f 100644 --- a/python/pyspark/broadcast.py +++ b/python/pyspark/broadcast.py @@ -29,8 +29,10 @@ if sys.version < '3': import cPickle as pickle + protocol = 2 else: import pickle + protocol = 3 unicode = str __all__ = ['Broadcast'] @@ -110,7 +112,7 @@ def __init__(self, sc=None, value=None, pickle_registry=None, path=None, def dump(self, value, f): try: - pickle.dump(value, f, 2) + pickle.dump(value, f, protocol) except pickle.PickleError: raise except Exception as e: diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py index fd4695210fb7c..d0f875d3b0421 100644 --- a/python/pyspark/serializers.py +++ b/python/pyspark/serializers.py @@ -620,7 +620,7 @@ class CloudPickleSerializer(PickleSerializer): def dumps(self, obj): try: - return cloudpickle.dumps(obj, 2) + return cloudpickle.dumps(obj, protocol) except pickle.PickleError: raise except Exception as e: