pycharm 使用pyspark 调用map算子一直报错
from pyspark import SparkConf, SparkContext
import os
os.environ['PYSPARK_PYTHON'] = 'D:/myApps/python/python.exe'
conf = SparkConf().setMaster("local[*]").setAppName("test_app_name")
sc = SparkContext(conf=conf)
wyyList = {"name": "刘德华", 'age': 18}
rdd1 = sc.parallelize([1, 2, 3, 4])
def func(data):
return data * 2
rdd2 = rdd1.map(func)
print("rdd1rdd1", rdd1.collect()) # 这行打印正常
print("rdd2rdd2", rdd2.collect())
sc.stop()
Traceback (most recent call last):
File "D:\myApps\python\Lib\site-packages\pyspark\serializers.py", line 458, in dumps
return cloudpickle.dumps(obj, pickle_protocol)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 73, in dumps
cp.dump(obj)
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 602, in dump
return Pickler.dump(self, obj)
^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 692, in reducer_override
return self._function_reduce(obj)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 565, in _function_reduce
return self._dynamic_function_reduce(obj)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 546, in _dynamic_function_reduce
state = _function_getstate(func)
^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 157, in _function_getstate
f_globals_ref = _extract_code_globals(func.code)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle.py", line 334, in _extract_code_globals
out_names = {names[oparg]: None for _, oparg in _walk_global_ops(co)}
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle.py", line 334, in
out_names = {names[oparg]: None for _, oparg in _walk_global_ops(co)}
~~~~~^^^^^^^
IndexError: tuple index out of range
Traceback (most recent call last):
File "D:\myApps\python\Lib\site-packages\pyspark\serializers.py", line 458, in dumps
return cloudpickle.dumps(obj, pickle_protocol)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 73, in dumps
cp.dump(obj)
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 602, in dump
return Pickler.dump(self, obj)
^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 692, in reducer_override
return self._function_reduce(obj)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 565, in _function_reduce
return self._dynamic_function_reduce(obj)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 546, in _dynamic_function_reduce
state = _function_getstate(func)
^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 157, in _function_getstate
f_globals_ref = _extract_code_globals(func.code)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle.py", line 334, in _extract_code_globals
out_names = {names[oparg]: None for _, oparg in _walk_global_ops(co)}
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\cloudpickle\cloudpickle.py", line 334, in
out_names = {names[oparg]: None for _, oparg in _walk_global_ops(co)}
~~~~~^^^^^^^
IndexError: tuple index out of range
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\wyyStudyDocument\python\python-learn\pyspark\pyspark_02.py", line 23, in
print("rdd2rdd2", rdd2.collect())
^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\rdd.py", line 1194, in collect
sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\rdd.py", line 3500, in _jrdd
wrapped_func = _wrap_function(
^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\rdd.py", line 3359, in _wrap_function
pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\rdd.py", line 3342, in _prepare_for_python_RDD
pickled_command = ser.dumps(command)
^^^^^^^^^^^^^^^^^^
File "D:\myApps\python\Lib\site-packages\pyspark\serializers.py", line 468, in dumps
raise pickle.PicklingError(msg)
_pickle.PicklingError: Could not serialize object: IndexError: tuple index out of range