I am trying to run objects_small_list parallel using mapPartitions and comparing in with the obj_id with process_object_id which is in the objects_small_list and pulling matching folder rows in function process_partition but i am getting this error:
PicklingError: Could not serialize object: RuntimeError: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.
any help would be appreciated. I am running this in azure synapse.
folder_data =[{"folderid": 10, "folder_path": "path", "obj_id": 1, "status":
"Progress"},
{"folderid": 11, "folder_path": "path", "obj_id": 2, "status":
"Progress"},
{"folderid": 12, "folder_path": "path", "obj_id": 3, "status":
"Progress"},
{"folderid": 13, "folder_path": "path", "obj_id": 4, "status": "Progress"}
]
folders_df = spark.createDataFrame(folder_data)
all_folders = folders_df.collect()
folders_broadcast =sc.broadcast(all_folders)
def process_partition(row):
# rows = list(row)
results = []
for row in folders_df.collect():
ProcessObjectId = row["process_object_id"]
matching_folders = [
f for f in folders_broadcast.value if f['obj_id'] == ProcessObjectId
]
# folders_to_process_rows = folders_df.collect()
results.extend(matching_folders)
yield results
rdd = sc.parallelize(objects_small_list, numSlices=8)
rdd_result = rdd.mapPartitions(process_partition).collect()
all_folders_filtered = [item for partition in rdd_result for item in partition]
Thanks