In my dataframe, I need to convert an array data type column to struct. I can manually do that with a sample of data (by modifying in editor) and it is the data that I need. I need to do it in PySpark.
Input dataframe schema:
root
|-- id: string (nullable = true)
|-- description: string (nullable = true)
|-- documents: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: string (nullable = true)
| | |-- doc_name: string (nullable = true)
| | |-- obligations: struct (containsNull = true)
|-- contacts: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: string (nullable = true)
| | |-- contact_first_name: string (nullable = true)
| | |-- contact_last_name: string (nullable = true)
Data:
{
"id":"123",
"description": "agreement",
"documents":[
{
"id":"doc_id_1",
"doc_name":"doc_name_1",
"obligations":{}
}
],
"contacts":[
{
"id":"contact_id_1",
"contact_first_name":"John",
"contact_last_name":"Doe"
}
]
}
Schema that I need:
root
|-- id: string (nullable = true)
|-- description: string (nullable = true)
|-- documents: struct (containsNull = true)
| |-- element: struct (containsNull = true)
| | |-- id: string (nullable = true)
| | |-- doc_name: string (nullable = true)
| | |-- obligations: struct (containsNull = true)
|-- contacts: struct (containsNull = true)
| |-- element: struct (containsNull = true)
| | |-- id: string (nullable = true)
| | |-- contact_first_name: string (nullable = true)
| | |-- contact_last_name: string (nullable = true)
Data that I need:
{
"id":"123",
"description": "agreement",
"documents":{
{
"id":"doc_id_1",
"doc_name":"doc_name_1",
"obligations":{}
}
},
"contacts":{
{
"id":"contact_id_1",
"contact_first_name":"John",
"contact_last_name":"Doe"
}
}
}