We need to use explode function to achieve this sort of nested JSON struct and array queries.
scala> val df = spark.read.json("/Users/pavithranrao/Desktop/test.json")
scala> df.printSchema
root
|-- data: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: string (nullable = true)
| | |-- parent: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- name: string (nullable = true)
scala> val oneDF = df.select(col("data"), explode(col("data"))).toDF("data", "element").select(col("data"), col("element.parent"))
scala> oneDF.show
"""
+--------------------+--------------------+
| data| parent|
+--------------------+--------------------+
|[[20180218,Wrappe...| [[Market]]|
|[[20180218,Wrappe...|[[Client], [Market]]|
|[[20180218,Wrappe...| [[Client]]|
|[[20180218,Wrappe...| []|
+--------------------+--------------------+
"""
scala> val twoDF = oneDF.select(col("data"), explode(col("parent"))).toDF("data", "names")
scala> twoDF.printSchema
root
|-- data: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: string (nullable = true)
| | |-- parent: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- name: string (nullable = true)
|-- names: struct (nullable = true)
| |-- name: string (nullable = true)
scala> twoDF.show
"""
+--------------------+--------+
| data| names|
+--------------------+--------+
|[[20180218,Wrappe...|[Market]|
|[[20180218,Wrappe...|[Client]|
|[[20180218,Wrappe...|[Market]|
|[[20180218,Wrappe...|[Client]|
+--------------------+--------+
"""
scala> import org.apache.spark.sql.functions.length
// Extract names struct that is Empty
scala> twoDF.select(length(col("names.name")) === 0).show
+------------------------+
|(length(names.name) = 0)|
+------------------------+
| false|
| false|
| false|
| false|
+------------------------+
// Extract names strcut that doesn't have Market
scala> twoDF.select(!col("names.name").contains("Market")).show()
+----------------------------------+
|(NOT contains(names.name, Market))|
+----------------------------------+
| false|
| true|
| false|
| true|
+----------------------------------+
// Combining these two
scala> val ansDF = twoDF.select("data").filter(!col("names.name").contains("Market") || length(col("names.name")) === 0)
scala> ansDF.printSchema
// Schema same as input df
root
|-- data: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: string (nullable = true)
| | |-- parent: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- name: string (nullable = true)
scala> ansDF.show(false)
+----------------------------------------------------------------------------------------------------------------------------------------------+
|data |
+----------------------------------------------------------------------------------------------------------------------------------------------+
|[[20180218,WrappedArray([Market])], [20180219,WrappedArray([Client], [Market])], [20180220,WrappedArray([Client])], [20180221,WrappedArray()]]|
|[[20180218,WrappedArray([Market])], [20180219,WrappedArray([Client], [Market])], [20180220,WrappedArray([Client])], [20180221,WrappedArray()]]|
+----------------------------------------------------------------------------------------------------------------------------------------------+
The final ansDF has the filtered records that satisfy the condition name does not contain 'Market' or isEmpty.
PS : If I have missed the exact filter scenario, correct from the
filter function in the above code
Hope this helps!