In Spark, how to efficiently check if an array is contained in (is a subset of) another array?
Having this as example df, what could be the options?
+------------+------------+
|look_in |look_for |
+------------+------------+
|[a, b, c] |[a] |
|[a, b, c] |[d] |
|[a, b, c] |[a, b] |
|[a, b, c] |[c, d] |
|[a, b, c] |[a, b, c] |
|[a, b, c] |[a, NULL] |
|[a, b, NULL]|[a, NULL] |
|[a, b, NULL]|[a] |
|[a, b, NULL]|[NULL] |
|[a, b, c] |NULL |
|NULL |[a] |
|NULL |NULL |
|[a, b, c] |[a, a] |
|[a, a, a] |[a] |
|[a, a, a] |[a, a, a] |
|[a, a, a] |[a, a, NULL]|
|[a, a, NULL]|[a, a, a] |
|[a, a, NULL]|[a, a, NULL]|
+------------+------------+
from pyspark.sql import functions as F
df = spark.createDataFrame(
[(['a', 'b', 'c'], ['a']),
(['a', 'b', 'c'], ['d']),
(['a', 'b', 'c'], ['a', 'b']),
(['a', 'b', 'c'], ['c', 'd']),
(['a', 'b', 'c'], ['a', 'b', 'c']),
(['a', 'b', 'c'], ['a', None]),
(['a', 'b',None], ['a', None]),
(['a', 'b',None], ['a']),
(['a', 'b',None], [None]),
(['a', 'b', 'c'], None),
(None, ['a']),
(None, None),
(['a', 'b', 'c'], ['a', 'a']),
(['a', 'a', 'a'], ['a']),
(['a', 'a', 'a'], ['a', 'a', 'a']),
(['a', 'a', 'a'], ['a', 'a',None]),
(['a', 'a',None], ['a', 'a', 'a']),
(['a', 'a',None], ['a', 'a',None])],
['look_in', 'look_for'])