So a coworker once told me that regex_extract is faster than parsing the JSONs and I've always believed that... until today when I decided to run some timing experiments comparing it the two other solutions posted here using get_json_object and from_json.
The short answer is that all perform comparably, even when we complicate the JSONs by adding thousands of extra K:V pairs. The regex_extract method is actually consistently a bit slower in these tests.
Setup: proving each method works
import pyspark.sql.functions as fun
import pyspark.sql.types as t
case_ids = range(1,6)
data = [
'{"dummyAcc":"12346","accountRequest":{"schemeCode":"ZEROQ1", "CCZ":"SGD"}}',
'{"dummyAcc":"12347","accountRequest":{"schemeCode":"ZEROQ2", "CCZ":"SGD"}}',
'{"dummyAcc":"12348","accountRequest":{"schemeCode":"ZEROQ5", "CCZ":"SGD"}}',
'{"dummyAcc":"12349","accountRequest":{"schemeCode":"ZEROQ", "CCZ":"SGD"}}',
'{"dummyAcc":"12350","accountRequest":{"schemeCode":"ZEROQ", "CCZ":"SGD"}}'
]
df = spark.createDataFrame(pd.DataFrame({"caseid": case_ids, "object_value": data}))
##
# fun.from_json
##
schm = t.StructType(
[
t.StructField("dummyAcc", t.StringType()),
t.StructField(
"accountRequest",
t.StructType(
[
t.StructField("schemeCode", t.StringType()),
t.StructField("CCZ", t.StringType()),
]
),
),
]
)
def run_from_json(df):
return df.withColumn("object_value", fun.from_json("object_value", schm, options={"allowSingleQuotes": "true"}))\
.select(
"caseid",
"object_value.accountRequest.schemeCode",
"object_value.accountRequest.CCZ",
)
##
# get_json
##
def run_get_json(df):
return df.select('caseid',
fun.get_json_object('object_value', '$.accountRequest.schemeCode').alias('schemeCode'),
fun.get_json_object('object_value', '$.accountRequest.CCZ').alias('CCZ'))
##
# regexp_extract
##
def run_regexp_extract(df):
return df.withColumn("schemeCode", fun.regexp_extract(fun.col("object_value"), '(.)("schemeCode":")(\w+)', 3))\
.withColumn("CCZ", fun.regexp_extract(fun.col("object_value"), '(.)("CCZ":")(\w+)', 3))\
.select("caseid", "schemeCode", "CCZ")
##
# Test them out
##
print("from_json")
run_from_json(df).show(truncate=False)
print("get_json")
run_get_json(df).show(truncate=False)
print("regexp_extract")
run_regexp_extract(df).show(truncate=False)
from_json
+------+----------+---+
|caseid|schemeCode|CCZ|
+------+----------+---+
|1 |ZEROQ1 |SGD|
|2 |ZEROQ2 |SGD|
|3 |ZEROQ5 |SGD|
|4 |ZEROQ |SGD|
|5 |ZEROQ |SGD|
+------+----------+---+
get_json
+------+----------+---+
|caseid|schemeCode|CCZ|
+------+----------+---+
|1 |ZEROQ1 |SGD|
|2 |ZEROQ2 |SGD|
|3 |ZEROQ5 |SGD|
|4 |ZEROQ |SGD|
|5 |ZEROQ |SGD|
+------+----------+---+
regexp_extract
+------+----------+---+
|caseid|schemeCode|CCZ|
+------+----------+---+
|1 |ZEROQ1 |SGD|
|2 |ZEROQ2 |SGD|
|3 |ZEROQ5 |SGD|
|4 |ZEROQ |SGD|
|5 |ZEROQ |SGD|
+------+----------+---+
Timing Part 1 -- Using Short JSONs
I checked the wall clock time of running multiple iterations using the default compact JSONs defined above.
def time_run_method(df, n_it, meth, meth_name):
t0 = time.time()
for i in range(n_it):
meth(df).count()
td = time.time() - t0
print(n)
print("Time to count %d iterations: %s [sec]" % (n_it, "{:,}".format(td)))
for m, n in zip([run_from_json, run_get_json, run_regexp_extract], ["from_json", "get_json", "regexp_extract"]):
time_run_method(df, 200, m, n)
from_json
Time to count 200 iterations: 15.918861389160156 [sec]
get_json
Time to count 200 iterations: 15.668830871582031 [sec]
regexp_extract
Time to count 200 iterations: 17.539576292037964 [sec]
Timing Part 2 -- Using Long JSONs
I added two thousand key-value pairs to the JSONs to see if the extra overhead of deserializing them would change things. It did not. Perhaps this structure is too simple and the internal parsers are able to simply avoid the extra keys or they just don't present a lot of overhead given how flat the structure is. I don't know.
cruft = json.dumps({k:v for k,v in enumerate(range(2000))})
data = [
'{ "cruft": %s, "dummyAcc":"12346","accountRequest":{"schemeCode":"ZEROQ1", "CCZ":"SGD"}}' % cruft,
'{ "cruft": %s, "dummyAcc":"12347","accountRequest":{"schemeCode":"ZEROQ2", "CCZ":"SGD"}}' % cruft,
'{ "cruft": %s, "dummyAcc":"12348","accountRequest":{"schemeCode":"ZEROQ5", "CCZ":"SGD"}}' % cruft,
'{ "cruft": %s, "dummyAcc":"12349","accountRequest":{"schemeCode":"ZEROQ", "CCZ":"SGD"}}' % cruft,
'{ "cruft": %s, "dummyAcc":"12350","accountRequest":{"schemeCode":"ZEROQ", "CCZ":"SGD"}}' % cruft
]
df2 = spark.createDataFrame(pd.DataFrame({"caseid": case_ids, "object_value": data}))
for m, n in zip([run_from_json, run_get_json, run_regexp_extract], ["from_json", "get_json", "regexp_extract"]):
time_run_method(df2, 200, m, n)
from_json
Time to count 200 iterations: 16.005220413208008 [sec]
get_json
Time to count 200 iterations: 15.788024187088013 [sec]
regexp_extract
Time to count 200 iterations: 16.81353187561035 [sec]
regexp_extracttool. If your data is very complex you may need to write a UDF that casts them into dictionaries and searches them.