I am trying to build a tool which can take any JSON data and convert that into multiple data frame based on data types. I am trying to add each data frame with a relation so that we can identify which data belong to which parent element(key).
For Example :
{
"name":"Some name"
"date": "12:23:2022"
"Students":[
{
"id":",some id"
"value": "some val"
},
{
"id":",some id2"
"value": "some val2"
}, {
"id":",some id3"
"value": "some val3"
},
],
"Error":[
{
"id":",some id",
"code": "some code",
"emessage":[
{
"err_trac":"Missing syntax",
"Err_code":";"
},
{
"err_trac":"invalid syntax",
"Err_code":"="
}
]
},
{
"id":",some id2",
"code": "some code 2",
"emessage":[
{
"err_trac":"Missing syntax",
"Err_code":";"
},
{
"err_trac":"invalid syntax",
"Err_code":"="
}
]
}, {
"id":",some id3",
"code": "some code3",
"emessage":[
{
"err_trac":"Missing syntax",
"Err_code":";"
},
{
"err_trac":"invalid syntax",
"Err_code":"="
}
]
},
]
}
I wanted to have data frame such as
Run
name, date , id (uuid)
Error
id, code parent_id(id of run), id (uuid)
Students
id, value, parent_id(id of run) , id (uuid)
emessage
err_trac, Err_code , parent_id(id of Error )
And have a relations with UUID to identify which key belongs to which parents id. I am trying the flattening approach to solve this problem using python and pandas . But my solution does not works for nested JSON.
Here is what I am trying.
import json
import pandas as pd
op = {}
import uuid
def format_string(string):
return string.replace(" ", "_")
def get_prefix(prefix, key):
if not key:
return format_string(prefix)
if prefix:
return format_string(prefix + "_" + key)
else:
return key
def flatten(prefix, key, value, uid, result=[]):
if isinstance(value, str):
result.append({get_prefix(prefix, key): value})
if isinstance(value, dict):
for item in value.keys():
flatten(get_prefix(prefix, key), item, value.get(item), uid, result)
if isinstance(value, list):
if prefix:
for i in range(len(value)):
flatten(
get_prefix(prefix, key + "[{}]".format(i)),
"",
value[i],
uid,
op[key],
)
else:
for i in range(len(value)):
flatten(
get_prefix(prefix, key + "[{}]".format(i)),
"",
value[i],
uid,
result,
)
res = {key: val for d in result for key, val in d.items()}
df = pd.DataFrame.from_dict(res, orient="index")
df["uuid"] = uid
op["result"] = df
return result
def solution() -> str:
f = open("example-input/sample.json", "r")
if f:
str_val = json.load(f)
print("j")
for key, value in str_val.items():
# pd_op = pd.json_normalize(str_val)
# print(pd_op.columns)
# for x in pd_op["run.tip usage"]:
# print(x[0])
# break
flatten("", key, str_val.get(key), uuid.uuid4())
return op
print(solution())
Update
The reason I wanted to create multiple dataframe is to put this data into Datalake and later access it via Athena in AWS. Once I get the dataframe I can move them into SQL tables.