How to identify and explode a nested json file as columns of a dataframe?

Question

I am reframing my question again so that it would be more clear. My data looks like this .

{
    "Research": {
        "@xmlns": "http://www.xml.org/2013/2/XML",
        "@language": "eng",
        "@createDateTime": "2022-03-25T10:12:39Z",
        "@researchID": "abcd",
        "Product": {
            "@productID": "abcd",
            "StatusInfo": {
                "@currentStatusIndicator": "Yes",
                "@statusDateTime": "2022-03-25T12:18:41Z",
                "@statusType": "Published"
            },
            "Source": {
                "Organization": {
                    "@primaryIndicator": "Yes",
                    "@type": "SellSideFirm",
                    "OrganizationID": [
                        {
                            "@idType": "L1",
                            "#text": "D827C98E315F"
                        },
                        {
                            "@idType": "TR",
                            "#text": "3202"
                        },
                        {
                            "@idType": "TR",
                            "#text": "SZA"
                        }
                    ],
                    "OrganizationName": {
                        "@nameType": "Legal",
                        "#text": "Citi"
                    },
                    "PersonGroup": {
                        "PersonGroupMember": {
                            "@primaryIndicator": "Yes",
                            "@sequence": "1",
                            "Person": {
                                "@personID": "tr56",
                                "FamilyName": "Wang",
                                "GivenName": "Bond",
                                "DisplayName": "Bond Wang",
                                "Biography": "Bond Wang is a",
                                "BiographyFormatted": "Bond Wang",
                                "PhotoResourceIdRef": "AS44556"
                            }
                        }
                    }
                }
            },
            "Content": {
                "Title": "Premier",
                "Abstract": "None",
                "Synopsis": "Premier’s solid 1H22 result .",
                "Resource": [
                    {
                        "@language": "eng",
                        "@primaryIndicator": "Yes",
                        "@resourceID": "9553",
                        "Length": {
                            "@lengthUnit": "Pages",
                            "#text": "17"
                        },
                        "MIMEType": "text/html",
                        "URL": "https://www.DFKJG.com/rendition/eppublic"
                    },
                    {
                        "@language": "eng",
                        "@primaryIndicator": "No",
                        "@resourceID": "4809",
                        "Length": {
                            "@lengthUnit": "Pages",
                            "#text": "17"
                        },
                        "MIMEType": "ABS/pdf",
                        "Name": "asdf.pdf",
                        "Comments": "fr5.pdf"
                    },
                    {
                        "@language": "eng",
                        "@primaryIndicator": "No",
                        "@resourceID": "6d13a965723e",
                        "Length": {
                            "@lengthUnit": "Pages",
                            "#text": "17"
                        },
                        "MIMEType": "text/html",
                        "URL": "https://www.dfgdfg.com/"
                    },
                    {
                        "@primaryIndicator": "No",
                        "@resourceID": "709c7bdb1c99",
                        "MIMEType": "tyy/image",
                        "URL": "https://ir.ght.com"
                    },
                    {
                        "@primaryIndicator": "No",
                        "@resourceID": "gfjhgj",
                        "MIMEType": "gtty/image",
                        "URL": "https://ir.gtty.com"
                    }
                ]
            },
            "Context": {
                "@external": "Yes",
                "IssuerDetails": {
                    "Issuer": {
                        "@issuerType": "Corporate",
                        "@primaryIndicator": "Yes",
                        "SecurityDetails": {
                            "Security": {
                                "@estimateAction": "Revision",
                                "@primaryIndicator": "Yes",
                                "@targetPriceAction": "Increase",
                                "SecurityID": [
                                    {
                                        "@idType": "RIC",
                                        "@idValue": "PMV.AX",
                                        "@publisherDefinedValue": "RIC"
                                    },
                                    {
                                        "@idType": "Bloomberg",
                                        "@idValue": "PMV@AU"
                                    },
                                    {
                                        "@idType": "SEDOL",
                                        "@idValue": "6699781"
                                    }
                                ],
                                "SecurityName": "Premier Investments Ltd",
                                "AssetClass": {
                                    "@assetClass": "Equity"
                                },
                                "AssetType": {
                                    "@assetType": "Stock"
                                },
                                "SecurityType": {
                                    "@securityType": "Common"
                                },
                                "Rating": {
                                    "@rating": "NeutralSentiment",
                                    "@ratingType": "Rating",
                                    "@aspect": "Investment",
                                    "@ratingDateTime": "2020-07-31T08:24:37Z",
                                    "RatingEntity": {
                                        "@ratingEntity": "PublisherDefined",
                                        "PublisherDefinedValue": "Citi"
                                    }
                                }
                            }
                        },
                        "IssuerID": {
                            "@idType": "PublisherDefined",
                            "@idValue": "PMV.AX",
                            "@publisherDefinedValue": "TICKER"
                        },
                        "IssuerName": {
                            "@nameType": "Legal",
                            "NameValue": "Premier Investments Ltd"
                        }
                    }
                },
                "ProductDetails": {
                    "@periodicalIndicator": "No",
                    "@publicationDateTime": "2022-03-25T12:18:41Z",
                    "ProductCategory": {
                        "@productCategory": "Report"
                    },
                    "ProductFocus": {
                        "@focus": "Issuer",
                        "@primaryIndicator": "Yes"
                    },
                    "EntitlementGroup": {
                        "Entitlement": [
                            {
                                "@includeExcludeIndicator": "Include",
                                "@primaryIndicator": "No",
                                "AudienceTypeEntitlement": {
                                    "@audienceType": "PublisherDefined",
                                    "@entitlementContext": "TR",
                                    "#text": "20012"
                                }
                            },
                            {
                                "@includeExcludeIndicator": "Include",
                                "@primaryIndicator": "No",
                                "AudienceTypeEntitlement": {
                                    "@audienceType": "PublisherDefined",
                                    "@entitlementContext": "TR",
                                    "#text": "2001"
                                }
                            }
                        ]
                    }
                },
                "ProductClassifications": {
                    "Discipline": {
                        "@disciplineType": "Investment",
                        "@researchApproach": "Fundamental"
                    },
                    "Subject": {
                        "@publisherDefinedValue": "TREPS",
                        "@subjectValue": "PublisherDefined"
                    },
                    "Country": {
                        "@code": "AU",
                        "@primaryIndicator": "Yes"
                    },
                    "Region": {
                        "@primaryIndicator": "Yes",
                        "@emergingIndicator": "No",
                        "@regionType": "Australasia"
                    },
                    "AssetClass": {
                        "@assetClass": "Equity"
                    },
                    "AssetType": {
                        "@assetType": "Stock"
                    },
                    "SectorIndustry": [
                        {
                            "@classificationType": "GICS",
                            "@code": "25201040",
                            "@focusLevel": "Yes",
                            "@level": "4",
                            "@primaryIndicator": "Yes",
                            "Name": "Household Appliances"
                        },
                        {
                            "@classificationType": "GICS",
                            "@code": "25504020",
                            "@focusLevel": "Yes",
                            "@level": "4",
                            "@primaryIndicator": "Yes",
                            "Name": "Computer & Electronics Retail"
                        },
                        {
                            "@classificationType": "GICS",
                            "@code": "25504040",
                            "@focusLevel": "Yes",
                            "@level": "4",
                            "@primaryIndicator": "Yes",
                            "Name": "Specialty Stores"
                        },
                        {
                            "@classificationType": "GICS",
                            "@code": "25504030",
                            "@focusLevel": "Yes",
                            "@level": "4",
                            "@primaryIndicator": "Yes",
                            "Name": "Home Improvement Retail"
                        },
                        {
                            "@classificationType": "GICS",
                            "@code": "25201050",
                            "@focusLevel": "Yes",
                            "@level": "4",
                            "@primaryIndicator": "Yes",
                            "Name": "Housewares & Specialties"
                        }
                    ]
                }
            }
        }
    }
}

I want to explode all of its elements into data frame . The no of columns that has list like structure can change also. Basically we will not be knowing if next input will have few column or more columns to be exploded .

This is what i have tried so far but it looks like it does not give me correct answer . Also the column values i have hardcoded but it should identify and then explode.

import xmltodict as xmltodict
from pprint import pprint
import pandas as pd
import json
from tabulate import tabulate

dict =(xmltodict.parse("""xml data"""))

json_str = json.dumps(dict)
resp = json.loads(json_str)
print(resp)
df = pd.json_normalize(resp)
    
cols=['Research.Product.Source.Organization.OrganizationID','Research.Product.Content.Resource','Research.Product.Context.IssuerDetails.Issuer.SecurityDetails.Security.SecurityID','Research.Product.Context.ProductDetails.EntitlementGroup.Entitlement','Research.Product.Context.ProductClassifications.SectorIndustry']
    
def expplode_columns(df, cols):
    df_e = df.copy()
    for c in cols:
        df_e = df_e.explode(c, ignore_index=True)
    return df_e


df2 = expplode_columns(df, cols)
print(tabulate(df2, headers="keys", tablefmt="psql"))
# df2.to_csv('dataframe.csv', header=True, index=False)

I guess you're going to want something recursive. Basically keep exploding a column untill the values are str not list. Never done something like that with pandas though and I'm really not sure how it would look to be honest.. — rayad
– rayad, Commented Jun 16, 2022 at 21:13

Laurent · Accepted Answer · 2022-09-18 07:46:20Z

2

As suggested in the comments, you can define a helper function in pure Python to recursively flatten the nested values of your data.

So, with the json file you provided, here is one way to do it:

def flatten(data, new_data):
    """Recursive helper function.

    Args:
        data: nested dictionary.
        new_data: empty dictionary.

    Returns:
        Flattened dictionary.

    """
    for key, value in data.items():
        if isinstance(value, dict):
            flatten(value, new_data)
        if isinstance(value, str) or isinstance(value, int) or isinstance(value, list):
            new_data[key] = value
    return new_data

And then:

import json

import pandas as pd

with open("file.json") as f:
    content = json.load(f)

df = pd.DataFrame.from_dict(flatten(content, {}), orient="index").T

From here, you can deal with columns which contains lists of dictionaries with identical keys, but different values, by exploding them and repeating the other values, like this:

cols_with_lists = [col for col in df.columns if isinstance(df.loc[0, col], list)]

for col in cols_with_lists:
    temp_df = pd.concat(
        [pd.DataFrame(item, index=[i]) for i, item in enumerate(df.loc[0, col])],
        axis=0,
    )
    df = pd.concat([df.drop(columns=[col]), temp_df], axis=1).fillna(method="ffill")

So that, finally, the json file is entirely flattened:

print(df)
# Output
                          @xmlns @language  ... @primaryIndicator                           Name
0  http://www.xml.org/2013/2/XML       eng  ...               Yes           Household Appliances
1  http://www.xml.org/2013/2/XML       eng  ...               Yes  Computer & Electronics Retail
2  http://www.xml.org/2013/2/XML       eng  ...               Yes               Specialty Stores
3  http://www.xml.org/2013/2/XML       eng  ...               Yes        Home Improvement Retail
4  http://www.xml.org/2013/2/XML       eng  ...               Yes       Housewares & Specialties

[5 rows x 73 columns]

edited Sep 18, 2022 at 7:46

answered Jun 19, 2022 at 14:13

Laurent

13.7k7 gold badges30 silver badges49 bronze badges

Sign up to request clarification or add additional context in comments.

5 Comments

Atharv Thakur Over a year ago

This is what i am also getting but the issue is ,I want to explode all columns like OrganizationID, Resource without mentioning the name in explode

Laurent Over a year ago

This is totally doable, but how do you want to deal with duplicated keys? Overwrite associated values? Or explode them in rows and repeat values of non-list columns on each row?

Atharv Thakur Over a year ago

We would like rep[eat the values of non-list columns for now .Your code is very clean and understandable

Pritam Dodeja Over a year ago

Is the parameter new_data required in flatten? What purpose does it serve?

Laurent Over a year ago

new_data is an empty dictionary which serves the purpose to receive the nested key/value pairs in the json file. It can be removed from the function parameters and defined inside the function itself, if this is more clear for you, see my updated answer; Cheers.

Emma · Accepted Answer · 2022-06-24 19:55:46Z

2

Little hacky but you can extract columns that has a list type in it. Then use reduce to recursively explode and normalize all columns until there are no more list/object.

I haven't tested well but something like this.

from functools import reduce

def full_explode_normalize(df):
    # Extract list columns 
    explode_cols = [x for x in df.columns if isinstance(df.iloc[0][x], list)]
    if len(explode_cols) < 1:
        return df
    
    # Explode and normalize the list
    df = reduce(_explode, explode_cols, df)

    return df

def _explode(df, col):
    df = df.explode(col)

    if isinstance(df.iloc[0][col], list):
        df = _explode(df, col)
    elif isinstance(df.iloc[0][col], object):
        df_child = pd.json_normalize(df[col])
        # To prevent column name collision, add the parent column name as prefix.
        df_child.columns = [f'{col}.{x}' for x in df_child.columns]
        df = pd.concat([df.loc[:, ~df.columns.isin([col])].reset_index(drop=True), df_child], axis=1)
    
    return df

edited Jun 24, 2022 at 19:55

answered Jun 24, 2022 at 19:20

Emma

9,5781 gold badge22 silver badges38 bronze badges

1 Comment

Yaakov Bressler Over a year ago

This is a good solution.

Collectives™ on Stack Overflow

How to identify and explode a nested json file as columns of a dataframe?

2 Answers 2

5 Comments

1 Comment

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

5 Comments

1 Comment

Your Answer

Sign up or log in

Post as a guest

Linked

Related