I am reframing my question again so that it would be more clear. My data looks like this .
{
"Research": {
"@xmlns": "http://www.xml.org/2013/2/XML",
"@language": "eng",
"@createDateTime": "2022-03-25T10:12:39Z",
"@researchID": "abcd",
"Product": {
"@productID": "abcd",
"StatusInfo": {
"@currentStatusIndicator": "Yes",
"@statusDateTime": "2022-03-25T12:18:41Z",
"@statusType": "Published"
},
"Source": {
"Organization": {
"@primaryIndicator": "Yes",
"@type": "SellSideFirm",
"OrganizationID": [
{
"@idType": "L1",
"#text": "D827C98E315F"
},
{
"@idType": "TR",
"#text": "3202"
},
{
"@idType": "TR",
"#text": "SZA"
}
],
"OrganizationName": {
"@nameType": "Legal",
"#text": "Citi"
},
"PersonGroup": {
"PersonGroupMember": {
"@primaryIndicator": "Yes",
"@sequence": "1",
"Person": {
"@personID": "tr56",
"FamilyName": "Wang",
"GivenName": "Bond",
"DisplayName": "Bond Wang",
"Biography": "Bond Wang is a",
"BiographyFormatted": "Bond Wang",
"PhotoResourceIdRef": "AS44556"
}
}
}
}
},
"Content": {
"Title": "Premier",
"Abstract": "None",
"Synopsis": "Premier’s solid 1H22 result .",
"Resource": [
{
"@language": "eng",
"@primaryIndicator": "Yes",
"@resourceID": "9553",
"Length": {
"@lengthUnit": "Pages",
"#text": "17"
},
"MIMEType": "text/html",
"URL": "https://www.DFKJG.com/rendition/eppublic"
},
{
"@language": "eng",
"@primaryIndicator": "No",
"@resourceID": "4809",
"Length": {
"@lengthUnit": "Pages",
"#text": "17"
},
"MIMEType": "ABS/pdf",
"Name": "asdf.pdf",
"Comments": "fr5.pdf"
},
{
"@language": "eng",
"@primaryIndicator": "No",
"@resourceID": "6d13a965723e",
"Length": {
"@lengthUnit": "Pages",
"#text": "17"
},
"MIMEType": "text/html",
"URL": "https://www.dfgdfg.com/"
},
{
"@primaryIndicator": "No",
"@resourceID": "709c7bdb1c99",
"MIMEType": "tyy/image",
"URL": "https://ir.ght.com"
},
{
"@primaryIndicator": "No",
"@resourceID": "gfjhgj",
"MIMEType": "gtty/image",
"URL": "https://ir.gtty.com"
}
]
},
"Context": {
"@external": "Yes",
"IssuerDetails": {
"Issuer": {
"@issuerType": "Corporate",
"@primaryIndicator": "Yes",
"SecurityDetails": {
"Security": {
"@estimateAction": "Revision",
"@primaryIndicator": "Yes",
"@targetPriceAction": "Increase",
"SecurityID": [
{
"@idType": "RIC",
"@idValue": "PMV.AX",
"@publisherDefinedValue": "RIC"
},
{
"@idType": "Bloomberg",
"@idValue": "PMV@AU"
},
{
"@idType": "SEDOL",
"@idValue": "6699781"
}
],
"SecurityName": "Premier Investments Ltd",
"AssetClass": {
"@assetClass": "Equity"
},
"AssetType": {
"@assetType": "Stock"
},
"SecurityType": {
"@securityType": "Common"
},
"Rating": {
"@rating": "NeutralSentiment",
"@ratingType": "Rating",
"@aspect": "Investment",
"@ratingDateTime": "2020-07-31T08:24:37Z",
"RatingEntity": {
"@ratingEntity": "PublisherDefined",
"PublisherDefinedValue": "Citi"
}
}
}
},
"IssuerID": {
"@idType": "PublisherDefined",
"@idValue": "PMV.AX",
"@publisherDefinedValue": "TICKER"
},
"IssuerName": {
"@nameType": "Legal",
"NameValue": "Premier Investments Ltd"
}
}
},
"ProductDetails": {
"@periodicalIndicator": "No",
"@publicationDateTime": "2022-03-25T12:18:41Z",
"ProductCategory": {
"@productCategory": "Report"
},
"ProductFocus": {
"@focus": "Issuer",
"@primaryIndicator": "Yes"
},
"EntitlementGroup": {
"Entitlement": [
{
"@includeExcludeIndicator": "Include",
"@primaryIndicator": "No",
"AudienceTypeEntitlement": {
"@audienceType": "PublisherDefined",
"@entitlementContext": "TR",
"#text": "20012"
}
},
{
"@includeExcludeIndicator": "Include",
"@primaryIndicator": "No",
"AudienceTypeEntitlement": {
"@audienceType": "PublisherDefined",
"@entitlementContext": "TR",
"#text": "2001"
}
}
]
}
},
"ProductClassifications": {
"Discipline": {
"@disciplineType": "Investment",
"@researchApproach": "Fundamental"
},
"Subject": {
"@publisherDefinedValue": "TREPS",
"@subjectValue": "PublisherDefined"
},
"Country": {
"@code": "AU",
"@primaryIndicator": "Yes"
},
"Region": {
"@primaryIndicator": "Yes",
"@emergingIndicator": "No",
"@regionType": "Australasia"
},
"AssetClass": {
"@assetClass": "Equity"
},
"AssetType": {
"@assetType": "Stock"
},
"SectorIndustry": [
{
"@classificationType": "GICS",
"@code": "25201040",
"@focusLevel": "Yes",
"@level": "4",
"@primaryIndicator": "Yes",
"Name": "Household Appliances"
},
{
"@classificationType": "GICS",
"@code": "25504020",
"@focusLevel": "Yes",
"@level": "4",
"@primaryIndicator": "Yes",
"Name": "Computer & Electronics Retail"
},
{
"@classificationType": "GICS",
"@code": "25504040",
"@focusLevel": "Yes",
"@level": "4",
"@primaryIndicator": "Yes",
"Name": "Specialty Stores"
},
{
"@classificationType": "GICS",
"@code": "25504030",
"@focusLevel": "Yes",
"@level": "4",
"@primaryIndicator": "Yes",
"Name": "Home Improvement Retail"
},
{
"@classificationType": "GICS",
"@code": "25201050",
"@focusLevel": "Yes",
"@level": "4",
"@primaryIndicator": "Yes",
"Name": "Housewares & Specialties"
}
]
}
}
}
}
}
I want to explode all of its elements into data frame . The no of columns that has list like structure can change also. Basically we will not be knowing if next input will have few column or more columns to be exploded .
This is what i have tried so far but it looks like it does not give me correct answer . Also the column values i have hardcoded but it should identify and then explode.
import xmltodict as xmltodict
from pprint import pprint
import pandas as pd
import json
from tabulate import tabulate
dict =(xmltodict.parse("""xml data"""))
json_str = json.dumps(dict)
resp = json.loads(json_str)
print(resp)
df = pd.json_normalize(resp)
cols=['Research.Product.Source.Organization.OrganizationID','Research.Product.Content.Resource','Research.Product.Context.IssuerDetails.Issuer.SecurityDetails.Security.SecurityID','Research.Product.Context.ProductDetails.EntitlementGroup.Entitlement','Research.Product.Context.ProductClassifications.SectorIndustry']
def expplode_columns(df, cols):
df_e = df.copy()
for c in cols:
df_e = df_e.explode(c, ignore_index=True)
return df_e
df2 = expplode_columns(df, cols)
print(tabulate(df2, headers="keys", tablefmt="psql"))
# df2.to_csv('dataframe.csv', header=True, index=False)