1

how to read this structure in python?

<?xml version="1.0" encoding="ISO-8859-1"?>

-<Workbook xmlns:html="http://www.w3.org/TR/REC-html40" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:msxsl="urn:schemas-microsoft-com:xslt" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:schemas-microsoft-com:office:spreadsheet">


-<Styles>


-<Style ss:ID="VIEW">

<Font ss:Bold="1"/>

</Style>


-<Style ss:ID="HEADER">

<Font ss:Bold="1" ss:Color="#FFFFFF"/>

<Interior ss:Color="#666699" ss:Pattern="Solid"/>

</Style>


-<Style ss:ID="DOUBLE_0">

<NumberFormat ss:Format="0"/>

</Style>


-<Style ss:ID="DOUBLE_2">

<NumberFormat ss:Format="0.00"/>

</Style>


-<Style ss:ID="DOUBLE_3">

<NumberFormat ss:Format="0.000"/>

</Style>


-<Style ss:ID="DOUBLE_4">

<NumberFormat ss:Format="0.0000"/>

</Style>


-<Style ss:ID="PERCENT_FIXED_2">

<NumberFormat ss:Format="0.00%"/>

</Style>


-<Style ss:ID="PERCENT_FIXED_3">

<NumberFormat ss:Format="0.000%"/>

</Style>


-<Style ss:ID="PERCENT_FIXED_4">

<NumberFormat ss:Format="0.0000%"/>

</Style>


-<Style ss:ID="PERCENT_FIXED_5">

<NumberFormat ss:Format="0.00000%"/>

</Style>


-<Style ss:ID="DATE">

<NumberFormat ss:Format="yyyy\-mm\-dd;@"/>

</Style>

<ss:Style ss:ID="STRING"/>

</Styles>


-<Worksheet ss:Name="MSFRE">


-<Table x:FullRows="1" x:FullColumns="1">


-<Row>


-<Cell ss:StyleID="VIEW" ss:Index="1">

<Data ss:Type="String">Geo</Data>

</Cell>

</Row>


-<Row>

<Cell ss:StyleID="HEADER"/>

<Cell ss:StyleID="HEADER"/>


-<Cell ss:StyleID="HEADER" ss:Index="3" ss:MergeAcross="5">

<Data ss:Type="String"/>

</Cell>

</Row>


-<Row>

<Cell ss:StyleID="HEADER"/>

<Cell ss:StyleID="HEADER"/>


-<Cell ss:StyleID="HEADER" ss:Index="3">

<Data ss:Type="String">Holding Date</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="4">

<Data ss:Type="String">Fund code</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="5">

<Data ss:Type="String">Fund name</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="6">

<Data ss:Type="String">PTF</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="7">

<Data ss:Type="String">REF</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="8">

<Data ss:Type="String">PTF-REF</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="1">

<Data ss:Type="String">Geo</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.18545250736645816</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.18545250736645816</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">Asia ex-Japan</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">9.356235001537855E-4</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">9.356235001537855E-4</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">EMU</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.10654090959320628</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.10654090959320628</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">Emerging Countries</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.00294017805163712</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.00294017805163712</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">Europe ex-EMU</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.02354783768818136</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.02354783768818136</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">Japan</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.005898729959204227</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.005898729959204227</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">North America</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.044037874185699856</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.044037874185699856</Data>

</Cell>

</Row>


</Table>

</Worksheet>

</Workbook>

if I use this code the cell index collapse and can't recreate the table:

import pandas as pd
import xml.etree.cElementTree as ET

#dict_bd_area = 
ns = {"doc": "urn:schemas-microsoft-com:office:spreadsheet"}

tree = ET.parse("filepath")
root = tree.getroot()

def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None


def main():
    """ main """
    parsed_xml = tree

    data = []
    for i, node in enumerate(root.findall('.//doc:Row', ns)):
        if i > 3:
            data.append({'Geo': getvalueofnode(node.find('doc:Cell[1]/doc:Data', ns)),
                         'Holding - Date': getvalueofnode(node.find('doc:Cell[2]/doc:Data', ns)),
                         'Fund Code': getvalueofnode(node.find('doc:Cell[3]/doc:Data', ns)),
                         'Name': getvalueofnode(node.find('doc:Cell[4]/doc:Data', ns)), 
                         'PTF': getvalueofnode(node.find('doc:Cell[5]/doc:Data', ns)),
                         'REF': getvalueofnode(node.find('doc:Cell[6]/doc:Data', ns)),
                         'PTF-REF':getvalueofnode(node.find('doc:Cell[7]/doc:Data', ns)),
                         'SPS PTF': getvalueofnode(node.find('doc:Cell[8]/doc:Data', ns)),
                         'SPS REF': getvalueofnode(node.find('doc:Cell[9]/doc:Data', ns))
                        })

return(pd.DataFrame(data))

How to handle the cell index if is empty or null as in this case? (in this case cell index 7 doesn't always exists). The Win32 library cannot help me in this case, I tried but it didn't have performance required to handle thousands of file

UPDATE: 2

What if I want to parse the parentcolDimension? This is the beginning of the XML file:

<loadingDate>2021-08-10T00:00:00+02:00</loadingDate>
<exportDate>2021-08-11T18:43:22.513+02:00</exportDate>
<exportParameters key="occurrenceKey" value="Family_occurrence"/>
<exportParameters key="occurrenceName" value="Family occurrence"/>
<exportParameters key="ConfigurationPath" value="/usr/users/reporting/configurations"/>
<exportParameters key="scenarioKey" value="mult"/>
<exportParameters key="perspective" value="Mulnt"/>
<dataViews viewHashcode="-75742857" showRootLine="true">
<viewKey>Flat</viewKey>
<viewName>Flat</viewName>
<viewType>tableSectionViewType</viewType>
<sectionInfos>
<sectionInfo sectionKey="MultiView" showSectionLine="false" startLine="0"/>
</sectionInfos>
<viewId>3</viewId>
<headers>
<header id="1" key="GF 350006|" desc="<html><font color=#00A0E3><b>Weight% (PTF) groupName="GF 350006" Key="WEIGHT#PTF" dataType="Number" dataFormat="PERCENT_FIXED_2">
<parentColDimension label="GF 350006"/>
</header>
<header id="2" key="GF TOP PRIVATE 350007 groupName="GF 350007"Key="WEIGHT#PTF" dataType="Number" dataFormat="PERCENT_FIXED_2">
<parentColDimension label="GF 350007"/>

Update 3: How could I retrieve how many times this cell <Cell ss:StyleID="s63"/> (the ones with no data) occurs, for example, in the second row?

<?xml version="1.0"?>
<?mso-application progid="Excel.Sheet"?>
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet"
 xmlns:o="urn:schemas-microsoft-com:office:office"
 xmlns:x="urn:schemas-microsoft-com:office:excel"
 xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet"
 xmlns:html="http://www.w3.org/TR/REC-html40"
 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 xmlns:msxsl="urn:schemas-microsoft-com:xslt">
 <DocumentProperties xmlns="urn:schemas-microsoft-com:office:office">
  <Version>16.00</Version>
 </DocumentProperties>
 <OfficeDocumentSettings xmlns="urn:schemas-microsoft-com:office:office">
  <AllowPNG/>
 </OfficeDocumentSettings>
 <ExcelWorkbook xmlns="urn:schemas-microsoft-com:office:excel">
  <WindowHeight>6760</WindowHeight>
  <WindowWidth>15530</WindowWidth>
  <WindowTopX>0</WindowTopX>
  <WindowTopY>0</WindowTopY>
  <ProtectStructure>False</ProtectStructure>
  <ProtectWindows>False</ProtectWindows>
 </ExcelWorkbook>
 <Styles>
  <Style ss:ID="Default" ss:Name="Normal">
   <Alignment ss:Vertical="Bottom"/>
   <Borders/>
   <Font ss:FontName="Calibri" x:Family="Swiss" ss:Size="11" ss:Color="#000000"/>
   <Interior/>
   <NumberFormat/>
   <Protection/>
  </Style>
  <Style ss:ID="s62">
   <Alignment ss:Vertical="Bottom"/>
   <Borders/>
   <Font ss:FontName="Arial" ss:Bold="1"/>
   <Interior/>
   <NumberFormat/>
   <Protection/>
  </Style>
  <Style ss:ID="s63">
   <Alignment ss:Vertical="Bottom"/>
   <Borders/>
   <Font ss:FontName="Arial" ss:Color="#FFFFFF" ss:Bold="1"/>
   <Interior ss:Color="#666699" ss:Pattern="Solid"/>
   <NumberFormat/>
   <Protection/>
  </Style>
  <Style ss:ID="s65">
   <Alignment ss:Vertical="Bottom"/>
   <Borders/>
   <Font ss:FontName="Calibri" x:Family="Swiss" ss:Size="11" ss:Color="#000000"/>
   <Interior/>
   <NumberFormat ss:Format="Percent"/>
   <Protection/>
  </Style>
 </Styles>
 <Worksheet ss:Name="- Family split occurrence-3- ">
  <Table ss:ExpandedColumnCount="262" ss:ExpandedRowCount="2460" x:FullColumns="1"
   x:FullRows="1" ss:DefaultRowHeight="14.5">
   <Row>
    <Cell ss:StyleID="s62"><Data ss:Type="String">Transparency Equity</Data></Cell>
   </Row>
   <Row>
    <Cell ss:StyleID="s63"/>
    <Cell ss:StyleID="s63"/>
    <Cell ss:StyleID="s63"/>
    <Cell ss:StyleID="s63"/>
    <Cell ss:StyleID="s63"/>
    <Cell ss:StyleID="s63"/>
    <Cell ss:MergeAcross="4" ss:StyleID="s63"><Data ss:Type="String">Date 0</Data></Cell>
    <Cell ss:MergeAcross="4" ss:StyleID="s63"><Data ss:Type="String">Date 1</Data></Cell>

1 Answer 1

1

Try to use "Index" attribute instead of cell element index:

# add "ss" namespace declaration to the namespaces map
ns = {"doc": "urn:schemas-microsoft-com:office:spreadsheet", "ss": "urn:schemas-microsoft-com:office:spreadsheet"}

# in function call reference element "Cell" having an attribute "Index" with value "7"
getvalueofnode(node.find('doc:Cell[@ss:Index="7"]/doc:Data', ns))

Same approach could be used for other cells as well.

This code will try to find cell element with the given index attribute. If not found function getvalueofnode() will return None.

To get parentColDimension following code could be used:

for parentColDimension in root.findall('.//doc:header/doc:parentColDimension', ns):
  print(parentColDimension.get('label'))
Sign up to request clarification or add additional context in comments.

4 Comments

What if I want to parse the headers?
Updated answer with the code snippet for geting parentColDimension label
Yes, I figured it out! But do you know where I could get more information to parse these kind of .xml?
Any help on the 3rd part?

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.