1

I have an excel sheet that I would like to read into a pandas multiindex dataframe. The complication is that the excel sheet contains duplicate header values. When reading pandas is adding a .x to the end of the second level headers instead of the first. Is there a way to have to rename the top level header instead of the second level?

Example excel file: enter image description here

Read Script:

from pathlib import Path
import pandas as pd


def main():
    xl_file = Path('.') / 'pandasExample.xlsx'
    df = pd.read_excel(xl_file, sheet_name='Sheet1', header=[
                            0, 1], skiprows=[0])
    print(df)


if __name__ == '__main__':
    main()

The output:

  Rectangle        Ellipse    Rectangle
      Width Height       a  b   Width.1 Height.1 Width.2 Height.2
0        10     20       1  2        20       30      40       50

Desired output:

  Rectangle        Ellipse    Rectangle.1        Rectangle.2       
      Width Height       a  b      Width Height      Width Height
0        10     20       1  2         20     30         40     50

2 Answers 2

1

Here's a different answer that produces the exact desired output listed in the question.

from pathlib import Path
import pandas as pd
from typing import List


def rename_headers(headers: List[str]) -> List[str]:
    header_dict = {}
    new_headers = []
    for header in headers:
        header_prefix = header.split('.')[0]
        header_occurance = header_dict.get(header_prefix, 0)
        if header_occurance > 0:
            new_header = header_prefix + f'.{header_occurance}'
        else:
            new_header = header_prefix
        new_headers.append(new_header)
        header_occurances[header_prefix] = header_occurance + 1
    return new_headers

def main():
    xl_file = Path('.') / 'pandasExample.xlsx'

    # Read first level headers
    header_df = pd.read_excel(xl_file, sheet_name='Sheet1', header=[
        0], skiprows=[0], nrows=1)
    headers = list(filter(lambda x: not x.startswith(
        'Unnamed'), list(header_df.columns)))

    # Generate the desired headers
    new_headers = rename_headers(headers)

    # Read in the full dataframe
    df = pd.read_excel(xl_file, sheet_name='Sheet1', header=[
        0, 1], skiprows=[0])

    # Create a dictionary that identifies the parameters for each unique header
    unique_headers = pd.unique(pd.Index(df.columns.get_level_values(0)))
    parameters = {}
    for header in unique_headers:
        parameters[header] = pd.unique(
            [column.split('.')[0] for column in df[header].columns])


    unstack_df = df.head(1).stack()
    # Keep order of the original index after stack
    index = df.head(1).unstack().index.get_level_values(1)
    unstack_df = unstack_df.reindex(zip([0] * len(index), index))
    unstack_df = unstack_df.reset_index()

    # Create the new level 0 and level 1 headers
    level_0 = []
    for header in new_headers:
        level_0 += [header] * len(parameters[header.split('.')[0]])
    level_1 = [parameter.split('.')[0] for parameter in unstack_df['level_1']]

    # Rename level 0 and level 1 columns for the dataframe
    df.columns = pd.MultiIndex.from_tuples(zip(level_0, level_1))
    print(df)


if __name__ == '__main__':
    main()

Ouput:

  Rectangle        Ellipse    Rectangle.1        Rectangle.2       
      Width Height       a  b       Width Height       Width Height
0        10     20       1  2          20     30          40     50
Sign up to request clarification or add additional context in comments.

Comments

1

unstack the dataframe then reassign level_0 to unique labels. I did it manually but you can do it programmaticly by adding a suffix to every two columns. Set the multi-index then stack the results. The are three values in the tuple: level 0 and level 1 and 0

 df=pd.read_excel('dup_header.xls',skiprows=2,nrows=10)
 unstack_df=df.stack()
 unstack_df=unstack_df.reset_index()
 unstack_df['level_0']=['Rectangle1','Rectangle1','Ellipse','Ellipse','Rectangle2','Rectangle2','Rectangle3','Rectangle3']
 unstack_df=unstack_df.set_index(['level_0','level_1'])
 stack_series=unstack_df.stack()

 df=stack_series.to_frame()
 df.columns=['value']
 #print(df.index)
 #print(df.values)
 print(df)

Output:

                             value
  level_0     level_1    
  Rectangle1  Width     0    10
              Height    0    20
  Ellipse     a         0     1
              b         0     2
  Rectangle2  width     0    20
              height    0    30
  Rectangle3  width.1   0    40
              height.1  0    50

3 Comments

This does not work with multi-index headers like my example shows. ValueError: cannot specify names when specifying a multi-index header
skip over the header level 0 and replace it with unique labels then set the multi index in the dataframe then stack the value then convert the results to a dataframe, see above
That works I guess. Will have to write some extra code to figure out the header values.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.