Pandas read_excel with duplicate header values

Question

I have an excel sheet that I would like to read into a pandas multiindex dataframe. The complication is that the excel sheet contains duplicate header values. When reading pandas is adding a .x to the end of the second level headers instead of the first. Is there a way to have to rename the top level header instead of the second level?

Example excel file:

Read Script:

from pathlib import Path
import pandas as pd


def main():
    xl_file = Path('.') / 'pandasExample.xlsx'
    df = pd.read_excel(xl_file, sheet_name='Sheet1', header=[
                            0, 1], skiprows=[0])
    print(df)


if __name__ == '__main__':
    main()

The output:

  Rectangle        Ellipse    Rectangle
      Width Height       a  b   Width.1 Height.1 Width.2 Height.2
0        10     20       1  2        20       30      40       50

Desired output:

  Rectangle        Ellipse    Rectangle.1        Rectangle.2       
      Width Height       a  b      Width Height      Width Height
0        10     20       1  2         20     30         40     50

philimat · Accepted Answer · 2021-05-13 22:27:56Z

Here's a different answer that produces the exact desired output listed in the question.

from pathlib import Path
import pandas as pd
from typing import List


def rename_headers(headers: List[str]) -> List[str]:
    header_dict = {}
    new_headers = []
    for header in headers:
        header_prefix = header.split('.')[0]
        header_occurance = header_dict.get(header_prefix, 0)
        if header_occurance > 0:
            new_header = header_prefix + f'.{header_occurance}'
        else:
            new_header = header_prefix
        new_headers.append(new_header)
        header_occurances[header_prefix] = header_occurance + 1
    return new_headers

def main():
    xl_file = Path('.') / 'pandasExample.xlsx'

    # Read first level headers
    header_df = pd.read_excel(xl_file, sheet_name='Sheet1', header=[
        0], skiprows=[0], nrows=1)
    headers = list(filter(lambda x: not x.startswith(
        'Unnamed'), list(header_df.columns)))

    # Generate the desired headers
    new_headers = rename_headers(headers)

    # Read in the full dataframe
    df = pd.read_excel(xl_file, sheet_name='Sheet1', header=[
        0, 1], skiprows=[0])

    # Create a dictionary that identifies the parameters for each unique header
    unique_headers = pd.unique(pd.Index(df.columns.get_level_values(0)))
    parameters = {}
    for header in unique_headers:
        parameters[header] = pd.unique(
            [column.split('.')[0] for column in df[header].columns])


    unstack_df = df.head(1).stack()
    # Keep order of the original index after stack
    index = df.head(1).unstack().index.get_level_values(1)
    unstack_df = unstack_df.reindex(zip([0] * len(index), index))
    unstack_df = unstack_df.reset_index()

    # Create the new level 0 and level 1 headers
    level_0 = []
    for header in new_headers:
        level_0 += [header] * len(parameters[header.split('.')[0]])
    level_1 = [parameter.split('.')[0] for parameter in unstack_df['level_1']]

    # Rename level 0 and level 1 columns for the dataframe
    df.columns = pd.MultiIndex.from_tuples(zip(level_0, level_1))
    print(df)


if __name__ == '__main__':
    main()

Ouput:

  Rectangle        Ellipse    Rectangle.1        Rectangle.2       
      Width Height       a  b       Width Height       Width Height
0        10     20       1  2          20     30          40     50

ListenSoftware Louise Ai Agent · Accepted Answer · 2021-04-29 20:36:39Z

1

unstack the dataframe then reassign level_0 to unique labels. I did it manually but you can do it programmaticly by adding a suffix to every two columns. Set the multi-index then stack the results. The are three values in the tuple: level 0 and level 1 and 0

 df=pd.read_excel('dup_header.xls',skiprows=2,nrows=10)
 unstack_df=df.stack()
 unstack_df=unstack_df.reset_index()
 unstack_df['level_0']=['Rectangle1','Rectangle1','Ellipse','Ellipse','Rectangle2','Rectangle2','Rectangle3','Rectangle3']
 unstack_df=unstack_df.set_index(['level_0','level_1'])
 stack_series=unstack_df.stack()

 df=stack_series.to_frame()
 df.columns=['value']
 #print(df.index)
 #print(df.values)
 print(df)

Output:

                             value
  level_0     level_1    
  Rectangle1  Width     0    10
              Height    0    20
  Ellipse     a         0     1
              b         0     2
  Rectangle2  width     0    20
              height    0    30
  Rectangle3  width.1   0    40
              height.1  0    50

edited Apr 29, 2021 at 20:36

answered Apr 28, 2021 at 15:57

ListenSoftware Louise Ai Agent

4,3432 gold badges31 silver badges39 bronze badges

3 Comments

Draino Over a year ago

This does not work with multi-index headers like my example shows. ValueError: cannot specify names when specifying a multi-index header

ListenSoftware Louise Ai Agent Over a year ago

skip over the header level 0 and replace it with unique labels then set the multi index in the dataframe then stack the value then convert the results to a dataframe, see above

Draino Over a year ago

That works I guess. Will have to write some extra code to figure out the header values.

Collectives™ on Stack Overflow

Pandas read_excel with duplicate header values

2 Answers 2

Comments

3 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

Comments

3 Comments

Your Answer

Sign up or log in

Post as a guest

Related