I have a loop that counts the rows in each sheet of an xls. When I open the xls itself the count is not aligning with what python is returning me.
It is due to the first sheet header being in row 3. How can I alter my code to read the first sheet ONLY in at row 3 and ignore the first two lines? The rest of my sheets ALWAYS start at the top row and contain no header. I would like to count the len of my first sheet without header included.
However when I open up my excel and count my sheet I am getting
65522 , header starts in row 3, expecting a count of 65520
65520
65520
65520
65520
65520
65520
65520
65520
65520
65520
25427
my full code:
from io import BytesIO
from pathlib import Path
from zipfile import ZipFile
import os
import pandas as pd
from os import walk
def process_files(files: list) -> pd.DataFrame:
file_mapping = {}
for file in files:
#data_mapping = pd.read_excel(BytesIO(ZipFile(file).read(Path(file).stem)), sheet_name=None)
archive = ZipFile(file)
# find file names in the archive which end in `.xls`, `.xlsx`, `.xlsb`, ...
files_in_archive = archive.namelist()
excel_files_in_archive = [
f for f in files_in_archive if Path(f).suffix[:4] == ".xls"
]
# ensure we only have one file (otherwise, loop or choose one somehow)
assert len(excel_files_in_archive) == 1
# read in data
data_mapping = pd.read_excel(
BytesIO(archive.read(excel_files_in_archive[0])),
sheet_name=None, header=None,
)
row_counts = []
for sheet in list(data_mapping.keys()):
if sheet == 'Sheet1':
df = data_mapping.get(sheet)[3:]
else:
df = data_mapping.get(sheet)
row_counts.append(len(df))
print(len(data_mapping.get(sheet)))
file_mapping.update({file: sum(row_counts)})
frame = pd.DataFrame([file_mapping]).transpose().reset_index()
frame.columns = ["file_name", "row_counts"]
return frame
dir_path = r'D:\test\2022 - 10'
zip_files = []
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.zip'):
zip_files.append(os.path.join(root, file))
df = process_files(zip_files) #function
does anyone have an idea on what im doing wrong?