Files
oopt-gnpy/gnpy/tools/xls_utils.py
EstherLerouzic 0bc1fb3bf8 fix: Use openpyxl for xlsx reading and move to latest xlrd version
Create a set of excel utils to be used for .xls and .xlsx files, for
reading workbook, reading sheets, ... optimize openpyxl access to
sheet to save computation time.

Use this opportunity to refactor service sheet without namedtuple
and simplify code

Signed-off-by: EstherLerouzic <esther.lerouzic@orange.com>
Change-Id: Ibaf3aac40b3f6ca4829d8ea8cd506523d318103a
2025-09-26 11:17:45 +02:00

325 lines
11 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause
# gnpy.tools.worker_utils: Utilities for reading and writing XLS, XLSX
# Copyright (C) 2025 Telecom Infra Project and GNPy contributors
# see AUTHORS.rst for a list of contributors
"""
gnpy.tools.xls_utils
====================
This module contains utilities for reading and writing XLS, XLSX
"""
from pathlib import Path
from typing import Generator, Tuple, List, Union, Optional, Iterator, Callable
from openpyxl import load_workbook, Workbook
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.cell.cell import Cell as OpenpyxlCell
from openpyxl.cell.read_only import ReadOnlyCell as OpenpyxlReadOnlyCell
from openpyxl.utils.exceptions import InvalidFileException
from xlrd import Book, open_workbook, XL_CELL_EMPTY
from xlrd.sheet import Sheet as XlrdSheet, Cell as XlrdCell
from xlrd.biffh import XLRDError
SheetType = Union[Worksheet, XlrdSheet]
WorkbookType = Union[Workbook, Book]
CellType = Union[OpenpyxlCell, OpenpyxlReadOnlyCell, XlrdCell]
XLS_EXCEPTIONS = (InvalidFileException, KeyError, XLRDError)
def generic_open_workbook(file_path: Union[str, Path]) -> Tuple[WorkbookType, bool]:
"""Open an Excel file supporting both XLS or XLSX.
:param file_path: Path of excel file
:type file_path: Union[str, Path]
:return: Tuple (workbook, is_xlsx) where is_xlsx inidcate if the file is XLSX or not
:rtype: Tuple[WorkbookType, bool]
"""
file_path = Path(file_path) if isinstance(file_path, str) else file_path
if file_path.suffix.lower() in ['.xlsx', '.xlsm']:
return load_workbook(file_path, read_only=True, data_only=True), True
return open_workbook(file_path), False
def get_sheet(workbook: WorkbookType,
sheet_name: str,
is_xlsx: bool) -> SheetType:
"""Get the Excel Sheet by name
:param workbook: Opened Excel workbook
:type workbook: WorkbookType
:param sheet_name: Sheet name
:type sheet_name: SheetType
:param is_xlsx: True if this is an XLSX workbook, False if XLS
:type is_xlsx: bool
:return: Excel sheet
:rtype: SheetType
"""
if is_xlsx:
return workbook[sheet_name]
return workbook.sheet_by_name(sheet_name)
def get_cell_value(sheet: SheetType, row: int, col: int, is_xlsx: bool) -> Optional[Union[str, int, float]]:
"""Get the cell value
:param sheet: Excel sheet
:type sheet: SheetType
:param row: Line index (0-based)
:type row: int
:param col: Column index (0-based)
:type: int
:param is_xlsx: True if this is an XLSX workbook, False if XLS
:type is_xlsx: bool
:return: cell value
:rtype: Optional[Union[str, int, float]]
"""
if is_xlsx:
# openpyxl uses a 1-based index
cell = sheet.cell(row=row + 1, column=col + 1)
return cell.value
# xlrd uses a 0-base index
return sheet.cell(row, col).value
def get_row(sheet: SheetType, row_index: int, is_xlsx: bool, get_rows=None) -> List[CellType]:
"""Get row in a workbook sheet.
:param sheet: Excel sheet
:type sheet: SheetType
:param row_index: Line index (0-based)
:type row_index: int
:param is_xlsx: True si c'est un fichier XLSX, False si XLS
:param is_xlsx: True if this is an XLSX workbook, False if XLS
:type is_xlsx: bool
:param get_rows: Optional function that returns preloaded rows (from fast_get_sheet_rows)
:type get_rows: Optional[Callable]
:return: List row cells
:rtype: List[CellType]
"""
if is_xlsx:
if get_rows is not None:
# use fast access with aclosure function
rows = get_rows()
else:
rows = list(sheet.rows)
return rows[row_index] if row_index < len(rows) else []
return sheet.row(row_index)
def fast_get_sheet_rows(sheet: Worksheet) -> Callable:
"""Preloads all rows from an Excel sheet for fast access.
This function loads the sheet data only once and returns a function
that provides access to this preloaded data without having to query
the Excel sheet on each access, which significantly improves performance,
particularly with openpyxl.
:param sheet: Excel worksheet (openpyxl.worksheet.worksheet.Worksheet object)
:type sheet: Worksheet
:return: Function that returns the preloaded rows
:rtype: Callable[[], List[Tuple[Cell, ...]]]
Usage example:
> get_rows = fast_get_sheet_rows(sheet)
> rows = get_rows() # Access to preloaded data
> first_row = rows[0] # First row
"""
# Load all sheet rows into memory only once
# This operation can be expensive, but it's performed only once
# load the rows only once.
preloaded_data = list(sheet.rows)
def get_rows():
"""Inner function (clodure function) that returns the preloaded data.
This function doesn't reload the data on each call,
it simply returns the reference to the already loaded data.
:return: List of preloaded rows
:rtype: List[Tuple[Cell, ...]]
"""
return preloaded_data
return get_rows
def get_row_slice(sheet: SheetType, row_index: int, start_col: int, end_col: int, is_xlsx: bool,
get_rows: Callable = None) -> Union[Tuple[CellType], List[CellType]]:
"""Get a row slice.
:param sheet: Excel sheet
:type sheet: SheetType
:param row_index: Line index (0-based)
:type row_index: int
:param start_col: Index of start column (0-based)
:type start_col: int
:param end_col: Index of end column (0-based)
:type end_col: int
:param is_xlsx: True if this is an XLSX workbook, False if XLS
:type is_xlsx: bool
:param get_rows: Optional function that returns preloaded rows (from fast_get_sheet_rows)
:type get_rows: Optional[Callable]
:return: List of cells in the selected slice
:rtype: List[CellType]
"""
if is_xlsx:
if get_rows is not None:
rows = get_rows()
else:
rows = list(sheet.rows)
return rows[row_index][start_col:end_col] if row_index < len(rows) else []
return sheet.row_slice(row_index, start_col, end_col)
def convert_empty(cell_value: Optional[Union[str, int, float]]) -> Optional[Union[str, int, float]]:
"""Convert empty string into None
:param cell_value: Cell value
:type cell_value: Optional[Union[str, int, float]]
>>> convert_empty('')
>>> convert_empty('data')
'data'
>>> convert_empty(123)
123
"""
if cell_value == '':
return None
return cell_value
def get_num_rows(sheet: SheetType, is_xlsx: bool, get_rows: Callable = None) -> int:
"""Get the number of lines of an Excel sheet. Note that openpyxl in read_only mode can return "ghost" rows
at the end (ReadOnlyCell cells with no actual value but formatting information even for empty rows).
:param sheet: Excel sheet
:type sheet: SheetType
:param is_xlsx: True if this is an XLSX workbook, False if XLS
:type is_xlsx: bool
:param get_rows: Optional function that returns preloaded rows (from fast_get_sheet_rows)
:type get_rows: Optional[Callable]
:return: Number of lines
:rtype: int
"""
if is_xlsx:
if get_rows is not None:
return len(list(get_rows()))
else:
return len(list(sheet.rows))
return sheet.nrows
def is_type_cell_empty(cell, is_xlsx: bool) -> bool:
"""Check is a cell is empty.
:param sheet: Excel sheet
:type sheet: SheetType
:param row: Line index (0-based)
:type row: int
:param col: Column index (0-based)
:type: int
:param is_xlsx: True if this is an XLSX workbook, False if XLS
:type is_xlsx: bool
:return: True if cell is empty, else returns False
:rtype: bool
"""
if is_xlsx:
return cell.value in [None, '']
return cell.ctype == XL_CELL_EMPTY
def get_sheet_name(sheet: SheetType, is_xlsx: bool) -> str:
"""Get the name of the current sheet
:param sheet: Excel sheet
:type sheet: SheetType
:param is_xlsx: True if this is an XLSX workbook, False if XLS
:type is_xlsx: bool
:return: Name of the sheet
:rtype: str
"""
if is_xlsx:
return sheet.title
return sheet.name
def all_rows(sh: Worksheet, is_xlsx: bool, start: int = 0, get_rows: Callable = None) -> Generator[list, None, None]:
"""Returns all rows of the xls(x) sheet starting from start row.
:param sh: sheet: Excel sheet
:type sheet: SheetType
:param start: The starting row index (0-based).
:type start: int
:param get_rows: Optional function that returns preloaded rows (from fast_get_sheet_rows)
:type get_rows: Optional[Callable]
:return: A generator yielding all rows from the specified starting index.
:rtype: Generator[list, None, None]
"""
return (get_row(sh, x, is_xlsx, get_rows) for x in range(start, get_num_rows(sh, is_xlsx, get_rows)))
def correct_cell_int_to_str(v: Optional[Union[str, int, float]]) -> Optional[Union[str, int, float]]:
"""Ensure that int values in "id" cells are read as strings containing the int and
do not use the automatic float conversion from xlrd or openpyxl
:param v: cell value to convert
:type v: Optional[Union[str, int, float]]
:return: corrected cell value
:rtype: Optional[Union[str, int, float]]
>>> correct_cell_int_to_str(123)
'123'
>>> correct_cell_int_to_str(123.0)
'123'
>>> correct_cell_int_to_str('abc')
'abc'
>>> correct_cell_int_to_str(None)
"""
if not isinstance(v, str) and v is not None:
value = str(int(v))
if value.endswith('.0'):
value = value[:-2]
else:
value = v
return value
def get_all_sheets(workbook: WorkbookType, is_xlsx: bool) -> Iterator[SheetType]:
"""Get all sheets from an Excel workbook.
:param workbook: Opened Excel workbook
:type workbook: WorkbookType
:param is_xlsx: True if this is an XLSX workbook, False if XLS
:type is_xlsx: bool
:return: Iterator of all sheets in the workbook
:rtype: Iterator[SheetType]
"""
if is_xlsx:
for sheet in workbook.worksheets:
yield sheet
else:
for i in range(workbook.nsheets):
yield workbook.sheet_by_index(i)
def get_sheet_names(workbook: WorkbookType, is_xlsx: bool) -> List[str]:
"""Get all sheet names from an Excel workbook.
:param workbook: Opened Excel workbook
:type workbook: WorkbookType
:param is_xlsx: True if this is an XLSX workbook, False if XLS
:type is_xlsx: bool
:return: List of sheet names
:rtype: List[str]
"""
if is_xlsx:
return workbook.sheetnames
return workbook.sheet_names()