oopt-gnpy/gnpy/tools/xls_utils.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# SPDX-License-Identifier: BSD-3-Clause
# gnpy.tools.worker_utils: Utilities for reading and writing XLS, XLSX
# Copyright (C) 2025 Telecom Infra Project and GNPy contributors
# see AUTHORS.rst for a list of contributors

"""
gnpy.tools.xls_utils
====================

This module contains utilities for reading and writing XLS, XLSX

"""
from pathlib import Path
from typing import Generator, Tuple, List, Union, Optional, Iterator, Callable
from openpyxl import load_workbook, Workbook
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.cell.cell import Cell as OpenpyxlCell
from openpyxl.cell.read_only import ReadOnlyCell as OpenpyxlReadOnlyCell
from openpyxl.utils.exceptions import InvalidFileException
from xlrd import Book, open_workbook, XL_CELL_EMPTY
from xlrd.sheet import Sheet as XlrdSheet, Cell as XlrdCell
from xlrd.biffh import XLRDError

SheetType = Union[Worksheet, XlrdSheet]
WorkbookType = Union[Workbook, Book]
CellType = Union[OpenpyxlCell, OpenpyxlReadOnlyCell, XlrdCell]
XLS_EXCEPTIONS = (InvalidFileException, KeyError, XLRDError)


def generic_open_workbook(file_path: Union[str, Path]) -> Tuple[WorkbookType, bool]:
    """Open an Excel file supporting both XLS or XLSX.

    :param file_path: Path of excel file
    :type file_path: Union[str, Path]
    :return: Tuple (workbook, is_xlsx) where is_xlsx inidcate if the file is XLSX or not
    :rtype: Tuple[WorkbookType, bool]
    """
    file_path = Path(file_path) if isinstance(file_path, str) else file_path

    if file_path.suffix.lower() in ['.xlsx', '.xlsm']:
        return load_workbook(file_path, read_only=True, data_only=True), True
    return open_workbook(file_path), False


def get_sheet(workbook: WorkbookType,
              sheet_name: str,
              is_xlsx: bool) -> SheetType:
    """Get the Excel Sheet by name

    :param workbook: Opened Excel workbook
    :type workbook: WorkbookType
    :param sheet_name: Sheet name
    :type sheet_name: SheetType
    :param is_xlsx: True if this is an XLSX workbook, False if XLS
    :type is_xlsx: bool
    :return: Excel sheet
    :rtype: SheetType
    """
    if is_xlsx:
        return workbook[sheet_name]
    return workbook.sheet_by_name(sheet_name)


def get_cell_value(sheet: SheetType, row: int, col: int, is_xlsx: bool) -> Optional[Union[str, int, float]]:
    """Get the cell value

    :param sheet: Excel sheet
    :type sheet: SheetType
    :param row: Line index (0-based)
    :type row: int
    :param col: Column index (0-based)
    :type: int
    :param is_xlsx: True if this is an XLSX workbook, False if XLS
    :type is_xlsx: bool
    :return: cell value
    :rtype: Optional[Union[str, int, float]]
    """
    if is_xlsx:
        # openpyxl uses a 1-based index
        cell = sheet.cell(row=row + 1, column=col + 1)
        return cell.value
    # xlrd uses a 0-base index
    return sheet.cell(row, col).value


def get_row(sheet: SheetType, row_index: int, is_xlsx: bool, get_rows=None) -> List[CellType]:
    """Get row in a workbook sheet.

    :param sheet: Excel sheet
    :type sheet: SheetType
    :param row_index: Line index (0-based)
    :type row_index: int
    :param is_xlsx: True si c'est un fichier XLSX, False si XLS
    :param is_xlsx: True if this is an XLSX workbook, False if XLS
    :type is_xlsx: bool
    :param get_rows: Optional function that returns preloaded rows (from fast_get_sheet_rows)
    :type get_rows: Optional[Callable]
    :return: List row cells
    :rtype: List[CellType]
    """
    if is_xlsx:
        if get_rows is not None:
            # use fast access with aclosure function
            rows = get_rows()
        else:
            rows = list(sheet.rows)
        return rows[row_index] if row_index < len(rows) else []
    return sheet.row(row_index)


def fast_get_sheet_rows(sheet: Worksheet) -> Callable:
    """Preloads all rows from an Excel sheet for fast access.

    This function loads the sheet data only once and returns a function
    that provides access to this preloaded data without having to query
    the Excel sheet on each access, which significantly improves performance,
    particularly with openpyxl.

    :param sheet: Excel worksheet (openpyxl.worksheet.worksheet.Worksheet object)
    :type sheet: Worksheet
    :return: Function that returns the preloaded rows
    :rtype: Callable[[], List[Tuple[Cell, ...]]]

    Usage example:
        > get_rows = fast_get_sheet_rows(sheet)
        > rows = get_rows()  # Access to preloaded data
        > first_row = rows[0]  # First row
    """
    # Load all sheet rows into memory only once
    # This operation can be expensive, but it's performed only once
    # load the rows only once.
    preloaded_data = list(sheet.rows)

    def get_rows():
        """Inner function (clodure function) that returns the preloaded data.

        This function doesn't reload the data on each call,
        it simply returns the reference to the already loaded data.

        :return: List of preloaded rows
        :rtype: List[Tuple[Cell, ...]]
        """
        return preloaded_data
    return get_rows


def get_row_slice(sheet: SheetType, row_index: int, start_col: int, end_col: int, is_xlsx: bool,
                  get_rows: Callable = None) -> Union[Tuple[CellType], List[CellType]]:
    """Get a row slice.

    :param sheet: Excel sheet
    :type sheet: SheetType
    :param row_index: Line index (0-based)
    :type row_index: int
    :param start_col: Index of start column (0-based)
    :type start_col: int
    :param end_col: Index of end column (0-based)
    :type end_col: int
    :param is_xlsx: True if this is an XLSX workbook, False if XLS
    :type is_xlsx: bool
    :param get_rows: Optional function that returns preloaded rows (from fast_get_sheet_rows)
    :type get_rows: Optional[Callable]
    :return: List of cells in the selected slice
    :rtype: List[CellType]
    """
    if is_xlsx:
        if get_rows is not None:
            rows = get_rows()
        else:
            rows = list(sheet.rows)
        return rows[row_index][start_col:end_col] if row_index < len(rows) else []
    return sheet.row_slice(row_index, start_col, end_col)


def convert_empty(cell_value: Optional[Union[str, int, float]]) -> Optional[Union[str, int, float]]:
    """Convert empty string into None

    :param cell_value: Cell value
    :type cell_value: Optional[Union[str, int, float]]

    >>> convert_empty('')

    >>> convert_empty('data')
    'data'

    >>> convert_empty(123)
    123
    """
    if cell_value == '':
        return None
    return cell_value


def get_num_rows(sheet: SheetType, is_xlsx: bool, get_rows: Callable = None) -> int:
    """Get the number of lines of an Excel sheet. Note that openpyxl in read_only mode can return "ghost" rows
    at the end (ReadOnlyCell cells with no actual value but formatting information even for empty rows).

    :param sheet: Excel sheet
    :type sheet: SheetType
    :param is_xlsx: True if this is an XLSX workbook, False if XLS
    :type is_xlsx: bool
    :param get_rows: Optional function that returns preloaded rows (from fast_get_sheet_rows)
    :type get_rows: Optional[Callable]
    :return: Number of lines
    :rtype: int
    """
    if is_xlsx:
        if get_rows is not None:
            return len(list(get_rows()))
        else:
            return len(list(sheet.rows))
    return sheet.nrows


def is_type_cell_empty(cell, is_xlsx: bool) -> bool:
    """Check is a cell is empty.

    :param sheet: Excel sheet
    :type sheet: SheetType
    :param row: Line index (0-based)
    :type row: int
    :param col: Column index (0-based)
    :type: int
    :param is_xlsx: True if this is an XLSX workbook, False if XLS
    :type is_xlsx: bool
    :return: True if cell is empty, else returns False
    :rtype: bool
    """
    if is_xlsx:
        return cell.value in [None, '']
    return cell.ctype == XL_CELL_EMPTY


def get_sheet_name(sheet: SheetType, is_xlsx: bool) -> str:
    """Get the name of the current sheet

    :param sheet: Excel sheet
    :type sheet: SheetType
    :param is_xlsx: True if this is an XLSX workbook, False if XLS
    :type is_xlsx: bool
    :return: Name of the sheet
    :rtype: str
    """
    if is_xlsx:
        return sheet.title
    return sheet.name


def all_rows(sh: Worksheet, is_xlsx: bool, start: int = 0, get_rows: Callable = None) -> Generator[list, None, None]:
    """Returns all rows of the xls(x) sheet starting from start row.

    :param sh: sheet: Excel sheet
    :type sheet: SheetType
    :param start: The starting row index (0-based).
    :type start: int
    :param get_rows: Optional function that returns preloaded rows (from fast_get_sheet_rows)
    :type get_rows: Optional[Callable]
    :return: A generator yielding all rows from the specified starting index.
    :rtype: Generator[list, None, None]
    """
    return (get_row(sh, x, is_xlsx, get_rows) for x in range(start, get_num_rows(sh, is_xlsx, get_rows)))


def correct_cell_int_to_str(v: Optional[Union[str, int, float]]) -> Optional[Union[str, int, float]]:
    """Ensure that int values in "id" cells are read as strings containing the int and
    do not use the automatic float conversion from xlrd or openpyxl

    :param v: cell value to convert
    :type v: Optional[Union[str, int, float]]
    :return: corrected cell value
    :rtype: Optional[Union[str, int, float]]

    >>> correct_cell_int_to_str(123)
    '123'
    >>> correct_cell_int_to_str(123.0)
    '123'
    >>> correct_cell_int_to_str('abc')
    'abc'
    >>> correct_cell_int_to_str(None)

    """
    if not isinstance(v, str) and v is not None:
        value = str(int(v))
        if value.endswith('.0'):
            value = value[:-2]
    else:
        value = v
    return value


def get_all_sheets(workbook: WorkbookType, is_xlsx: bool) -> Iterator[SheetType]:
    """Get all sheets from an Excel workbook.

    :param workbook: Opened Excel workbook
    :type workbook: WorkbookType
    :param is_xlsx: True if this is an XLSX workbook, False if XLS
    :type is_xlsx: bool
    :return: Iterator of all sheets in the workbook
    :rtype: Iterator[SheetType]
    """
    if is_xlsx:
        for sheet in workbook.worksheets:
            yield sheet
    else:
        for i in range(workbook.nsheets):
            yield workbook.sheet_by_index(i)


def get_sheet_names(workbook: WorkbookType, is_xlsx: bool) -> List[str]:
    """Get all sheet names from an Excel workbook.

    :param workbook: Opened Excel workbook
    :type workbook: WorkbookType
    :param is_xlsx: True if this is an XLSX workbook, False if XLS
    :type is_xlsx: bool
    :return: List of sheet names
    :rtype: List[str]
    """
    if is_xlsx:
        return workbook.sheetnames
    return workbook.sheet_names()