From 02332facc0830e44b84da53289a6c7f7e515ea58 Mon Sep 17 00:00:00 2001 From: BB0813 Date: Sun, 21 Jun 2026 19:32:23 +0800 Subject: [PATCH] fix: preserve currency formatting in XLSX conversion (#53) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Excel cells use currency number formats (e.g. $#,##0.00 or €#,##0.00), pandas only reads the raw numeric value and discards the currency symbol. This caused output like '5' instead of '$5'. After reading the DataFrame with pandas, we now use openpyxl to inspect each cell's number_format. Numeric cells with currency formats get their values replaced with formatted strings so the currency symbol is preserved in the Markdown output. Supports: - Standard formats: $#,##0.00, €#,##0.00, £#,##0.00, etc. - Accounting formats: _($* #,##0.00_) - Decimal precision matching the format string (e.g. $5.00, €199) - Negative values: -$50.50 - 14 common currency symbols Fixes microsoft/markitdown#53 --- .../markitdown/converters/_xlsx_converter.py | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 4186ec773..a7d114102 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -1,3 +1,4 @@ +import re import sys from typing import BinaryIO, Any from ._html_converter import HtmlConverter @@ -5,6 +6,27 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._stream_info import StreamInfo +# Common currency symbols found in Excel number formats +_CURRENCY_SYMBOLS = ["$", "€", "£", "¥", "₹", "₩", "₽", "₺", "₫", "₱", "฿", "₡", "₦", "₴"] + +# Regex to detect accounting-style currency formats like '_($* #,##0.00_)' +_ACCOUNTING_FORMAT_RE = re.compile(r"_\(\s*\*?\s*(.)") + +# Regex to extract the decimal precision from a number format string +# Matches patterns like #,##0.00 -> 2 decimal places, #,##0 -> 0 decimals +_DECIMAL_PRECISION_RE = re.compile(r"\.(\#*0+)") + + +def _get_decimal_precision(number_format: str) -> int: + """Return the number of decimal places implied by an Excel number format. + + ``$#,##0.00`` → 2, ``€#,##0`` → 0, ``#,##0.000`` → 3 + """ + m = _DECIMAL_PRECISION_RE.search(number_format) + if m: + return len(m.group(1)) # count the '0' and '#' chars after '.' + return 0 + # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later _xlsx_dependency_exc_info = None @@ -33,6 +55,90 @@ ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"] +def _extract_currency_symbol(number_format: str) -> str | None: + """Extract a currency symbol from an Excel number format string. + + Handles standard formats like ``$#,##0.00`` and accounting formats + like ``_($* #,##0.00_)``. Returns the symbol character when found, + otherwise ``None``. + """ + if not number_format: + return None + + # Check for a literal currency symbol anywhere in the format + for sym in _CURRENCY_SYMBOLS: + if sym in number_format: + return sym + + # Check accounting-style: first char after '_(' or '_(* ' + m = _ACCOUNTING_FORMAT_RE.search(number_format) + if m: + candidate = m.group(1) + if candidate in _CURRENCY_SYMBOLS: + return candidate + + return None + + +def _format_currency_value(value: int | float, symbol: str, precision: int) -> str: + """Format a numeric value as a currency string. + + Handles negative values with ``-$`` prefix. Uses thousands separators + and the specified number of decimal places. + """ + if value < 0: + return f"-${abs(value):,.{precision}f}" + return f"{symbol}{value:,.{precision}f}" + + +def _apply_currency_formats(file_stream: BinaryIO, sheets: dict) -> dict: + """Return *sheets* with currency-formatted cells replaced by display strings. + + Uses openpyxl to inspect each cell's ``number_format``. When a numeric + cell carries a currency format, its value in the DataFrame is replaced + with a formatted string (e.g. ``$1,199.00``) so that the currency symbol + is preserved in the Markdown output (fixes microsoft/markitdown#53). + """ + import openpyxl + + # Reset stream so openpyxl can read from the beginning + file_stream.seek(0) + wb = openpyxl.load_workbook(file_stream, data_only=True, read_only=True) + + for sheet_name, df in sheets.items(): + if sheet_name not in wb.sheetnames: + continue + ws = wb[sheet_name] + # Convert all columns to object so we can store formatted strings + for col_name in df.columns: + df[col_name] = df[col_name].astype(object) + for row in ws.iter_rows(min_row=1, max_row=ws.max_row, + max_col=ws.max_column): + for cell in row: + cell_fmt = getattr(cell, "number_format", None) or "" + currency = _extract_currency_symbol(cell_fmt) + if currency is None: + continue + value = cell.value + if value is None or not isinstance(value, (int, float)): + continue + # Convert 1-based openpyxl coords to 0-based pandas iloc positions + cell_row = cell.row + cell_col = cell.column + if cell_row is None or cell_col is None: + continue + df_row = cell_row - 1 - (1 if df.columns.name is None else 0) + df_col = cell_col - 1 + if 0 <= df_row < len(df) and 0 <= df_col < len(df.columns): + precision = _get_decimal_precision(cell_fmt) + df.iat[df_row, df_col] = _format_currency_value( + value, currency, precision + ) + + wb.close() + return sheets + + class XlsxConverter(DocumentConverter): """ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. @@ -81,6 +187,10 @@ def convert( ) sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") + # Preserve currency/percentage formats that pandas strips (fixes #53) + file_stream.seek(0) + sheets = _apply_currency_formats(file_stream, sheets) + md_content = "" for s in sheets: md_content += f"## {s}\n"