Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
8020297
First test xlsx handler
mattiagiupponi Jan 20, 2026
3dd170c
extending xlsx_hander to cover the error handling by using calamine
Gpetrak Feb 2, 2026
790c469
improving error handling and adding the dep of XLSX handler
Gpetrak Feb 2, 2026
8dc5f73
Merge branch 'master' into xlsx_handler
Gpetrak Feb 3, 2026
d39e05b
simpifying the ogr-based method in order to not handle the WKT geomet…
Gpetrak Feb 3, 2026
ecc8f30
implementing a data validation check
Gpetrak Feb 3, 2026
5557898
make the XLSX handler configurable
Gpetrak Feb 3, 2026
3459516
adding xls extension
Gpetrak Feb 3, 2026
4084871
extend the handler for xls files
Gpetrak Feb 3, 2026
e105ca7
adding tests
Gpetrak Feb 4, 2026
1bbf2ee
adding a security-check test
Gpetrak Feb 5, 2026
a63ffea
fixing flake issues
Gpetrak Feb 5, 2026
e298866
black re-format
Gpetrak Feb 5, 2026
4fa8944
use InvalidInputFileException in more simple exceptions
Gpetrak Feb 5, 2026
af1d9ee
avoid exposing a row exception to the user
Gpetrak Feb 5, 2026
083addf
simplifying is_valid method
Gpetrak Feb 5, 2026
cbede0b
fixing security in the create_ogr2ogr_command method as gemini review…
Gpetrak Feb 5, 2026
bdbd200
fixing flake issues
Gpetrak Feb 5, 2026
1937fd9
wrapping the xlsx enable check in a separate method
Gpetrak Feb 10, 2026
38a4102
moving the XLSX conf variable in the settings and define it as False
Gpetrak Feb 11, 2026
db358f4
fixing flake issues
Gpetrak Feb 12, 2026
3c55cdb
Merge branch 'master' into ISSUE_13936
Gpetrak Feb 17, 2026
7f6c030
remove the shlex-based modification
Gpetrak Feb 17, 2026
eb113b3
Merge branch 'master' into ISSUE_13936
Gpetrak Mar 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -245,4 +245,7 @@ RESTART_POLICY_WINDOW=120s

DEFAULT_MAX_PARALLEL_UPLOADS_PER_USER=5

# FORCE_READ_ONLY_MODE=False Override the read-only value saved in the configuration
# FORCE_READ_ONLY_MODE=False Override the read-only value saved in the configuration

# Enable or not the XLSX / XLS upload
XLSX_UPLOAD_ENABLED=True
5 changes: 4 additions & 1 deletion .env_dev
Original file line number Diff line number Diff line change
Expand Up @@ -207,4 +207,7 @@ RESTART_POLICY_WINDOW=120s

DEFAULT_MAX_PARALLEL_UPLOADS_PER_USER=5
UPSERT_CHUNK_SIZE= 100
UPSERT_LIMIT_ERROR_LOG=100
UPSERT_LIMIT_ERROR_LOG=100

# Enable or not the XLSX / XLS upload
XLSX_UPLOAD_ENABLED=True
3 changes: 3 additions & 0 deletions .env_local
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,6 @@ RESTART_POLICY_MAX_ATTEMPTS="3"
RESTART_POLICY_WINDOW=120s

DEFAULT_MAX_PARALLEL_UPLOADS_PER_USER=5

# Enable or not the XLSX / XLS upload
XLSX_UPLOAD_ENABLED=True
3 changes: 3 additions & 0 deletions .env_test
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,6 @@ MICROSOFT_TENANT_ID=
AZURE_CLIENT_ID=
AZURE_SECRET_KEY=
AZURE_KEY=

# Enable or not the XLSX / XLS upload
XLSX_UPLOAD_ENABLED=True
Empty file.
361 changes: 361 additions & 0 deletions geonode/upload/handlers/xlsx/handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,361 @@
#########################################################################
#
# Copyright (C) 2024 OSGeo
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#########################################################################
import logging
import os
from distutils.util import strtobool
from pathlib import Path
import csv
from datetime import datetime
import math
from celery import group
from python_calamine import CalamineWorkbook
from osgeo import ogr

from dynamic_models.models import ModelSchema

from geonode.upload.handlers.common.vector import BaseVectorFileHandler
from geonode.upload.handlers.csv.handler import CSVFileHandler
from geonode.upload.celery_tasks import create_dynamic_structure
from geonode.upload.handlers.utils import GEOM_TYPE_MAPPING
from geonode.upload.api.exceptions import (
UploadParallelismLimitException,
InvalidInputFileException,
)

logger = logging.getLogger("importer")


class XLSXFileHandler(CSVFileHandler):

XLSX_UPLOAD_ENABLED = strtobool(os.getenv("XLSX_UPLOAD_ENABLED", "False"))

lat_names = CSVFileHandler.possible_lat_column
lon_names = CSVFileHandler.possible_long_column

@property
def supported_file_extension_config(self):

# If disabled, return an empty list or None so the UI doesn't show XLSX options
if not XLSXFileHandler.XLSX_UPLOAD_ENABLED:
Comment thread
mattiagiupponi marked this conversation as resolved.
Outdated
return None

return {
"id": "excel", # Use a generic ID that doesn't imply a specific extension
"formats": [
{
"label": "Excel (OpenXML)",
"required_ext": ["xlsx"],
"optional_ext": ["sld", "xml"],
},
{
"label": "Excel (Binary/Legacy)",
"required_ext": ["xls"],
"optional_ext": ["sld", "xml"],
},
],
"actions": list(self.TASKS.keys()),
"type": "vector",
}

@staticmethod
def can_handle(_data) -> bool:
"""
This endpoint will return True or False if with the info provided
the handler is able to handle the file or not
"""
# Availability Check for the back-end
if not XLSXFileHandler.XLSX_UPLOAD_ENABLED:
return False

base = _data.get("base_file")
if not base:
return False

# Support both XLSX and XLS
valid_extensions = (".xlsx", ".xls")

is_excel = (
base.lower().endswith(valid_extensions)
if isinstance(base, str)
else base.name.lower().endswith(valid_extensions)
)

return is_excel and BaseVectorFileHandler.can_handle(_data)

@staticmethod
def is_valid(files, user, **kwargs):
from geonode.upload.utils import UploadLimitValidator

BaseVectorFileHandler.is_valid(files, user)

upload_validator = UploadLimitValidator(user)
upload_validator.validate_parallelism_limit_per_user()
actual_upload = upload_validator._get_parallel_uploads_count()
max_upload = upload_validator._get_max_parallel_uploads()

datasource = ogr.GetDriverByName("CSV").Open(files.get("base_file"))
if not datasource:
raise InvalidInputFileException("The converted XLSX data is invalid; no layers found.")

# In XLSX handler, we always expect 1 layer (the first sheet)
layer = datasource.GetLayer(0)
if not layer:
raise InvalidInputFileException("No data found in the converted CSV.")

if 1 + actual_upload > max_upload:
raise UploadParallelismLimitException(
detail=f"Upload limit exceeded. Max allowed parallel uploads: {max_upload}"
)

schema_keys = [x.name.lower() for x in layer.schema]

# Accessing class-level constants explicitly
has_lat = any(x in XLSXFileHandler.lat_names for x in schema_keys)
has_long = any(x in XLSXFileHandler.lon_names for x in schema_keys)

if has_lat and not has_long:
raise InvalidInputFileException(
f"Longitude is missing. Supported names: {', '.join(XLSXFileHandler.lon_names)}"
)

if not has_lat and has_long:
raise InvalidInputFileException(
f"Latitude is missing. Supported names: {', '.join(XLSXFileHandler.lat_names)}"
)

if not (has_lat and has_long):
raise InvalidInputFileException(
"XLSX uploads require both a Latitude and a Longitude column in the first row. "
f"Accepted Lat: {', '.join(XLSXFileHandler.lat_names)}. "
f"Accepted Lon: {', '.join(XLSXFileHandler.lon_names)}."
)
Comment thread
Gpetrak marked this conversation as resolved.
Outdated

return True

@staticmethod
def create_ogr2ogr_command(files, original_name, ovverwrite_layer, alternate, **kwargs):
"""
Customized for XLSX: Only looks for X/Y (Point) data.
Ignores WKT/Geom columns as per requirements.
"""

base_command = BaseVectorFileHandler.create_ogr2ogr_command(files, original_name, ovverwrite_layer, alternate)

# We only define X and Y possible names instead of WKT columns
lat_mapping = ",".join(XLSXFileHandler.lat_names)
lon_mapping = ",".join(XLSXFileHandler.lon_names)

additional_option = f' -oo "X_POSSIBLE_NAMES={lon_mapping}" ' f' -oo "Y_POSSIBLE_NAMES={lat_mapping}"'

return (
f"{base_command} -oo KEEP_GEOM_COLUMNS=NO "
f"-lco GEOMETRY_NAME={BaseVectorFileHandler().default_geometry_column_name} " + additional_option
)
Comment thread
Gpetrak marked this conversation as resolved.

def create_dynamic_model_fields(
self,
layer: str,
dynamic_model_schema: ModelSchema = None,
overwrite: bool = None,
execution_id: str = None,
layer_name: str = None,
return_celery_group: bool = True,
):
# retrieving the field schema from ogr2ogr and converting the type to Django Types
layer_schema = [{"name": x.name.lower(), "class_name": self._get_type(x), "null": True} for x in layer.schema]
Comment thread
mattiagiupponi marked this conversation as resolved.

class_name = GEOM_TYPE_MAPPING.get(self.promote_to_multi("Point"))
# Get the geometry type name from OGR (e.g., 'Point' or 'Point 25D')
geom_type_name = ogr.GeometryTypeToName(layer.GetGeomType())

layer_schema += [
{
"name": layer.GetGeometryColumn() or self.default_geometry_column_name,
"class_name": class_name,
"dim": (3 if geom_type_name.lower().startswith("3d") or "z" in geom_type_name.lower() else 2),
}
]

if not return_celery_group:
return layer_schema

list_chunked = [layer_schema[i : i + 30] for i in range(0, len(layer_schema), 30)]
celery_group = group(
create_dynamic_structure.s(execution_id, schema, dynamic_model_schema.id, overwrite, layer_name)
for schema in list_chunked
)

return dynamic_model_schema, celery_group

def pre_processing(self, files, execution_id, **kwargs):
from geonode.upload.orchestrator import orchestrator

# calling the super function (CSVFileHandler logic)
_data, execution_id = super().pre_processing(files, execution_id, **kwargs)

# convert the XLSX file into a CSV
xlsx_file = _data.get("files", {}).get("base_file", "")
if not xlsx_file:
raise Exception("File not found")
Comment thread
Gpetrak marked this conversation as resolved.
Outdated

output_file = str(Path(xlsx_file).with_suffix(".csv"))

try:
workbook = CalamineWorkbook.from_path(xlsx_file)

# Sheet Validation (Uses the validated sheet name)
sheet_name = self._validate_sheets(workbook)
sheet = workbook.get_sheet_by_name(sheet_name)

# We iterate until we find the first non-empty row
rows_gen = iter(sheet.to_python())
try:
# We strictly take the first row. No skipping allowed.
headers = next(rows_gen)
except StopIteration:
raise InvalidInputFileException(detail="The file is empty.")

# Restrictive File Structure Validation
self._validate_headers(headers)

# Conversion with row cleanup
# Note: rows_gen continues from the row after the headers
self._convert_to_csv(headers, rows_gen, output_file)

except Exception as e:
logger.exception("XLSX Pre-processing failed")
raise InvalidInputFileException(detail=f"Failed to securely parse Excel: {str(e)}")

Check warning

Code scanning / CodeQL

Information exposure through an exception Medium

Stack trace information
flows to this location and may be exposed to an external user.

Copilot Autofix

AI 3 months ago

In general, the fix is to avoid returning or propagating raw exception messages or stack traces to the client. Instead, log the full details server-side and return a generic, user-safe error message. For developers and operators, the log entry (with stack trace) provides enough information for debugging without exposing internals to attackers.

For this specific code, we should keep the existing logger.exception("XLSX Pre-processing failed") call, which already records the stack trace. Then, modify the raised InvalidInputFileException so that its detail is a static, generic message without interpolating str(e) (or any other exception-derived text). Functionality is preserved: clients still get a clear signal that the Excel file could not be processed; the only change is that they no longer see the internal error message. All changes occur in geonode/upload/handlers/xlsx/handler.py within the shown snippet, by replacing the single line that currently builds the tainted message.

Concretely:

  • In pre_processing, inside the except Exception as e: block, change:
raise InvalidInputFileException(detail=f"Failed to securely parse Excel: {str(e)}")

to something like:

raise InvalidInputFileException(
    detail="Failed to securely parse the Excel file. Please verify the file format and contents."
)

No new imports, methods, or definitions are required.

Suggested changeset 1
geonode/upload/handlers/xlsx/handler.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/geonode/upload/handlers/xlsx/handler.py b/geonode/upload/handlers/xlsx/handler.py
--- a/geonode/upload/handlers/xlsx/handler.py
+++ b/geonode/upload/handlers/xlsx/handler.py
@@ -215,7 +215,9 @@
 
         except Exception as e:
             logger.exception("XLSX Pre-processing failed")
-            raise InvalidInputFileException(detail=f"Failed to securely parse Excel: {str(e)}")
+            raise InvalidInputFileException(
+                detail="Failed to securely parse the Excel file. Please verify that the file is a valid XLSX document with the expected structure."
+            )
 
         # update the file path in the payload
         _data["files"]["base_file"] = output_file
EOF
@@ -215,7 +215,9 @@

except Exception as e:
logger.exception("XLSX Pre-processing failed")
raise InvalidInputFileException(detail=f"Failed to securely parse Excel: {str(e)}")
raise InvalidInputFileException(
detail="Failed to securely parse the Excel file. Please verify that the file is a valid XLSX document with the expected structure."
)

# update the file path in the payload
_data["files"]["base_file"] = output_file
Copilot is powered by AI and may make mistakes. Always verify output.

# update the file path in the payload
_data["files"]["base_file"] = output_file

if "temporary_files" not in _data or not isinstance(_data["temporary_files"], dict):
_data["temporary_files"] = {}

_data["temporary_files"]["base_file"] = output_file

# updating the execution id params
orchestrator.update_execution_request_obj(
orchestrator.get_execution_object(execution_id), {"input_params": _data}
)
return _data, execution_id

def _validate_sheets(self, workbook):
"""Returns the first sheet name and logs warnings if others exist."""
sheets = workbook.sheet_names
if not sheets:
raise Exception("No sheets found in workbook.")
Comment thread
Gpetrak marked this conversation as resolved.
Outdated
if len(sheets) > 1:
logger.warning(f"Multiple sheets found. Ignoring: {sheets[1:]}")
return sheets[0]

def _validate_headers(self, headers):
"""
Strictly validates Row 1 for headers:
- Must not be empty.
- Must contain geometry 'fingerprints' (Lat/Lon).
- Must have unique and non-empty column names.
"""
# Existence Check
if not headers or self._detect_empty_rows(headers):
raise Exception("No data or headers found in the selected sheet.")
Comment thread
Gpetrak marked this conversation as resolved.
Outdated

# Normalization
clean_headers = [str(h).strip().lower() if h is not None else "" for h in headers]

# Geometry Fingerprint Check
has_lat = any(h in self.lat_names for h in clean_headers)
has_lon = any(h in self.lon_names for h in clean_headers)

if not (has_lat and has_lon):
raise Exception(
"The headers does not contain valid geometry headers. "
"GeoNode requires Latitude and Longitude labels in the first row."
)
Comment thread
Gpetrak marked this conversation as resolved.
Outdated

# Integrity Check (No Empty Names)
if any(h == "" for h in clean_headers):
raise Exception("One or more columns in the first row are missing a header name.")
Comment thread
Gpetrak marked this conversation as resolved.
Outdated

# Uniqueness Check
if len(clean_headers) != len(set(clean_headers)):
duplicates = set([h for h in clean_headers if clean_headers.count(h) > 1])
raise Exception(f"Duplicate headers found in Row 1: {', '.join(duplicates)}")
Comment thread
Gpetrak marked this conversation as resolved.
Outdated

return True

def _data_sense_check(self, x, y):
"""
High-speed coordinate validation for large datasets
"""
try:
# Catch Excel Date objects immediately (Calamine returns these as datetime)
if isinstance(x, datetime) or isinstance(y, datetime):
return False

f_x = float(x)
f_y = float(y)

# Finiteness check (Catches NaN, Inf, and None)
# This is extremely fast in Python
if not (math.isfinite(f_x) and math.isfinite(f_y)):
return False

# Magnitude check
# Limits to +/- 40 million (covers all CRS including Web Mercator)
# but blocks 'serial date numbers' or corrupted scientific notation
if not (-40000000 < f_x < 40000000 and -40000000 < f_y < 40000000):
return False

return True
except (ValueError, TypeError):
return False

def _detect_empty_rows(self, row):
return not row or all(cell is None or str(cell).strip() == "" for cell in row)

def _convert_to_csv(self, headers, rows_gen, output_path):
"""Streams valid data to CSV, skipping empty rows."""

# Define clean_headers once here to find the indices
clean_headers = [str(h).strip().lower() for h in headers]

# Get the indices for the Lat and Lon columns
lat_idx = next(i for i, h in enumerate(clean_headers) if h in self.lat_names)
lon_idx = next(i for i, h in enumerate(clean_headers) if h in self.lon_names)

# Local binding of the check function for loop speed
check_func = self._data_sense_check

with open(output_path, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(headers)

for row_num, row in enumerate(rows_gen, start=2):
# Skip row if it contains no data
if self._detect_empty_rows(row):
continue

if not check_func(row[lon_idx], row[lat_idx]):
raise InvalidInputFileException(
detail=f"Coordinate error at row {row_num}. "
"Check for dates or non-numeric values in Lat/Lon."
)

writer.writerow(row)
Loading
Loading