diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..9049978 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,65 @@ +name: Python package + +on: + push: + branches: + - 'master' + pull_request: + + +jobs: + lint-coverage: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.13"] + steps: + - uses: actions/checkout@v6 + + - name: Setup uv + uses: astral-sh/setup-uv@v7 + + - name: Lint + run: uv run --python ${{ matrix.python-version }} --extra dev ruff check --output-format=github + + - name: Generate coverage report + run: | + uv run --python ${{ matrix.python-version }} --extra dev \ + pytest tests/ \ + --cov=. \ + --cov-branch \ + --cov-report=term-missing \ + --cov-report=html:cov_html \ + --cov-report=markdown-append:$GITHUB_STEP_SUMMARY \ + --verbose + + - uses: actions/upload-artifact@v6 + with: + name: html-coverage-artifact + path: ./cov_html/ + + test: + needs: lint-coverage + runs-on: ${{ matrix.os }} + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.python-version }} + cancel-in-progress: true + strategy: + fail-fast: true + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] + os: [ubuntu-latest, windows-latest, macos-latest] + exclude: + - os: macos-latest + python-version: "3.9" + - os: macos-latest + python-version: "3.10" + + steps: + - uses: actions/checkout@v6 + + - name: Setup uv + uses: astral-sh/setup-uv@v7 + + - name: Install and test + run: uv run --python ${{ matrix.python-version }} --extra dev pytest diff --git a/.gitignore b/.gitignore index f2cbc5c..293af28 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,9 @@ nosetests.xml # Python build directory api/python/build + +.venv/ +venv/ +pyenv/ +.idea/ +build/ \ No newline at end of file diff --git a/CHANGES.txt b/CHANGES.txt index f01f6a7..667204f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -6,4 +6,5 @@ v0.3.3, 2017-03-07 -- Python 3 compatible v0.3.4, 2018-04-05 -- Fix incorrect version being saved v0.3.4.1, 2018-10-19 -- python3 compatible next for generator v0.3.5, 2019-12-21 -- add validator and clean-up repo structure +v0.4.0, 2025-12-21 -- Replaces PyTables with H5Py, cleans repo structure, adds CI diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 8376c3c..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include *.txt -recursive-include docs *.txt - diff --git a/README.md b/README.md index 73af057..cc477c3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # OMX Python API Documentation -The Python OMX API borrows heavily from PyTables. An OMX file extends the equivalent PyTables File object, so anything you can do in PyTables you can do with OMX as well. This API attempts to be very Pythonic, including dictionary-style lookup of matrix names. +The Python OMX API is built on top of h5py. An OMX file extends the equivalent h5py File object, so anything you can do in h5py you can do with OMX as well. This API attempts to be very Pythonic, including dictionary-style lookup of matrix names. * [Pre-requisites](#pre-requisites) * [Installation](#installation) @@ -12,9 +12,9 @@ The Python OMX API borrows heavily from PyTables. An OMX file extends the equiva # Pre-requisites -Python 2.6+, PyTables 3.1+, and NumPy. Python 3 is now supported too. +Python 3.9+, h5py 2.10+, and NumPy. -On Windows, the easiest way to get these is from [Anaconda](https://www.continuum.io/downloads#windows) or from Chris Gohlke's python binaries [page](http://www.lfd.uci.edu/~gohlke/pythonlibs/). On Linux, your distribution already has these available. +Binaries for all these dependencies are readily available from PyPI and can be installed via pip. # Installation @@ -22,7 +22,13 @@ The easiest way to get OMX on Python is to use pip. Get the latest package (call `pip install openmatrix` -This command will fetch openmatrix from the PyPi repository and download/install it for you. The package name "omx" was already taken on pip for a lame xml library that no one uses. Thus our little project goes by "openmatrix" on pip instead of "omx". This means your import statements should look like, +Using uv is also possible and much faster: + + `pip install uv` + `uv pip install openmatrix` + + +This command will fetch openmatrix from the PyPi repository and download/install it for you. The package name "omx" was already taken on pip for a lame xml library that no one uses. Thus our little project goes by "openmatrix" on pip instead of "omx". This means your import statements should look like, `import openmatrix as omx` @@ -33,31 +39,30 @@ and NOT: # Quick-Start Sample Code ```python -from __future__ import print_function import openmatrix as omx import numpy as np # Create some data -ones = np.ones((100,100)) -twos = 2.0*ones +ones = np.ones((100, 100)) +twos = 2.0 * ones # Create an OMX file (will overwrite existing file!) print('Creating myfile.omx') -myfile = omx.open_file('myfile.omx','w') # use 'a' to append/edit an existing file +myfile = omx.open_file('myfile.omx', 'w') # use 'a' to append/edit an existing file # Write to the file. myfile['m1'] = ones myfile['m2'] = twos -myfile['m3'] = ones + twos # numpy array math is fast +myfile['m3'] = ones + twos # numpy array math is fast myfile.close() # Open an OMX file for reading only print('Reading myfile.omx') myfile = omx.open_file('myfile.omx') -print ('Shape:', myfile.shape()) # (100,100) -print ('Number of tables:', len(myfile)) # 3 -print ('Table names:', myfile.list_matrices()) # ['m1','m2',',m3'] +print('Shape:', myfile.shape()) # (100,100) +print('Number of tables:', len(myfile)) # 3 +print('Table names:', myfile.list_matrices()) # ['m1','m2',',m3'] # Work with data. Pass a string to select matrix by name: # ------------------------------------------------------- @@ -76,8 +81,8 @@ my_very_special_zone_value = m2[10][25] # FANCY: Use attributes to find matrices # -------------------------------------- -myfile.close() # was opened read-only, so let's reopen. -myfile = omx.open_file('myfile.omx','a') # append mode: read/write existing file +myfile.close() # was opened read-only, so let's reopen. +myfile = omx.open_file('myfile.omx', 'a') # append mode: read/write existing file myfile['m1'].attrs.timeperiod = 'am' myfile['m1'].attrs.mode = 'hwy' @@ -87,13 +92,13 @@ myfile['m2'].attrs.timeperiod = 'md' myfile['m3'].attrs.timeperiod = 'am' myfile['m3'].attrs.mode = 'trn' -print('attributes:', myfile.list_all_attributes()) # ['mode','timeperiod'] +print('attributes:', myfile.list_all_attributes()) # ['mode','timeperiod'] # Use a DICT to select matrices via attributes: -all_am_trips = myfile[ {'timeperiod':'am'} ] # [m1,m3] -all_hwy_trips = myfile[ {'mode':'hwy'} ] # [m1] -all_am_trn_trips = myfile[ {'mode':'trn','timeperiod':'am'} ] # [m3] +all_am_trips = myfile[{'timeperiod': 'am'}] # [m1,m3] +all_hwy_trips = myfile[{'mode': 'hwy'}] # [m1] +all_am_trn_trips = myfile[{'mode': 'trn', 'timeperiod': 'am'}] # [m3] print('sum of some tables:', np.sum(all_am_trips)) @@ -102,23 +107,23 @@ print('sum of some tables:', np.sum(all_am_trips)) # (any mapping would work, such as a mapping with large gaps between zone # numbers. For this simple case we'll just assume TAZ numbers are 1-100.) -taz_equivs = np.arange(1,101) # 1-100 inclusive +taz_equivs = np.arange(1, 101) # 1-100 inclusive myfile.create_mapping('taz', taz_equivs) -print('mappings:', myfile.list_mappings()) # ['taz'] +print('mappings:', myfile.list_mappings()) # ['taz'] -tazs = myfile.mapping('taz') # Returns a dict: {1:0, 2:1, 3:2, ..., 100:99} +tazs = myfile.mapping('taz') # Returns a dict: {1:0, 2:1, 3:2, ..., 100:99} m3 = myfile['m3'] -print('cell value:', m3[tazs[100]][tazs[100]]) # 3.0 (taz (100,100) is cell [99][99]) +print('cell value:', m3[tazs[100]][tazs[100]]) # 3.0 (taz (100,100) is cell [99][99]) myfile.close() ``` # Testing -Testing is done with [nose](https://nose.readthedocs.io/en/latest/). Run the tests via: +Testing is done with [pytest](https://docs.pytest.org/). Run the tests via: ``` -openmatrix\test> nosetests -v +pytest ``` # OMX File Validator @@ -132,58 +137,53 @@ omx-validate my_file.omx ### File Objects -OMX File objects extend Pytables.File, so all Pytables functions work normally. We've also added some useful stuff to make things even easier. +OMX File objects extend h5py.File, so most h5py functions work normally, some methods operate in a modified manner to be more useful rather than general methods. We've also added some useful stuff to make things even easier. ### Writing Data Writing data to an OMX file is simple: You must provide a name, and you must provide either an existing numpy (or python) array, or a shape and an "atom". You can optionally provide a descriptive title, a list of tags, and other implementation minutiae. -The easiest way to do all that is to use python dictionary nomenclature: +The easiest way to do all that is to use python dictionary nomenclature: ```python myfile['matrixname'] = mynumpyobject ``` -will call createMatrix() for you and populate it with the specified array. +will call `create_matrix()` for you and populate it with the specified array. ### Accessing Data -You can access matrix objects by name, using dictionary lookup e.g. `myfile['hwydist']` or using PyTables path notation, e.g. `myfile.root.hwydist` +You can access matrix objects by name, using dictionary lookup e.g. `myfile['hwydist']`. ### Matrix objects -OMX matrices extend numpy arrays. An OMX matrix object extends a Pytables/HDF5 "node" which means all HDF5 methods and properties behave normally. Generally these datasets are going to be numpy CArray objects of arbitrary shape. +OMX matrices are h5py Dataset objects. An OMX matrix object extends an h5py Dataset which means most h5py methods and properties behave normally. You can access a matrix object by name using: * dictionary syntax, e.g. `myfile['hwydist']` -* or by Pytables path syntax, e.g. `myfile.root.hwydist` Once you have a matrix object, you can perform normal numpy math on it or you can access rows and columns pythonically: ```python myfile['biketime'][0][0] = 0.60 * myfile['bikedist'][0][0] -total_trips = numpy.sum(myfile.root.trips)` +total_trips = np.sum(myfile.root.trips)` ``` ### Properties -Every Matrix has its own dictionary of key/value pair attributes (properties) which can be accessed using the standard Pytables .attrs field. Add as many attributes as you like; attributes can be string, ints, floats, and lists: +Every Matrix has its own dictionary of key/value pair attributes (properties) which can be accessed using the standard h5py .attrs field. Add as many attributes as you like; attributes can be string, ints, floats, and lists: ```python -print mymatrix.attrs -print mymatrix.attrs.myfield -print mymatrix.attrs['myfield'] +print(mymatrix.attrs) +print(mymatrix.attrs['myfield']) +print(mymatrix.attrs.items()) ``` -### Tags - -If you create tags for your objects, you can also look up matrices by those tags. You can assign tags to any matrix using the 'tags' property attribute. Tags are a list of strings, e.g. ['skims','am','hwy']. To retrieve the list of matrices that matches a given set of tags, pass in a tuple of tags when using dictionary-style lookups: +Files can be queried for all matrices that match a set of attributes via indexing with a dictionary of attribute name and value. ```python -list_all_hwy_skims = mybigfile[ ('skims','hwy') ] +myfile[{"myfield": 123}] ``` -This will always return a list (which can be empty). A matrix will only be included in the returned list if ALL tags specified match exactly. Tags are case-sensitive. - ### Mappings A mapping allows rows and columns to be accessed using an integer value other than a zero-based offset. For instance zone numbers often start at "1" not "0", and there can be significant gaps between zone numbers; they're rarely fully sequential. An OMX file can contain multiple mappings. @@ -197,176 +197,234 @@ A mapping allows rows and columns to be accessed using an integer value other th ## Global Properties ### `__version__` -OMX module version string. Currently '0.3.5' as of this writing. This is the Python API version. +OMX module version string. Currently '0.4.0' as of this writing. This is the Python API version. ### `__omx_version__` OMX file format version. Currently '0.2'. This is the OMX file format specification that omx-python adheres to. -### `open_file`(filename, mode='r', title='', root_uep='/', filters=Filters(complevel=1, complib='zlib', shuffle=True, bitshuffle=False, fletcher32=False, least_significant_digit=None), shape=None, **kwargs) +### `open_file(filename: Union[str, PathLike], mode: Literal["r", "w", "a", "r+", "w-", "x"] = "r", title: str = "", filters: Optional[Union[dict[str, Any], Any]] = None, shape: Optional[tuple[int, int]] = None, **kwargs,) -> File` Open or create a new OMX file. New files will be created with default - zlib compression enabled. - + gzip compression enabled if filters is None. + Parameters ---------- - filename : string + filename : string or PathLike Name or path and name of file mode : string - 'r' for read-only; - 'w' to write (erases existing file); + 'r' for read-only; + 'w' to write (erases existing file); 'a' to read/write an existing file (will create it if doesn't exist). - Ignored in read-only mode. + 'r+' is also supported (read/write, must exist). + 'w- or x' create file, fail if exists. title : string Short description of this file, used when creating the file. Default is ''. Ignored in read-only mode. - filters : tables.Filters - HDF5 default filter options for compression, shuffling, etc. Default for - OMX standard file format is: zlib compression level 1, and shuffle=True. - Only specify this if you want something other than the recommended standard - HDF5 zip compression. - 'None' will create enormous uncompressed files. - Only 'zlib' compression is guaranteed to be available on all HDF5 implementations. - See HDF5 docs for more detail. - shape: numpy.array - Shape of matrices in this file. Default is None. Specify a valid shape - (e.g. (1200,1200)) to enforce shape-checking for all added objects. - If shape is not specified, the first added matrix will not be shape-checked + filters : dict or object + HDF5 default filter options. + Default for OMX standard file format is: gzip compression level 1, and shuffle=True. + shape: array-like + Shape of matrices in this file. Default is None. Specify a valid shape + (e.g. (1000,1200)) to enforce shape-checking for all added objects. + If shape is not specified, the first added matrix will not be shape-checked and all subsequently added matrices must match the shape of the first matrix. All tables in an OMX file must have the same shape. - + Returns ------- f : openmatrix.File The file object for reading and writing. ## File Objects + OMX File class, which contains all the methods for adding, removing, manipulating matrices + and mappings in an OMX file. -### `create_mapping`(self, title, entries, overwrite=False) - Create an equivalency index, which maps a raw data dimension to - another integer value. Once created, mappings can be referenced by - offset or by key. - - Parameters: - ----------- - title : string - Name of this mapping - entries : list - List of n equivalencies for the mapping. n must match one data - dimension of the matrix. - overwrite : boolean - True to allow overwriting an existing mapping, False will raise - a LookupError if the mapping already exists. Default is False. - - Returns: - -------- - mapping : tables.array - Returns the created mapping. - - Raises: - LookupError : if the mapping exists and overwrite=False - -### `create_matrix`(self, name, atom=None, shape=None, title='', filters=None, chunkshape=None, byteorder=None, createparents=False, obj=None, attrs=None) +### `version(self) -> Optional[str]` + """ + Return the OMX file format of this OMX file, embedded in the OMX_VERSION file attribute. + Returns None if the OMX_VERSION attribute is not set. + """ + +### `create_matrix(self, name: str, shape: Optional[tuple[int, int]] = None, title: str = "", filters: Union[dict, Any] = None, chunks: Union[bool, tuple[int, int]] = True, obj: Optional[npt.NDArray[Union[np.integer, np.floating]]] = None, dtype: Optional[np.dtype] = None, attrs: Optional[dict] = None,) -> h5py.Dataset` Create an OMX Matrix (CArray) at the root level. User must pass in either an existing numpy matrix, or a shape and an atom type. - + Parameters ---------- name : string The name of this matrix. Stored in HDF5 as the leaf name. - title : string - Short description of this matrix. Default is ''. - obj : numpy.CArray - Existing numpy array from which to create this OMX matrix. If obj is passed in, - then shape and atom can be left blank. If obj is not passed in, then a shape and - atom must be specified instead. Default is None. shape : numpy.array Optional shape of the matrix. Shape is an int32 numpy array of format (rows,columns). - If shape is not specified, an existing numpy CArray must be passed in instead, + If shape is not specified, an existing numpy CArray must be passed in instead, as the 'obj' parameter. Default is None. - atom : atom_type - Optional atom type of the data. Can be int32, float32, etc. Default is None. - If None specified, then obj parameter must be passed in instead. + title : string + Short description of this matrix. Default is ''. filters : tables.Filters - Set of HDF5 filters (compression, etc) used for creating the matrix. + Set of HDF5 filters (compression, etc) used for creating the matrix. Default is None. See HDF5 documentation for details. Note: while the default here - is None, the default set of filters set at the OMX parent file level is + is None, the default set of filters set at the OMX parent file level is zlib compression level 1. Those settings usually trickle down to the table level. + chunks: bool or tuple[int, int] + Enable HDF5 array chunking. A value of True enables HDF5 to guess the best chunk size. Chunk size may impact + I/O performance. + obj : numpy.NDArray + Existing numpy array from which to create this OMX matrix. If obj is passed in, + then shape and atom can be left blank. If obj is not passed in, then a shape and + atom must be specified instead. Default is None. + dtype: numpy.dtype + Underlying data to use for storage. Defaults to the datatype of obj. attrs : dict Dictionary of attribute names and values to be attached to this matrix. Default is None. - + Returns ------- - matrix : tables.carray + matrix : h5py.Dataset HDF5 CArray matrix - -### `delete_mapping`(self, title) - Remove a mapping. - - Raises: + +### `shape(self) -> Optional[tuple[int, int]]` + Get the one and only shape of all matrices in this File + + Returns ------- - LookupError : if the specified mapping does not exist. - -### `list_all_attributes`(self) + shape : tuple + Tuple of (rows,columns) for this matrix and file or None if a shape is not present and could not be + inferred. + +### `list_matrices(self) -> list[str]` + List the matrix names in this File + + Returns + ------- + matrices : list + List of all matrix names stored in this OMX file. + +### `list_all_attributes(self) -> list[str]` Return set of all attributes used for any Matrix in this File - + Returns ------- all_attributes : set The combined set of all attribute names that exist on any matrix in this file. - -### `list_mappings`(self) + +### `data(self) -> h5py.Group` + Return the '/data' group. + +### `lookup(self) -> h5py.Group` + Return the '/lookup' group. + +### `list_mappings(self) -> list[str]` List all mappings in this file - + Returns: -------- mappings : list - List of the names of all mappings in the OMX file. Mappings + List of the names of all mappings in the OMX file. Mappings are stored internally in the 'lookup' subset of the HDF5 file structure. Returns empty list if there are no mappings. - -### `list_matrices`(self) - List the matrix names in this File - - Returns + +### `delete_mapping(self, title) -> None` + Remove a mapping. + + Raises: ------- - matrices : list - List of all matrix names stored in this OMX file. - -### `map_entries`(self, title) - Return a list of entries for the specified mapping. - Throws LookupError if the specified mapping does not exist. - -### `mapping`(self, title) + LookupError : if the specified mapping does not exist. + +### `delete_matrix(self, name) -> None` + Remove a matrix. + + Raises: + ------- + LookupError : if the specified matrix does not exist. + +### `mapping(self, title) -> dict[Any, int]` Return dict containing key:value pairs for specified mapping. Keys represent the map item and value represents the array offset. - + Parameters: ----------- title : string Name of the mapping to be returned - + Returns: -------- mapping : dict - Dictionary where each key is the map item, and the value + Dictionary where each key is the map item, and the value represents the array offset. - + Raises: ------- LookupError : if the specified mapping does not exist. - -### `shape`(self) - Get the one and only shape of all matrices in this File - - Returns + +### `map_entries(self, title) -> list[Any]` + Return a list of entries for the specified mapping. + + Parameters: + ----------- + title : string + Name of the mapping to be returned + + Returns: + -------- + mappings : list + List of entries for the specified mapping. + + Raises: ------- - shape : tuple - Tuple of (rows,columns) for this matrix and file. - -### `version`(self) - Return the OMX file format of this OMX file, embedded in the OMX_VERSION file attribute. - Returns None if the OMX_VERSION attribute is not set. + LookupError : if the specified mapping does not exist. + +### `create_mapping(self, title, entries, overwrite=False)` + Create an equivalency index, which maps a raw data dimension to + another integer value. Once created, mappings can be referenced by + offset or by key. + + Parameters: + ----------- + title : string + Name of this mapping + entries : list + List of n equivalencies for the mapping. n must match one data + dimension of the matrix. + overwrite : boolean + True to allow overwriting an existing mapping, False will raise + a LookupError if the mapping already exists. Default is False. + + Returns: + -------- + mapping : tables.array + Returns the created mapping. + + Raises: + LookupError : if the mapping exists and overwrite=False + +### `__getitem__(self, key)` + Return a matrix by name, a list of matrices by attributes, or a HDF5 group for given absolute path. + + +### `__len__(self)` + Return the length of the '/data' group. + +### `__setitem__(self, key, dataset)` + Create a matrix with a given name. + + If a h5py.Dataset is provide that dataset is copied directly. + +### `items(self)` + Return the key value pairs of the '/data' group. + +### `keys(self)` + Return the keys of the '/data' group. + +### `values(self)` + Return the values of the '/data' group. + +### `__delitem__(self, key)` + Delete a matrix by name, or a HDF5 group for given absolute path. + +### `__iter__(self)` + Iterate over the matrices in this container. +### `__contains__(self, item)` + Test if a name is with the '/data' group. ## Exceptions * LookupError diff --git a/example/python-omx-sample.py b/example/python-omx-sample.py index c3b8055..fee743e 100644 --- a/example/python-omx-sample.py +++ b/example/python-omx-sample.py @@ -3,36 +3,36 @@ import numpy as np # Create some data -ones = np.ones((100,100)) -twos = 2.0*ones +ones = np.ones((100, 100)) +twos = 2.0 * ones # Create an OMX file (will overwrite existing file!) -print('Creating myfile.omx') -myfile = omx.open_file('myfile.omx','w') # use 'a' to append/edit an existing file +print("Creating myfile.omx") +myfile = omx.open_file("myfile.omx", "w") # use 'a' to append/edit an existing file # Write to the file. -myfile['m1'] = ones -myfile['m2'] = twos -myfile['m3'] = ones + twos # numpy array math is fast +myfile["m1"] = ones +myfile["m2"] = twos +myfile["m3"] = ones + twos # numpy array math is fast myfile.close() # Open an OMX file for reading only -print('Reading myfile.omx') -myfile = omx.open_file('myfile.omx') +print("Reading myfile.omx") +myfile = omx.open_file("myfile.omx") -print ('Shape:', myfile.shape()) # (100,100) -print ('Number of tables:', len(myfile)) # 3 -print ('Table names:', myfile.list_matrices()) # ['m1','m2',',m3'] +print("Shape:", myfile.shape()) # (100,100) +print("Number of tables:", len(myfile)) # 3 +print("Table names:", myfile.list_matrices()) # ['m1','m2',',m3'] # Work with data. Pass a string to select matrix by name: # ------------------------------------------------------- -m1 = myfile['m1'] -m2 = myfile['m2'] -m3 = myfile['m3'] +m1 = myfile["m1"] +m2 = myfile["m2"] +m3 = myfile["m3"] # halves = m1 * 0.5 # CRASH! Don't modify an OMX object directly. # # Create a new numpy array, and then edit it. @@ -46,26 +46,26 @@ # FANCY: Use attributes to find matrices # -------------------------------------- -myfile.close() # was opened read-only, so let's reopen. -myfile = omx.open_file('myfile.omx','a') # append mode: read/write existing file +myfile.close() # was opened read-only, so let's reopen. +myfile = omx.open_file("myfile.omx", "a") # append mode: read/write existing file -myfile['m1'].attrs.timeperiod = 'am' -myfile['m1'].attrs.mode = 'hwy' +myfile["m1"].attrs.timeperiod = "am" +myfile["m1"].attrs.mode = "hwy" -myfile['m2'].attrs.timeperiod = 'md' +myfile["m2"].attrs.timeperiod = "md" -myfile['m3'].attrs.timeperiod = 'am' -myfile['m3'].attrs.mode = 'trn' +myfile["m3"].attrs.timeperiod = "am" +myfile["m3"].attrs.mode = "trn" -print('attributes:', myfile.list_all_attributes()) # ['mode','timeperiod'] +print("attributes:", myfile.list_all_attributes()) # ['mode','timeperiod'] # Use a DICT to select matrices via attributes: -all_am_trips = myfile[ {'timeperiod':'am'} ] # [m1,m3] -all_hwy_trips = myfile[ {'mode':'hwy'} ] # [m1] -all_am_trn_trips = myfile[ {'mode':'trn','timeperiod':'am'} ] # [m3] +all_am_trips = myfile[{"timeperiod": "am"}] # [m1,m3] +all_hwy_trips = myfile[{"mode": "hwy"}] # [m1] +all_am_trn_trips = myfile[{"mode": "trn", "timeperiod": "am"}] # [m3] -print('sum of some tables:', np.sum(all_am_trips)) +print("sum of some tables:", np.sum(all_am_trips)) # SUPER FANCY: Create a mapping to use TAZ numbers instead of matrix offsets @@ -73,16 +73,15 @@ # (any mapping would work, such as a mapping with large gaps between zone # numbers. For this simple case we'll just assume TAZ numbers are 1-100.) -taz_equivs = np.arange(1,101) # 1-100 inclusive +taz_equivs = np.arange(1, 101) # 1-100 inclusive -myfile.create_mapping('taz', taz_equivs) -print('mappings:', myfile.list_mappings()) # ['taz'] +myfile.create_mapping("taz", taz_equivs) +print("mappings:", myfile.list_mappings()) # ['taz'] -tazs = myfile.mapping('taz') # Returns a dict: {1:0, 2:1, 3:2, ..., 100:99} +tazs = myfile.mapping("taz") # Returns a dict: {1:0, 2:1, 3:2, ..., 100:99} -m3 = myfile['m3'] +m3 = myfile["m3"] -print('cell value:', m3[tazs[100]][tazs[100]]) # 3.0 (taz (100,100) is cell [99][99]) +print("cell value:", m3[tazs[100]][tazs[100]]) # 3.0 (taz (100,100) is cell [99][99]) myfile.close() - diff --git a/openmatrix/File.py b/openmatrix/File.py deleted file mode 100644 index c7eb328..0000000 --- a/openmatrix/File.py +++ /dev/null @@ -1,367 +0,0 @@ -import numpy as np -import tables # requires pytables >= 3.1 - -from .Exceptions import * - - -class File(tables.File): - """ - OMX File class, which contains all the methods for adding, removing, manipulating tables - and mappings in an OMX file. - """ - - def __init__(self, f,m,t,r,f1, **kwargs): - tables.File.__init__(self,f,m,t,r,f1,**kwargs) - self._shape = None - - def version(self): - """ - Return the OMX file format of this OMX file, embedded in the OMX_VERSION file attribute. - Returns None if the OMX_VERSION attribute is not set. - """ - if 'OMX_VERSION' in self.root._v_attrs: - return self.root._v_attrs['OMX_VERSION'] - else: - return None - - - def create_matrix(self, name, atom=None, shape=None, title='', filters=None, - chunkshape=None, byteorder=None, createparents=False, obj=None, - attrs=None): - """ - Create an OMX Matrix (CArray) at the root level. User must pass in either - an existing numpy matrix, or a shape and an atom type. - - Parameters - ---------- - name : string - The name of this matrix. Stored in HDF5 as the leaf name. - title : string - Short description of this matrix. Default is ''. - obj : numpy.CArray - Existing numpy array from which to create this OMX matrix. If obj is passed in, - then shape and atom can be left blank. If obj is not passed in, then a shape and - atom must be specified instead. Default is None. - shape : numpy.array - Optional shape of the matrix. Shape is an int32 numpy array of format (rows,columns). - If shape is not specified, an existing numpy CArray must be passed in instead, - as the 'obj' parameter. Default is None. - atom : atom_type - Optional atom type of the data. Can be int32, float32, etc. Default is None. - If None specified, then obj parameter must be passed in instead. - filters : tables.Filters - Set of HDF5 filters (compression, etc) used for creating the matrix. - Default is None. See HDF5 documentation for details. Note: while the default here - is None, the default set of filters set at the OMX parent file level is - zlib compression level 1. Those settings usually trickle down to the table level. - attrs : dict - Dictionary of attribute names and values to be attached to this matrix. - Default is None. - - Returns - ------- - matrix : tables.carray - HDF5 CArray matrix - """ - - # If object was passed in, make sure its shape is correct - if self.shape() is not None and obj is not None and obj.shape != self.shape(): - raise ShapeError('%s has shape %s but this file requires shape %s' % - (name, obj.shape, self.shape())) - - matrix = self.create_carray(self.root.data, name, atom, shape, title, filters, - chunkshape, byteorder, createparents, obj) - - # Store shape if we don't have one yet - if self._shape is None: - storeshape = np.array([matrix.shape[0],matrix.shape[1]], dtype='int32') - self.root._v_attrs['SHAPE'] = storeshape - self._shape = matrix.shape - - # attributes - if attrs: - for key in attrs: - matrix.attrs[key] = attrs[key] - - return matrix - - def shape(self): - """ - Get the one and only shape of all matrices in this File - - Returns - ------- - shape : tuple - Tuple of (rows,columns) for this matrix and file. - """ - - # If we already have the shape, just return it - if self._shape: - return self._shape - - # If shape is already set in root node attributes, grab it - if 'SHAPE' in self.root._v_attrs: - # Shape is stored as a numpy.array: - arrayshape = self.root._v_attrs['SHAPE'] - # which must be converted to a tuple: - realshape = (arrayshape[0],arrayshape[1]) - self._shape = realshape - return self._shape - - # Inspect the first CArray object to determine its shape - if len(self) > 0: - # jwd: generator has no next funtion in python 3 - # next() function supported in both in python 2.6+ and python 3 - self._shape = next(self.iter_nodes(self.root.data,'CArray')).shape - - # Store it if we can - if self._iswritable(): - storeshape = np.array( - [self._shape[0],self._shape[1]], - dtype='int32') - self.root._v_attrs['SHAPE'] = storeshape - - return self._shape - - else: - return None - - - def list_matrices(self): - """ - List the matrix names in this File - - Returns - ------- - matrices : list - List of all matrix names stored in this OMX file. - """ - return [node.name for node in self.list_nodes(self.root.data,'CArray')] - - - def list_all_attributes(self): - """ - Return set of all attributes used for any Matrix in this File - - Returns - ------- - all_attributes : set - The combined set of all attribute names that exist on any matrix in this file. - """ - all_tags = set() - for m in self.iter_nodes(self.root.data, 'CArray'): - all_tags.update(m.attrs._v_attrnamesuser) - return sorted(all_tags) - - - # MAPPINGS ----------------------------------------------- - def list_mappings(self): - """ - List all mappings in this file - - Returns: - -------- - mappings : list - List of the names of all mappings in the OMX file. Mappings - are stored internally in the 'lookup' subset of the HDF5 file - structure. Returns empty list if there are no mappings. - """ - try: - return [m.name for m in self.list_nodes(self.root.lookup)] - except: - return [] - - - def delete_mapping(self, title): - """ - Remove a mapping. - - Raises: - ------- - LookupError : if the specified mapping does not exist. - """ - - try: - self.remove_node(self.root.lookup, title) - except: - raise LookupError('No such mapping: '+title) - - - def mapping(self, title): - """ - Return dict containing key:value pairs for specified mapping. Keys - represent the map item and value represents the array offset. - - Parameters: - ----------- - title : string - Name of the mapping to be returned - - Returns: - -------- - mapping : dict - Dictionary where each key is the map item, and the value - represents the array offset. - - Raises: - ------- - LookupError : if the specified mapping does not exist. - """ - - try: - # fetch entries - entries = [] - entries.extend(self.get_node(self.root.lookup, title)[:]) - - # build reverse key-lookup - keymap = {} - for i in range(len(entries)): - keymap[entries[i]] = i - - return keymap - - except: - raise LookupError('No such mapping: '+title) - - def map_entries(self, title): - """Return a list of entries for the specified mapping. - Throws LookupError if the specified mapping does not exist. - """ - try: - # fetch entries - entries = [] - entries.extend(self.get_node(self.root.lookup, title)[:]) - - return entries - - except: - raise LookupError('No such mapping: '+title) - - - def create_mapping(self, title, entries, overwrite=False): - """ - Create an equivalency index, which maps a raw data dimension to - another integer value. Once created, mappings can be referenced by - offset or by key. - - Parameters: - ----------- - title : string - Name of this mapping - entries : list - List of n equivalencies for the mapping. n must match one data - dimension of the matrix. - overwrite : boolean - True to allow overwriting an existing mapping, False will raise - a LookupError if the mapping already exists. Default is False. - - Returns: - -------- - mapping : tables.array - Returns the created mapping. - - Raises: - LookupError : if the mapping exists and overwrite=False - """ - - # Enforce shape-checking - if self.shape(): - if not len(entries) in self._shape: - raise ShapeError('Mapping must match one data dimension') - - # Handle case where mapping already exists: - if title in self.list_mappings(): - if overwrite: - self.delete_mapping(title) - else: - raise LookupError(title+' mapping already exists.') - - # Create lookup group under root if it doesn't already exist. - if 'lookup' not in self.root: - self.create_group(self.root, 'lookup') - - # Write the mapping! - mymap = self.create_array(self.root.lookup, title, atom=tables.UInt32Atom(), - shape=(len(entries),) ) - mymap[:] = entries - - return mymap - - - # The following functions implement Python list/dictionary lookups. ---- - def __getitem__(self,key): - """Return a matrix by name, or a list of matrices by attributes""" - - if isinstance(key, str): - return self.get_node(self.root.data, key) - - if 'keys' not in dir(key): - raise LookupError('Key %s not found' % key) - - # Loop through key/value pairs - mats = self.list_nodes(self.root.data, 'CArray') - for a in key.keys(): - mats = self._getMatricesByAttribute(a, key[a], mats) - - return mats - - - def _getMatricesByAttribute(self, key, value, matrices=None): - - answer = [] - - if matrices is None: - matrices = self.list_nodes(self.root.data,'CArray') - - for m in matrices: - if m.attrs is None: - continue - - # Only test if key is present in matrix attributes - if key in m.attrs._v_attrnames and m.attrs[key] == value: - answer.append(m) - - return answer - - - def __len__(self): - return len(self.list_nodes(self.root.data, 'CArray')) - - - def __setitem__(self, key, dataset): - # We need to determine atom and shape from the object that's been passed in. - # This assumes 'dataset' is a numpy object. - atom = tables.Atom.from_dtype(dataset.dtype) - shape = dataset.shape - - #checks to see if it is already a tables instance, and if so, just copies it - if dataset.__class__.__name__ == 'CArray': - return dataset.copy(self.root.data, key) - else: - return self.create_matrix(key, atom, shape, obj=dataset) - - - def __delitem__(self, key): - self.remove_node(self.root.data, key) - - - def __iter__(self): - """Iterate over the keys in this container""" - return self.iter_nodes(self.root.data, 'CArray') - - - def __contains__(self, item): - return item in self.root.data._v_children - - # BACKWARD COMPATIBILITY: - # PyTables switched from camelCaseMethods to camel_case_methods - # We follow suit, and keep old methods for backward compat: - createMapping = create_mapping - createMatrix = create_matrix - deleteMapping = delete_mapping - listMatrices = list_matrices - listAllAttributes = list_all_attributes - listMappings = list_mappings - mapentries = map_entries - mapEntries = map_entries - diff --git a/openmatrix/__init__.py b/openmatrix/__init__.py deleted file mode 100644 index cb894c0..0000000 --- a/openmatrix/__init__.py +++ /dev/null @@ -1,79 +0,0 @@ -from __future__ import print_function - -import tables -import numpy as np - -from .File import * -from .Exceptions import * - -# GLOBAL VARIABLES ----------- -__version__ = '0.3.3' -__omx_version__ = b'0.2' - -# GLOBAL FUNCTIONS ----------- -def open_file(filename, mode='r', title='', root_uep='/', - filters=tables.Filters(complevel=1, shuffle=True, fletcher32=False, complib='zlib'), - shape=None, **kwargs): - """ - Open or create a new OMX file. New files will be created with default - zlib compression enabled. - - Parameters - ---------- - filename : string - Name or path and name of file - mode : string - 'r' for read-only; - 'w' to write (erases existing file); - 'a' to read/write an existing file (will create it if doesn't exist). - Ignored in read-only mode. - title : string - Short description of this file, used when creating the file. Default is ''. - Ignored in read-only mode. - filters : tables.Filters - HDF5 default filter options for compression, shuffling, etc. Default for - OMX standard file format is: zlib compression level 1, and shuffle=True. - Only specify this if you want something other than the recommended standard - HDF5 zip compression. - 'None' will create enormous uncompressed files. - Only 'zlib' compression is guaranteed to be available on all HDF5 implementations. - See HDF5 docs for more detail. - shape: array-like - Shape of matrices in this file. Default is None. Specify a valid shape - (e.g. (1000,1200)) to enforce shape-checking for all added objects. - If shape is not specified, the first added matrix will not be shape-checked - and all subsequently added matrices must match the shape of the first matrix. - All tables in an OMX file must have the same shape. - - Returns - ------- - f : openmatrix.File - The file object for reading and writing. - """ - f = File(filename, mode, title, root_uep, filters, **kwargs); - - # add omx structure if file is writable - if mode != 'r': - # version number - if 'OMX_VERSION' not in f.root._v_attrs: - f.root._v_attrs['OMX_VERSION'] = __omx_version__ - if 'OMX_CREATED_WITH' not in f.root._v_attrs: - f.root._v_attrs['OMX_CREATED_WITH'] = 'python omx ' + __version__ - - # shape - if shape: - storeshape = np.array([shape[0],shape[1]], dtype='int32') - f.root._v_attrs['SHAPE'] = storeshape - - # /data and /lookup folders - if 'data' not in f.root: - f.create_group(f.root,"data") - if 'lookup' not in f.root: - f.create_group(f.root,"lookup") - - return f - - -if __name__ == "__main__": - print('OMX!') - diff --git a/openmatrix/test/test_file.py b/openmatrix/test/test_file.py deleted file mode 100644 index 63c07f2..0000000 --- a/openmatrix/test/test_file.py +++ /dev/null @@ -1,200 +0,0 @@ -import os -import tempfile - -import numpy as np -import numpy.testing as npt -import tables - -import openmatrix as omx - -import nose.tools as nt - -TEST_FILE = None - - -def ones5x5(): - return np.ones((5, 5)) - - -def add_m1_node(f): - f.create_matrix('m1', obj=ones5x5()) - - -def setup_func(): - global TEST_FILE - - if TEST_FILE is not None and os.path.isfile(TEST_FILE): - os.remove(TEST_FILE) - - with tempfile.NamedTemporaryFile(suffix='.omx') as tmp: - TEST_FILE = tmp.name - - -def teardown_func(): - if TEST_FILE is not None and os.path.isfile(TEST_FILE): - os.remove(TEST_FILE) - - -@nt.with_setup(setup_func, teardown_func) -def test_create_file(): - with omx.open_file(TEST_FILE, 'w'): - pass - assert os.path.isfile(TEST_FILE) - - -@nt.with_setup(setup_func, teardown_func) -def test_open_readonly_hdf5_file(): - with tables.open_file(TEST_FILE, 'w'): - pass - - assert os.path.isfile(TEST_FILE) - - with omx.open_file(TEST_FILE, 'r'): - pass - - -@nt.with_setup(setup_func, teardown_func) -def test_set_get_del(): - with omx.open_file(TEST_FILE, 'w') as f: - add_m1_node(f) - npt.assert_array_equal(f['m1'], ones5x5()) - nt.assert_equal(f.shape(), (5, 5)) - del f['m1'] - nt.assert_not_in('m1', f) - - -@nt.with_setup(setup_func, teardown_func) -def test_add_numpy_matrix_using_brackets(): - with omx.open_file(TEST_FILE, 'w') as f: - f['m1'] = ones5x5() - npt.assert_array_equal(f['m1'], ones5x5()) - nt.assert_equal(f.shape(), (5, 5)) - - # test check for shape matching - with nt.assert_raises(omx.Exceptions.ShapeError): - f.create_matrix('m2', obj=np.ones((8, 8))) - - -@nt.with_setup(setup_func, teardown_func) -def test_add_numpy_matrix_using_create_matrix(): - with omx.open_file(TEST_FILE, 'w') as f: - f.create_matrix('m1', obj=ones5x5()) - npt.assert_array_equal(f['m1'], ones5x5()) - nt.assert_equal(f.shape(), (5, 5)) - - -@nt.with_setup(setup_func, teardown_func) -@nt.raises(tables.FileModeError) -def test_add_matrix_to_readonly_file(): - with omx.open_file(TEST_FILE, 'w') as f: - f['m2'] = np.ones((5, 5)) - - with omx.open_file(TEST_FILE, 'r') as f: - f.create_matrix('m1', obj=np.ones((5, 5))) - - -@nt.with_setup(setup_func, teardown_func) -@nt.raises(tables.NodeError) -def test_add_matrix_with_same_name(): - with omx.open_file(TEST_FILE, 'w') as f: - add_m1_node(f) - # now add m1 again: - add_m1_node(f) - - -@nt.with_setup(setup_func, teardown_func) -def test_get_length_of_file(): - with omx.open_file(TEST_FILE, 'w') as f: - f['m1'] = np.ones((5, 5)) - f['m2'] = np.ones((5, 5)) - f['m3'] = np.ones((5, 5)) - f['m4'] = np.ones((5, 5)) - f['m5'] = np.ones((5, 5)) - nt.assert_equal(len(f), 5) - nt.assert_equal(len(f.list_matrices()), 5) - - -@nt.with_setup(setup_func, teardown_func) -def test_len_list_iter(): - names = ['m{}'.format(x) for x in range(5)] - with omx.open_file(TEST_FILE, 'w') as f: - for m in names: - f[m] = ones5x5() - - for mat in f: - npt.assert_array_equal(mat, ones5x5()) - - nt.assert_equal(len(f), len(names)) - nt.assert_equal(f.list_matrices(), names) - - -@nt.with_setup(setup_func, teardown_func) -def test_contains(): - with omx.open_file(TEST_FILE, 'w') as f: - add_m1_node(f) - nt.assert_in('m1', f) - # keep this here to be sure we're actually running - # File.__contains__ - assert 'm1' in f - - -@nt.with_setup(setup_func, teardown_func) -def test_list_all_attrs(): - with omx.open_file(TEST_FILE, 'w') as f: - add_m1_node(f) - f['m2'] = ones5x5() - - nt.assert_equal(f.list_all_attributes(), []) - - f['m1'].attrs['a1'] = 'a1' - f['m1'].attrs['a2'] = 'a2' - f['m2'].attrs['a2'] = 'a2' - f['m2'].attrs['a3'] = 'a3' - - nt.assert_equal(f.list_all_attributes(), ['a1', 'a2', 'a3']) - - -@nt.with_setup(setup_func, teardown_func) -def test_matrices_by_attr(): - with omx.open_file(TEST_FILE, 'w') as f: - f['m1'] = ones5x5() - f['m2'] = ones5x5() - f['m3'] = ones5x5() - - for m in f: - m.attrs['a1'] = 'a1' - m.attrs['a2'] = 'a2' - f['m3'].attrs['a2'] = 'a22' - f['m3'].attrs['a3'] = 'a3' - - gmba = f._getMatricesByAttribute - - nt.assert_equal(gmba('zz', 'zz'), []) - nt.assert_equal(gmba('a1', 'a1'), [f['m1'], f['m2'], f['m3']]) - nt.assert_equal(gmba('a2', 'a2'), [f['m1'], f['m2']]) - nt.assert_equal(gmba('a2', 'a22'), [f['m3']]) - nt.assert_equal(gmba('a3', 'a3'), [f['m3']]) - - -@nt.with_setup(setup_func, teardown_func) -def test_set_with_carray(): - with omx.open_file(TEST_FILE, 'w') as f: - f['m1'] = ones5x5() - f['m2'] = f['m1'] - npt.assert_array_equal(f['m2'], f['m1']) - -@nt.with_setup(setup_func, teardown_func) -def test_mappings(): - with omx.open_file(TEST_FILE, 'w') as f: - taz_equivs = np.arange(1,4) - f.create_mapping('taz', taz_equivs) - - tazs = f.mapping('taz') - nt.assert_equal(tazs, {1:0, 2:1, 3:2}) - nt.assert_raises(LookupError, f.mapping, 'missing') - nt.assert_raises(TypeError, f.mapping) - - entries = f.map_entries('taz') - nt.assert_equal(entries, [1, 2, 3]) - nt.assert_raises(LookupError, f.map_entries, 'missing') - nt.assert_raises(TypeError, f.map_entries) diff --git a/openmatrix/validator.py b/openmatrix/validator.py deleted file mode 100644 index e105556..0000000 --- a/openmatrix/validator.py +++ /dev/null @@ -1,213 +0,0 @@ -import os -from . import open_file as _open_file - -def pass_or_fail(ok): - return("Pass" if ok else "Fail") - -def open_file(filename): - mat_file = _open_file(filename, "r") - print("File contents:", filename) - print(mat_file) - return(mat_file) - -def check1(mat_file, required=True, checknum=1): - try: - print('\nCheck 1: Has OMX_VERSION attribute set to 0.2') - ok = mat_file.root._v_attrs['OMX_VERSION'] == b'0.2' - print(" File version is 0.2:", pass_or_fail(ok)) - return(ok, required, checknum) - except Exception as err: - return (False, required, checknum, str(err)) - -def check2(mat_file, required=True, checknum=2): - try: - print('\nCheck 2: Has SHAPE array attribute set to two item integer array') - ok = len(mat_file.root._v_attrs['SHAPE']) == 2 - print(" Length is 2:", pass_or_fail(ok)) - ok_2 = int(mat_file.root._v_attrs['SHAPE'][0]) == mat_file.root._v_attrs['SHAPE'][0] - print(" First item is integer:", pass_or_fail(ok_2)) - ok_3 = int(mat_file.root._v_attrs['SHAPE'][1]) == mat_file.root._v_attrs['SHAPE'][1] - print(" Second item is integer:", pass_or_fail(ok_3)) - print(' Shape:', mat_file.shape()) - return(ok * ok_2 * ok_3, required, checknum) - except Exception as err: - return (False, required, checknum, str(err)) - -def check3(mat_file, required=True, checknum=3): - try: - print('\nCheck 3: Has data group for matrices') - ok = 'data' in map(lambda x: x._v_name, mat_file.list_nodes("/")) - print(" Group:", pass_or_fail(ok)) - print(' Number of Matrices:', len(mat_file)) - print(' Matrix names:', mat_file.list_matrices()) - return(ok, required, checknum) - except Exception as err: - return (False, required, checknum, str(err)) - -def check4(mat_file, required=True, checknum=4): - try: - print("\nCheck 4: Matrix shape matches file shape") - ok = True - for matrix in mat_file.list_matrices(): - ok_2 = (mat_file[matrix].shape == mat_file.root._v_attrs['SHAPE']).all() - print(" Matrix shape: ", matrix, ":", mat_file[matrix].shape, ":", pass_or_fail(ok_2)) - ok = ok * ok_2 - return(ok, required, checknum) - except Exception as err: - return (False, required, checknum, str(err)) - -def check5(mat_file, required=True, checknum=5): - try: - print('\nCheck 5: Uses common data types (float or int) for matrices') - ok = True - for matrix in mat_file.list_matrices(): - ok_2 = (mat_file[matrix].dtype == float) or (mat_file[matrix].dtype == int) - print(" Matrix: ", matrix, ":", mat_file[matrix].dtype, ":", pass_or_fail(ok_2)) - ok = ok * ok_2 - return(ok, required, checknum) - except Exception as err: - return (False, required, checknum, str(err)) - -def check6(mat_file, required=True, checknum=6): - try: - print('\nCheck 6: Matrices chunked for faster I/O') - ok = True - for matrix in mat_file.list_matrices(): - ok_2 = True if mat_file[matrix].chunkshape is not None else False - print(" Matrix chunkshape: ", matrix, ":", mat_file[matrix].chunkshape, ":", pass_or_fail(ok_2)) - ok = ok * ok_2 - return(ok, required, checknum) - except Exception as err: - return (False, required, checknum, str(err)) - -def check7(mat_file, required=False, checknum=7): - try: - print('\nCheck 7: Uses zlib compression if compression used') - ok = True - for matrix in mat_file.list_matrices(): - ok_2 = True if mat_file[matrix].filters.complib is not None else False - if ok_2: - ok_3 = mat_file[matrix].filters.complib == 'zlib' - ok_2 = ok_2 * ok_3 - print(" Matrix compression library and level: ", matrix, ":", mat_file[matrix].filters.complib, ":", mat_file[matrix].filters.complevel, ":", pass_or_fail(ok_2)) - ok = ok * ok_2 - return(ok, required, checknum) - except Exception as err: - return (False, required, checknum, str(err)) - -def check8(mat_file, required=False, checknum=8): - try: - print("\nCheck 8: Has NA attribute if desired (but not required)") - ok = True - for matrix in mat_file.list_matrices(): - ok_2 = mat_file[matrix].attrs.__contains__("NA") - print(" Matrix NA attribute: ", matrix, ":", pass_or_fail(ok_2)) - ok = ok * ok_2 - return(ok, required, checknum) - except Exception as err: - return (False, required, checknum, str(err)) - -def check9(mat_file, required=False, checknum=9): - try: - print('\nCheck 9: Has lookup group for labels/indexes if desired (but not required)') - ok = 'lookup' in map(lambda x: x._v_name, mat_file.list_nodes("/")) - print(" Group:", pass_or_fail(ok)) - if ok: - print(' Number of Lookups:', len(mat_file.list_mappings())) - print(' Lookups names:', mat_file.list_mappings()) - return(ok, required, checknum) - except Exception as err: - return (False, required, checknum, str(err)) - -def check10(mat_file, required=False, checknum=10): - try: - print("\nCheck 10: Lookup shapes are 1-d and match file shape") - ok = False - if 'lookup' in map(lambda x: x._v_name, mat_file.list_nodes("/")): - ok = True - for lookup in mat_file.list_mappings(): - this_shape = mat_file.get_node(mat_file.root.lookup, lookup).shape - ok_2 = len(this_shape)==1 and this_shape[0] in mat_file.root._v_attrs['SHAPE'] - print(" Lookup: ", lookup, ":", this_shape, ":", pass_or_fail(ok_2)) - ok = ok * ok_2 - return(ok, required, checknum) - except Exception as err: - return (False, required, checknum, str(err)) - -def check11(mat_file, required=False, checknum=11): - try: - print('\nCheck 11: Uses common data types (int or str) for lookups') - is_int = lambda x: x == int - ok = False - if 'lookup' in map(lambda x: x._v_name, mat_file.list_nodes("/")): - ok = True - for lookup in mat_file.list_mappings(): - try: - ok_2 = all(map(lambda x: x == int(x), mat_file.mapping(lookup).keys())) - except ValueError: - ok_2 = None - if not ok_2: - ok_2 = all(map(lambda x: x == str(x), mat_file.mapping(lookup).keys())) - if not ok_2: - ok_2 = all(map(lambda x: x == bytes(x), mat_file.mapping(lookup).keys())) - this_dtype = mat_file.get_node(mat_file.root.lookup, lookup).dtype - print(" Lookup: ", lookup, ":",this_dtype,":", pass_or_fail(ok_2)) - ok = ok * ok_2 - return(ok, required, checknum) - except Exception as err: - return (False, required, checknum, str(err)) - -def check12(mat_file, required=False, checknum=12): - try: - print("\nCheck 12: Has Lookup DIM attribute of 0 (row) or 1 (column) if desired (but not required)") - print(" Not supported at this time by the Python openmatrix package") - ok = False - if 'lookup' in map(lambda x: x._v_name, mat_file.list_nodes("/")): - ok = False - return(ok, required, checknum) - except Exception as err: - return (False, required, checknum, str(err)) - - -def run_checks(filename): - if not os.path.exists(filename): - raise FileNotFoundError(filename) - try: - mat_file = open_file(filename) - except: - print("Unable to open", filename, "using HDF5") - else: - try: - results = [] - results.append(check1(mat_file)) - results.append(check2(mat_file)) - results.append(check3(mat_file)) - results.append(check4(mat_file)) - results.append(check5(mat_file)) - results.append(check6(mat_file)) - results.append(check7(mat_file)) - results.append(check8(mat_file)) - results.append(check9(mat_file)) - results.append(check10(mat_file)) - results.append(check11(mat_file)) - results.append(check12(mat_file)) - print("\nOverall result ") - overall_ok = True - for result in results: - if len(result) == 4: - print(" ERROR", result[3]) - else: - print(" Check", result[2], ":", "Required" if result[1] else "Not required", ":", pass_or_fail(result[0])) - if result[1]: - overall_ok = overall_ok * result[0] - print(" Overall : ", pass_or_fail(overall_ok)) - finally: - mat_file.close() - - -def command_line(): - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('filename', nargs=1, type=str, action="store", help='Open Matrix file to validate') - args = parser.parse_args() - run_checks(args.filename[0]) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a88679d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,60 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "openmatrix" +version = "0.4.0" +description = "OMX, the open matrix data format" +readme = "README.txt" +requires-python = ">=3.9" +authors = [ + { name = "Billy Charlton", email = "billy@okbecause.com" }, + { name = "Ben Stabler", email = "benstabler@yahoo.com" }, +] + +license = "Apache-2.0" +license-files = ["LICENSE.TXT"] + +classifiers = [ + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", +] +keywords = ["openmatrix", "omx"] +dependencies = [ + "h5py >= 2.10.0", + "numpy >= 1.5.0", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-cov", + "ruff == 0.14.10" +] + +[project.urls] +"Homepage" = "https://github.com/osPlanning/omx" + +[project.scripts] +omx-validate = "openmatrix.validator:command_line" + +[tool.setuptools] +package-dir = {"" = "src"} +packages = ["openmatrix"] + + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = ["E", "F", "W"] +ignore = [] + +[tool.pytest.ini_options] +pythonpath = ["src"] +testpaths = ["tests"] diff --git a/setup.py b/setup.py deleted file mode 100644 index de2419b..0000000 --- a/setup.py +++ /dev/null @@ -1,30 +0,0 @@ -from setuptools import setup, find_packages - -# To push to PyPi/pip, use -# python setup.py sdist bdist_wheel upload - -setup( - name='OpenMatrix', - keywords='openmatrix omx', - version='0.3.5.0', - author='Billy Charlton, Ben Stabler', - author_email='billy@okbecause.com, benstabler@yahoo.com', - packages=find_packages(), - url='https://github.com/osPlanning/omx', - license='Apache', - description='OMX, the open matrix data format', - long_description=open('README.txt').read(), - install_requires=[ - "tables >= 3.1.0", - "numpy >= 1.5.0", - ], - classifiers=[ - 'License :: OSI Approved :: Apache Software License' - ], - entry_points={ - 'console_scripts': [ - 'omx-validate = openmatrix.validator:command_line', - ], - } - -) diff --git a/src/openmatrix/__init__.py b/src/openmatrix/__init__.py new file mode 100644 index 0000000..240b61c --- /dev/null +++ b/src/openmatrix/__init__.py @@ -0,0 +1,58 @@ +from os import PathLike +from typing import Union, Literal, Optional, Any + +from .exceptions import ShapeError as ShapeError +from .file import File as File, __version__ as __version__, __omx_version__ as __omx_version__ + + +# GLOBAL FUNCTIONS ----------- +def open_file( + filename: Union[str, PathLike], + mode: Literal["r", "w", "a", "r+", "w-", "x"] = "r", + title: str = "", + filters: Optional[Union[dict[str, Any], Any]] = None, + shape: Optional[tuple[int, int]] = None, + **kwargs, +) -> File: + """ + Open or create a new OMX file. New files will be created with default + gzip compression enabled if filters is None. + + Parameters + ---------- + filename : string or PathLike + Name or path and name of file + mode : string + 'r' for read-only; + 'w' to write (erases existing file); + 'a' to read/write an existing file (will create it if doesn't exist). + 'r+' is also supported (read/write, must exist). + 'w- or x' create file, fail if exists. + title : string + Short description of this file, used when creating the file. Default is ''. + Ignored in read-only mode. + filters : dict or object + HDF5 default filter options. + Default for OMX standard file format is: gzip compression level 1, and shuffle=True. + shape: array-like + Shape of matrices in this file. Default is None. Specify a valid shape + (e.g. (1000,1200)) to enforce shape-checking for all added objects. + If shape is not specified, the first added matrix will not be shape-checked + and all subsequently added matrices must match the shape of the first matrix. + All tables in an OMX file must have the same shape. + + Returns + ------- + f : openmatrix.File + The file object for reading and writing. + """ + + # Default filters if None and mode is writing + if filters is None and mode != "r": + filters = {"complib": "gzip", "complevel": 1, "shuffle": True} + + return File(filename, mode, title=title, filters=filters, shape=shape, **kwargs) + + +if __name__ == "__main__": # pragma: no cover + print("OMX!") diff --git a/openmatrix/Exceptions.py b/src/openmatrix/exceptions.py similarity index 97% rename from openmatrix/Exceptions.py rename to src/openmatrix/exceptions.py index 4688e2e..6f82cdb 100644 --- a/openmatrix/Exceptions.py +++ b/src/openmatrix/exceptions.py @@ -1,3 +1,2 @@ class ShapeError(Exception): pass - diff --git a/src/openmatrix/file.py b/src/openmatrix/file.py new file mode 100644 index 0000000..5d9038a --- /dev/null +++ b/src/openmatrix/file.py @@ -0,0 +1,490 @@ +from os import PathLike +from typing import Optional, Union, Any, Literal + +import h5py +import numpy as np +import numpy.typing as npt + +from .exceptions import ShapeError + + +__version__ = "0.4.0" +__omx_version__ = b"0.2" + + +class File(h5py.File): + """ + OMX File class, which contains all the methods for adding, removing, manipulating matrices + and mappings in an OMX file. + """ + + def __init__( + self, + name: Union[str, PathLike], + mode: Literal["r", "w", "a", "r+", "w-", "x"], + title: str = "", + filters: Optional[Union[dict[str, Any], Any]] = None, + shape: Optional[tuple[int, int]] = None, + **kwargs, + ): + super().__init__(name, mode, **kwargs) + self._shape = None + self.default_filters = filters + + # add omx structure if file is writable + if mode != "r": + # version number + if "OMX_VERSION" not in self.attrs: + self.attrs["OMX_VERSION"] = __omx_version__ + if "OMX_CREATED_WITH" not in self.attrs: + self.attrs["OMX_CREATED_WITH"] = "python omx " + __version__ + + # shape + if shape: + storeshape = np.array([shape[0], shape[1]], dtype=np.int32) + self.attrs["SHAPE"] = storeshape + + # /data and /lookup folders + if "data" not in self["/"]: + self.create_group("data") + if "lookup" not in self["/"]: + self.create_group("lookup") + + def version(self) -> Optional[str]: + """ + Return the OMX file format of this OMX file, embedded in the OMX_VERSION file attribute. + Returns None if the OMX_VERSION attribute is not set. + """ + if "OMX_VERSION" in self.attrs: + return self.attrs["OMX_VERSION"] + else: + return None + + def create_matrix( + self, + name: str, + shape: Optional[tuple[int, int]] = None, + title: str = "", + filters: Union[dict, Any] = None, + chunks: Union[bool, tuple[int, int]] = True, + obj: Optional[npt.NDArray[Union[np.integer, np.floating]]] = None, + dtype: Optional[np.dtype] = None, + attrs: Optional[dict] = None, + ) -> h5py.Dataset: + """ + Create an OMX Matrix (CArray) at the root level. User must pass in either + an existing numpy matrix, or a shape and an atom type. + + Parameters + ---------- + name : string + The name of this matrix. Stored in HDF5 as the leaf name. + shape : numpy.array + Optional shape of the matrix. Shape is an int32 numpy array of format (rows,columns). + If shape is not specified, an existing numpy CArray must be passed in instead, + as the 'obj' parameter. Default is None. + title : string + Short description of this matrix. Default is ''. + filters : tables.Filters + Set of HDF5 filters (compression, etc) used for creating the matrix. + Default is None. See HDF5 documentation for details. Note: while the default here + is None, the default set of filters set at the OMX parent file level is + zlib compression level 1. Those settings usually trickle down to the table level. + chunks: bool or tuple[int, int] + Enable HDF5 array chunking. A value of True enables HDF5 to guess the best chunk size. Chunk size may impact + I/O performance. + obj : numpy.NDArray + Existing numpy array from which to create this OMX matrix. If obj is passed in, + then shape and atom can be left blank. If obj is not passed in, then a shape and + atom must be specified instead. Default is None. + dtype: numpy.dtype + Underlying data to use for storage. Defaults to the datatype of obj. + attrs : dict + Dictionary of attribute names and values to be attached to this matrix. + Default is None. + + Returns + ------- + matrix : h5py.Dataset + HDF5 CArray matrix + """ + + # If object was passed in, make sure its shape is correct + if self.shape() is not None and obj is not None and obj.shape != self.shape(): + raise ShapeError(f"{name} has shape {obj.shape} but this file requires shape {self.shape()}") + + # Determine dshape and dtype + dshape = shape + data = obj + if obj is not None: + dshape = obj.shape + dtype = obj.dtype + + if dshape is None or dtype is None: + raise ValueError("Shape and dtype must be specified if obj is None") + + # Handle compression + compression = compression_opts = None + shuffle = fletcher32 = False + + # If filters is passed (it might be a tables.Filters object or a dict or None) + # We'll try to parse basic stuff or just use defaults if it's the standard OMX one + filters = filters or self.default_filters + + if filters: + # Handle dict + if isinstance(filters, dict): + compression = filters.get("complib") + compression_opts = filters.get("complevel") + shuffle = filters.get("shuffle") + fletcher32 = filters.get("fletcher32") + + # Handle object with attributes (like tables.Filters) + elif hasattr(filters, "complib"): + compression = filters.complib if filters.complib else compression + compression_opts = filters.complevel if hasattr(filters, "complevel") else compression_opts + shuffle = filters.shuffle if hasattr(filters, "shuffle") else shuffle + fletcher32 = filters.fletcher32 if hasattr(filters, "fletcher32") else fletcher32 + else: + raise TypeError("unknown filters object") + + compression = "gzip" if compression == "zlib" else compression + + # create_dataset arguments + kwargs = {} + if compression: + kwargs["compression"] = compression + if compression_opts is not None: + kwargs["compression_opts"] = compression_opts + if shuffle: + kwargs["shuffle"] = shuffle + if fletcher32: + kwargs["fletcher32"] = fletcher32 + if chunks: + kwargs["chunks"] = chunks + + matrix = super().__getitem__("data").create_dataset(name, shape=dshape, dtype=dtype, data=data, **kwargs) + + if title: + matrix.attrs["TITLE"] = title + + # Store shape if we don't have one yet + if self._shape is None: + storeshape = np.array([matrix.shape[0], matrix.shape[1]], dtype="int32") + self.attrs["SHAPE"] = storeshape + self._shape = matrix.shape + + # attributes + if attrs: + for key in attrs: + matrix.attrs[key] = attrs[key] + + return matrix + + def shape(self) -> Optional[tuple[int, int]]: + """ + Get the one and only shape of all matrices in this File + + Returns + ------- + shape : tuple + Tuple of (rows,columns) for this matrix and file or None if a shape is not present and could not be + inferred. + """ + + # If we already have the shape, just return it + if self._shape: + return self._shape + + # If shape is already set in root node attributes, grab it + if "SHAPE" in self.attrs: + # Shape is stored as a numpy.array: + arrayshape = self.attrs["SHAPE"] + # which must be converted to a tuple: + self._shape = (arrayshape[0], arrayshape[1]) + return self._shape + + # Inspect the first Dataset object to determine its shape + data_group = self.data + if len(data_group) > 0: + # Get first key + first_key = list(data_group.keys())[0] + self._shape = data_group[first_key].shape + + # Store it if we can + if self.mode != "r": + storeshape = np.array([self._shape[0], self._shape[1]], dtype="int32") + self.attrs["SHAPE"] = storeshape + self.flush() + + return self._shape + return None + + def list_matrices(self) -> list[str]: + """ + List the matrix names in this File + + Returns + ------- + matrices : list + List of all matrix names stored in this OMX file. + """ + + # Previous versions of OMX returned only the CArrays, it's possible to create other array types so we return + # them all here. + return list(self.data.keys()) + + def list_all_attributes(self) -> list[str]: + """ + Return set of all attributes used for any Matrix in this File + + Returns + ------- + all_attributes : set + The combined set of all attribute names that exist on any matrix in this file. + """ + return sorted(set(k for m in self.data.values() for k in m.attrs.keys())) + + # MAPPINGS ----------------------------------------------- + @property + def data(self) -> h5py.Group: + """Return the '/data' group.""" + return super().__getitem__("data") + + @property + def lookup(self) -> h5py.Group: + """Return the '/lookup' group.""" + return super().__getitem__("lookup") + + def list_mappings(self) -> list[str]: + """ + List all mappings in this file + + Returns: + -------- + mappings : list + List of the names of all mappings in the OMX file. Mappings + are stored internally in the 'lookup' subset of the HDF5 file + structure. Returns empty list if there are no mappings. + """ + return list(self.lookup.keys()) + + def delete_mapping(self, title) -> None: + """ + Remove a mapping. + + Raises: + ------- + LookupError : if the specified mapping does not exist. + """ + try: + del self.lookup[title] + self.flush() + except KeyError: + raise LookupError(f"No such mapping: {title}") + + def delete_matrix(self, name) -> None: + """ + Remove a matrix. + + Raises: + ------- + LookupError : if the specified matrix does not exist. + """ + try: + del self.data[name] + self.flush() + except Exception: + raise LookupError(f"No such matrix: {name}") + + def mapping(self, title) -> dict[Any, int]: + """ + Return dict containing key:value pairs for specified mapping. Keys + represent the map item and value represents the array offset. + + Parameters: + ----------- + title : string + Name of the mapping to be returned + + Returns: + -------- + mapping : dict + Dictionary where each key is the map item, and the value + represents the array offset. + + Raises: + ------- + LookupError : if the specified mapping does not exist. + """ + entries = self.lookup[title][:] + # build reverse key-lookup + return {k: i for i, k in enumerate(entries)} + + def map_entries(self, title) -> list[Any]: + """ + Return a list of entries for the specified mapping. + + Parameters: + ----------- + title : string + Name of the mapping to be returned + + Returns: + -------- + mappings : list + List of entries for the specified mapping. + + Raises: + ------- + LookupError : if the specified mapping does not exist. + """ + return self.lookup[title][:].tolist() + + def create_mapping(self, title, entries, overwrite=False): + """ + Create an equivalency index, which maps a raw data dimension to + another integer value. Once created, mappings can be referenced by + offset or by key. + + Parameters: + ----------- + title : string + Name of this mapping + entries : list + List of n equivalencies for the mapping. n must match one data + dimension of the matrix. + overwrite : boolean + True to allow overwriting an existing mapping, False will raise + a LookupError if the mapping already exists. Default is False. + + Returns: + -------- + mapping : tables.array + Returns the created mapping. + + Raises: + LookupError : if the mapping exists and overwrite=False + """ + + # Enforce shape-checking + if shape := self.shape(): + if len(entries) not in shape: + raise ShapeError("Mapping must match one data dimension") + + existing = self.list_mappings() + if title in existing: + if overwrite: + self.delete_mapping(title) + else: + raise LookupError(f"{title} mapping already exists.") + + # Ensure lookup group exists when writable and write the mapping + return self.lookup.create_dataset(title, data=entries) + + # The following functions implement Python list/dictionary lookups. ---- + def __getitem__(self, key): + """ + Return a matrix by name, a list of matrices by attributes, or a HDF5 group for given absolute path. + """ + + if isinstance(key, str): + # It's not uncommon to want a way out of the omx object, so we provide a special assess method via a + # path. Everything else is assumed to be in data + if key.startswith("/"): + return super().__getitem__(key) + else: + try: + return self.data[key] + except KeyError: + raise LookupError(f"Key {key} not found") + + if not hasattr(key, "keys"): # Pseudo isinstance(key, dict) check + raise LookupError(f"Key {key} not found") + + # Loop through key/value pairs (attribute lookup) + mats = list(self.values()) + for a in key.keys(): + mats = self._getMatricesByAttribute(a, key[a], mats) + + # Shadowed 'mats' variable means that the empty dict query (e.g. f[{}]) returns all children of data. + return mats + + def _getMatricesByAttribute(self, key, value, matrices=None): + """Return a matrix by name, or a list of matrices by attributes""" + answer = [] + + if matrices is None: + matrices = list(self.values()) + + for m in matrices: + # Only test if key is present in matrix attributes + if key in m.attrs and m.attrs[key] == value: + answer.append(m) + + return answer + + def __len__(self): + """Return the length of the '/data' group.""" + return len(self.data) + + def __setitem__(self, key, dataset): + """ + Create a matrix with a given name. + + If a h5py.Dataset is provide that dataset is copied directly. + """ + # We need to determine dtype and shape from the object that's been passed in. + # This assumes 'dataset' is a numpy object. + + # Check if it's already an h5py dataset (copy?) + if isinstance(dataset, h5py.Dataset): + return self.data.copy(dataset, key) + + try: + del self[key] + except KeyError: + pass + + return self.create_matrix(key, obj=dataset) + + # Our set and get item methods break these methods from h5py. These could be useful so we restore them by forward + # the call to the data group instead of the file object. + def items(self): + """Return the key value pairs of the '/data' group.""" + return self.data.items() + + def keys(self): + """Return the keys of the '/data' group.""" + return self.data.keys() + + def values(self): + """Return the values of the '/data' group.""" + return self.data.values() + + def __delitem__(self, key): + """ + Delete a matrix by name, or a HDF5 group for given absolute path. + """ + if key.startswith("/"): + super().__delitem__(key) + else: + del self.data[key] + + def __iter__(self): + """Iterate over the matrices in this container.""" + return iter(self.values()) + + def __contains__(self, item): + """Test if a name is with the '/data' group.""" + return item in self.data + + # BACKWARD COMPATIBILITY: + createMapping = create_mapping + createMatrix = create_matrix + deleteMapping = delete_mapping + listMatrices = list_matrices + listAllAttributes = list_all_attributes + listMappings = list_mappings + mapentries = map_entries + mapEntries = map_entries diff --git a/src/openmatrix/validator.py b/src/openmatrix/validator.py new file mode 100644 index 0000000..f744f27 --- /dev/null +++ b/src/openmatrix/validator.py @@ -0,0 +1,263 @@ +import os + +from . import open_file as _open_file + + +def pass_or_fail(ok): + return "Pass" if ok else "Fail" + + +def open_file(filename): + mat_file = _open_file(filename, "r") + print("File contents:", filename) + print(mat_file) + return mat_file + + +def check1(mat_file, required=True, checknum=1): + """Check 1: Has OMX_VERSION attribute set to 0.2""" + try: + print("\nCheck 1: Has OMX_VERSION attribute set to 0.2") + version = mat_file.attrs.get("OMX_VERSION") + # h5py may return bytes or string depending on version + ok = version in (b"0.2", "0.2") + print(" File version is 0.2:", pass_or_fail(ok)) + return (ok, required, checknum) + except Exception as err: + return (False, required, checknum, str(err)) + + +def check2(mat_file, required=True, checknum=2): + """Check 2: Has SHAPE array attribute set to two item integer array""" + try: + print("\nCheck 2: Has SHAPE array attribute set to two item integer array") + shape_attr = mat_file.attrs.get("SHAPE") + ok = shape_attr is not None and len(shape_attr) == 2 + print(" Length is 2:", pass_or_fail(ok)) + ok_2 = int(shape_attr[0]) == shape_attr[0] if ok else False + print(" First item is integer:", pass_or_fail(ok_2)) + ok_3 = int(shape_attr[1]) == shape_attr[1] if ok else False + print(" Second item is integer:", pass_or_fail(ok_3)) + print(" Shape:", mat_file.shape()) + return (ok and ok_2 and ok_3, required, checknum) + except Exception as err: + return (False, required, checknum, str(err)) + + +def check3(mat_file, required=True, checknum=3): + """Check 3: Has data group for matrices""" + try: + print("\nCheck 3: Has data group for matrices") + ok = "data" in mat_file["/"] + print(" Group:", pass_or_fail(ok)) + print(" Number of Matrices:", len(mat_file)) + print(" Matrix names:", mat_file.list_matrices()) + return (ok, required, checknum) + except Exception as err: + return (False, required, checknum, str(err)) + + +def check4(mat_file, required=True, checknum=4): + """Check 4: Matrix shape matches file shape""" + try: + print("\nCheck 4: Matrix shape matches file shape") + ok = True + shape_attr = mat_file.attrs.get("SHAPE") + file_shape = tuple(shape_attr) if shape_attr is not None else None + for matrix in mat_file.list_matrices(): + matrix_shape = mat_file[matrix].shape + ok_2 = matrix_shape == file_shape + print(" Matrix shape: ", matrix, ":", matrix_shape, ":", pass_or_fail(ok_2)) + ok = ok and ok_2 + return (ok, required, checknum) + except Exception as err: + return (False, required, checknum, str(err)) + + +def check5(mat_file, required=True, checknum=5): + """Check 5: Uses common data types (float or int) for matrices""" + try: + print("\nCheck 5: Uses common data types (float or int) for matrices") + ok = True + for matrix in mat_file.list_matrices(): + dtype = mat_file[matrix].dtype + ok_2 = dtype.kind in ("f", "i", "u") # float, signed int, unsigned int + print(" Matrix: ", matrix, ":", dtype, ":", pass_or_fail(ok_2)) + ok = ok and ok_2 + return (ok, required, checknum) + except Exception as err: + return (False, required, checknum, str(err)) + + +def check6(mat_file, required=True, checknum=6): + """Check 6: Matrices chunked for faster I/O""" + try: + print("\nCheck 6: Matrices chunked for faster I/O") + ok = True + for matrix in mat_file.list_matrices(): + chunks = mat_file[matrix].chunks + ok_2 = chunks is not None + print(" Matrix chunks: ", matrix, ":", chunks, ":", pass_or_fail(ok_2)) + ok = ok and ok_2 + return (ok, required, checknum) + except Exception as err: + return (False, required, checknum, str(err)) + + +def check7(mat_file, required=False, checknum=7): + """Check 7: Uses zlib/gzip compression if compression used""" + try: + print("\nCheck 7: Uses zlib/gzip compression if compression used") + ok = True + for matrix in mat_file.list_matrices(): + compression = mat_file[matrix].compression + compression_opts = mat_file[matrix].compression_opts + if compression is not None: + # h5py uses 'gzip' for zlib compression + ok_2 = compression == "gzip" + print( + " Matrix compression library and level: ", + matrix, + ":", + compression, + ":", + compression_opts, + ":", + pass_or_fail(ok_2), + ) + ok = ok and ok_2 + else: + print(" Matrix compression: ", matrix, ": None") + return (ok, required, checknum) + except Exception as err: + return (False, required, checknum, str(err)) + + +def check8(mat_file, required=False, checknum=8): + """Check 8: Has NA attribute if desired (but not required)""" + try: + print("\nCheck 8: Has NA attribute if desired (but not required)") + ok = True + for matrix in mat_file.list_matrices(): + ok_2 = "NA" in mat_file[matrix].attrs + print(" Matrix NA attribute: ", matrix, ":", pass_or_fail(ok_2)) + ok = ok and ok_2 + return (ok, required, checknum) + except Exception as err: + return (False, required, checknum, str(err)) + + +def check9(mat_file, required=False, checknum=9): + """Check 9: Has lookup group for labels/indexes if desired (but not required)""" + try: + print("\nCheck 9: Has lookup group for labels/indexes if desired (but not required)") + ok = "lookup" in mat_file["/"] + print(" Group:", pass_or_fail(ok)) + if ok: + print(" Number of Lookups:", len(mat_file.list_mappings())) + print(" Lookups names:", mat_file.list_mappings()) + return (ok, required, checknum) + except Exception as err: + return (False, required, checknum, str(err)) + + +def check10(mat_file, required=False, checknum=10): + """Check 10: Lookup shapes are 1-d and match file shape""" + try: + print("\nCheck 10: Lookup shapes are 1-d and match file shape") + ok = False + if "lookup" in mat_file["/"]: + ok = True + shape_attr = mat_file.attrs.get("SHAPE") + file_shape = tuple(shape_attr) if shape_attr is not None else () + lookup_group = mat_file.lookup + for lookup_name in mat_file.list_mappings(): + this_shape = lookup_group[lookup_name].shape + ok_2 = len(this_shape) == 1 and this_shape[0] in file_shape + print(" Lookup: ", lookup_name, ":", this_shape, ":", pass_or_fail(ok_2)) + ok = ok and ok_2 + return (ok, required, checknum) + except Exception as err: + return (False, required, checknum, str(err)) + + +def check11(mat_file, required=False, checknum=11): + """Check 11: Uses common data types (int or str) for lookups""" + try: + print("\nCheck 11: Uses common data types (int or str) for lookups") + ok = False + if "lookup" in mat_file["/"]: + ok = True + lookup_group = mat_file.lookup + for lookup_name in mat_file.list_mappings(): + dtype = lookup_group[lookup_name].dtype + # Check if integer or string type + ok_2 = dtype.kind in ("i", "u", "S", "U", "O") + print(" Lookup: ", lookup_name, ":", dtype, ":", pass_or_fail(ok_2)) + ok = ok and ok_2 + return (ok, required, checknum) + except Exception as err: + return (False, required, checknum, str(err)) + + +def check12(mat_file, required=False, checknum=12): + """Check 12: Has Lookup DIM attribute of 0 (row) or 1 (column) if desired (but not required)""" + try: + print("\nCheck 12: Has Lookup DIM attribute of 0 (row) or 1 (column) if desired (but not required)") + print(" Not supported at this time by the Python openmatrix package") + ok = "lookup" in mat_file["/"] + return (ok, required, checknum) + except Exception as err: + return (False, required, checknum, str(err)) + + +def run_checks(filename): + if not os.path.exists(filename): + raise FileNotFoundError(filename) + try: + mat_file = open_file(filename) + except Exception: + print("Unable to open", filename, "using HDF5") + else: + try: + results = [] + results.append(check1(mat_file)) + results.append(check2(mat_file)) + results.append(check3(mat_file)) + results.append(check4(mat_file)) + results.append(check5(mat_file)) + results.append(check6(mat_file)) + results.append(check7(mat_file)) + results.append(check8(mat_file)) + results.append(check9(mat_file)) + results.append(check10(mat_file)) + results.append(check11(mat_file)) + results.append(check12(mat_file)) + print("\nOverall result ") + overall_ok = True + for result in results: + if len(result) == 4: + print(" ERROR", result[3]) + else: + print( + " Check", + result[2], + ":", + "Required" if result[1] else "Not required", + ":", + pass_or_fail(result[0]), + ) + if result[1]: + overall_ok = overall_ok and result[0] + print(" Overall : ", pass_or_fail(overall_ok)) + finally: + mat_file.close() + + +def command_line(): + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("filename", nargs=1, type=str, action="store", help="Open Matrix file to validate") + args = parser.parse_args() + run_checks(args.filename[0]) diff --git a/tests/test_file.py b/tests/test_file.py new file mode 100644 index 0000000..e995c17 --- /dev/null +++ b/tests/test_file.py @@ -0,0 +1,648 @@ +import uuid +import pytest +import numpy as np +import numpy.testing as npt +import h5py +import openmatrix as omx + + +@pytest.fixture +def omx_file(tmp_path): + """Provide a unique temporary OMX file path for each test.""" + return tmp_path / f"test{uuid.uuid4().hex}.omx" + + +def ones5x5(): + return np.ones((5, 5)) + + +def add_m1_node(f): + f.create_matrix("m1", obj=ones5x5()) + + +def test_create_file(omx_file): + with omx.open_file(omx_file, "w"): + pass + assert omx_file.exists() + + +def test_open_readonly_hdf5_file(omx_file): + with h5py.File(omx_file, "w"): + pass + + assert omx_file.exists() + + with omx.open_file(omx_file, "r"): + pass + + +def test_set_get_del(omx_file): + with omx.open_file(omx_file, "w") as f: + add_m1_node(f) + npt.assert_array_equal(f["m1"], ones5x5()) + assert f.shape() == (5, 5) + del f["m1"] + assert "m1" not in f + + +def test_add_numpy_matrix_using_brackets(omx_file): + with omx.open_file(omx_file, "w") as f: + f["m1"] = ones5x5() + npt.assert_array_equal(f["m1"], ones5x5()) + assert f.shape() == (5, 5) + + # test check for shape matching + with pytest.raises(omx.exceptions.ShapeError): + f.create_matrix("m2", obj=np.ones((8, 8))) + + +def test_add_numpy_matrix_using_create_matrix(omx_file): + with omx.open_file(omx_file, "w") as f: + f.create_matrix("m1", obj=ones5x5()) + npt.assert_array_equal(f["m1"], ones5x5()) + assert f.shape() == (5, 5) + + +def test_add_matrix_to_readonly_file(omx_file): + with omx.open_file(omx_file, "w") as f: + f["m2"] = np.ones((5, 5)) + + with omx.open_file(omx_file, "r") as f: + with pytest.raises(ValueError): + f.create_matrix("m1", obj=np.ones((5, 5))) + + +def test_add_matrix_with_same_name(omx_file): + with omx.open_file(omx_file, "w") as f: + add_m1_node(f) + # now add m1 again: + with pytest.raises((ValueError, RuntimeError)): + add_m1_node(f) + + +def test_get_length_of_file(omx_file): + with omx.open_file(omx_file, "w") as f: + f["m1"] = np.ones((5, 5)) + f["m2"] = np.ones((5, 5)) + f["m3"] = np.ones((5, 5)) + f["m4"] = np.ones((5, 5)) + f["m5"] = np.ones((5, 5)) + assert len(f) == 5 + assert len(f.list_matrices()) == 5 + + +def test_len_list_iter(omx_file): + names = ["m{}".format(x) for x in range(5)] + with omx.open_file(omx_file, "w") as f: + for m in names: + f[m] = ones5x5() + + for mat in f: + npt.assert_array_equal(mat, ones5x5()) + + assert len(f) == len(names) + assert f.list_matrices() == names + + +def test_contains(omx_file): + with omx.open_file(omx_file, "w") as f: + add_m1_node(f) + assert "m1" in f + + +def test_contains_groups_and_datasets(omx_file): + with omx.open_file(omx_file, "w") as f: + # groups auto-created in writable mode + assert "data" in f["/"] and "data" not in f + assert "lookup" in f["/"] and "lookup" not in f + + f.create_mapping("zones", entries=np.array([1, 2, 3])) + f.create_matrix("m1", obj=np.ones((5, 5))) + + assert "m1" in f and "m1" in f.data # dataset inside data + assert "zones" in f.lookup # dataset inside lookup group + assert "missing" not in f + + +def test_list_all_attrs(omx_file): + with omx.open_file(omx_file, "w") as f: + add_m1_node(f) + f["m2"] = ones5x5() + + assert f.list_all_attributes() == [] + + f["m1"].attrs["a1"] = "a1" + f["m1"].attrs["a2"] = "a2" + f["m2"].attrs["a2"] = "a2" + f["m2"].attrs["a3"] = "a3" + + assert f.list_all_attributes() == ["a1", "a2", "a3"] + + +def test_matrices_by_attr(omx_file): + with omx.open_file(omx_file, "w") as f: + f["m1"] = ones5x5() + f["m2"] = ones5x5() + f["m3"] = ones5x5() + + for m in f: + m.attrs["a1"] = "a1" + m.attrs["a2"] = "a2" + f["m3"].attrs["a2"] = "a22" + f["m3"].attrs["a3"] = "a3" + + gmba = f._getMatricesByAttribute + + assert gmba("zz", "zz") == [] + + r1 = gmba("a1", "a1") + assert len(r1) == 3 + names = sorted([m.name.split("/")[-1] for m in r1]) + assert names == ["m1", "m2", "m3"] + + r2 = gmba("a2", "a2") + assert len(r2) == 2 + names2 = sorted([m.name.split("/")[-1] for m in r2]) + assert names2 == ["m1", "m2"] + + r3 = gmba("a2", "a22") + assert len(r3) == 1 + assert r3[0].name.split("/")[-1] == "m3" + + r4 = gmba("a3", "a3") + assert len(r4) == 1 + assert r4[0].name.split("/")[-1] == "m3" + + +def test_set_with_carray(omx_file): + with omx.open_file(omx_file, "w") as f: + f["m1"] = ones5x5() + f["m2"] = f["m1"] + npt.assert_array_equal(f["m2"], f["m1"]) + + +def test_mappings(omx_file): + with omx.open_file(omx_file, "w") as f: + taz_equivs = np.arange(1, 4) + f.create_mapping("taz", taz_equivs) + + tazs = f.mapping("taz") + assert tazs == {1: 0, 2: 1, 3: 2} + with pytest.raises(LookupError): + f.mapping("missing") + + entries = f.map_entries("taz") + assert entries == [1, 2, 3] + with pytest.raises(LookupError): + f.map_entries("missing") + + +def test_open_existing_with_append_mode(omx_file): + data = np.arange(9, dtype=float).reshape(3, 3) + # Create file and add a matrix + with omx.open_file(omx_file, "w") as f: + f.create_matrix("my_matrix", obj=data) + + # Re-open in append mode and ensure dataset is accessible + with omx.open_file(omx_file, "a") as f: + assert "my_matrix" in f + npt.assert_array_equal(f["my_matrix"], data) + + +def test_lookup_property_behavior(tmp_path): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + # When writable, lookup is auto-created and returns an h5py Group + with omx.open_file(omx_file, "w") as f: + lookup_group = f.lookup + assert isinstance(lookup_group, h5py.Group) + assert "lookup" in f["/"] and "lookup" not in f + lookup_group.create_dataset("zones", data=np.arange(3)) + + # With an existing lookup group, read mode should expose it + with omx.open_file(omx_file, "r") as f: + assert "lookup" in f["/"] and "lookup" not in f + assert "zones" in f.lookup + + # If a read-only file has no lookup group, accessing lookup should raise + omx_file_no_lookup = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file_no_lookup, "w"): + pass + with omx.open_file(omx_file_no_lookup, "r") as f: + with pytest.raises(KeyError, match="object 'lookup' doesn't exist"): + _ = f.lookup + + +def test_version_attribute(omx_file): + """Test version() method returns OMX_VERSION or None.""" + # When file has OMX_VERSION (set by open_file in write mode) + with omx.open_file(omx_file, "w") as f: + ver = f.version() + # version can be bytes or string depending on h5py version + assert ver in (b"0.2", "0.2") + + # When file has no OMX_VERSION attribute + omx_file_no_ver = omx_file.parent / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file_no_ver, "w"): + pass + with omx.open_file(omx_file_no_ver, "r") as f: + assert f.version() is None + + +def test_create_matrix_without_obj_raises(omx_file): + """Test create_matrix raises ValueError when obj=None and shape/dtype not specified.""" + with omx.open_file(omx_file, "w") as f: + with pytest.raises(ValueError, match="Shape and dtype must be specified"): + f.create_matrix("test") + + +def test_create_matrix_with_shape_and_dtype(omx_file): + """Test create_matrix with explicit shape and dtype (no obj).""" + with omx.open_file(omx_file, "w") as f: + mat = f.create_matrix("test", shape=(3, 3), dtype=np.float64) + assert mat.shape == (3, 3) + assert mat.dtype == np.float64 + + +def test_create_matrix_with_title(omx_file): + """Test create_matrix with title sets TITLE attribute.""" + with omx.open_file(omx_file, "w") as f: + f.create_matrix("m1", obj=ones5x5(), title="My Matrix Title") + assert f["m1"].attrs["TITLE"] == "My Matrix Title" + + +def test_create_matrix_with_attrs(omx_file): + """Test create_matrix with custom attrs dict.""" + with omx.open_file(omx_file, "w") as f: + f.create_matrix("m1", obj=ones5x5(), attrs={"custom1": "value1", "custom2": 42}) + assert f["m1"].attrs["custom1"] == "value1" + assert f["m1"].attrs["custom2"] == 42 + + +def test_create_matrix_with_dict_filters(omx_file): + """Test create_matrix with dict-style filters including zlib->gzip conversion.""" + with omx.open_file(omx_file, "w") as f: + filters = {"complib": "zlib", "complevel": 4, "shuffle": True} + f.create_matrix("m1", obj=ones5x5(), filters=filters) + assert f["m1"].compression == "gzip" + assert f["m1"].compression_opts == 4 + assert f["m1"].shuffle is True + + +def test_create_matrix_with_object_filters(omx_file): + """Test create_matrix with object-style filters (like tables.Filters).""" + + class MockFilters: + complib = "zlib" + complevel = 2 + shuffle = False + fletcher32 = True + + with omx.open_file(omx_file, "w") as f: + f.create_matrix("m1", obj=ones5x5(), filters=MockFilters()) + assert f["m1"].compression == "gzip" # Translated from zlib + assert f["m1"].compression_opts == MockFilters.complevel + assert f["m1"].shuffle == MockFilters.shuffle + assert f["m1"].fletcher32 == MockFilters.fletcher32 + + +def test_shape_inferred_from_first_matrix(tmp_path): + """Test shape() infers shape from first matrix when SHAPE attr is missing.""" + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + # Create file directly with h5py to skip OMX structure + with h5py.File(omx_file, "w") as f: + data_group = f.create_group("data") + data_group.create_dataset("m1", data=np.ones((7, 7))) + + with omx.open_file(omx_file, "r") as f: + shape = f.shape() + assert shape == (7, 7) + + +def test_list_matrices_empty_file(tmp_path): + """Test list_matrices returns empty list when no data group exists.""" + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with omx.open_file(omx_file, "w"): + pass + with omx.open_file(omx_file, "r") as f: + assert f.list_matrices() == [] + + +def test_delete_mapping(omx_file): + """Test delete_mapping removes a mapping.""" + with omx.open_file(omx_file, "w") as f: + f.create_mapping("taz", np.arange(1, 6)) + assert "taz" in f.list_mappings() + f.delete_mapping("taz") + assert "taz" not in f.list_mappings() + + +@pytest.mark.parametrize( + "setup,title", + [ + ("no_lookup", "missing"), # No lookup group exists + ("with_mapping", "nonexistent"), # Lookup exists but title doesn't + ], +) +def test_delete_mapping_errors(tmp_path, setup, title): + """Test delete_mapping raises LookupError for missing lookup or title.""" + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + if setup == "no_lookup": + with h5py.File(omx_file, "w"): + pass + with omx.File(omx_file, "r+") as f: + with pytest.raises(LookupError, match="No such mapping"): + f.delete_mapping(title) + else: + with omx.open_file(omx_file, "w") as f: + f.create_mapping("taz", np.arange(1, 6)) + with pytest.raises(LookupError, match="No such mapping"): + f.delete_mapping(title) + + +@pytest.mark.parametrize("matrix_exists", [True, False]) +def test_delete_matrix_behavior(omx_file, matrix_exists): + """Test delete_matrix success and error cases.""" + with omx.open_file(omx_file, "w") as f: + if matrix_exists: + f.create_matrix("m1", obj=ones5x5()) + assert "m1" in f + f.delete_matrix("m1") + assert "m1" not in f + else: + with pytest.raises(LookupError, match="No such matrix"): + f.delete_matrix("nonexistent") + + +@pytest.mark.parametrize("method", ["mapping", "map_entries"]) +def test_mapping_methods_missing_lookup(tmp_path, method): + """Test mapping() and map_entries() raise when lookup group doesn't exist.""" + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file, "w"): + pass + with omx.open_file(omx_file, "r") as f: + with pytest.raises(KeyError, match="object 'lookup' doesn't exist"): + getattr(f, method)("missing") + + +def test_create_mapping_shape_mismatch(omx_file): + """Test create_mapping raises ShapeError when entries don't match shape.""" + with omx.open_file(omx_file, "w") as f: + f.create_matrix("m1", obj=ones5x5()) # Sets shape to (5,5) + with pytest.raises(omx.exceptions.ShapeError, match="Mapping must match one data dimension"): + f.create_mapping("bad", np.arange(1, 10)) # Length 9 doesn't match 5 + + +@pytest.mark.parametrize( + "overwrite,should_raise", + [ + (True, False), # overwrite=True replaces existing + (False, True), # overwrite=False raises + ], +) +def test_create_mapping_overwrite_behavior(omx_file, overwrite, should_raise): + """Test create_mapping overwrite parameter behavior.""" + with omx.open_file(omx_file, "w") as f: + f.create_mapping("taz", np.arange(1, 4)) + if should_raise: + with pytest.raises(LookupError, match="mapping already exists"): + f.create_mapping("taz", np.arange(10, 13), overwrite=overwrite) + else: + f.create_mapping("taz", np.arange(10, 13), overwrite=overwrite) + assert f.map_entries("taz") == [10, 11, 12] + + +def test_getitem_direct_group_access(omx_file): + """Test __getitem__ with 'data' and 'lookup' keys.""" + with omx.open_file(omx_file, "w") as f: + data_group = f.data + assert isinstance(data_group, h5py.Group) + lookup_group = f.lookup + assert isinstance(lookup_group, h5py.Group) + + +def test_getitem_path_access(omx_file): + """Test __getitem__ with absolute path.""" + with omx.open_file(omx_file, "w") as f: + f.create_matrix("m1", obj=ones5x5()) + mat = f["/data/m1"] + npt.assert_array_equal(mat, ones5x5()) + + +def test_getitem_dict_attribute_lookup(omx_file): + """Test __getitem__ with dict for attribute-based lookup.""" + with omx.open_file(omx_file, "w") as f: + f["m1"] = ones5x5() + f["m2"] = ones5x5() + f["m1"].attrs["purpose"] = "work" + f["m2"].attrs["purpose"] = "home" + + result = f[{"purpose": "work"}] + assert len(result) == 1 + assert result[0].name.split("/")[-1] == "m1" + + +@pytest.mark.parametrize( + "key,desc", + [ + ("nonexistent", "missing string key"), + (12345, "invalid key type without keys() method"), + ], +) +def test_getitem_errors(omx_file, key, desc): + """Test __getitem__ raises LookupError for invalid keys.""" + with omx.open_file(omx_file, "w") as f: + with pytest.raises(LookupError, match="Key .* not found"): + _ = f[key] + + +def test_getMatricesByAttribute_no_matrices_arg(omx_file): + """Test _getMatricesByAttribute when matrices=None.""" + with omx.open_file(omx_file, "w") as f: + f["m1"] = ones5x5() + f["m1"].attrs["tag"] = "yes" + result = f._getMatricesByAttribute("tag", "yes") # matrices=None by default + assert len(result) == 1 + + +@pytest.mark.parametrize( + "operation,expected", + [ + ("len", 0), + ("iter", []), + ("contains", False), + ], +) +def test_empty_file_no_data_group(tmp_path, operation, expected): + """Test file operations when no data group exists.""" + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with omx.open_file(omx_file, "w"): + pass + with omx.open_file(omx_file, "r") as f: + if operation == "len": + assert len(f) == expected + elif operation == "iter": + assert list(f) == expected + elif operation == "contains": + assert ("anything" in f) == expected + else: # pragma: no cover + pass + + +def test_setitem_overwrites_existing(omx_file): + """Test __setitem__ overwrites existing matrix.""" + with omx.open_file(omx_file, "w") as f: + f["m1"] = ones5x5() + f["m1"] = np.zeros((5, 5)) + npt.assert_array_equal(f["m1"], np.zeros((5, 5))) + + +@pytest.mark.parametrize("key_exists", [True, False]) +def test_delitem_behavior(omx_file, key_exists): + """Test __delitem__ for existing and non-existent keys.""" + with omx.open_file(omx_file, "w") as f: + if key_exists: + f["custom"] = ones5x5() + assert "custom" in f + del f["custom"] + assert "custom" not in f + else: + with pytest.raises(KeyError, match="Couldn't delete link"): + del f["nonexistent"] + + +def test_open_file_with_shape(omx_file): + """Test open_file with shape parameter sets SHAPE attribute.""" + with omx.open_file(omx_file, "w", shape=(100, 200)) as f: + shape = f.attrs["SHAPE"] + assert tuple(shape) == (100, 200) + + +def test_create_matrix_with_non_zlib_compression(omx_file): + """Test create_matrix with non-zlib compression (e.g., gzip directly).""" + with omx.open_file(omx_file, "w") as f: + filters = {"complib": "gzip", "complevel": 5, "shuffle": False} + f.create_matrix("m1", obj=ones5x5(), filters=filters) + assert f["m1"].compression == "gzip" + assert f["m1"].compression_opts == 5 + + +def test_shape_inferred_from_first_matrix_append_mode(tmp_path): + """Test shape() infers and stores shape when mode is not read-only.""" + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + # Create file directly with h5py without SHAPE attribute + with h5py.File(omx_file, "w") as f: + data_group = f.create_group("data") + data_group.create_dataset("m1", data=np.ones((7, 7))) + + # Open in append mode - shape should be inferred and stored + with omx.open_file(omx_file, "a") as f: + shape = f.shape() + assert shape == (7, 7) + # Verify SHAPE was stored in attrs + assert "SHAPE" in f.attrs + assert tuple(f.attrs["SHAPE"]) == (7, 7) + + +def test_getMatricesByAttribute_no_data_group(tmp_path): + """Test _getMatricesByAttribute when no data group exists and matrices=None.""" + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with omx.open_file(omx_file, "w"): + pass + + with omx.open_file(omx_file, "r") as f: + result = f._getMatricesByAttribute("key", "value") + assert result == [] + + +def test_setitem_with_h5py_dataset(omx_file): + """Test __setitem__ when passing an h5py.Dataset (copy scenario).""" + with omx.open_file(omx_file, "w") as f: + f["m1"] = ones5x5() + # Get the dataset and assign it to a new key + dataset = f["m1"] + f["m2"] = dataset + # Both should have the same data + npt.assert_array_equal(f["m1"], f["m2"]) + + +def test_map_entries_non_numpy_array(tmp_path): + """Test map_entries when entries don't have tolist method.""" + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with omx.open_file(omx_file, "w") as f: + # Create mapping normally + f.create_mapping("taz", np.arange(1, 4)) + # Test that map_entries works + entries = f.map_entries("taz") + assert entries == [1, 2, 3] + + +def test_backward_compatibility_aliases(omx_file): + """Test backward compatibility method aliases.""" + with omx.open_file(omx_file, "w") as f: + # Test createMatrix alias + f.createMatrix("m1", obj=ones5x5()) + assert "m1" in f + + # Test listMatrices alias + assert f.listMatrices() == ["m1"] + + # Test listAllAttributes alias + assert f.listAllAttributes() == [] + + # Test createMapping alias + f.createMapping("taz", np.arange(1, 6)) + assert "taz" in f.listMappings() + + # Test mapEntries alias + assert f.mapEntries("taz") == [1, 2, 3, 4, 5] + + # Test deleteMapping alias + f.deleteMapping("taz") + assert "taz" not in f.listMappings() + + +def test_keys_values_items(omx_file): + with omx.open_file(omx_file, "w") as f: + f["m1"] = ones5x5() + f["m2"] = ones5x5() + + assert all((k, v) == t for k, v, t in zip(f.keys(), f.values(), f.items())) + + +def test_assess_with_path(omx_file): + with omx.open_file(omx_file, "w") as f: + f["m1"] = ones5x5() + npt.assert_array_equal(f["/data/m1"], ones5x5()) + + del f["/data/m1"] + + assert "m1" not in f + + +def test_tables_like_filters(omx_file): + class MockFilters: + complib = "zlib" + complevel = 3 + shuffle = False + fletcher32 = True + + with omx.open_file(omx_file, "w", filters=MockFilters()) as f: + f["m1"] = ones5x5() + m1 = f["m1"] + + assert m1.compression == "gzip" # zlib was translated to gzip + assert m1.compression_opts == MockFilters.complevel + assert m1.shuffle == MockFilters.shuffle + assert m1.fletcher32 == MockFilters.fletcher32 + + +def test_bad_filters(omx_file): + with pytest.raises(TypeError, match="unknown filters object"): + with omx.open_file(omx_file, "w", filters="gzip") as f: + f["m1"] = ones5x5() + + +@pytest.mark.parametrize("chunks", [True, False]) +def test_chunks(omx_file, chunks): + with omx.open_file(omx_file, "w") as f: + f.create_matrix("m1", obj=ones5x5(), chunks=chunks) diff --git a/tests/test_validator.py b/tests/test_validator.py new file mode 100644 index 0000000..5d2d3c0 --- /dev/null +++ b/tests/test_validator.py @@ -0,0 +1,482 @@ +import uuid + +import h5py +import numpy as np +import openmatrix as omx +import pytest +from openmatrix import validator + + +@pytest.fixture +def omx_file(tmp_path): + """Provide a unique temporary OMX file path for each test.""" + return tmp_path / f"test{uuid.uuid4().hex}.omx" + + +@pytest.fixture +def valid_omx_file(omx_file): + """Create a valid OMX file that passes all required checks.""" + with omx.open_file(omx_file, "w") as f: + f.create_matrix("m1", obj=np.ones((5, 5), dtype=np.float64)) + f.create_mapping("zones", np.arange(1, 6)) + return omx_file + + +class TestPassOrFail: + def test_pass(self): + assert validator.pass_or_fail(True) == "Pass" + + def test_fail(self): + assert validator.pass_or_fail(False) == "Fail" + + +class TestOpenFile: + def test_open_file(self, valid_omx_file, capsys): + mat_file = validator.open_file(valid_omx_file) + captured = capsys.readouterr() + assert "File contents:" in captured.out + assert mat_file is not None + mat_file.close() + + +class TestCheck1: + """Check 1: Has OMX_VERSION attribute set to 0.2""" + + def test_valid_version(self, valid_omx_file, capsys): + with omx.open_file(valid_omx_file, "r") as f: + result = validator.check1(f) + assert result[0] # ok (use == for numpy bool compatibility) + assert result[1] # required + assert result[2] == 1 # checknum + + def test_missing_version(self, tmp_path, capsys): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file, "w") as f: + f.create_group("data") + with omx.open_file(omx_file, "r") as f: + result = validator.check1(f) + assert result[0] is False + + def test_wrong_version(self, tmp_path, capsys): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file, "w") as f: + f.attrs["OMX_VERSION"] = b"0.1" + with omx.open_file(omx_file, "r") as f: + result = validator.check1(f) + assert result[0] is False + + +class TestCheck2: + """Check 2: Has SHAPE array attribute set to two item integer array""" + + def test_valid_shape(self, valid_omx_file, capsys): + with omx.open_file(valid_omx_file, "r") as f: + result = validator.check2(f) + assert result[0] # use == for numpy bool compatibility + assert result[2] == 2 + + def test_missing_shape(self, tmp_path, capsys): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file, "w") as f: + f.attrs["OMX_VERSION"] = b"0.2" + with omx.open_file(omx_file, "r") as f: + result = validator.check2(f) + assert result[0] is False + + +class TestCheck3: + """Check 3: Has data group for matrices""" + + def test_valid_data_group(self, valid_omx_file, capsys): + with omx.open_file(valid_omx_file, "r") as f: + result = validator.check3(f) + assert result[0] is True + assert result[2] == 3 + + def test_missing_data_group(self, tmp_path, capsys): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file, "w") as f: + f.attrs["OMX_VERSION"] = b"0.2" + with omx.open_file(omx_file, "r") as f: + result = validator.check3(f) + assert result[0] is False + + +class TestCheck4: + """Check 4: Matrix shape matches file shape""" + + def test_matching_shapes(self, valid_omx_file, capsys): + with omx.open_file(valid_omx_file, "r") as f: + result = validator.check4(f) + assert result[0] is True + assert result[2] == 4 + + def test_no_matrices(self, tmp_path, capsys): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with omx.open_file(omx_file, "w"): + pass # Empty file with data group but no matrices + with omx.open_file(omx_file, "r") as f: + result = validator.check4(f) + assert result[0] is True # Vacuously true - no matrices to check + + +class TestCheck5: + """Check 5: Uses common data types (float or int) for matrices""" + + def test_valid_float_dtype(self, valid_omx_file, capsys): + with omx.open_file(valid_omx_file, "r") as f: + result = validator.check5(f) + assert result[0] is True + assert result[2] == 5 + + def test_valid_int_dtype(self, omx_file, capsys): + with omx.open_file(omx_file, "w") as f: + f.create_matrix("m1", obj=np.ones((5, 5), dtype=np.int32)) + with omx.open_file(omx_file, "r") as f: + result = validator.check5(f) + assert result[0] is True + + +class TestCheck6: + """Check 6: Matrices chunked for faster I/O""" + + def test_chunked_matrix(self, valid_omx_file, capsys): + with omx.open_file(valid_omx_file, "r") as f: + result = validator.check6(f) + assert result[0] is True + assert result[2] == 6 + + def test_unchunked_matrix(self, tmp_path, capsys): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file, "w") as f: + f.attrs["OMX_VERSION"] = b"0.2" + f.attrs["SHAPE"] = np.array([5, 5], dtype=np.int32) + data = f.create_group("data") + data.create_dataset("m1", data=np.ones((5, 5)), chunks=None) + with omx.open_file(omx_file, "r") as f: + result = validator.check6(f) + assert result[0] is False + + +class TestCheck7: + """Check 7: Uses zlib/gzip compression if compression used""" + + def test_gzip_compression(self, valid_omx_file, capsys): + with omx.open_file(valid_omx_file, "r") as f: + result = validator.check7(f) + assert result[0] is True + assert result[2] == 7 + + def test_no_compression(self, tmp_path, capsys): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file, "w") as f: + f.attrs["OMX_VERSION"] = b"0.2" + f.attrs["SHAPE"] = np.array([5, 5], dtype=np.int32) + data = f.create_group("data") + data.create_dataset("m1", data=np.ones((5, 5))) + with omx.open_file(omx_file, "r") as f: + result = validator.check7(f) + # No compression is ok (just prints "None") + assert result[0] is True + + def test_non_gzip_compression(self, tmp_path, capsys): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file, "w") as f: + f.attrs["OMX_VERSION"] = b"0.2" + f.attrs["SHAPE"] = np.array([5, 5], dtype=np.int32) + data = f.create_group("data") + data.create_dataset("m1", data=np.ones((5, 5)), compression="lzf") + with omx.open_file(omx_file, "r") as f: + result = validator.check7(f) + assert result[0] is False + + +class TestCheck8: + """Check 8: Has NA attribute if desired (but not required)""" + + def test_no_na_attribute(self, valid_omx_file, capsys): + with omx.open_file(valid_omx_file, "r") as f: + result = validator.check8(f) + assert result[0] is False # NA not set + assert result[1] is False # Not required + assert result[2] == 8 + + def test_with_na_attribute(self, omx_file, capsys): + with omx.open_file(omx_file, "w") as f: + f.create_matrix("m1", obj=np.ones((5, 5)), attrs={"NA": -999}) + with omx.open_file(omx_file, "r") as f: + result = validator.check8(f) + assert result[0] is True + + +class TestCheck9: + """Check 9: Has lookup group for labels/indexes if desired""" + + def test_has_lookup_group(self, valid_omx_file, capsys): + with omx.open_file(valid_omx_file, "r") as f: + result = validator.check9(f) + assert result[0] is True + assert result[1] is False # Not required + assert result[2] == 9 + + def test_missing_lookup_group(self, tmp_path, capsys): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file, "w") as f: + f.attrs["OMX_VERSION"] = b"0.2" + f.create_group("data") + with omx.open_file(omx_file, "r") as f: + result = validator.check9(f) + assert result[0] is False + + +class TestCheck10: + """Check 10: Lookup shapes are 1-d and match file shape""" + + def test_valid_lookup_shape(self, valid_omx_file, capsys): + with omx.open_file(valid_omx_file, "r") as f: + result = validator.check10(f) + assert result[0] is True + assert result[2] == 10 + + def test_no_lookup_group(self, tmp_path, capsys): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file, "w") as f: + f.attrs["OMX_VERSION"] = b"0.2" + f.create_group("data") + with omx.open_file(omx_file, "r") as f: + result = validator.check10(f) + assert result[0] is False + + def test_invalid_lookup_shape(self, tmp_path, capsys): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file, "w") as f: + f.attrs["OMX_VERSION"] = b"0.2" + f.attrs["SHAPE"] = np.array([5, 5], dtype=np.int32) + f.create_group("data") + lookup = f.create_group("lookup") + lookup.create_dataset("zones", data=np.arange(10)) # Wrong size + with omx.open_file(omx_file, "r") as f: + result = validator.check10(f) + assert result[0] is False + + +class TestCheck11: + """Check 11: Uses common data types (int or str) for lookups""" + + def test_valid_int_lookup(self, valid_omx_file, capsys): + with omx.open_file(valid_omx_file, "r") as f: + result = validator.check11(f) + assert result[0] is True + assert result[2] == 11 + + def test_no_lookup_group(self, tmp_path, capsys): + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with h5py.File(omx_file, "w") as f: + f.attrs["OMX_VERSION"] = b"0.2" + f.create_group("data") + with omx.open_file(omx_file, "r") as f: + result = validator.check11(f) + assert result[0] is False + + def test_string_lookup(self, omx_file, capsys): + with omx.open_file(omx_file, "w") as f: + f.create_matrix("m1", obj=np.ones((3, 3))) + # Use bytes for h5py compatibility (Unicode strings not directly supported) + f.create_mapping("zones", np.array([b"A", b"B", b"C"])) + with omx.open_file(omx_file, "r") as f: + result = validator.check11(f) + assert result[0] + + +class TestCheck12: + """Check 12: Has Lookup DIM attribute (not supported)""" + + @pytest.mark.skip("Not supported by omx-python at this time") # pragma: no cover + def test_dim_not_supported(self, valid_omx_file, capsys): + with omx.open_file(valid_omx_file, "r") as f: + result = validator.check12(f) + assert result[0] is False # Not supported + assert result[1] is False # Not required + assert result[2] == 12 + captured = capsys.readouterr() + assert "Not supported" in captured.out + + +class TestRunChecks: + def test_run_checks_valid_file(self, valid_omx_file, capsys): + validator.run_checks(valid_omx_file) + captured = capsys.readouterr() + assert "Overall" in captured.out + + def test_run_checks_file_not_found(self, tmp_path): + nonexistent = tmp_path / "nonexistent.omx" + with pytest.raises(FileNotFoundError): + validator.run_checks(nonexistent) + + def test_run_checks_invalid_hdf5(self, tmp_path, capsys): + invalid_file = tmp_path / "invalid.omx" + invalid_file.write_text("not an hdf5 file") + validator.run_checks(invalid_file) + captured = capsys.readouterr() + assert "Unable to open" in captured.out + + def test_run_checks_with_error_result(self, tmp_path, capsys, monkeypatch): + """Test run_checks prints ERROR when a check returns 4-tuple.""" + omx_file = tmp_path / f"test{uuid.uuid4().hex}.omx" + with omx.open_file(omx_file, "w") as f: + f.create_matrix("m1", obj=np.ones((5, 5))) + + # Monkeypatch check1 to return an error tuple + original_check1 = validator.check1 # noqa: F841 + + def mock_check1(mat_file, required=True, checknum=1): + return (False, True, 1, "Simulated check error") + + monkeypatch.setattr(validator, "check1", mock_check1) + validator.run_checks(omx_file) + captured = capsys.readouterr() + assert "ERROR" in captured.out + assert "Simulated check error" in captured.out + + +class TestCommandLine: + def test_command_line(self, valid_omx_file, monkeypatch, capsys): + monkeypatch.setattr("sys.argv", ["validator", str(valid_omx_file)]) + validator.command_line() + captured = capsys.readouterr() + assert "Overall" in captured.out + + +class TestExceptionHandling: + """Test exception handling in validator checks.""" + + def test_check1_exception(self, capsys): + """Test check1 handles exceptions gracefully.""" + + class BadFile: + @property + def attrs(self): + raise RuntimeError("Simulated error") + + result = validator.check1(BadFile()) + assert len(result) == 4 # (ok, required, checknum, error_msg) + assert result[0] is False + assert "Simulated error" in result[3] + + def test_check2_exception(self, capsys): + """Test check2 handles exceptions gracefully.""" + + class BadFile: + @property + def attrs(self): + raise RuntimeError("Simulated error") + + result = validator.check2(BadFile()) + assert len(result) == 4 + assert result[0] is False + + def test_check3_exception(self, capsys): + """Test check3 handles exceptions gracefully.""" + + class BadFile: + pass + + result = validator.check3(BadFile()) + assert len(result) == 4 + assert result[0] is False + + def test_check4_exception(self, capsys): + """Test check4 handles exceptions gracefully.""" + + class BadFile: + @property + def attrs(self): + raise RuntimeError("Simulated error") + + result = validator.check4(BadFile()) + assert len(result) == 4 + assert result[0] is False + + def test_check5_exception(self, capsys): + """Test check5 handles exceptions gracefully.""" + + class BadFile: + def list_matrices(self): + raise RuntimeError("Simulated error") + + result = validator.check5(BadFile()) + assert len(result) == 4 + assert result[0] is False + + def test_check6_exception(self, capsys): + """Test check6 handles exceptions gracefully.""" + + class BadFile: + def list_matrices(self): + raise RuntimeError("Simulated error") + + result = validator.check6(BadFile()) + assert len(result) == 4 + assert result[0] is False + + def test_check7_exception(self, capsys): + """Test check7 handles exceptions gracefully.""" + + class BadFile: + def list_matrices(self): + raise RuntimeError("Simulated error") + + result = validator.check7(BadFile()) + assert len(result) == 4 + assert result[0] is False + + def test_check8_exception(self, capsys): + """Test check8 handles exceptions gracefully.""" + + class BadFile: + def list_matrices(self): + raise RuntimeError("Simulated error") + + result = validator.check8(BadFile()) + assert len(result) == 4 + assert result[0] is False + + def test_check9_exception(self, capsys): + """Test check9 handles exceptions gracefully.""" + + class BadFile: + pass + + result = validator.check9(BadFile()) + assert len(result) == 4 + assert result[0] is False + + def test_check10_exception(self, capsys): + """Test check10 handles exceptions gracefully.""" + + class BadFile: + pass + + result = validator.check10(BadFile()) + assert len(result) == 4 + assert result[0] is False + + def test_check11_exception(self, capsys): + """Test check11 handles exceptions gracefully.""" + + class BadFile: + pass + + result = validator.check11(BadFile()) + assert len(result) == 4 + assert result[0] is False + + def test_check12_exception(self, capsys): + """Test check12 handles exceptions gracefully.""" + + class BadFile: + pass + + result = validator.check12(BadFile()) + assert len(result) == 4 + assert result[0] is False