Source code for nbodykit.source.catalog.array

from nbodykit.base.catalog import CatalogSource
from nbodykit import CurrentMPIComm
import numpy

[docs]class ArrayCatalog(CatalogSource):
    """
    A CatalogSource initialized from a dictionary or structured ndarray.

    Parameters
    ----------
    data : obj:`dict` or :class:`numpy.ndarray`
        a dictionary or structured ndarray; items are interpreted
        as the columns of the catalog; the length of any item is used
        as the size of the catalog.
    comm : MPI Communicator, optional
        the MPI communicator instance; default (``None``) sets to the
        current communicator
    use_cache : bool, optional
        whether to cache data read from disk; default is ``False``
    **kwargs :
        additional keywords to store as meta-data in :attr:`attrs`
    """
    @CurrentMPIComm.enable
    def __init__(self, data, comm=None, use_cache=False, **kwargs):

        self.comm    = comm
        self._source = data

        if hasattr(data, 'dtype'):
            keys = sorted(data.dtype.names)
        else:
            keys = sorted(data.keys())

        dtype = numpy.dtype([(key, (data[key].dtype, data[key].shape[1:])) for key in keys])

        # verify data types are the same
        dtypes = self.comm.gather(dtype, root=0)
        if self.comm.rank == 0:
            if any(dt != dtypes[0] for dt in dtypes):
                raise ValueError("mismatch between dtypes across ranks in Array")

        self._size = len(self._source[keys[0]])

        for key in keys:
            if len(self._source[key]) != self._size:
                raise ValueError("column `%s` and column `%s` has different size" % (keys[0], key))

        self._dtype = dtype
        # update the meta-data
        self.attrs.update(kwargs)

        CatalogSource.__init__(self, comm=comm, use_cache=use_cache)

    @property
    def hardcolumns(self):
        """
        The union of the columns in the file and any transformed columns.
        """
        defaults = CatalogSource.hardcolumns.fget(self)
        return list(self._dtype.names) + defaults

[docs]    def get_hardcolumn(self, col):
        """
        Return a column from the underlying data array/dict.

        Columns are returned as dask arrays.
        """
        if col in self._dtype.names:
            return self.make_column(self._source[col])
        else:
            return CatalogSource.get_hardcolumn(self, col)