Source code for nbodykit.source.catalog.file

from nbodykit.base.catalog import CatalogSource
from nbodykit.io.stack import FileStack
from nbodykit import CurrentMPIComm
from nbodykit import io
from nbodykit.extern import docrep

from six import string_types
import textwrap
import os

__all__ = ['FileCatalogFactory', 'FileCatalogBase',
           'CSVCatalog', 'BinaryCatalog', 'BigFileCatalog',
           'HDFCatalog', 'TPMBinaryCatalog', 'Gadget1Catalog', 'FITSCatalog']

[docs]class FileCatalogBase(CatalogSource):
    """
    Base class to create a source of particles from a
    single file, or multiple files, on disk.

    Files of a specific type should be subclasses of this class.

    Parameters
    ----------
    filetype : subclass of :class:`~nbodykit.io.base.FileType`
        the file-like class used to load the data from file; should be a
        subclass of :class:`nbodykit.io.base.FileType`
    path : string or list. If string, it is expanded as a glob pattern.
    args : tuple, optional
        the arguments to pass to the ``filetype`` class when constructing
        each file object
    kwargs : dict, optional
        the keyword arguments to pass to the ``filetype`` class when
        constructing each file object
    comm : MPI Communicator, optional
        the MPI communicator instance; default (``None``) sets to the
        current communicator
    """
    @CurrentMPIComm.enable
    def __init__(self, filetype, path, args=(), kwargs={}, comm=None):

        self.comm = comm
        self.filetype = filetype

        # bcast the FileStack
        if self.comm.rank == 0:
            self._source = FileStack(filetype, path, *args, **kwargs)
        else:
            self._source = None
        self._source = self.comm.bcast(self._source)

        # compute the size; start with full file.
        lstart = self.comm.rank * self._source.size // self.comm.size
        lend = (self.comm.rank  + 1) * self._source.size // self.comm.size
        self._size = lend - lstart

        self.start = 0
        self.end = self._source.size

        self._lstart = lstart # offset in the file for this rank
        self._lend = lend     # offset in the file for this rank

        # update the meta-data
        self.attrs.update(self._source.attrs)

        if self.comm.rank == 0:
            self.logger.info("Extra arguments to FileType: %s %s" % (str(args), str(kwargs)))

        CatalogSource.__init__(self, comm=comm)

[docs]    def query_range(self, start, end):
        """
            Seek to a range in the file catalog.

            Parameters
            ----------
            start : int
                start of the file relative to the physical file

            end : int
                end of the file relative to the physical file

            Returns
            -------
            A new catalog that only accesses the given region of the file.

            If the original catalog (self) contains any assigned columns not directly
            obtained from the file, then the function will raise ValueError, since
            the operation in that case is not well defined.

        """
        if len(CatalogSource.hardcolumns.fget(self)) > 0:
            raise ValueError("cannot seek if columns have been attached to the FileCatalog")

        other = self.copy()
        other._lstart = self.start + start +  self.comm.rank * (end - start) // self.comm.size
        other._lend = self.start + start + (self.comm.rank + 1) * (end - start) // self.comm.size
        other._size = other._lend - other._lstart
        other.start = start
        other.end = end
        CatalogSource.__init__(other, comm=self.comm)
        return other

    def __repr__(self):
        path = self._source.path
        name = self.__class__.__name__
        args = (name, self.size, repr(self._source))

        return "%s(size=%d, %s)" % args

    @property
    def hardcolumns(self):
        """
        The union of the columns in the file and any transformed columns.
        """
        defaults = CatalogSource.hardcolumns.fget(self)
        return list(self._source.dtype.names) + defaults

[docs]    def get_hardcolumn(self, col):
        """
        Return a column from the underlying file source.

        Columns are returned as dask arrays.
        """
        if col in self._source.dtype.names:
            return self._source.get_dask(col)[self._lstart:self._lend]
        else:
            return CatalogSource.get_hardcolumn(self, col)


def _make_docstring(filetype, examples):
    """
    Internal function to generate the doc strings for the built-in
    CatalogSource objects that rely on :mod:`nbodykit.io` classes
    to read data from disk.
    """

    qualname = '%s.%s' %(filetype.__module__, filetype.__name__)
    __doc__ = """
A CatalogSource that uses :class:`~{qualname}` to read data from disk.

Multiple files can be read at once by supplying a list of file
names or a glob asterisk pattern as the ``path`` argument. See
:ref:`reading-multiple-files` for examples.

Parameters
----------
%(test.parameters)s
comm : MPI Communicator, optional
    the MPI communicator instance; default (``None``) sets to the
    current communicator
attrs : dict, optional
    dictionary of meta-data to store in :attr:`attrs`
""".format(qualname=qualname)

    if examples is not None:
        __doc__ += """
Examples
--------
Please see :ref:`the documentation <%s>` for examples.
""" %examples

    # get the Parameters from the IO libary class
    d = docrep.DocstringProcessor()
    d.get_sections(d.dedents(filetype.__doc__), 'test', ['Parameters'])
    return d.dedents(__doc__)

[docs]def FileCatalogFactory(name, filetype, examples=None):
    """
    Factory method to create a :class:`~nbodykit.base.catalog.CatalogSource`
    that uses a subclass of :mod:`nbodykit.io.base.FileType` to read
    data from disk.

    Parameters
    ----------
    name : str
        the name of the catalog class to create
    filetype : subclass of :class:`nbodykit.io.base.FileType`
        the subclass of the FileType that reads a specific type of data
    examples : str, optional
        if given, a documentation cross-reference link where examples can be
        found

    Returns
    -------
    subclass of :class:`FileCatalogBase` :
        the ``CatalogSource`` object that reads data using ``filetype``
    """
    def __init__(self, path, *args, **kwargs):
        comm = kwargs.pop('comm', None)
        attrs = kwargs.pop('attrs', {})
        FileCatalogBase.__init__(self, filetype=filetype, path=path, args=args, kwargs=kwargs, comm=comm)
        self.attrs.update(attrs)

    # make the doc string for this class
    __doc__ = _make_docstring(filetype, examples)

    # make the new class object and return it
    newclass = type(name, (FileCatalogBase,),{"__init__": __init__, "__doc__":__doc__})
    return newclass


class FileCatalog(FileCatalogBase):
    """
    Base class to create a source of particles from a
    single file, or multiple files, on disk.

    Files of a specific type should be subclasses of this class.

    Parameters
    ----------
    filetype : subclass of :class:`~nbodykit.io.base.FileType`
        the file-like class used to load the data from file; should be a
        subclass of :class:`nbodykit.io.base.FileType`
    path : string or list. If string, it is expanded as a glob pattern.
    attrs : dict, attributes to set to the Catalog.
    args : tuple, optional
        the arguments to pass to the ``filetype`` class when constructing
        each file object
    kwargs : dict, optional
        the keyword arguments to pass to the ``filetype`` class when
        constructing each file object
    comm : MPI Communicator, optional
        the MPI communicator instance; default (``None``) sets to the
        current communicator
    """
    def __init__(self, filetype, path, *args, **kwargs): 
        comm = kwargs.pop('comm', None)
        attrs = kwargs.pop('attrs', {})
        FileCatalogBase.__init__(self, filetype=filetype, path=path, args=args, kwargs=kwargs, comm=comm)
        self.attrs.update(attrs)

CSVCatalog       = FileCatalogFactory("CSVCatalog", io.CSVFile, examples='csv-data')
BinaryCatalog    = FileCatalogFactory("BinaryCatalog", io.BinaryFile, examples='binary-data')
BigFileCatalog   = FileCatalogFactory("BigFileCatalog", io.BigFile, examples='bigfile-data')
HDFCatalog       = FileCatalogFactory("HDFCatalog", io.HDFFile, examples='hdf-data')
TPMBinaryCatalog = FileCatalogFactory("TPMBinaryCatalog", io.TPMBinaryFile)
FITSCatalog      = FileCatalogFactory("FITSCatalog", io.FITSFile, examples='fits-data')
Gadget1Catalog   = FileCatalogFactory("Gadget1Catalog", io.Gadget1File, examples=None)