Source code for nbodykit.source.catalog.file

from nbodykit.base.catalog import CatalogSource
from nbodykit.io.stack import FileStack
from nbodykit import CurrentMPIComm
from nbodykit import io
from nbodykit.extern import docrep

from six import string_types
import textwrap
import os

__all__ = ['FileCatalogFactory', 'FileCatalogBase',
           'CSVCatalog', 'BinaryCatalog', 'BigFileCatalog',
           'HDFCatalog', 'TPMBinaryCatalog', 'Gadget1Catalog', 'FITSCatalog']

[docs]class FileCatalogBase(CatalogSource): """ Base class to create a source of particles from a single file, or multiple files, on disk. Files of a specific type should be subclasses of this class. Parameters ---------- filetype : subclass of :class:`~nbodykit.io.base.FileType` the file-like class used to load the data from file; should be a subclass of :class:`nbodykit.io.base.FileType` args : tuple, optional the arguments to pass to the ``filetype`` class when constructing each file object kwargs : dict, optional the keyword arguments to pass to the ``filetype`` class when constructing each file object comm : MPI Communicator, optional the MPI communicator instance; default (``None``) sets to the current communicator use_cache : bool, optional whether to cache data read from disk; default is ``False`` """ @CurrentMPIComm.enable def __init__(self, filetype, args=(), kwargs={}, comm=None, use_cache=False): self.comm = comm self.filetype = filetype # bcast the FileStack if self.comm.rank == 0: self._source = FileStack(filetype, *args, **kwargs) else: self._source = None self._source = self.comm.bcast(self._source) # compute the size start = self.comm.rank * self._source.size // self.comm.size end = (self.comm.rank + 1) * self._source.size // self.comm.size self._size = end - start # update the meta-data self.attrs.update(self._source.attrs) if self.comm.rank == 0: self.logger.info("Extra arguments to FileType: %s" % str(args)) CatalogSource.__init__(self, comm=comm, use_cache=use_cache) def __repr__(self): path = self._source.path name = self.__class__.__name__ if isinstance(path, string_types): args = (name, self.size, os.path.basename(path)) return "%s(size=%d, file='%s')" % args else: args = (name, self.size, self._source.nfiles) return "%s(size=%d, nfiles=%d)" % args @property def hardcolumns(self): """ The union of the columns in the file and any transformed columns. """ defaults = CatalogSource.hardcolumns.fget(self) return list(self._source.dtype.names) + defaults
[docs] def get_hardcolumn(self, col): """ Return a column from the underlying file source. Columns are returned as dask arrays. """ if col in self._source.dtype.names: start = self.comm.rank * self._source.size // self.comm.size end = (self.comm.rank + 1) * self._source.size // self.comm.size return self._source.get_dask(col)[start:end] else: return CatalogSource.get_hardcolumn(self, col)
def _make_docstring(filetype, examples): """ Internal function to generate the doc strings for the built-in CatalogSource objects that rely on :mod:`nbodykit.io` classes to read data from disk. """ qualname = '%s.%s' %(filetype.__module__, filetype.__name__) __doc__ = """ A CatalogSource that uses :class:`~{qualname}` to read data from disk. Multiple files can be read at once by supplying a list of file names or a glob asterisk pattern as the ``path`` argument. See :ref:`reading-multiple-files` for examples. Parameters ---------- %(test.parameters)s comm : MPI Communicator, optional the MPI communicator instance; default (``None``) sets to the current communicator use_cache : bool, optional whether to cache data read from disk; default is ``False`` attrs : dict, optional dictionary of meta-data to store in :attr:`attrs` """.format(qualname=qualname) if examples is not None: __doc__ += """ Examples -------- Please see :ref:`the documentation <%s>` for examples. """ %examples # get the Parameters from the IO libary class d = docrep.DocstringProcessor() d.get_sections(d.dedents(filetype.__doc__), 'test', ['Parameters']) return d.dedents(__doc__)
[docs]def FileCatalogFactory(name, filetype, examples=None): """ Factory method to create a :class:`~nbodykit.base.catalog.CatalogSource` that uses a subclass of :mod:`nbodykit.io.base.FileType` to read data from disk. Parameters ---------- name : str the name of the catalog class to create filetype : subclass of :class:`nbodykit.io.base.FileType` the subclass of the FileType that reads a specific type of data examples : str, optional if given, a documentation cross-reference link where examples can be found Returns ------- subclass of :class:`FileCatalogBase` : the ``CatalogSource`` object that reads data using ``filetype`` """ def __init__(self, *args, **kwargs): comm = kwargs.pop('comm', None) use_cache = kwargs.pop('use_cache', False) attrs = kwargs.pop('attrs', {}) FileCatalogBase.__init__(self, filetype=filetype, args=args, kwargs=kwargs) self.attrs.update(attrs) # make the doc string for this class __doc__ = _make_docstring(filetype, examples) # make the new class object and return it newclass = type(name, (FileCatalogBase,),{"__init__": __init__, "__doc__":__doc__}) return newclass
CSVCatalog = FileCatalogFactory("CSVCatalog", io.CSVFile, examples='csv-data') BinaryCatalog = FileCatalogFactory("BinaryCatalog", io.BinaryFile, examples='binary-data') BigFileCatalog = FileCatalogFactory("BigFileCatalog", io.BigFile, examples='bigfile-data') HDFCatalog = FileCatalogFactory("HDFCatalog", io.HDFFile, examples='hdf-data') TPMBinaryCatalog = FileCatalogFactory("TPMBinaryCatalog", io.TPMBinaryFile) FITSCatalog = FileCatalogFactory("FITSCatalog", io.FITSFile, examples='fits-data') Gadget1Catalog = FileCatalogFactory("Gadget1Catalog", io.Gadget1File, examples=None)