Source code for nbodykit.source.catalog.file

from nbodykit.base.catalog import CatalogSource
from nbodykit.io.stack import FileStack
from nbodykit import CurrentMPIComm
from nbodykit import io
from nbodykit.extern import docrep

from six import string_types
import textwrap
import os

__all__ = ['FileCatalogFactory', 'FileCatalogBase',
           'CSVCatalog', 'BinaryCatalog', 'BigFileCatalog',
           'HDFCatalog', 'TPMBinaryCatalog', 'Gadget1Catalog', 'FITSCatalog']

[docs]class FileCatalogBase(CatalogSource): """ Base class to create a source of particles from a single file, or multiple files, on disk. Files of a specific type should be subclasses of this class. Parameters ---------- filetype : subclass of :class:`~nbodykit.io.base.FileType` the file-like class used to load the data from file; should be a subclass of :class:`nbodykit.io.base.FileType` path : string or list. If string, it is expanded as a glob pattern. args : tuple, optional the arguments to pass to the ``filetype`` class when constructing each file object kwargs : dict, optional the keyword arguments to pass to the ``filetype`` class when constructing each file object comm : MPI Communicator, optional the MPI communicator instance; default (``None``) sets to the current communicator """ @CurrentMPIComm.enable def __init__(self, filetype, path, args=(), kwargs={}, comm=None): self.comm = comm self.filetype = filetype # bcast the FileStack if self.comm.rank == 0: self._source = FileStack(filetype, path, *args, **kwargs) else: self._source = None self._source = self.comm.bcast(self._source) # compute the size; start with full file. lstart = self.comm.rank * self._source.size // self.comm.size lend = (self.comm.rank + 1) * self._source.size // self.comm.size self._size = lend - lstart self.start = 0 self.end = self._source.size self._lstart = lstart # offset in the file for this rank self._lend = lend # offset in the file for this rank # update the meta-data self.attrs.update(self._source.attrs) if self.comm.rank == 0: self.logger.info("Extra arguments to FileType: %s %s" % (str(args), str(kwargs))) CatalogSource.__init__(self, comm=comm)
[docs] def query_range(self, start, end): """ Seek to a range in the file catalog. Parameters ---------- start : int start of the file relative to the physical file end : int end of the file relative to the physical file Returns ------- A new catalog that only accesses the given region of the file. If the original catalog (self) contains any assigned columns not directly obtained from the file, then the function will raise ValueError, since the operation in that case is not well defined. """ if len(CatalogSource.hardcolumns.fget(self)) > 0: raise ValueError("cannot seek if columns have been attached to the FileCatalog") other = self.copy() other._lstart = self.start + start + self.comm.rank * (end - start) // self.comm.size other._lend = self.start + start + (self.comm.rank + 1) * (end - start) // self.comm.size other._size = other._lend - other._lstart other.start = start other.end = end CatalogSource.__init__(other, comm=self.comm) return other
def __repr__(self): path = self._source.path name = self.__class__.__name__ args = (name, self.size, repr(self._source)) return "%s(size=%d, %s)" % args @property def hardcolumns(self): """ The union of the columns in the file and any transformed columns. """ defaults = CatalogSource.hardcolumns.fget(self) return list(self._source.dtype.names) + defaults
[docs] def get_hardcolumn(self, col): """ Return a column from the underlying file source. Columns are returned as dask arrays. """ if col in self._source.dtype.names: return self._source.get_dask(col)[self._lstart:self._lend] else: return CatalogSource.get_hardcolumn(self, col)
def _make_docstring(filetype, examples): """ Internal function to generate the doc strings for the built-in CatalogSource objects that rely on :mod:`nbodykit.io` classes to read data from disk. """ qualname = '%s.%s' %(filetype.__module__, filetype.__name__) __doc__ = """ A CatalogSource that uses :class:`~{qualname}` to read data from disk. Multiple files can be read at once by supplying a list of file names or a glob asterisk pattern as the ``path`` argument. See :ref:`reading-multiple-files` for examples. Parameters ---------- %(test.parameters)s comm : MPI Communicator, optional the MPI communicator instance; default (``None``) sets to the current communicator attrs : dict, optional dictionary of meta-data to store in :attr:`attrs` """.format(qualname=qualname) if examples is not None: __doc__ += """ Examples -------- Please see :ref:`the documentation <%s>` for examples. """ %examples # get the Parameters from the IO libary class d = docrep.DocstringProcessor() d.get_sections(d.dedents(filetype.__doc__), 'test', ['Parameters']) return d.dedents(__doc__)
[docs]def FileCatalogFactory(name, filetype, examples=None): """ Factory method to create a :class:`~nbodykit.base.catalog.CatalogSource` that uses a subclass of :mod:`nbodykit.io.base.FileType` to read data from disk. Parameters ---------- name : str the name of the catalog class to create filetype : subclass of :class:`nbodykit.io.base.FileType` the subclass of the FileType that reads a specific type of data examples : str, optional if given, a documentation cross-reference link where examples can be found Returns ------- subclass of :class:`FileCatalogBase` : the ``CatalogSource`` object that reads data using ``filetype`` """ def __init__(self, path, *args, **kwargs): comm = kwargs.pop('comm', None) attrs = kwargs.pop('attrs', {}) FileCatalogBase.__init__(self, filetype=filetype, path=path, args=args, kwargs=kwargs, comm=comm) self.attrs.update(attrs) # make the doc string for this class __doc__ = _make_docstring(filetype, examples) # make the new class object and return it newclass = type(name, (FileCatalogBase,),{"__init__": __init__, "__doc__":__doc__}) return newclass
class FileCatalog(FileCatalogBase): """ Base class to create a source of particles from a single file, or multiple files, on disk. Files of a specific type should be subclasses of this class. Parameters ---------- filetype : subclass of :class:`~nbodykit.io.base.FileType` the file-like class used to load the data from file; should be a subclass of :class:`nbodykit.io.base.FileType` path : string or list. If string, it is expanded as a glob pattern. attrs : dict, attributes to set to the Catalog. args : tuple, optional the arguments to pass to the ``filetype`` class when constructing each file object kwargs : dict, optional the keyword arguments to pass to the ``filetype`` class when constructing each file object comm : MPI Communicator, optional the MPI communicator instance; default (``None``) sets to the current communicator """ def __init__(self, filetype, path, *args, **kwargs): comm = kwargs.pop('comm', None) attrs = kwargs.pop('attrs', {}) FileCatalogBase.__init__(self, filetype=filetype, path=path, args=args, kwargs=kwargs, comm=comm) self.attrs.update(attrs) CSVCatalog = FileCatalogFactory("CSVCatalog", io.CSVFile, examples='csv-data') BinaryCatalog = FileCatalogFactory("BinaryCatalog", io.BinaryFile, examples='binary-data') BigFileCatalog = FileCatalogFactory("BigFileCatalog", io.BigFile, examples='bigfile-data') HDFCatalog = FileCatalogFactory("HDFCatalog", io.HDFFile, examples='hdf-data') TPMBinaryCatalog = FileCatalogFactory("TPMBinaryCatalog", io.TPMBinaryFile) FITSCatalog = FileCatalogFactory("FITSCatalog", io.FITSFile, examples='fits-data') Gadget1Catalog = FileCatalogFactory("Gadget1Catalog", io.Gadget1File, examples=None)