Source code for nbodykit.io.bigfile

from __future__ import absolute_import
# the future import is important. or in python 2.7 we try to
# import this module itself. Due to the unfortnate name conflict!

import numpy

from .base import FileType
from six import string_types
import json
from nbodykit.utils import JSONDecoder

class Automatic: pass

[docs]class BigFile(FileType):
    """
    A file object to handle the reading of columns of data from
    a :mod:`bigfile` file.

    :mod:`bigfile` is a reproducible, massively parallel IO library for
    large, hierarchical datasets, and it is the default format of the
    `FastPM <https://github.com/rainwoodman/fastpm>`_ and the
    `MP-Gadget <https://github.com/bluetides-project/MP-Gadget>`_
    simulations.

    See also: https://github.com/rainwoodman/bigfile

    Parameters
    ----------
    path : str
        the name of the directory holding the bigfile data
    exclude : list of str, optional
        the data sets to exlude from loading within bigfile; default
        is the header
    header : str, optional
        the path to the header; default is to use a column 'Header'.
        It is relative to the file, not the dataset.
    dataset : str
        load a specific dataset from the bigfile; default is to starting
        from the root.
    """
    def __init__(self, path, exclude=None, header=Automatic, dataset='./'):

        if not dataset.endswith('/'): dataset = dataset + '/'

        import bigfile

        self.dataset = dataset
        self.path = path

        # store the attributes
        self.attrs = {}

        # the file path
        with bigfile.BigFile(filename=path) as ff:
            columns = ff[self.dataset].blocks
            if header is Automatic:
                for header in ['Header', 'header', './']:
                    if header in columns: break

            if exclude is None:
                exclude = [header]

            columns = list(set(columns) - set(exclude))

            ds = bigfile.BigData(ff[self.dataset], columns)

            # set the data type and size
            self.dtype = ds.dtype
            self.size = ds.size

            header = ff[header]
            attrs = header.attrs

            # copy over the attrs
            for k in attrs.keys():

                # load a JSON representation if str starts with json:://
                if isinstance(attrs[k], string_types) and attrs[k].startswith('json://'):
                    self.attrs[k] = json.loads(attrs[k][7:], cls=JSONDecoder)
                # copy over an array
                else:
                    self.attrs[k] = numpy.array(attrs[k], copy=True)

[docs]    def read(self, columns, start, stop, step=1):
        """
        Read the specified column(s) over the given range,
        as a dictionary

        'start' and 'stop' should be between 0 and :attr:`size`,
        which is the total size of the binary file (in particles)
        """
        import bigfile
        if isinstance(columns, string_types): columns = [columns]

        with bigfile.BigFile(filename=self.path)[self.dataset] as f:
            ds = bigfile.BigData(f, columns)
            return ds[start:stop][::step]