Source code for nbodykit.io.bigfile

from __future__ import absolute_import
# the future import is important. or in python 2.7 we try to
# import this module itself. Due to the unfortnate name conflict!

import numpy

from .base import FileType
from six import string_types
import json
from nbodykit.utils import JSONDecoder
from fnmatch import fnmatch


class Automatic: pass

[docs]class BigFile(FileType): """ A file object to handle the reading of columns of data from a :mod:`bigfile` file. :mod:`bigfile` is a reproducible, massively parallel IO library for large, hierarchical datasets, and it is the default format of the `FastPM <https://github.com/rainwoodman/fastpm>`_ and the `MP-Gadget <https://github.com/bluetides-project/MP-Gadget>`_ simulations. See also: https://github.com/rainwoodman/bigfile Parameters ---------- path : str the name of the directory holding the bigfile data exclude : list of str, optional the data sets to exlude from loading within bigfile; default is the header. If any list is given, the name of the header column must be given too if it is not part of the data set. The names are shell glob patterns. header : str, or list, optional the path to the header; default is to use a column 'Header'. It is relative to the file, not the dataset. If a list is provided, the attributes is updated from the first entry to the last. dataset : str finding columns from a specific dataset in the bigfile; the default is start looking for columns from the root. """ def __init__(self, path, exclude=None, header=Automatic, dataset='./'): if not dataset.endswith('/'): dataset = dataset + '/' import bigfile self.dataset = dataset self.path = path # store the attributes self.attrs = {} # the file path with bigfile.File(filename=path) as ff: columns = [block for block in ff[self.dataset].blocks] headers = self._find_headers(header, dataset, ff) if exclude is None: # by default exclude header only. exclude = headers if not isinstance(exclude, (list, tuple)): exclude = [exclude] columns = [ column for column in set(columns) if not any(fnmatch(column, e) for e in exclude) ] ds = bigfile.Dataset(ff[self.dataset], columns) headers = [ff[header] for header in headers] all_attrs = [ header.attrs for header in headers ] for attrs in all_attrs: # copy over the attrs for k in attrs.keys(): # load a JSON representation if str starts with json::// if isinstance(attrs[k], string_types) and attrs[k].startswith('json://'): self.attrs[k] = json.loads(attrs[k][7:], cls=JSONDecoder) # copy over an array else: self.attrs[k] = numpy.array(attrs[k], copy=True) # set the data type and size FileType.__init__(self, dtype=ds.dtype, size=ds.size) def _find_headers(self, header, dataset, ff): """ Find header from the file block by default. """ if header is Automatic: header = ['Header', 'header', '.'] if not isinstance(header, (tuple, list)): header = [header] r = [] for h in header: if h in ff.blocks: if h not in r: r.append(h) # append the dataset itself r.append(dataset.strip('/') + '/.') # shall not make the assertion here because header can be nested deep. # then not shown in ff.blocks. try catch may work better. #if not header in ff.blocks: # raise KeyError("header block `%s` is not defined in the bigfile. Candidates can be `%s`" # % (header, str(ff.blocks)) return r
[docs] def read(self, columns, start, stop, step=1): """ Read the specified column(s) over the given range, as a dictionary 'start' and 'stop' should be between 0 and :attr:`size`, which is the total size of the binary file (in particles) """ import bigfile with bigfile.File(filename=self.path)[self.dataset] as f: ds = bigfile.Dataset(f, columns) return ds[start:stop][::step]