Source code for nbodykit.binned_statistic

import numpy

[docs]def bin_ndarray(ndarray, new_shape, weights=None, operation=numpy.mean): """ Bins an ndarray in all axes based on the target shape, by summing or averaging. Parameters ---------- ndarray : array_like the input array to re-bin new_shape : tuple the tuple holding the desired new shape weights : array_like, optional weights to multiply the input array by, before running the re-binning operation, Notes ----- * Dimensions in `new_shape` must be integral factor smaller than the old shape * Number of output dimensions must match number of input dimensions. * See https://gist.github.com/derricw/95eab740e1b08b78c03f Examples -------- >>> m = numpy.arange(0,100,1).reshape((10,10)) >>> n = bin_ndarray(m, new_shape=(5,5), operation=numpy.sum) >>> print(n) [[ 22 30 38 46 54] [102 110 118 126 134] [182 190 198 206 214] [262 270 278 286 294] [342 350 358 366 374]] """ if ndarray.shape == new_shape: raise ValueError("why are we re-binning if the new shape equals the old shape?") if ndarray.ndim != len(new_shape): raise ValueError("Shape mismatch: {} -> {}".format(ndarray.shape, new_shape)) if numpy.sometrue(numpy.mod(ndarray.shape, new_shape)): args = (str(new_shape), str(ndarray.shape)) msg = "desired shape of %s must be integer factor smaller than the old shape %s" %args raise ValueError(msg) compression_pairs = [(d, c//d) for d, c in zip(new_shape, ndarray.shape)] flattened = [l for p in compression_pairs for l in p] ndarray = ndarray.reshape(flattened) if weights is not None: weights = weights.reshape(flattened) for i in range(len(new_shape)): if weights is not None: ndarray = operation(ndarray*weights, axis=-1*(i+1)) weights = numpy.sum(weights, axis=-1*(i+1)) ndarray /= weights else: ndarray = operation(ndarray, axis=-1*(i+1)) return ndarray
[docs]class BinnedStatistic(object): """ Lightweight class to hold statistics binned at fixed coordinates. For example, this class could hold a grid of (r, mu) or (k, mu) bins for a correlation function or power spectrum measurement. It is modeled after the syntax of :class:`xarray.Dataset`, and is designed to hold correlation function or power spectrum results (in 1D or 2D) Parameters ---------- dims : list, (Ndim,) A list of strings specifying names for the coordinate dimensions. The dimension names stored in :attr:`dims` have the suffix 'cen' added, to indicate that the coordinate grid is defined at the bin centers edges : list, (Ndim,) A list specifying the bin edges for each dimension data : array_like a structured array holding the data variables, where the named fields interpreted as the variable names. The variable names are stored in :attr:`variables` fields_to_sum : list, optional the name of fields that will be summed when reindexing, instead of averaging **kwargs : Any additional keywords are saved as metadata in the :attr:`attrs` dictionary attribute Examples -------- The following example shows how to read a power spectrum measurement from a JSON file, as output by nbodykit, assuming the JSON file holds a dictionary with a 'power' entry holding the relevant data >>> filename = 'test_data.json' >>> pk = BinnedStatistic.from_json(['k'], filename, 'power') In older versions of nbodykit, results were written using plaintext ASCII files. Although now deprecated, this type of files can be read using: >>> filename = 'test_data.dat' >>> dset = BinnedStatistic.from_plaintext(['k'], filename) Data variables can be accessed in a dict-like fashion: >>> power = pkmu['power'] # returns power data variable Array-like indexing of a :class:`BinnedStatistic` returns a new :class:`BinnedStatistic` holding the sliced data: >>> pkmu <BinnedStatistic: dims: (k: 200, mu: 5), variables: ('mu', 'k', 'power')> >>> pkmu[:,0] # select first mu column <BinnedStatistic: dims: (k: 200), variables: ('mu', 'k', 'power')> Additional data variables can be added to the :class:`BinnedStatistic` via: >>> modes = numpy.ones((200, 5)) >>> pkmu['modes'] = modes Coordinate-based indexing is possible through :func:`sel`: >>> pkmu <BinnedStatistic: dims: (k: 200, mu: 5), variables: ('mu', 'k', 'power')> >>> pkmu.sel(k=slice(0.1, 0.4), mu=0.5) <BinnedStatistic: dims: (k: 30), variables: ('mu', 'k', 'power')> :func:`squeeze` will explicitly squeeze the specified dimension (of length one) such that the resulting instance has one less dimension: >>> pkmu <BinnedStatistic: dims: (k: 200, mu: 1), variables: ('mu', 'k', 'power')> >>> pkmu.squeeze(dim='mu') # can also just call pkmu.squeeze() <BinnedStatistic: dims: (k: 200), variables: ('mu', 'k', 'power')> :func:`average` returns a new :class:`BinnedStatistic` holding the data averaged over one dimension :func:`reindex` will re-bin the coordinate arrays along the specified dimension """ def __init__(self, dims, edges, data, fields_to_sum=[], coords=None, **kwargs): # number of dimensions must match if len(dims) != len(edges): raise ValueError("size mismatch between specified `dims` and `edges`") # input data must be structured array if not isinstance(data, numpy.ndarray) or data.dtype.names is None: raise TypeError("'data' should be a structured numpy array") shape = tuple(len(e)-1 for e in edges) if data.shape != shape: args = (shape, data.shape) raise ValueError("`edges` imply data shape of %s, but data has shape %s" %args) self.dims = list(dims) self.edges = dict(zip(self.dims, edges)) # coordinates are the bin centers self.coords = {} for i, dim in enumerate(self.dims): if coords is not None and coords[i] is not None: self.coords[dim] = numpy.copy(coords[i]) else: self.coords[dim] = 0.5 * (edges[i][1:] + edges[i][:-1]) # store variables as a structured array self.data = data.copy() # define a mask such that a coordinate grid element will be masked # if any of the variables at that coordinate are (NaN, inf) self.mask = numpy.zeros(self.shape, dtype=bool) for name in data.dtype.names: self.mask = numpy.logical_or(self.mask, ~numpy.isfinite(self.data[name])) # fields that we wish to sum, instead of averaging self._fields_to_sum = fields_to_sum # save and track metadata self.attrs = {} for k in kwargs: self.attrs[k] = kwargs[k] @classmethod def from_state(kls, state): obj = kls(dims=state['dims'], edges=state['edges'], coords=state['coords'], data=state['data']) obj.attrs.update(state['attrs']) return obj def __getstate__(self): return dict( dims=self.dims, edges=[self.edges[dim] for dim in self.dims], coords=[self.coords[dim] for dim in self.dims], data=self.data, attrs=self.attrs) @property def shape(self): """ The shape of the coordinate grid """ return tuple(len(self.coords[d]) for d in self.dims) @property def variables(self): """ Alias to return the names of the variables stored in `data` """ return list(self.data.dtype.names)
[docs] @classmethod def __construct_direct__(cls, data, mask, **kwargs): """ Shortcut around __init__ for internal use to construct and return a new class instance. The returned object should be identical to that returned by __init__. Notes ----- * Useful for returning new instances with sliced data/mask * The keyword arguments required to create a full, unbroken instance are `dims`, `coords`, `edges`, and `attrs` Parameters ---------- data : """ obj = object.__new__(cls) for k in kwargs: setattr(obj, k, kwargs[k]) for k, d in zip(['data', 'mask'], [data, mask]): setattr(obj, k, d) if obj.shape != d.shape: try: setattr(obj, k, d.reshape(obj.shape)) except: raise ValueError("shape mismatch between data and coordinates") return obj
[docs] def __copy_attrs__(self): """ Return a copy of all necessary attributes associated with the `BinnedStatistic`. This dictionary + `data` and `mask` are all that's required to reconstruct a new class """ kw = {} kw['dims'] = list(self.dims) kw['edges'] = self.edges.copy() kw['coords'] = self.coords.copy() kw['attrs'] = self.attrs.copy() kw['_fields_to_sum'] = list(self._fields_to_sum) return kw
[docs] def __finalize__(self, data, mask, indices): """ Finalize and return a new instance from a slice of the current object (returns a copy) """ edges, coords = self.__slice_edges__(indices) kw = {'dims':list(self.dims), 'edges':edges, 'coords':coords, 'attrs':self.attrs.copy(), '_fields_to_sum':self._fields_to_sum} return self.__class__.__construct_direct__(data, mask, **kw)
[docs] def __slice_edges__(self, indices): """ Internal function to slice the `edges` attribute with the specified indices, which specify the included coordinate bins """ edges = {} coords = {} for i, dim in enumerate(self.dims): if len(indices[i]) > 0: idx = list(indices[i]) + [indices[i][-1]+1] else: idx = [0] edges[dim] = self.edges[dim][idx] coords[dim] = 0.5 * (edges[dim][1:] + edges[dim][:-1]) return edges, coords
def __str__(self): name = self.__class__.__name__ dims = "(" + ", ".join(['%s: %d' %(k, self.shape[i]) for i, k in enumerate(self.dims)]) + ")" if len(self.variables) < 5: return "<%s: dims: %s, variables: %s>" %(name, dims, str(tuple(self.variables))) else: return "<%s: dims: %s, variables: %d total>" %(name, dims, len(self.variables)) def __repr__(self): return self.__str__() def __iter__(self): return iter(self.variables) def __contains__(self, key): return key in self.variables
[docs] def __setitem__(self, key, data): """ Add a new variable with the name `key` to the class using `data` """ if numpy.shape(data) != self.data.shape: raise ValueError("data to be added must have shape %s" %str(self.data.shape)) # add the new (key, type) to the dtype, or if the key is present, overwrite dtype = list(self.data.dtype.descr) # make copy names = list(self.data.dtype.names) # make copy if key in names: i = names.index(key) dtype.pop(i); names.pop(i) dtype += [(key, data.dtype.type)] # add old variables dtype = numpy.dtype(dtype) new = numpy.zeros(self.data.shape, dtype=dtype) for col in names: new[col] = self.data[col] # the new data to add new[key] = data mask = numpy.logical_or(self.mask, ~numpy.isfinite(new[key])) # save self.data = new self.mask = mask
[docs] def __getitem__(self, key): """ Index- or string- based indexing Notes ----- * If a single string is passed, the key is intrepreted as a `variable` or `coordinate`, and the corresponding array is returned * If a list of strings is passed, then a new `BinnedStatistic` holding only the `variable` names in `key` is returned * Integer-based indexing or slices similar to numpy indexing will slice `data`, returning a new `BinnedStatistic` holding the newly sliced data and coordinate grid * Scalar indexes (i.e., integers) used to index a certain dimension will "squeeze" that dimension, removing it from the coordinate grid """ # if single string passed, return a coordinate or variable if isinstance(key, str): if key in self.variables: return self.data[key] else: raise KeyError("`%s` is not a valid variable name" %key) # indices to slice the data with indices = [list(range(0, self.shape[i])) for i in range(len(self.dims))] # check for list/tuple of variable names # if so, return a BinnedStatistic with slice of columns if isinstance(key, (list, tuple)) and all(isinstance(x, str) for x in key): if all(k in self.variables for k in key): return self.__finalize__(self.data[list(key)], self.mask.copy(), indices) else: invalid = ', '.join("'%s'" %k for k in key if k not in self.variables) raise KeyError("cannot slice variables -- invalid names: (%s)" %invalid) key_ = key # if key is a single integer or single slice, make it a list make_iterable = isinstance(key, slice) or isinstance(key, int) if make_iterable or isinstance(key, list) and all(isinstance(x, int) for x in key): key_ = [key] squeezed_dims = [] for i, subkey in enumerate(key_): if i >= len(self.dims): raise IndexError("too many indices for BinnedStatistic; note that ndim = %d" %len(self.dims)) if isinstance(subkey, int): indices[i] = [subkey] squeezed_dims.append(self.dims[i]) elif isinstance(subkey, list): indices[i] = subkey elif isinstance(subkey, slice): indices[i] = list(range(*subkey.indices(self.shape[i]))) # can't squeeze all dimensions!! if len(squeezed_dims) == len(self.dims): raise IndexError("cannot return object with all remaining dimensions squeezed") # fail nicely if we fail at all try: toret = self.__finalize__(self.data[key], self.mask[key], indices) for dim in squeezed_dims: toret = toret.squeeze(dim) return toret except ValueError: raise IndexError("this type of slicing not implemented")
def _get_index(self, dim, val, method=None): """ Internal function to compute the bin index of the nearest coordinate value to the input value """ index = self.coords[dim] if method == 'nearest': i = (numpy.abs(index-val)).argmin() else: try: i = list(index).index(val) except Exception as e: args = (dim, str(e)) msg = "error converting '%s' index; try setting `method = 'nearest'`: %s" raise IndexError(msg %args) return i #-------------------------------------------------------------------------- # user-called functions #--------------------------------------------------------------------------
[docs] def to_json(self, filename): """ Write a BinnedStatistic from a JSON file. .. note:: This uses :class:`nbodykit.utils.JSONEncoder` to write the JSON file Parameters ---------- filename : str the name of the file to write """ import json from nbodykit.utils import JSONEncoder state = self.__getstate__() with open(filename, 'w') as ff: json.dump(state, ff, cls=JSONEncoder)
[docs] @classmethod def from_json(cls, filename, key='data', dims=None, edges=None, **kwargs): """ Initialize a BinnedStatistic from a JSON file. The JSON file should contain a dictionary, where the data to load is stored as the ``key`` entry, with an ``edges`` entry specifying bin edges, and optionally, a ``attrs`` entry giving a dict of meta-data .. note:: This uses :class:`nbodykit.utils.JSONDecoder` to load the JSON file Parameters ---------- filename : str the name of the file to load key : str, optional the name of the key in the JSON file holding the data to load dims : list, optional list of names specifying the dimensions, i.e., ``['k']`` or ``['k', 'mu']``; must be supplied if not given in the JSON file Returns ------- dset : BinnedStatistic the BinnedStatistic holding the data from file """ import json from nbodykit.utils import JSONDecoder # parse with open(filename, 'r') as ff: state = json.load(ff, cls=JSONDecoder) # the data if key not in state: args = (key, tuple(state.keys())) raise ValueError("no data entry found in JSON format for '%s' key; valid keys are %s" %args) data = state[key] # the dimensions dims = state.pop('dims', dims) if dims is None: raise ValueError("no `dims` found in JSON file; please specify as keyword argument") # the edges edges = state.pop('edges', edges) if edges is None: raise ValueError("no `edges` found in JSON file; please specify as keyword argument") # the coords coords = state.pop('coords', None) # meta-data attrs = state.pop('attrs', {}) attrs.update(kwargs) return cls(dims, edges, data, coords=coords, **attrs)
[docs] @classmethod def from_plaintext(cls, dims, filename, **kwargs): """ Initialize a BinnedStatistic from a plaintext file .. note:: Deprecated in nbodykit 0.2.x Storage of BinnedStatistic objects as plaintext ASCII files is no longer supported; See :func:`BinnedStatistic.from_json` Parameters ---------- dims : list list of names specifying the dimensions, i.e., ``['k']`` or ``['k', 'mu']`` filename : str the name of the file to load Returns ------- dset : BinnedStatistic the BinnedStatistic holding the data from file """ import warnings msg = "storage of BinnedStatistic objects as ASCII plaintext files is deprecated; see BinnedStatistic.from_json" warnings.warn(msg, FutureWarning, stacklevel=2) # make sure dims is a list/tuple if not isinstance(dims, (tuple, list)): raise TypeError("`dims` should be a list or tuple of strings") # read from file try: if len(dims) == 1: data, meta = _Read1DPlainText(filename) elif len(dims) == 2: data, meta = _Read2DPlainText(filename) except Exception as e: msg = "unable to read plaintext file, perhaps the dimension of the file does " msg += "not match the passed `dims`;\nexception: %s" %str(e) raise ValueError(msg) # get the bin edges edges = meta.pop('edges', None) if edges is None: raise ValueError("plaintext file does not include `edges`; cannot be loaded into a BinnedStatistic") if len(dims) == 1: edges = [edges] meta.update(kwargs) return cls(dims, edges, data, **meta)
[docs] def copy(self, cls=None): """ Returns a copy of the BinnedStatistic, optionally change the type to cls. cls must be a subclass of BinnedStatistic. """ attrs = self.__copy_attrs__() if cls is None: cls = self.__class__ if not issubclass(cls, BinnedStatistic): raise TypeError("The cls argument must be a subclass of BinnedStatistic") return cls.__construct_direct__(self.data.copy(), self.mask.copy(), **attrs)
[docs] def rename_variable(self, old_name, new_name): """ Rename a variable in :attr:`data` from ``old_name`` to ``new_name``. Note that this procedure is performed in-place (does not return a new BinnedStatistic) Parameters ---------- old_name : str the name of the old varibale to rename new_name : str the desired new variable name Raises ------ ValueError If `old_name` is not present in :attr:`variables` """ import copy if old_name not in self.variables: raise ValueError("`%s` is not an existing variable name" %old_name) new_dtype = copy.deepcopy(self.data.dtype) names = list(new_dtype.names) names[names.index(old_name)] = new_name new_dtype.names = names self.data.dtype = new_dtype
[docs] def sel(self, method=None, **indexers): """ Return a new BinnedStatistic indexed by coordinate values along the specified dimension(s). Notes ----- Scalar values used to index a specific dimension will result in that dimension being squeezed. To keep a dimension of unit length, use a list to index (see examples below). Parameters ---------- method : {None, 'nearest'} The method to use for inexact matches; if set to `None`, require an exact coordinate match, otherwise match the nearest coordinate **indexers : the pairs of dimension name and coordinate value used to index the BinnedStatistic Returns ------- sliced : BinnedStatistic a new BinnedStatistic holding the sliced data and coordinate grid Examples -------- >>> pkmu <BinnedStatistic: dims: (k: 200, mu: 5), variables: ('mu', 'k', 'power')> >>> pkmu.sel(k=0.4) <BinnedStatistic: dims: (mu: 5), variables: ('mu', 'k', 'power')> >>> pkmu.sel(k=[0.4]) <BinnedStatistic: dims: (k: 1, mu: 5), variables: ('mu', 'k', 'power')> >>> pkmu.sel(k=slice(0.1, 0.4), mu=0.5) <BinnedStatistic: dims: (k: 30), variables: ('mu', 'k', 'power')> """ indices = {} squeezed_dims = [] for dim, key in indexers.items(): if isinstance(key, list): indices[dim] = [self._get_index(dim, k, method=method) for k in key] elif isinstance(key, slice): new_slice = [] for name in ['start', 'stop']: new_slice.append(self._get_index(dim, getattr(key, name), method=method)) i = self.dims.index(dim) indices[dim] = list(range(*slice(*new_slice).indices(self.shape[i]))) elif not numpy.isscalar(key): raise IndexError("please index using a list, slice, or scalar value") else: indices[dim] = [self._get_index(dim, key, method=method)] squeezed_dims.append(dim) # can't squeeze all dimensions!! if len(squeezed_dims) == len(self.dims): raise IndexError("cannot return object with all remaining dimensions squeezed") toret = self.take(**indices) for dim in squeezed_dims: toret = toret.squeeze(dim) return toret
[docs] def take(self, *masks, **indices): """ Take a subset of a BinnedStatistic from given list of indices. This is more powerful but more verbose than `sel`. Also the result is never squeezed, even if only a single item along the direction is used. Parameters ---------- masks : array_like (boolean) a list of masks that are of the same shape as the data. indices: dict (string : array_like) mapping from axes (by name, dim) to items to select (list/array_like). Each item is a valid selector for numpy's fancy indexing. Returns ------- new BinnedStatistic, where only items selected by all axes are kept. Examples -------- >>> pkmu <BinnedStatistic: dims: (k: 200, mu: 5), variables: ('mu', 'k', 'power')> # similar to pkmu.sel(k > 0.4), select the bin centers >>> pkmu.take(k=pkmu.coords['k'] > 0.4) <BinnedStatistic: dims: (mu: 5), variables: ('mu', 'k', 'power')> # also similar to pkmu.sel(k > 0.4), select the bin averages >>> pkmu.take(pkmu['k'] > 0.4) <BinnedStatistic: dims: (k: 30), variables: ('mu', 'k', 'power')> # impossible with sel. >>> pkmu.take(pkmu['modes'] > 0) """ indices_dict = {} indices_dict.update(indices) # rename for a cleaner API. # flatten the masks, will keep items that are true everywhere mask = numpy.ones(self.shape, dtype='?') for m in masks: mask = mask & m indices = [numpy.ones(self.shape[i], dtype='?') for i in range(len(self.dims))] # update indices with masks for i, dim in enumerate(self.dims): axis = list(range(len(self.dims))) axis.remove(i) axis = tuple(axis) mask1 = mask.all(axis=axis) indices[i] &= mask1 # update indices with indices_dict for dim, index in indices_dict.items(): i = self.dims.index(dim) if isinstance(index, numpy.ndarray) and index.dtype == numpy.dtype('?'): # boolean mask? assert index.ndim == 1 indices[i] &= index else: mask1 = numpy.zeros(self.shape[i], dtype='?') mask1.put(index, True) indices[i] &= mask1 # convert to indices for i in range(len(indices)): indices[i] = indices[i].nonzero()[0] data = self.data.copy() mask = self.mask.copy() for i, idx in enumerate(indices): data = numpy.take(data, idx, axis=i) mask = numpy.take(mask, idx, axis=i) toret = self.__finalize__(data, mask, indices) return toret
[docs] def squeeze(self, dim=None): """ Squeeze the BinnedStatistic along the specified dimension, which removes that dimension from the BinnedStatistic. The behavior is similar to that of :func:`numpy.squeeze`. Parameters ---------- dim : str, optional The name of the dimension to squeeze. If no dimension is provided, then the one dimension with unit length will be squeezed Returns ------- squeezed : BinnedStatistic a new BinnedStatistic instance, squeezed along one dimension Raises ------ ValueError If the specified dimension does not have length one, or no dimension is specified and multiple dimensions have length one Examples -------- >>> pkmu <BinnedStatistic: dims: (k: 200, mu: 1), variables: ('mu', 'k', 'power')> >>> pkmu.squeeze() # squeeze the mu dimension <BinnedStatistic: dims: (k: 200), variables: ('mu', 'k', 'power')> """ # infer the right dimension to squeeze if dim is None: dim = [k for k in self.dims if len(self.coords[k]) == 1] if not len(dim): raise ValueError("no available dimensions with length one to squeeze") if len(dim) > 1: raise ValueError("multiple dimensions available to squeeze -- please specify") dim = dim[0] else: if dim not in self.dims: raise ValueError("`%s` is not a valid dimension name" %dim) if len(self.coords[dim]) != 1: raise ValueError("the `%s` dimension must have length one to squeeze" %dim) # remove the dimension from the grid i = self.dims.index(dim) toret = self.copy() toret.dims.pop(i); toret.edges.pop(dim); toret.coords.pop(dim) if not len(toret.dims): raise ValueError("cannot squeeze the only remaining axis") # construct new object with squeezed data/mask attrs = toret.__copy_attrs__() d = toret.data.squeeze(axis=i) m = toret.mask.squeeze(axis=i) return self.__construct_direct__(d, m, **attrs)
[docs] def average(self, dim, **kwargs): """ Compute the average of each variable over the specified dimension. Parameters ---------- dim : str The name of the dimension to average over **kwargs : Additional keywords to pass to :func:`BinnedStatistic.reindex`. See the documentation for :func:`BinnedStatistic.reindex` for valid keywords. Returns ------- averaged : BinnedStatistic A new BinnedStatistic, with data averaged along one dimension, which reduces the number of dimension by one """ spacing = (self.edges[dim][-1] - self.edges[dim][0]) toret = self.reindex(dim, spacing, **kwargs) return toret.sel(**{dim:toret.coords[dim][0]})
[docs] def reindex(self, dim, spacing, weights=None, force=True, return_spacing=False, fields_to_sum=[]): """ Reindex the dimension ``dim`` by averaging over multiple coordinate bins, optionally weighting by ``weights``. Returns a new BinnedStatistic holding the re-binned data. Notes ----- * We can only re-bin to an integral factor of the current dimension size in order to inaccuracies when re-binning to overlapping bins * Variables specified in `fields_to_sum` will be summed when re-indexing, instead of averaging Parameters ---------- dim : str The name of the dimension to average over spacing : float The desired spacing for the re-binned data. If `force = True`, the spacing used will be the closest value to this value, such that the new bins are N times larger, when N is an integer weights : array_like or str, optional (`None`) An array to weight the data by before re-binning, or if a string is provided, the name of a data column to use as weights force : bool, optional If `True`, force the spacing to be a value such that the new bins are N times larger, when N is an integer, otherwise, raise an exception. Default is `True` return_spacing : bool, optional If `True`, return the new spacing as the second return value. Default is `False`. fields_to_sum : list the name of fields that will be summed when reindexing, instead of averaging Returns ------- rebinned : BinnedStatistic A new BinnedStatistic instance, which holds the rebinned coordinate grid and data variables spacing : float, optional If `return_spacing` is `True`, the new coordinate spacing will be returned """ i = self.dims.index(dim) fields_to_sum += self._fields_to_sum # determine the new binning old_spacings = numpy.diff(self.coords[dim]) if not numpy.array_equal(old_spacings, old_spacings): raise ValueError("`reindex` requires even bin spacings") old_spacing = old_spacings[0] factor = numpy.round(spacing/old_spacing).astype('int') if not factor: raise ValueError("new spacing must be smaller than original spacing of %.2e" %old_spacing) if factor == 1: raise ValueError("closest binning size to input spacing is the same as current binning") if not numpy.allclose(old_spacing*factor, spacing) and not force: raise ValueError("if `force = False`, new bin spacing must be an integral factor smaller than original") # make a copy of the data data = self.data.copy() # get the weights if isinstance(weights, str): if weights not in self.variables: raise ValueError("cannot weight by `%s`; no such column" %weights) weights = self.data[weights] edges = self.edges[dim] new_shape = list(self.shape) # check if we need to discard bins from the end leftover = self.shape[i] % factor if leftover and not force: args = (leftover, old_spacing*factor) raise ValueError("cannot re-bin because they are %d extra bins, using spacing = %.2e" %args) if leftover: sl = [slice(None, None)]*len(self.dims) sl[i] = slice(None, -leftover) data = data[tuple(sl)] if weights is not None: weights = weights[sl] edges = edges[:-leftover] new_shape[i] = new_shape[i] - leftover # new edges new_shape[i] /= factor new_shape[i] = int(new_shape[i]) new_edges = numpy.linspace(edges[0], edges[-1], new_shape[i]+1) # the re-binned data new_data = numpy.empty(new_shape, dtype=self.data.dtype) for name in self.variables: operation = numpy.nanmean weights_ = weights if weights is not None or name in fields_to_sum: operation = numpy.nansum if name in fields_to_sum: weights_ = None new_data[name] = bin_ndarray(data[name], new_shape, weights=weights_, operation=operation) # the new mask new_mask = numpy.zeros_like(new_data, dtype=bool) for name in self.variables: new_mask = numpy.logical_or(new_mask, ~numpy.isfinite(new_data[name])) # construct new object kw = self.__copy_attrs__() kw['edges'][dim] = new_edges kw['coords'][dim] = 0.5*(new_edges[1:] + new_edges[:-1]) toret = self.__construct_direct__(new_data, new_mask, **kw) return (toret, spacing) if return_spacing else toret
#------------------------------------------------------------------------------ # Deprecated Plaintext read/write functions #------------------------------------------------------------------------------ def _Read2DPlainText(filename): """ Reads the plain text storage of a 2D measurement Returns ------- data : dict dictionary holding the `edges` data, as well as the data columns for the P(k,mu) measurement metadata : dict any additional metadata to store as part of the P(k,mu) measurement """ d = {} metadata = {} with open(filename, 'r') as ff: # read number of k and mu bins are first line Nk, Nmu = [int(l) for l in ff.readline().split()] N = Nk*Nmu # names of data columns on second line columns = ff.readline().split() lines = ff.readlines() data = numpy.array([float(l) for line in lines[:N] for l in line.split()]) data = data.reshape((Nk, Nmu, -1)) #reshape properly to (Nk, Nmu) # make a dict, making complex arrays from real/imag parts i = 0 while i < len(columns): name = columns[i] nextname = columns[i+1] if i < len(columns)-1 else '' if name.endswith('.real') and nextname.endswith('.imag'): name = name.split('.real')[0] d[name] = data[...,i] + 1j*data[...,i+1] i += 2 else: d[name] = data[...,i] i += 1 # store variables as a structured array dtypes = numpy.dtype([(name, d[name].dtype) for name in d]) data = numpy.empty(data.shape[:2], dtype=dtypes) for name in d: data[name] = d[name] # read the edges for k and mu bins edges = [] l1 = int(lines[N].split()[-1]); N = N+1 edges.append(numpy.array([float(line) for line in lines[N:N+l1]])) l2 = int(lines[N+l1].split()[-1]); N = N+l1+1 edges.append(numpy.array([float(line) for line in lines[N:N+l2]])) metadata['edges'] = edges # read any metadata if len(lines) > N+l2: N_meta = int(lines[N+l2].split()[-1]) N = N + l2 + 1 meta = lines[N:N+N_meta] for line in meta: fields = line.split() cast = fields[-1] if cast in __builtins__: metadata[fields[0]] = __builtins__[cast](fields[1]) elif hasattr(numpy, cast): metadata[fields[0]] = getattr(numpy, cast)(fields[1]) else: raise TypeError("Metadata must have builtin or numpy type") return data, metadata def _Read1DPlainText(filename): """ Reads the plain text storage of a 1D measurement Notes ----- * If `edges` is present in the file, they will be returned as part of the metadata, with the key `edges` * If the first line of the file specifies column names, they will be returned as part of the metadata with the `columns` key Returns ------- data : array_like the 1D data stacked vertically, such that each columns represents a separate data variable metadata : dict any additional metadata to store as part of the P(k) measurement """ # data list data = [] # extract the metadata metadata = {} make_float = lambda x: float(x[1:]) with open(filename, 'r') as ff: currline = 0 lines = ff.readlines() # try to read columns if lines[0][0] == '#': try: metadata['columns'] = lines[0][1:].split() except: pass while True: # break if we are at the EOF if currline == len(lines): break line = lines[currline] if not line: currline += 1 continue if line[0] != '#': data.append([float(l) for l in line.split()]) else: line = line[1:] # read edges if 'edges' in line: fields = line.split() N = int(fields[-1]) # number of edges metadata['edges'] = numpy.array([make_float(l) for l in lines[currline+1:currline+1+N]]) currline += 1+N continue # read metadata if 'metadata' in line: # read and cast the metadata properly fields = line.split() N = int(fields[-1]) # number of individual metadata lines for i in range(N): fields = lines[currline+1+i][1:].split() cast = fields[-1] if cast in __builtins__: metadata[fields[0]] = __builtins__[cast](fields[1]) elif hasattr(numpy, cast): metadata[fields[0]] = getattr(numpy, cast)(fields[1]) else: raise TypeError("metadata must have builtin or numpy type") currline += 1+N continue # add to the data currline += 1 data = numpy.asarray(data) # get names of columns, using default if not in file columns = metadata.pop('columns', ['col_%d' %i for i in range(data.shape[1])]) # make a dict, making complex arrays from real/imag parts i = 0 d = {} while i < len(columns): name = columns[i] nextname = columns[i+1] if i < len(columns)-1 else '' if name.endswith('.real') and nextname.endswith('.imag'): name = name.split('.real')[0] d[name] = data[...,i] + 1j*data[...,i+1] i += 2 else: d[name] = data[...,i] i += 1 # store variables as a structured array dtypes = numpy.dtype([(name, d[name].dtype) for name in d]) data = numpy.empty(len(data), dtype=dtypes) for name in d: data[name] = d[name] return data, metadata