import numpy
[docs]def bin_ndarray(ndarray, new_shape, weights=None, operation=numpy.mean):
"""
Bins an ndarray in all axes based on the target shape, by summing
or averaging.
Parameters
----------
ndarray : array_like
the input array to re-bin
new_shape : tuple
the tuple holding the desired new shape
weights : array_like, optional
weights to multiply the input array by, before running the re-binning
operation,
Notes
-----
* Dimensions in `new_shape` must be integral factor smaller than the
old shape
* Number of output dimensions must match number of input dimensions.
* See https://gist.github.com/derricw/95eab740e1b08b78c03f
Examples
--------
>>> m = numpy.arange(0,100,1).reshape((10,10))
>>> n = bin_ndarray(m, new_shape=(5,5), operation=numpy.sum)
>>> print(n)
[[ 22 30 38 46 54]
[102 110 118 126 134]
[182 190 198 206 214]
[262 270 278 286 294]
[342 350 358 366 374]]
"""
if ndarray.shape == new_shape:
raise ValueError("why are we re-binning if the new shape equals the old shape?")
if ndarray.ndim != len(new_shape):
raise ValueError("Shape mismatch: {} -> {}".format(ndarray.shape, new_shape))
if numpy.sometrue(numpy.mod(ndarray.shape, new_shape)):
args = (str(new_shape), str(ndarray.shape))
msg = "desired shape of %s must be integer factor smaller than the old shape %s" %args
raise ValueError(msg)
compression_pairs = [(d, c//d) for d, c in zip(new_shape, ndarray.shape)]
flattened = [l for p in compression_pairs for l in p]
ndarray = ndarray.reshape(flattened)
if weights is not None: weights = weights.reshape(flattened)
for i in range(len(new_shape)):
if weights is not None:
ndarray = operation(ndarray*weights, axis=-1*(i+1))
weights = numpy.sum(weights, axis=-1*(i+1))
ndarray /= weights
else:
ndarray = operation(ndarray, axis=-1*(i+1))
return ndarray
[docs]class BinnedStatistic(object):
"""
Lightweight class to hold statistics binned at fixed coordinates.
For example, this class could hold a grid of (r, mu) or (k, mu) bins
for a correlation function or power spectrum measurement.
It is modeled after the syntax of :class:`xarray.Dataset`, and is designed
to hold correlation function or power spectrum results (in 1D or 2D)
Parameters
----------
dims : list, (Ndim,)
A list of strings specifying names for the coordinate dimensions.
The dimension names stored in :attr:`dims` have the suffix 'cen'
added, to indicate that the coordinate grid is defined at the bin
centers
edges : list, (Ndim,)
A list specifying the bin edges for each dimension
data : array_like
a structured array holding the data variables, where the named
fields interpreted as the variable names. The variable names are
stored in :attr:`variables`
fields_to_sum : list, optional
the name of fields that will be summed when reindexing, instead
of averaging
**kwargs :
Any additional keywords are saved as metadata in the :attr:`attrs`
dictionary attribute
Examples
--------
The following example shows how to read a power spectrum
measurement from a JSON file, as output by nbodykit, assuming
the JSON file holds a dictionary with a 'power' entry holding the
relevant data
>>> filename = 'test_data.json'
>>> pk = BinnedStatistic.from_json(['k'], filename, 'power')
In older versions of nbodykit, results were written using plaintext ASCII
files. Although now deprecated, this type of files can be read using:
>>> filename = 'test_data.dat'
>>> dset = BinnedStatistic.from_plaintext(['k'], filename)
Data variables can be accessed in a dict-like fashion:
>>> power = pkmu['power'] # returns power data variable
Array-like indexing of a :class:`BinnedStatistic` returns a new :class:`BinnedStatistic`
holding the sliced data:
>>> pkmu
<BinnedStatistic: dims: (k: 200, mu: 5), variables: ('mu', 'k', 'power')>
>>> pkmu[:,0] # select first mu column
<BinnedStatistic: dims: (k: 200), variables: ('mu', 'k', 'power')>
Additional data variables can be added to the :class:`BinnedStatistic` via:
>>> modes = numpy.ones((200, 5))
>>> pkmu['modes'] = modes
Coordinate-based indexing is possible through :func:`sel`:
>>> pkmu
<BinnedStatistic: dims: (k: 200, mu: 5), variables: ('mu', 'k', 'power')>
>>> pkmu.sel(k=slice(0.1, 0.4), mu=0.5)
<BinnedStatistic: dims: (k: 30), variables: ('mu', 'k', 'power')>
:func:`squeeze` will explicitly squeeze the specified dimension
(of length one) such that the resulting instance has one less dimension:
>>> pkmu
<BinnedStatistic: dims: (k: 200, mu: 1), variables: ('mu', 'k', 'power')>
>>> pkmu.squeeze(dim='mu') # can also just call pkmu.squeeze()
<BinnedStatistic: dims: (k: 200), variables: ('mu', 'k', 'power')>
:func:`average` returns a new :class:`BinnedStatistic` holding the
data averaged over one dimension
:func:`reindex` will re-bin the coordinate arrays along the specified
dimension
"""
def __init__(self, dims, edges, data, fields_to_sum=[], coords=None, **kwargs):
# number of dimensions must match
if len(dims) != len(edges):
raise ValueError("size mismatch between specified `dims` and `edges`")
# input data must be structured array
if not isinstance(data, numpy.ndarray) or data.dtype.names is None:
raise TypeError("'data' should be a structured numpy array")
shape = tuple(len(e)-1 for e in edges)
if data.shape != shape:
args = (shape, data.shape)
raise ValueError("`edges` imply data shape of %s, but data has shape %s" %args)
self.dims = list(dims)
self.edges = dict(zip(self.dims, edges))
# coordinates are the bin centers
self.coords = {}
for i, dim in enumerate(self.dims):
if coords is not None and coords[i] is not None:
self.coords[dim] = numpy.copy(coords[i])
else:
self.coords[dim] = 0.5 * (edges[i][1:] + edges[i][:-1])
# store variables as a structured array
self.data = data.copy()
# define a mask such that a coordinate grid element will be masked
# if any of the variables at that coordinate are (NaN, inf)
self.mask = numpy.zeros(self.shape, dtype=bool)
for name in data.dtype.names:
self.mask = numpy.logical_or(self.mask, ~numpy.isfinite(self.data[name]))
# fields that we wish to sum, instead of averaging
self._fields_to_sum = fields_to_sum
# save and track metadata
self.attrs = {}
for k in kwargs: self.attrs[k] = kwargs[k]
@classmethod
def from_state(kls, state):
obj = kls(dims=state['dims'], edges=state['edges'], coords=state['coords'], data=state['data'])
obj.attrs.update(state['attrs'])
return obj
def __getstate__(self):
return dict(
dims=self.dims,
edges=[self.edges[dim] for dim in self.dims],
coords=[self.coords[dim] for dim in self.dims],
data=self.data,
attrs=self.attrs)
@property
def shape(self):
"""
The shape of the coordinate grid
"""
return tuple(len(self.coords[d]) for d in self.dims)
@property
def variables(self):
"""
Alias to return the names of the variables stored in `data`
"""
return list(self.data.dtype.names)
[docs] @classmethod
def __construct_direct__(cls, data, mask, **kwargs):
"""
Shortcut around __init__ for internal use to construct and
return a new class instance. The returned object should be
identical to that returned by __init__.
Notes
-----
* Useful for returning new instances with sliced data/mask
* The keyword arguments required to create a full, unbroken
instance are `dims`, `coords`, `edges`, and `attrs`
Parameters
----------
data :
"""
obj = object.__new__(cls)
for k in kwargs: setattr(obj, k, kwargs[k])
for k, d in zip(['data', 'mask'], [data, mask]):
setattr(obj, k, d)
if obj.shape != d.shape:
try:
setattr(obj, k, d.reshape(obj.shape))
except:
raise ValueError("shape mismatch between data and coordinates")
return obj
[docs] def __copy_attrs__(self):
"""
Return a copy of all necessary attributes associated with
the `BinnedStatistic`. This dictionary + `data` and `mask` are all
that's required to reconstruct a new class
"""
kw = {}
kw['dims'] = list(self.dims)
kw['edges'] = self.edges.copy()
kw['coords'] = self.coords.copy()
kw['attrs'] = self.attrs.copy()
kw['_fields_to_sum'] = list(self._fields_to_sum)
return kw
[docs] def __finalize__(self, data, mask, indices):
"""
Finalize and return a new instance from a slice of the
current object (returns a copy)
"""
edges, coords = self.__slice_edges__(indices)
kw = {'dims':list(self.dims), 'edges':edges, 'coords':coords,
'attrs':self.attrs.copy(), '_fields_to_sum':self._fields_to_sum}
return self.__class__.__construct_direct__(data, mask, **kw)
[docs] def __slice_edges__(self, indices):
"""
Internal function to slice the `edges` attribute with the
specified indices, which specify the included coordinate bins
"""
edges = {}
coords = {}
for i, dim in enumerate(self.dims):
if len(indices[i]) > 0:
idx = list(indices[i]) + [indices[i][-1]+1]
else:
idx = [0]
edges[dim] = self.edges[dim][idx]
coords[dim] = 0.5 * (edges[dim][1:] + edges[dim][:-1])
return edges, coords
def __str__(self):
name = self.__class__.__name__
dims = "(" + ", ".join(['%s: %d' %(k, self.shape[i]) for i, k in enumerate(self.dims)]) + ")"
if len(self.variables) < 5:
return "<%s: dims: %s, variables: %s>" %(name, dims, str(tuple(self.variables)))
else:
return "<%s: dims: %s, variables: %d total>" %(name, dims, len(self.variables))
def __repr__(self):
return self.__str__()
def __iter__(self):
return iter(self.variables)
def __contains__(self, key):
return key in self.variables
[docs] def __setitem__(self, key, data):
"""
Add a new variable with the name `key` to the class using `data`
"""
if numpy.shape(data) != self.data.shape:
raise ValueError("data to be added must have shape %s" %str(self.data.shape))
# add the new (key, type) to the dtype, or if the key is present, overwrite
dtype = list(self.data.dtype.descr) # make copy
names = list(self.data.dtype.names) # make copy
if key in names:
i = names.index(key)
dtype.pop(i); names.pop(i)
dtype += [(key, data.dtype.type)]
# add old variables
dtype = numpy.dtype(dtype)
new = numpy.zeros(self.data.shape, dtype=dtype)
for col in names:
new[col] = self.data[col]
# the new data to add
new[key] = data
mask = numpy.logical_or(self.mask, ~numpy.isfinite(new[key]))
# save
self.data = new
self.mask = mask
[docs] def __getitem__(self, key):
"""
Index- or string- based indexing
Notes
-----
* If a single string is passed, the key is intrepreted
as a `variable` or `coordinate`, and the corresponding
array is returned
* If a list of strings is passed, then a new `BinnedStatistic`
holding only the `variable` names in `key` is returned
* Integer-based indexing or slices similar to numpy
indexing will slice `data`, returning a new
`BinnedStatistic` holding the newly sliced data and coordinate grid
* Scalar indexes (i.e., integers) used to index a certain
dimension will "squeeze" that dimension, removing it
from the coordinate grid
"""
# if single string passed, return a coordinate or variable
if isinstance(key, str):
if key in self.variables:
return self.data[key]
else:
raise KeyError("`%s` is not a valid variable name" %key)
# indices to slice the data with
indices = [list(range(0, self.shape[i])) for i in range(len(self.dims))]
# check for list/tuple of variable names
# if so, return a BinnedStatistic with slice of columns
if isinstance(key, (list, tuple)) and all(isinstance(x, str) for x in key):
if all(k in self.variables for k in key):
return self.__finalize__(self.data[list(key)], self.mask.copy(), indices)
else:
invalid = ', '.join("'%s'" %k for k in key if k not in self.variables)
raise KeyError("cannot slice variables -- invalid names: (%s)" %invalid)
key_ = key
# if key is a single integer or single slice, make it a list
make_iterable = isinstance(key, slice) or isinstance(key, int)
if make_iterable or isinstance(key, list) and all(isinstance(x, int) for x in key):
key_ = [key]
squeezed_dims = []
for i, subkey in enumerate(key_):
if i >= len(self.dims):
raise IndexError("too many indices for BinnedStatistic; note that ndim = %d" %len(self.dims))
if isinstance(subkey, int):
indices[i] = [subkey]
squeezed_dims.append(self.dims[i])
elif isinstance(subkey, list):
indices[i] = subkey
elif isinstance(subkey, slice):
indices[i] = list(range(*subkey.indices(self.shape[i])))
# can't squeeze all dimensions!!
if len(squeezed_dims) == len(self.dims):
raise IndexError("cannot return object with all remaining dimensions squeezed")
# fail nicely if we fail at all
try:
toret = self.__finalize__(self.data[key], self.mask[key], indices)
for dim in squeezed_dims:
toret = toret.squeeze(dim)
return toret
except ValueError:
raise IndexError("this type of slicing not implemented")
def _get_index(self, dim, val, method=None):
"""
Internal function to compute the bin index of the nearest
coordinate value to the input value
"""
index = self.coords[dim]
if method == 'nearest':
i = (numpy.abs(index-val)).argmin()
else:
try:
i = list(index).index(val)
except Exception as e:
args = (dim, str(e))
msg = "error converting '%s' index; try setting `method = 'nearest'`: %s"
raise IndexError(msg %args)
return i
#--------------------------------------------------------------------------
# user-called functions
#--------------------------------------------------------------------------
[docs] def to_json(self, filename):
"""
Write a BinnedStatistic from a JSON file.
.. note::
This uses :class:`nbodykit.utils.JSONEncoder` to write the
JSON file
Parameters
----------
filename : str
the name of the file to write
"""
import json
from nbodykit.utils import JSONEncoder
state = self.__getstate__()
with open(filename, 'w') as ff:
json.dump(state, ff, cls=JSONEncoder)
[docs] @classmethod
def from_json(cls, filename, key='data', dims=None, edges=None, **kwargs):
"""
Initialize a BinnedStatistic from a JSON file.
The JSON file should contain a dictionary, where the data to load is stored
as the ``key`` entry, with an ``edges`` entry specifying bin edges, and
optionally, a ``attrs`` entry giving a dict of meta-data
.. note::
This uses :class:`nbodykit.utils.JSONDecoder` to load the
JSON file
Parameters
----------
filename : str
the name of the file to load
key : str, optional
the name of the key in the JSON file holding the data to load
dims : list, optional
list of names specifying the dimensions, i.e., ``['k']`` or ``['k', 'mu']``;
must be supplied if not given in the JSON file
Returns
-------
dset : BinnedStatistic
the BinnedStatistic holding the data from file
"""
import json
from nbodykit.utils import JSONDecoder
# parse
with open(filename, 'r') as ff:
state = json.load(ff, cls=JSONDecoder)
# the data
if key not in state:
args = (key, tuple(state.keys()))
raise ValueError("no data entry found in JSON format for '%s' key; valid keys are %s" %args)
data = state[key]
# the dimensions
dims = state.pop('dims', dims)
if dims is None:
raise ValueError("no `dims` found in JSON file; please specify as keyword argument")
# the edges
edges = state.pop('edges', edges)
if edges is None:
raise ValueError("no `edges` found in JSON file; please specify as keyword argument")
# the coords
coords = state.pop('coords', None)
# meta-data
attrs = state.pop('attrs', {})
attrs.update(kwargs)
return cls(dims, edges, data, coords=coords, **attrs)
[docs] @classmethod
def from_plaintext(cls, dims, filename, **kwargs):
"""
Initialize a BinnedStatistic from a plaintext file
.. note:: Deprecated in nbodykit 0.2.x
Storage of BinnedStatistic objects as plaintext ASCII files is no longer supported;
See :func:`BinnedStatistic.from_json`
Parameters
----------
dims : list
list of names specifying the dimensions, i.e., ``['k']`` or ``['k', 'mu']``
filename : str
the name of the file to load
Returns
-------
dset : BinnedStatistic
the BinnedStatistic holding the data from file
"""
import warnings
msg = "storage of BinnedStatistic objects as ASCII plaintext files is deprecated; see BinnedStatistic.from_json"
warnings.warn(msg, FutureWarning, stacklevel=2)
# make sure dims is a list/tuple
if not isinstance(dims, (tuple, list)):
raise TypeError("`dims` should be a list or tuple of strings")
# read from file
try:
if len(dims) == 1:
data, meta = _Read1DPlainText(filename)
elif len(dims) == 2:
data, meta = _Read2DPlainText(filename)
except Exception as e:
msg = "unable to read plaintext file, perhaps the dimension of the file does "
msg += "not match the passed `dims`;\nexception: %s" %str(e)
raise ValueError(msg)
# get the bin edges
edges = meta.pop('edges', None)
if edges is None:
raise ValueError("plaintext file does not include `edges`; cannot be loaded into a BinnedStatistic")
if len(dims) == 1:
edges = [edges]
meta.update(kwargs)
return cls(dims, edges, data, **meta)
[docs] def copy(self, cls=None):
"""
Returns a copy of the BinnedStatistic, optionally change the type
to cls. cls must be a subclass of BinnedStatistic.
"""
attrs = self.__copy_attrs__()
if cls is None:
cls = self.__class__
if not issubclass(cls, BinnedStatistic):
raise TypeError("The cls argument must be a subclass of BinnedStatistic")
return cls.__construct_direct__(self.data.copy(), self.mask.copy(), **attrs)
[docs] def rename_variable(self, old_name, new_name):
"""
Rename a variable in :attr:`data` from ``old_name`` to ``new_name``.
Note that this procedure is performed in-place (does not
return a new BinnedStatistic)
Parameters
----------
old_name : str
the name of the old varibale to rename
new_name : str
the desired new variable name
Raises
------
ValueError
If `old_name` is not present in :attr:`variables`
"""
import copy
if old_name not in self.variables:
raise ValueError("`%s` is not an existing variable name" %old_name)
new_dtype = copy.deepcopy(self.data.dtype)
names = list(new_dtype.names)
names[names.index(old_name)] = new_name
new_dtype.names = names
self.data.dtype = new_dtype
[docs] def sel(self, method=None, **indexers):
"""
Return a new BinnedStatistic indexed by coordinate values along the
specified dimension(s).
Notes
-----
Scalar values used to index a specific dimension will result in
that dimension being squeezed. To keep a dimension of unit length,
use a list to index (see examples below).
Parameters
----------
method : {None, 'nearest'}
The method to use for inexact matches; if set to `None`, require
an exact coordinate match, otherwise match the nearest coordinate
**indexers :
the pairs of dimension name and coordinate value used to index
the BinnedStatistic
Returns
-------
sliced : BinnedStatistic
a new BinnedStatistic holding the sliced data and coordinate grid
Examples
--------
>>> pkmu
<BinnedStatistic: dims: (k: 200, mu: 5), variables: ('mu', 'k', 'power')>
>>> pkmu.sel(k=0.4)
<BinnedStatistic: dims: (mu: 5), variables: ('mu', 'k', 'power')>
>>> pkmu.sel(k=[0.4])
<BinnedStatistic: dims: (k: 1, mu: 5), variables: ('mu', 'k', 'power')>
>>> pkmu.sel(k=slice(0.1, 0.4), mu=0.5)
<BinnedStatistic: dims: (k: 30), variables: ('mu', 'k', 'power')>
"""
indices = {}
squeezed_dims = []
for dim, key in indexers.items():
if isinstance(key, list):
indices[dim] = [self._get_index(dim, k, method=method) for k in key]
elif isinstance(key, slice):
new_slice = []
for name in ['start', 'stop']:
new_slice.append(self._get_index(dim, getattr(key, name), method=method))
i = self.dims.index(dim)
indices[dim] = list(range(*slice(*new_slice).indices(self.shape[i])))
elif not numpy.isscalar(key):
raise IndexError("please index using a list, slice, or scalar value")
else:
indices[dim] = [self._get_index(dim, key, method=method)]
squeezed_dims.append(dim)
# can't squeeze all dimensions!!
if len(squeezed_dims) == len(self.dims):
raise IndexError("cannot return object with all remaining dimensions squeezed")
toret = self.take(**indices)
for dim in squeezed_dims:
toret = toret.squeeze(dim)
return toret
[docs] def take(self, *masks, **indices):
"""
Take a subset of a BinnedStatistic from given list of indices.
This is more powerful but more verbose than `sel`. Also the
result is never squeezed, even if only a single item along the direction
is used.
Parameters
----------
masks : array_like (boolean)
a list of masks that are of the same shape as the data.
indices: dict (string : array_like)
mapping from axes (by name, dim) to items to select (list/array_like).
Each item is a valid selector for numpy's fancy indexing.
Returns
-------
new BinnedStatistic, where only items selected by all axes are kept.
Examples
--------
>>> pkmu
<BinnedStatistic: dims: (k: 200, mu: 5), variables: ('mu', 'k', 'power')>
# similar to pkmu.sel(k > 0.4), select the bin centers
>>> pkmu.take(k=pkmu.coords['k'] > 0.4)
<BinnedStatistic: dims: (mu: 5), variables: ('mu', 'k', 'power')>
# also similar to pkmu.sel(k > 0.4), select the bin averages
>>> pkmu.take(pkmu['k'] > 0.4)
<BinnedStatistic: dims: (k: 30), variables: ('mu', 'k', 'power')>
# impossible with sel.
>>> pkmu.take(pkmu['modes'] > 0)
"""
indices_dict = {}
indices_dict.update(indices) # rename for a cleaner API.
# flatten the masks, will keep items that are true everywhere
mask = numpy.ones(self.shape, dtype='?')
for m in masks: mask = mask & m
indices = [numpy.ones(self.shape[i], dtype='?') for i in range(len(self.dims))]
# update indices with masks
for i, dim in enumerate(self.dims):
axis = list(range(len(self.dims)))
axis.remove(i)
axis = tuple(axis)
mask1 = mask.all(axis=axis)
indices[i] &= mask1
# update indices with indices_dict
for dim, index in indices_dict.items():
i = self.dims.index(dim)
if isinstance(index, numpy.ndarray) and index.dtype == numpy.dtype('?'):
# boolean mask?
assert index.ndim == 1
indices[i] &= index
else:
mask1 = numpy.zeros(self.shape[i], dtype='?')
mask1.put(index, True)
indices[i] &= mask1
# convert to indices
for i in range(len(indices)):
indices[i] = indices[i].nonzero()[0]
data = self.data.copy()
mask = self.mask.copy()
for i, idx in enumerate(indices):
data = numpy.take(data, idx, axis=i)
mask = numpy.take(mask, idx, axis=i)
toret = self.__finalize__(data, mask, indices)
return toret
[docs] def squeeze(self, dim=None):
"""
Squeeze the BinnedStatistic along the specified dimension, which
removes that dimension from the BinnedStatistic.
The behavior is similar to that of :func:`numpy.squeeze`.
Parameters
----------
dim : str, optional
The name of the dimension to squeeze. If no dimension
is provided, then the one dimension with unit length will
be squeezed
Returns
-------
squeezed : BinnedStatistic
a new BinnedStatistic instance, squeezed along one dimension
Raises
------
ValueError
If the specified dimension does not have length one, or
no dimension is specified and multiple dimensions have
length one
Examples
--------
>>> pkmu
<BinnedStatistic: dims: (k: 200, mu: 1), variables: ('mu', 'k', 'power')>
>>> pkmu.squeeze() # squeeze the mu dimension
<BinnedStatistic: dims: (k: 200), variables: ('mu', 'k', 'power')>
"""
# infer the right dimension to squeeze
if dim is None:
dim = [k for k in self.dims if len(self.coords[k]) == 1]
if not len(dim):
raise ValueError("no available dimensions with length one to squeeze")
if len(dim) > 1:
raise ValueError("multiple dimensions available to squeeze -- please specify")
dim = dim[0]
else:
if dim not in self.dims:
raise ValueError("`%s` is not a valid dimension name" %dim)
if len(self.coords[dim]) != 1:
raise ValueError("the `%s` dimension must have length one to squeeze" %dim)
# remove the dimension from the grid
i = self.dims.index(dim)
toret = self.copy()
toret.dims.pop(i); toret.edges.pop(dim); toret.coords.pop(dim)
if not len(toret.dims):
raise ValueError("cannot squeeze the only remaining axis")
# construct new object with squeezed data/mask
attrs = toret.__copy_attrs__()
d = toret.data.squeeze(axis=i)
m = toret.mask.squeeze(axis=i)
return self.__construct_direct__(d, m, **attrs)
[docs] def average(self, dim, **kwargs):
"""
Compute the average of each variable over the specified dimension.
Parameters
----------
dim : str
The name of the dimension to average over
**kwargs :
Additional keywords to pass to :func:`BinnedStatistic.reindex`. See the
documentation for :func:`BinnedStatistic.reindex` for valid keywords.
Returns
-------
averaged : BinnedStatistic
A new BinnedStatistic, with data averaged along one dimension,
which reduces the number of dimension by one
"""
spacing = (self.edges[dim][-1] - self.edges[dim][0])
toret = self.reindex(dim, spacing, **kwargs)
return toret.sel(**{dim:toret.coords[dim][0]})
[docs] def reindex(self,
dim,
spacing,
weights=None,
force=True,
return_spacing=False,
fields_to_sum=[]):
"""
Reindex the dimension ``dim`` by averaging over multiple coordinate bins,
optionally weighting by ``weights``.
Returns a new BinnedStatistic holding the re-binned data.
Notes
-----
* We can only re-bin to an integral factor of the current
dimension size in order to inaccuracies when re-binning to
overlapping bins
* Variables specified in `fields_to_sum` will
be summed when re-indexing, instead of averaging
Parameters
----------
dim : str
The name of the dimension to average over
spacing : float
The desired spacing for the re-binned data. If `force = True`,
the spacing used will be the closest value to this value, such
that the new bins are N times larger, when N is an integer
weights : array_like or str, optional (`None`)
An array to weight the data by before re-binning, or if
a string is provided, the name of a data column to use as weights
force : bool, optional
If `True`, force the spacing to be a value such
that the new bins are N times larger, when N is an integer,
otherwise, raise an exception. Default is `True`
return_spacing : bool, optional
If `True`, return the new spacing as the second return value.
Default is `False`.
fields_to_sum : list
the name of fields that will be summed when reindexing, instead
of averaging
Returns
-------
rebinned : BinnedStatistic
A new BinnedStatistic instance, which holds the rebinned coordinate
grid and data variables
spacing : float, optional
If `return_spacing` is `True`, the new coordinate spacing
will be returned
"""
i = self.dims.index(dim)
fields_to_sum += self._fields_to_sum
# determine the new binning
old_spacings = numpy.diff(self.coords[dim])
if not numpy.array_equal(old_spacings, old_spacings):
raise ValueError("`reindex` requires even bin spacings")
old_spacing = old_spacings[0]
factor = numpy.round(spacing/old_spacing).astype('int')
if not factor:
raise ValueError("new spacing must be smaller than original spacing of %.2e" %old_spacing)
if factor == 1:
raise ValueError("closest binning size to input spacing is the same as current binning")
if not numpy.allclose(old_spacing*factor, spacing) and not force:
raise ValueError("if `force = False`, new bin spacing must be an integral factor smaller than original")
# make a copy of the data
data = self.data.copy()
# get the weights
if isinstance(weights, str):
if weights not in self.variables:
raise ValueError("cannot weight by `%s`; no such column" %weights)
weights = self.data[weights]
edges = self.edges[dim]
new_shape = list(self.shape)
# check if we need to discard bins from the end
leftover = self.shape[i] % factor
if leftover and not force:
args = (leftover, old_spacing*factor)
raise ValueError("cannot re-bin because they are %d extra bins, using spacing = %.2e" %args)
if leftover:
sl = [slice(None, None)]*len(self.dims)
sl[i] = slice(None, -leftover)
data = data[tuple(sl)]
if weights is not None: weights = weights[sl]
edges = edges[:-leftover]
new_shape[i] = new_shape[i] - leftover
# new edges
new_shape[i] /= factor
new_shape[i] = int(new_shape[i])
new_edges = numpy.linspace(edges[0], edges[-1], new_shape[i]+1)
# the re-binned data
new_data = numpy.empty(new_shape, dtype=self.data.dtype)
for name in self.variables:
operation = numpy.nanmean
weights_ = weights
if weights is not None or name in fields_to_sum:
operation = numpy.nansum
if name in fields_to_sum: weights_ = None
new_data[name] = bin_ndarray(data[name], new_shape, weights=weights_, operation=operation)
# the new mask
new_mask = numpy.zeros_like(new_data, dtype=bool)
for name in self.variables:
new_mask = numpy.logical_or(new_mask, ~numpy.isfinite(new_data[name]))
# construct new object
kw = self.__copy_attrs__()
kw['edges'][dim] = new_edges
kw['coords'][dim] = 0.5*(new_edges[1:] + new_edges[:-1])
toret = self.__construct_direct__(new_data, new_mask, **kw)
return (toret, spacing) if return_spacing else toret
#------------------------------------------------------------------------------
# Deprecated Plaintext read/write functions
#------------------------------------------------------------------------------
def _Read2DPlainText(filename):
"""
Reads the plain text storage of a 2D measurement
Returns
-------
data : dict
dictionary holding the `edges` data, as well as the
data columns for the P(k,mu) measurement
metadata : dict
any additional metadata to store as part of the
P(k,mu) measurement
"""
d = {}
metadata = {}
with open(filename, 'r') as ff:
# read number of k and mu bins are first line
Nk, Nmu = [int(l) for l in ff.readline().split()]
N = Nk*Nmu
# names of data columns on second line
columns = ff.readline().split()
lines = ff.readlines()
data = numpy.array([float(l) for line in lines[:N] for l in line.split()])
data = data.reshape((Nk, Nmu, -1)) #reshape properly to (Nk, Nmu)
# make a dict, making complex arrays from real/imag parts
i = 0
while i < len(columns):
name = columns[i]
nextname = columns[i+1] if i < len(columns)-1 else ''
if name.endswith('.real') and nextname.endswith('.imag'):
name = name.split('.real')[0]
d[name] = data[...,i] + 1j*data[...,i+1]
i += 2
else:
d[name] = data[...,i]
i += 1
# store variables as a structured array
dtypes = numpy.dtype([(name, d[name].dtype) for name in d])
data = numpy.empty(data.shape[:2], dtype=dtypes)
for name in d:
data[name] = d[name]
# read the edges for k and mu bins
edges = []
l1 = int(lines[N].split()[-1]); N = N+1
edges.append(numpy.array([float(line) for line in lines[N:N+l1]]))
l2 = int(lines[N+l1].split()[-1]); N = N+l1+1
edges.append(numpy.array([float(line) for line in lines[N:N+l2]]))
metadata['edges'] = edges
# read any metadata
if len(lines) > N+l2:
N_meta = int(lines[N+l2].split()[-1])
N = N + l2 + 1
meta = lines[N:N+N_meta]
for line in meta:
fields = line.split()
cast = fields[-1]
if cast in __builtins__:
metadata[fields[0]] = __builtins__[cast](fields[1])
elif hasattr(numpy, cast):
metadata[fields[0]] = getattr(numpy, cast)(fields[1])
else:
raise TypeError("Metadata must have builtin or numpy type")
return data, metadata
def _Read1DPlainText(filename):
"""
Reads the plain text storage of a 1D measurement
Notes
-----
* If `edges` is present in the file, they will be returned
as part of the metadata, with the key `edges`
* If the first line of the file specifies column names,
they will be returned as part of the metadata with the
`columns` key
Returns
-------
data : array_like
the 1D data stacked vertically, such that each columns
represents a separate data variable
metadata : dict
any additional metadata to store as part of the
P(k) measurement
"""
# data list
data = []
# extract the metadata
metadata = {}
make_float = lambda x: float(x[1:])
with open(filename, 'r') as ff:
currline = 0
lines = ff.readlines()
# try to read columns
if lines[0][0] == '#':
try:
metadata['columns'] = lines[0][1:].split()
except:
pass
while True:
# break if we are at the EOF
if currline == len(lines): break
line = lines[currline]
if not line:
currline += 1
continue
if line[0] != '#':
data.append([float(l) for l in line.split()])
else:
line = line[1:]
# read edges
if 'edges' in line:
fields = line.split()
N = int(fields[-1]) # number of edges
metadata['edges'] = numpy.array([make_float(l) for l in lines[currline+1:currline+1+N]])
currline += 1+N
continue
# read metadata
if 'metadata' in line:
# read and cast the metadata properly
fields = line.split()
N = int(fields[-1]) # number of individual metadata lines
for i in range(N):
fields = lines[currline+1+i][1:].split()
cast = fields[-1]
if cast in __builtins__:
metadata[fields[0]] = __builtins__[cast](fields[1])
elif hasattr(numpy, cast):
metadata[fields[0]] = getattr(numpy, cast)(fields[1])
else:
raise TypeError("metadata must have builtin or numpy type")
currline += 1+N
continue
# add to the data
currline += 1
data = numpy.asarray(data)
# get names of columns, using default if not in file
columns = metadata.pop('columns', ['col_%d' %i for i in range(data.shape[1])])
# make a dict, making complex arrays from real/imag parts
i = 0
d = {}
while i < len(columns):
name = columns[i]
nextname = columns[i+1] if i < len(columns)-1 else ''
if name.endswith('.real') and nextname.endswith('.imag'):
name = name.split('.real')[0]
d[name] = data[...,i] + 1j*data[...,i+1]
i += 2
else:
d[name] = data[...,i]
i += 1
# store variables as a structured array
dtypes = numpy.dtype([(name, d[name].dtype) for name in d])
data = numpy.empty(len(data), dtype=dtypes)
for name in d:
data[name] = d[name]
return data, metadata