Source code for nbodykit.algorithms.zhist

import numpy
import logging
from mpi4py import MPI

from nbodykit import CurrentMPIComm
from nbodykit.transform import ConstantArray
from scipy.interpolate import InterpolatedUnivariateSpline

[docs]class RedshiftHistogram(object): """ Compute the mean number density as a function of redshift :math:`n(z)` from an input CatalogSource of particles. Results are computed when the object is inititalized. See the documenation of :func:`~RedshiftHistogram.run` for the attributes storing the results. .. note:: The units of the number density are :math:`(\mathrm{Mpc}/h)^{-3}` Parameters ---------- source : CatalogSource the source of particles holding the redshift column to histogram fsky : float the sky area fraction, which is used in the volume calculation when normalizing :math:`n(z)` cosmo : :class:`nbodykit.cosmology.core.Cosmology` the cosmological parameters, which are used to compute the volume from redshift shells when normalizing :math:`n(z)` bins : int or sequence of scalars, optional If `bins` is an int, it defines the number of equal-width bins in the given range. If `bins` is a sequence, it defines the bin edges, including the rightmost edge, allowing for non-uniform bin widths. If not provided, Scott's rule is used to estimate the optimal bin width from the input data (default) redshift : str, optional the name of the column specifying the redshift data weight : str, optional the name of the column specifying weights to use when histogramming the data """ logger = logging.getLogger('RedshiftHistogram') def __init__(self, source, fsky, cosmo, bins=None, redshift='Redshift', weight=None): # input columns need to be there for col in [redshift, weight]: if col is not None and col not in source: raise ValueError("'%s' column missing from input source in RedshiftHistogram" %col) self.comm = source.comm # using Scott's rule for binning if bins is None: h, bins = scotts_bin_width(source.compute(source[redshift]), self.comm) if self.comm.rank == 0: self.logger.info("using Scott's rule to determine optimal binning; h = %.2e, N_bins = %d" %(h, len(bins)-1)) # equally spaced bins from min to max val elif numpy.isscalar(bins): if self.comm.rank == 0: self.logger.info("computing %d equally spaced bins" %bins) z = source.compute(source[redshift]) maxval = self.comm.allreduce(z.max(), op=MPI.MAX) minval = self.comm.allreduce(z.min(), op=MPI.MIN) bins = linspace(minval, maxval, bins + 1, endpoint=True) self.source = source self.cosmo = cosmo self.attrs = {} self.attrs['edges'] = bins self.attrs['fsky'] = fsky self.attrs['redshift'] = redshift self.attrs['weight'] = weight self.attrs['cosmo'] = dict(cosmo) # and run self.run()
[docs] def run(self): """ Run the algorithm, which computes the histogram. This function does not return anything, but adds the following attributes to the class: - :attr:`bin_edges` - :attr:`bin_centers` - :attr:`dV` - :attr:`nbar` .. note:: All ranks store the same result attributes. Attributes ---------- bin_edges : array_like the edges of the redshift bins bin_centers : array_like the center values of each redshift bin dV : array_like the volume of each redshift shell in units of :math:`(\mathrm{Mpc}/h)^3` nbar : array_like the values of the redshift histogram, normalized to number density (in units of :math:`(\mathrm{Mpc}/h)^{-3}`) """ edges = self.attrs['edges'] # get the columns redshift = self.source[self.attrs['redshift']] if self.attrs['weight'] is not None: weight = self.source[self.attrs['weight']] if self.comm.rank == 0: self.logger.info("computing histogram using weights from '%s' column" %self.attrs['weight']) else: weight = ConstantArray(1.0, self.source.size) # compute the numpy arrays from dask redshift, weight = self.source.compute(redshift, weight) # do the bin count, using the specified weight values dig = numpy.searchsorted(edges, redshift, "right") N = numpy.bincount(dig, weights=weight, minlength=len(edges)+1)[1:-1] # now sum across all ranks N = self.comm.allreduce(N) # compute the volume if self.comm.rank == 0: self.logger.info("using cosmology %s to compute volume in units of (Mpc/h)^3" %str(self.cosmo)) self.logger.info("sky fraction used in volume calculation: %.4f" %self.attrs['fsky']) R_hi = self.cosmo.comoving_distance(edges[1:]) # in Mpc/h R_lo = self.cosmo.comoving_distance(edges[:-1]) # in Mpc/h dV = (4./3.)*numpy.pi*(R_hi**3 - R_lo**3) * self.attrs['fsky'] # store the results self.bin_edges = edges self.bin_centers = 0.5*(edges[:-1] + edges[1:]) self.dV = dV self.nbar = 1.*N/dV
[docs] def interpolate(self, z, ext='zeros'): """ Interpoalte dndz as a function of redshift. The interpolation acts as a band pass filter, removing small scale fluctuations in the estimator. Parameters ---------- z : array_like redshift ext : 'extrapolate', 'zeros', 'raise', 'const' how to deal with values out of bound. Returns ------- n : n(z) """ nofz = InterpolatedUnivariateSpline(self.bin_centers, self.nbar, ext=ext) return nofz(z)
def __getstate__(self): state = dict( bin_edges=self.bin_edges, bin_centers=self.bin_centers, dV=self.dV, nbar=self.nbar, attrs=self.attrs) return state def __setstate__(self, state): self.__dict__.update(state)
[docs] def save(self, output): """ Save the RedshiftHistogram result to disk. The format is JSON. """ import json from nbodykit.utils import JSONEncoder # only the master rank writes if self.comm.rank == 0: self.logger.info('histogram done; saving result to %s' %output) with open(output, 'w') as ff: json.dump(self.__getstate__(), ff, cls=JSONEncoder)
[docs] @classmethod @CurrentMPIComm.enable def load(cls, output, comm=None): """ Load a saved RedshiftHistogram result. The result has been saved to disk with :func:`RedshiftHistogram.save`. """ import json from nbodykit.utils import JSONDecoder if comm.rank == 0: with open(output, 'r') as ff: state = json.load(ff, cls=JSONDecoder) else: state = None state = comm.bcast(state) self = object.__new__(cls) self.__setstate__(state) self.comm = comm return self
[docs]def scotts_bin_width(data, comm): r""" Return the optimal histogram bin width using Scott's rule, defined as: .. math:: h = \sigma \sqrt[3]{\frac{24 * \sqrt{\pi}}{n}} .. note:: This is a collective operation Parameters ---------- data : array_like the array that we are histograming comm : the MPI communicator Returns ------- dx : float the bin spacing edges : array_like the array holding the bin edges """ # compute the mean csum = comm.allreduce(data.sum()) csize = comm.allreduce(data.size) cmean = csum / csize # std dev rsum = comm.allreduce((abs(data - cmean)**2).sum()) sigma = (rsum / csize)**0.5 dx = sigma * (24. * numpy.sqrt(numpy.pi) / csize) ** (1. / 3) maxval = comm.allreduce(data.max(), op=MPI.MAX) minval = comm.allreduce(data.min(), op=MPI.MIN) Nbins = numpy.ceil((maxval - minval) * 1. / dx) Nbins = max(1, Nbins) edges = minval + dx * numpy.arange(Nbins + 1) return dx, edges