Source code for pycbc.inference.sampler.refine

""" Sampler that uses kde refinement of an existing posterior estimate.
"""

import logging
import numpy
import numpy.random

from scipy.special import logsumexp
from scipy.stats import gaussian_kde
from scipy.stats import entropy as sentropy

from pycbc.inference import models
from pycbc.pool import choose_pool
from pycbc.inference.io import loadfile

from .base import setup_output, initial_dist_from_config
from .dummy import DummySampler


[docs]def call_model(params): models._global_instance.update(**params) return ( models._global_instance.logposterior, models._global_instance.loglikelihood, )
[docs]def resample_equal(samples, logwt, seed=0): weights = numpy.exp(logwt - logsumexp(logwt)) N = len(weights) positions = (numpy.random.random() + numpy.arange(N)) / N idx = numpy.zeros(N, dtype=int) cumulative_sum = numpy.cumsum(weights) cumulative_sum /= cumulative_sum[-1] i, j = 0, 0 while i < N: if positions[i] < cumulative_sum[j]: idx[i] = j i += 1 else: j += 1 try: rng = numpy.random.default_rng(seed) except AttributeError: # numpy pre-1.17 uses RandomState # Py27: delete this after we drop python 2.7 support rng = numpy.random.RandomState(seed) rng.shuffle(idx) return {p: samples[p][idx] for p in samples}
[docs]class RefineSampler(DummySampler): """Sampler for kde drawn refinement of existing posterior estimate Parameters ---------- model : Model An instance of a model from ``pycbc.inference.models``. num_samples: int The number of samples to draw from the kde at the conclusion iterative_kde_samples: int The number of samples to add to the kde during each iterations min_refinement_steps: int The minimum number of iterations to take. max_refinement_steps: The maximum number of refinment steps to take. entropy: float The target entropy between iterative kdes dlogz: float The target evidence difference between iterative kde updates kde: kde The inital kde to use. """ name = "refine" def __init__( self, model, *args, nprocesses=1, use_mpi=False, num_samples=int(1e5), iterative_kde_samples=int(1e3), min_refinement_steps=5, max_refinement_steps=40, offbase_fraction=0.7, entropy=0.01, dlogz=0.01, kde=None, update_groups=None, max_kde_samples=int(5e4), **kwargs ): super().__init__(model, *args) self.model = model self.kde = kde self.vparam = model.variable_params models._global_instance = model self.num_samples = int(num_samples) self.pool = choose_pool(mpi=use_mpi, processes=nprocesses) self._samples = {} self.num_samples = int(num_samples) self.iterative_kde_samples = int(iterative_kde_samples) self.max_kde_samples = int(max_kde_samples) self.min_refinement_steps = int(min_refinement_steps) self.max_refinement_steps = int(max_refinement_steps) self.offbase_fraction = float(offbase_fraction) self.entropy = float(entropy) self.dlogz_target = float(dlogz) self.param_groups = [] if update_groups is None: self.param_groups.append(self.vparam) else: for gname in update_groups.split(): gvalue = kwargs[gname] if gvalue == "all": self.param_groups.append(self.vparam) else: self.param_groups.append(gvalue.split())
[docs] def draw_samples(self, size, update_params=None): """Draw new samples within the model priors""" logging.info("getting from kde") params = {} ksamples = self.kde.resample(size=size) j = 0 for i, k in enumerate(self.vparam): if update_params is not None and k not in update_params: params[k] = numpy.ones(size) * self.fixed_samples[i] else: params[k] = ksamples[j, :] j += 1 logging.info("checking prior") keep = self.model.prior_distribution.contains(params) logging.info("done checking") r = numpy.array([params[k][keep] for k in self.vparam]) return r
[docs] @staticmethod def compare_kde(kde1, kde2, size=int(1e4)): """Calculate information difference between two kde distributions""" s = kde1.resample(size=size) return sentropy(kde1.pdf(s), kde2.pdf(s))
[docs] def converged(self, step, kde_new, factor, logp): """Check that kde is converged by comparing to previous iteration""" logging.info("checking convergence") if not hasattr(self, "old_logz"): self.old_logz = numpy.inf entropy_diff = -1 if self.entropy < 1: entropy_diff = self.compare_kde(self.kde, kde_new) # Compare how the logz changes when adding new samples # this is guaranteed to decrease as old samples included logz = logsumexp(factor) - numpy.log(len(factor)) dlogz = logz - self.old_logz self.old_logz = logz # compare evidence subsets agree choice2 = numpy.random.choice(factor, len(factor) // 2) logz2 = logsumexp(choice2) - numpy.log(len(choice2)) choice3 = numpy.random.choice(factor, len(factor) // 2) logz3 = logsumexp(choice3) - numpy.log(len(choice3)) dlogz2 = logz3 - logz2 # If kde matches posterior, the weights should be uniform # check fraction that are significant deviation from peak frac_offbase = (logp < logp.max() - 5.0).sum() / len(logp) logging.info( "%s: dlogz_iter=%.4f," "dlogz_half=%.4f, entropy=%.4f offbase fraction=%.4f", step, dlogz, dlogz2, entropy_diff, frac_offbase, ) if ( entropy_diff < self.entropy and step >= self.min_refinement_steps and max(abs(dlogz), abs(dlogz2)) < self.dlogz_target and frac_offbase < self.offbase_fraction ): return True else: return False
[docs] @classmethod def from_config( cls, cp, model, output_file=None, nprocesses=1, use_mpi=False ): """This should initialize the sampler given a config file.""" kwargs = {k: cp.get("sampler", k) for k in cp.options("sampler")} obj = cls(model, nprocesses=nprocesses, use_mpi=use_mpi, **kwargs) obj.set_start_from_config(cp) setup_output(obj, output_file, check_nsamples=False, validate=False) return obj
[docs] def set_start_from_config(self, cp): """Sets the initial state of the sampler from config file""" num_samples = self.iterative_kde_samples if cp.has_option("sampler", "start-file"): start_file = cp.get("sampler", "start-file") logging.info("Using file %s for initial positions", start_file) f = loadfile(start_file, "r") fsamples = f.read_samples(f["samples"].keys()) num_samples = len(fsamples) init_prior = initial_dist_from_config( cp, self.model.variable_params, self.model.static_params ) if init_prior is not None: samples = init_prior.rvs(size=num_samples) else: p = self.model.prior_distribution samples = p.rvs(size=num_samples) ksamples = [] for v in self.vparam: if v in fsamples: ksamples.append(fsamples[v]) else: ksamples.append(samples[v]) self.kde = gaussian_kde(numpy.array(ksamples))
[docs] def run_samples(self, ksamples, update_params=None, iteration=False): """Calculate the likelihoods and weights for a set of samples""" # Calculate likelihood for each sample logging.info("calculating likelihoods...") args = [] for i in range(len(ksamples[0])): param = {k: ksamples[j][i] for j, k in enumerate(self.vparam)} args.append(param) result = self.pool.map(call_model, args) logging.info("..done with likelihood") logp = numpy.array([r[0] for r in result]) logl = numpy.array([r[1] for r in result]) if update_params is not None: ksamples = numpy.array( [ ksamples[i, :] for i, k in enumerate(self.vparam) if k in update_params ] ) # Weights for iteration if iteration: logw = logp - numpy.log(self.kde.pdf(ksamples)) logw = logw - logsumexp(logw) # To avoid single samples dominating the weighting kde before # we will put a cap on the minimum and maximum logw sort = logw.argsort() cap = logw[sort[-len(sort) // 5]] low = logw[sort[len(sort) // 5]] logw[logw > cap] = cap logw[logw < low] = low else: # Weights for monte-carlo selection logw = logp - numpy.log(self.kde.pdf(ksamples)) logw = logw - logsumexp(logw) k = logp != -numpy.inf ksamples = ksamples[:, k] logp, logl, logw = logp[k], logl[k], logw[k] return ksamples, logp, logl, logw
[docs] def run(self): """Iterative sample from kde and update based on likelihood values""" self.group_kde = self.kde for param_group in self.param_groups: total_samples = None total_logp = None total_logw = None total_logl = None gsample = self.group_kde.resample(int(1e5)) gsample = [ gsample[i, :] for i, k in enumerate(self.vparam) if k in param_group ] self.kde = gaussian_kde(numpy.array(gsample)) self.fixed_samples = self.group_kde.resample(1) logging.info("updating: %s", param_group) for r in range(self.max_refinement_steps): ksamples = self.draw_samples( self.iterative_kde_samples, update_params=param_group ) ksamples, logp, logl, logw = self.run_samples( ksamples, update_params=param_group, iteration=True ) if total_samples is not None: total_samples = numpy.concatenate( [total_samples, ksamples], axis=1 ) total_logp = numpy.concatenate([total_logp, logp]) total_logw = numpy.concatenate([total_logw, logw]) total_logl = numpy.concatenate([total_logl, logl]) else: total_samples = ksamples total_logp = logp total_logw = logw total_logl = logl logging.info("setting up next kde iteration..") ntotal_logw = total_logw - logsumexp(total_logw) kde_new = gaussian_kde( total_samples, weights=numpy.exp(ntotal_logw) ) if self.converged(r, kde_new, total_logl + total_logw, logp): break self.kde = kde_new full_samples = [] gsample = self.group_kde.resample(len(total_samples[0])) i = 0 for j, k in enumerate(self.vparam): if k in param_group: full_samples.append(total_samples[i]) i += 1 else: full_samples.append(gsample[j]) self.group_kde = gaussian_kde(numpy.array(full_samples)) logging.info("Drawing final samples") ksamples = self.draw_samples(self.num_samples) logging.info("Calculating final likelihoods") ksamples, logp, logl, logw = self.run_samples(ksamples) self._samples = {k: ksamples[j, :] for j, k in enumerate(self.vparam)} self._samples["loglikelihood"] = logl logging.info("Reweighting to equal samples") self._samples = resample_equal(self._samples, logw)