Source code for bayesreg

from __future__ import print_function
from __future__ import division

import numpy as np
from scipy import optimize, linalg
from scipy.linalg import LinAlgError


[docs]class BLR: """Bayesian linear regression Estimation and prediction of Bayesian linear regression models Basic usage:: B = BLR() hyp = B.estimate(hyp0, X, y) ys,s2 = B.predict(hyp, X, y, Xs) where the variables are :param hyp: vector of hyperparmaters. :param X: N x D data array :param y: 1D Array of targets (length N) :param Xs: Nte x D array of test cases :param hyp0: starting estimates for hyperparameter optimisation :returns: * ys - predictive mean * s2 - predictive variance The hyperparameters are:: hyp = ( log(beta), log(alpha) ) # hyp is a list or numpy array The implementation and notation mostly follows Bishop (2006). The hyperparameter beta is the noise precision and alpha is the precision over lengthscale parameters. This can be either a scalar variable (a common lengthscale for all input variables), or a vector of length D (a different lengthscale for each input variable, derived using an automatic relevance determination formulation). These are estimated using conjugate gradient optimisation of the marginal likelihood. Reference: Bishop (2006) Pattern Recognition and Machine Learning, Springer Written by A. Marquand """ def __init__(self, **kwargs): # parse arguments n_iter = kwargs.get('n_iter', 100) tol = kwargs.get('tol', 1e-3) verbose = kwargs.get('verbose', False) var_groups = kwargs.get('var_groups', None) var_covariates = kwargs.get('var_covariates', None) warp = kwargs.get('warp', None) warp_reparam = kwargs.get('warp_reparam', False) if var_groups is not None and var_covariates is not None: raise ValueError( "var_covariates and var_groups cannot both be used") # basic parameters self.hyp = np.nan self.nlZ = np.nan self.tol = tol # not used at present self.n_iter = n_iter self.verbose = verbose self.var_groups = var_groups if var_covariates is not None: self.hetero_var = True else: self.hetero_var = False if self.var_groups is not None: self.var_ids = set(self.var_groups) self.var_ids = sorted(list(self.var_ids)) # set up warped likelihood if verbose: print('warp:', warp, 'warp_reparam:', warp_reparam) if warp is None: self.warp = None self.n_warp_param = 0 else: self.warp = warp self.n_warp_param = warp.get_n_params() self.warp_reparam = warp_reparam self.gamma = None def _parse_hyps(self, hyp, X, Xv=None): """ Parse hyperparameters into noise precision, lengthscale precision and lengthscale parameters. :param hyp: hyperparameter vector :param X: covariates :param Xv: covariates for heteroskedastic noise """ N = X.shape[0] # noise precision if Xv is not None: if len(Xv.shape) == 1: Dv = 1 Xv = Xv[:, np.newaxis] else: Dv = Xv.shape[1] w_d = np.asarray(hyp[0:Dv]) beta = np.exp(Xv.dot(w_d)) n_lik_param = len(w_d) elif self.var_groups is not None: beta = np.exp(hyp[0:len(self.var_ids)]) n_lik_param = len(beta) else: beta = np.asarray([np.exp(hyp[0])]) n_lik_param = len(beta) # parameters for warping the likelihood function if self.warp is not None: gamma = hyp[n_lik_param:(n_lik_param + self.n_warp_param)] n_lik_param += self.n_warp_param else: gamma = None # precision for the coefficients if isinstance(beta, list) or type(beta) is np.ndarray: alpha = np.exp(hyp[n_lik_param:]) else: alpha = np.exp(hyp[1:]) # reparameterise the warp (WarpSinArcsinh only) if self.warp is not None and self.warp_reparam: delta = np.exp(gamma[1]) beta = beta/(delta**2) # Create precision matrix from noise precision if Xv is not None: self.lambda_n_vec = beta elif self.var_groups is not None: beta_all = np.ones(N) for v in range(len(self.var_ids)): beta_all[self.var_groups == self.var_ids[v]] = beta[v] self.lambda_n_vec = beta_all else: self.lambda_n_vec = np.ones(N)*beta return beta, alpha, gamma
[docs] def post(self, hyp, X, y, Xv=None): """ Generic function to compute posterior distribution. This function will save the posterior mean and precision matrix as self.m and self.A and will also update internal parameters (e.g. N, D and the prior covariance (Sigma_a) and precision (Lambda_a). :param hyp: hyperparameter vector :param X: covariates :param y: responses :param Xv: covariates for heteroskedastic noise """ N = X.shape[0] if len(X.shape) == 1: D = 1 else: D = X.shape[1] if (hyp == self.hyp).all() and hasattr(self, 'N'): print("hyperparameters have not changed, exiting") return beta, alpha, gamma = self._parse_hyps(hyp, X, Xv) if self.verbose: print("estimating posterior ... | hyp=", hyp) # prior variance if len(alpha) == 1 or len(alpha) == D: self.Sigma_a = np.diag(np.ones(D))/alpha self.Lambda_a = np.diag(np.ones(D))*alpha else: raise ValueError("hyperparameter vector has invalid length") # compute posterior precision and mean # this is equivalent to the following operation but makes much more # efficient use of memory by avoiding the need to store Lambda_n # # self.A = X.T.dot(self.Lambda_n).dot(X) + self.Lambda_a # self.m = linalg.solve(self.A, X.T, # check_finite=False).dot(self.Lambda_n).dot(y) XtLambda_n = X.T*self.lambda_n_vec self.A = XtLambda_n.dot(X) + self.Lambda_a invAXt = linalg.solve(self.A, X.T, check_finite=False) self.m = (invAXt*self.lambda_n_vec).dot(y) # save stuff self.N = N self.D = D self.hyp = hyp
[docs] def loglik(self, hyp, X, y, Xv=None): """ Function to compute compute log (marginal) likelihood """ # hyperparameters (alpha not needed) beta, alpha, gamma = self._parse_hyps(hyp, X, Xv) # warp the likelihood? if self.warp is not None: if self.verbose: print('warping input...') y_unwarped = y y = self.warp.f(y, gamma) # load posterior and prior covariance if (hyp != self.hyp).any() or not (hasattr(self, 'A')): try: self.post(hyp, X, y, Xv) except ValueError: print("Warning: Estimation of posterior distribution failed") nlZ = 1/np.finfo(float).eps return nlZ try: # compute the log determinants in a numerically stable way logdetA = 2*sum(np.log(np.diag(np.linalg.cholesky(self.A)))) except (ValueError, LinAlgError): print("Warning: Estimation of posterior distribution failed") nlZ = 1/np.finfo(float).eps return nlZ logdetSigma_a = sum(np.log(np.diag(self.Sigma_a))) # diagonal logdetSigma_n = sum(np.log(1/self.lambda_n_vec)) # compute negative marginal log likelihood X_y_t_sLambda_n = (y-X.dot(self.m))*np.sqrt(self.lambda_n_vec) nlZ = -0.5 * (-self.N*np.log(2*np.pi) - logdetSigma_n - logdetSigma_a - X_y_t_sLambda_n.T.dot(X_y_t_sLambda_n) - self.m.T.dot(self.Lambda_a).dot(self.m) - logdetA ) if self.warp is not None: # add in the Jacobian nlZ = nlZ - sum(np.log(self.warp.df(y_unwarped, gamma))) # make sure the output is finite to stop the minimizer getting upset if not np.isfinite(nlZ): nlZ = 1/np.finfo(float).eps if self.verbose: print("nlZ= ", nlZ, " | hyp=", hyp) self.nlZ = nlZ return nlZ
[docs] def penalized_loglik(self, hyp, X, y, Xv=None, l=0.1, norm='L1'): """ Function to compute the penalized log (marginal) likelihood :param hyp: hyperparameter vector :param X: covariates :param y: responses :param Xv: covariates for heteroskedastic noise :param l: regularisation penalty :param norm: type of regulariser (L1 or L2) """ if norm.lower() == 'l1': L = self.loglik(hyp, X, y, Xv) + l * sum(abs(hyp)) elif norm.lower() == 'l2': L = self.loglik(hyp, X, y, Xv) + l * sum(np.sqrt(hyp**2)) else: print("Requested penalty not recognized, choose between 'L1' or 'L2'.") return L
[docs] def dloglik(self, hyp, X, y, Xv=None): """ Function to compute derivatives """ # hyperparameters beta, alpha, gamma = self._parse_hyps(hyp, X, Xv) if self.warp is not None: raise ValueError('optimization with derivatives is not yet ' + 'supported for warped liklihood') # load posterior and prior covariance if (hyp != self.hyp).any() or not (hasattr(self, 'A')): try: self.post(hyp, X, y, Xv) except ValueError: print("Warning: Estimation of posterior distribution failed") dnlZ = np.sign(self.dnlZ) / np.finfo(float).eps return dnlZ # precompute re-used quantities to maximise speed # todo: revise implementation to use Cholesky throughout # that would remove the need to explicitly compute the inverse S = np.linalg.inv(self.A) # posterior covariance SX = S.dot(X.T) XLn = X.T*self.lambda_n_vec # = X.T.dot(self.Lambda_n) XLny = XLn.dot(y) SXLny = S.dot(XLny) XLnXm = XLn.dot(X).dot(self.m) # initialise derivatives dnlZ = np.zeros(hyp.shape) dnl2 = np.zeros(hyp.shape) # noise precision parameter(s) for i in range(0, len(beta)): # first compute derivative of Lambda_n with respect to beta dL_n_vec = np.zeros(self.N) if self.var_groups is None: dL_n_vec = np.ones(self.N) else: dL_n_vec[np.where(self.var_groups == self.var_ids[i])[0]] = 1 dLambda_n = np.diag(dL_n_vec) # compute quantities used multiple times XdLnX = X.T.dot(dLambda_n).dot(X) dA = XdLnX # derivative of posterior parameters with respect to beta b = -S.dot(dA).dot(SXLny) + SX.dot(dLambda_n).dot(y) # compute np.trace(self.Sigma_n.dot(dLambda_n)) efficiently trSigma_ndLambda_n = sum((1/self.lambda_n_vec)*np.diag(dLambda_n)) # compute y.T.dot(Lambda_n) efficiently ytLn = (y*self.lambda_n_vec).T # compute derivatives dnlZ[i] = - (0.5 * trSigma_ndLambda_n - 0.5 * y.dot(dLambda_n).dot(y) + y.dot(dLambda_n).dot(X).dot(self.m) + ytLn.dot(X).dot(b) - 0.5 * self.m.T.dot(XdLnX).dot(self.m) - b.T.dot(XLnXm) - b.T.dot(self.Lambda_a).dot(self.m) - 0.5 * np.trace(S.dot(dA)) ) * beta[i] # scaling parameter(s) for i in range(0, len(alpha)): # first compute derivatives with respect to alpha if len(alpha) == self.D: # are we using ARD? dLambda_a = np.zeros((self.D, self.D)) dLambda_a[i, i] = 1 else: dLambda_a = np.eye(self.D) F = dLambda_a c = -S.dot(F).dot(SXLny) # compute np.trace(self.Sigma_a.dot(dLambda_a)) efficiently trSigma_adLambda_a = sum(np.diag(self.Sigma_a)*np.diag(dLambda_a)) dnlZ[i+len(beta)] = -(0.5 * trSigma_adLambda_a + XLny.T.dot(c) - c.T.dot(XLnXm) - c.T.dot(self.Lambda_a).dot(self.m) - 0.5 * self.m.T.dot(F).dot(self.m) - 0.5*np.trace(linalg.solve(self.A, F)) ) * alpha[i] # make sure the gradient is finite to stop the minimizer getting upset if not all(np.isfinite(dnlZ)): bad = np.where(np.logical_not(np.isfinite(dnlZ))) for b in bad: dnlZ[b] = np.sign(self.dnlZ[b]) / np.finfo(float).eps if self.verbose: print("dnlZ= ", dnlZ, " | hyp=", hyp) self.dnlZ = dnlZ return dnlZ
# model estimation (optimization)
[docs] def estimate(self, hyp0, X, y, **kwargs): """ Function to estimate the model :param hyp: hyperparameter vector :param X: covariates :param y: responses :param optimizer: optimisation algorithm ('cg','powell','nelder-mead','l0bfgs-b') """ optimizer = kwargs.get('optimizer', 'cg') # covariates for heteroskedastic noise Xv = kwargs.get('var_covariates', None) # options for l-bfgs-b l = float(kwargs.get('l', 0.1)) epsilon = float(kwargs.get('epsilon', 0.1)) norm = kwargs.get('norm', 'l2') if optimizer.lower() == 'cg': # conjugate gradients out = optimize.fmin_cg(self.loglik, hyp0, self.dloglik, (X, y, Xv), disp=True, gtol=self.tol, maxiter=self.n_iter, full_output=1) elif optimizer.lower() == 'powell': # Powell's method out = optimize.fmin_powell(self.loglik, hyp0, (X, y, Xv), full_output=1) elif optimizer.lower() == 'nelder-mead': out = optimize.fmin(self.loglik, hyp0, (X, y, Xv), full_output=1) elif optimizer.lower() == 'l-bfgs-b': all_hyp_i = [hyp0] def store(X): hyp = X all_hyp_i.append(hyp) try: out = optimize.fmin_l_bfgs_b(self.penalized_loglik, x0=hyp0, args=(X, y, Xv, l, norm), approx_grad=True, epsilon=epsilon, callback=store) # If the matrix becomes singular restart at last found hyp except np.linalg.LinAlgError: print( f'Restarting estimation at hyp = {all_hyp_i[-1]}, due to *** numpy.linalg.LinAlgError: Matrix is singular.') out = optimize.fmin_l_bfgs_b(self.penalized_loglik, x0=all_hyp_i[-1], args=(X, y, Xv, l, norm), approx_grad=True, epsilon=epsilon) else: raise ValueError("unknown optimizer") self.hyp = out[0] self.nlZ = out[1] self.optimizer = optimizer return self.hyp
[docs] def predict(self, hyp, X, y, Xs, var_groups_test=None, var_covariates_test=None, **kwargs): """ Function to make predictions from the model :param hyp: hyperparameter vector :param X: covariates for training data :param y: responses for training data :param Xs: covariates for test data :param var_covariates_test: test covariates for heteroskedastic noise This always returns Gaussian predictions, i.e. :returns: * ys - predictive mean * s2 - predictive variance """ Xvs = var_covariates_test if Xvs is not None and len(Xvs.shape) == 1: Xvs = Xvs[:, np.newaxis] if X is None or y is None: # set dummy hyperparameters beta, alpha, gamma = self._parse_hyps( hyp, np.zeros((self.N, self.D)), Xvs) else: # set hyperparameters beta, alpha, gamma = self._parse_hyps(hyp, X, Xvs) # do we need to re-estimate the posterior? if (hyp != self.hyp).any() or not (hasattr(self, 'A')): raise ValueError('posterior not properly estimated') N_test = Xs.shape[0] ys = Xs.dot(self.m) if self.var_groups is not None: if len(var_groups_test) != N_test: raise ValueError('Invalid variance groups for test') # separate variance groups s2n = np.ones(N_test) for v in range(len(self.var_ids)): s2n[var_groups_test == self.var_ids[v]] = 1/beta[v] else: s2n = 1/beta # compute xs.dot(S).dot(xs.T) avoiding computing off-diagonal entries s2 = s2n + np.sum(Xs*linalg.solve(self.A, Xs.T).T, axis=1) return ys, s2
[docs] def predict_and_adjust(self, hyp, X, y, Xs=None, ys=None, var_groups_test=None, var_groups_adapt=None, **kwargs): """ Function to transfer the model to a new site. This is done by first making predictions on the adaptation data given by X, adjusting by the residuals with respect to y. :param hyp: hyperparameter vector :param X: covariates for adaptation (i.e. calibration) data :param y: responses for adaptation data :param Xs: covariate data (for which predictions should be adjusted) :param ys: true response variables (to be adjusted) :param var_groups_test: variance groups (e.g. sites) for test data :param var_groups_adapt: variance groups for adaptation data There are two possible ways of using this function, depending on whether ys or Xs is specified If ys is specified, this is applied directly to the data, which is assumed to be in the input space (i.e. not warped). In this case the adjusted true data points are returned in the same space Alternatively, Xs is specified, then the predictions are made and adjusted. In this case the predictive variance are returned in the warped (i.e. Gaussian) space. This function needs to know which sites are associated with which data points, which provided by var_groups_xxx, which is a list or array of scalar ids . """ if ys is None: if Xs is None: raise ValueError('Either ys or Xs must be specified') else: N = Xs.shape[0] else: if len(ys.shape) < 1: raise ValueError('ys is specified but has insufficent length') N = ys.shape[0] if var_groups_test is None: var_groups_test = np.ones(N) var_groups_adapt = np.ones(X.shape[0]) ys_out = np.zeros(N) s2_out = np.zeros(N) for g in np.unique(var_groups_test): idx_s = var_groups_test == g idx_a = var_groups_adapt == g if sum(idx_a) < 2: raise ValueError( 'Insufficient adaptation data to estimate variance') # Get predictions from old model on new data X ys_ref, s2_ref = self.predict(hyp, None, None, X[idx_a, :]) # Subtract the predictions from true data to get the residuals if self.warp is None: residuals = ys_ref-y[idx_a] else: # Calculate the residuals in warped space y_ref_ws = self.warp.f( y[idx_a], hyp[1:self.warp.get_n_params()+1]) residuals = ys_ref - y_ref_ws residuals_mu = np.mean(residuals) residuals_sd = np.std(residuals) # Adjust the mean with the mean of the residuals if ys is None: # make and adjust predictions ys_out[idx_s], s2_out[idx_s] = self.predict( hyp, None, None, Xs[idx_s, :]) ys_out[idx_s] = ys_out[idx_s] - residuals_mu # Set the deviation to the devations of the residuals s2_out[idx_s] = np.ones(len(s2_out[idx_s]))*residuals_sd**2 else: # adjust the data if self.warp is not None: y_ws = self.warp.f( ys[idx_s], hyp[1:self.warp.get_n_params()+1]) ys_out[idx_s] = y_ws + residuals_mu ys_out[idx_s] = self.warp.invf( ys_out[idx_s], hyp[1:self.warp.get_n_params()+1]) else: ys = ys - residuals_mu s2_out = None return ys_out, s2_out