Source code for gp

from __future__ import print_function
from __future__ import division

import os
import sys
import numpy as np
from scipy import optimize
from numpy.linalg import solve, LinAlgError
from numpy.linalg import cholesky as chol
from six import with_metaclass
from abc import ABCMeta, abstractmethod


try:  # Run as a package if installed
    from pcntoolkit.util.utils import squared_dist
except ImportError:
    pass

    path = os.path.abspath(os.path.dirname(__file__))
    path = os.path.dirname(path)  # parent directory
    if path not in sys.path:
        sys.path.append(path)
    del path

    from util.utils import squared_dist

# --------------------
# Covariance functions
# --------------------


[docs]class CovBase(with_metaclass(ABCMeta)):
    """ Base class for covariance functions.

        All covariance functions must define the following methods::

            CovFunction.get_n_params()
            CovFunction.cov()
            CovFunction.xcov()
            CovFunction.dcov()
    """

    def __init__(self, x=None):
        self.n_params = np.nan

[docs]    def get_n_params(self):
        """ Report the number of parameters required """

        assert not np.isnan(self.n_params), \
            "Covariance function not initialised"

        return self.n_params

[docs]    @abstractmethod
    def cov(self, theta, x, z=None):
        """ Return the full covariance (or cross-covariance if z is given) """

[docs]    @abstractmethod
    def dcov(self, theta, x, i):
        """ Return the derivative of the covariance function with respect to
            the i-th hyperparameter """


[docs]class CovLin(CovBase):
    """ Linear covariance function (no hyperparameters)
    """

    def __init__(self, x=None):
        self.n_params = 0
        self.first_call = False

[docs]    def cov(self, theta, x, z=None):
        if not self.first_call and not theta and theta is not None:
            self.first_call = True
            if len(theta) > 0 and theta[0] is not None:
                print("CovLin: ignoring unnecessary hyperparameter ...")

        if z is None:
            z = x

        K = x.dot(z.T)
        return K

[docs]    def dcov(self, theta, x, i):
        raise ValueError("Invalid covariance function parameter")


[docs]class CovSqExp(CovBase):
    """ Ordinary squared exponential covariance function.
        The hyperparameters are::

            theta = ( log(ell), log(sf) )

        where ell is a lengthscale parameter and sf2 is the signal variance
    """

    def __init__(self, x=None):
        self.n_params = 2

[docs]    def cov(self, theta, x, z=None):
        self.ell = np.exp(theta[0])
        self.sf2 = np.exp(2*theta[1])

        if z is None:
            z = x

        R = squared_dist(x/self.ell, z/self.ell)
        K = self.sf2 * np.exp(-R/2)
        return K

[docs]    def dcov(self, theta, x, i):
        self.ell = np.exp(theta[0])
        self.sf2 = np.exp(2*theta[1])

        R = squared_dist(x/self.ell, x/self.ell)

        if i == 0:   # return derivative of lengthscale parameter
            dK = self.sf2 * np.exp(-R/2) * R
            return dK
        elif i == 1:   # return derivative of signal variance parameter
            dK = 2*self.sf2 * np.exp(-R/2)
            return dK
        else:
            raise ValueError("Invalid covariance function parameter")


[docs]class CovSqExpARD(CovBase):
    """ Squared exponential covariance function with ARD
        The hyperparameters are::

            theta = (log(ell_1, ..., log_ell_D), log(sf))

        where ell_i are lengthscale parameters and sf2 is the signal variance
    """

    def __init__(self, x=None):
        if x is None:
            raise ValueError("N x D data matrix must be supplied as input")
        if len(x.shape) == 1:
            self.D = 1
        else:
            self.D = x.shape[1]
        self.n_params = self.D + 1

[docs]    def cov(self, theta, x, z=None):
        self.ell = np.exp(theta[0:self.D])
        self.sf2 = np.exp(2*theta[self.D])

        if z is None:
            z = x

        R = squared_dist(x.dot(np.diag(1./self.ell)),
                         z.dot(np.diag(1./self.ell)))
        K = self.sf2*np.exp(-R/2)
        return K

[docs]    def dcov(self, theta, x, i):
        K = self.cov(theta, x)
        if i < self.D:    # return derivative of lengthscale parameter
            dK = K * squared_dist(x[:, i]/self.ell[i], x[:, i]/self.ell[i])
            return dK
        elif i == self.D:   # return derivative of signal variance parameter
            dK = 2*K
            return dK
        else:
            raise ValueError("Invalid covariance function parameter")


[docs]class CovSum(CovBase):
    """ Sum of covariance functions. These are passed in as a cell array and
        intialised automatically. For example::

            C = CovSum(x,(CovLin, CovSqExpARD))
            C = CovSum.cov(x, )

        The hyperparameters are::

            theta = ( log(ell_1, ..., log_ell_D), log(sf2) )

        where ell_i are lengthscale parameters and sf2 is the signal variance
    """

    def __init__(self, x=None, covfuncnames=None):
        if x is None:
            raise ValueError("N x D data matrix must be supplied as input")
        if covfuncnames is None:
            raise ValueError("A list of covariance functions is required")
        self.covfuncs = []
        self.n_params = 0
        for cname in covfuncnames:
            covfunc = eval(cname + '(x)')
            self.n_params += covfunc.get_n_params()
            self.covfuncs.append(covfunc)

        if len(x.shape) == 1:
            self.N = len(x)
            self.D = 1
        else:
            self.N, self.D = x.shape

[docs]    def cov(self, theta, x, z=None):
        theta_offset = 0
        for ci, covfunc in enumerate(self.covfuncs):
            try:
                n_params_c = covfunc.get_n_params()
                theta_c = [theta[c] for c in
                           range(theta_offset, theta_offset + n_params_c)]
                theta_offset += n_params_c
            except Exception as e:
                print(e)

            if ci == 0:
                K = covfunc.cov(theta_c, x, z)
            else:
                K += covfunc.cov(theta_c, x, z)
        return K

[docs]    def dcov(self, theta, x, i):
        theta_offset = 0
        for covfunc in self.covfuncs:
            n_params_c = covfunc.get_n_params()
            theta_c = [theta[c] for c in
                       range(theta_offset, theta_offset + n_params_c)]
            theta_offset += n_params_c

            if theta_c:  # does the variable have any hyperparameters?
                if 'dK' not in locals():
                    dK = covfunc.dcov(theta_c, x, i)
                else:
                    dK += covfunc.dcov(theta_c, x, i)
        return dK

# -----------------------
# Gaussian process models
# -----------------------


[docs]class GPR:
    """Gaussian process regression

    Estimation and prediction of Gaussian process regression models

    Basic usage::

        G = GPR()
        hyp = B.estimate(hyp0, cov, X, y)
        ys, ys2 = B.predict(hyp, cov, X, y, Xs)

    where the variables are

    :param hyp: vector of hyperparmaters
    :param cov: covariance function
    :param X: N x D data array
    :param y: 1D Array of targets (length N)
    :param Xs: Nte x D array of test cases
    :param hyp0: starting estimates for hyperparameter optimisation

    :returns: * ys - predictive mean
              * ys2 - predictive variance

    The hyperparameters are::

        hyp = ( log(sn), (cov function params) )  # hyp is a list or array

    The implementation and notation  follows Rasmussen and Williams (2006).
    As in the gpml toolbox, these parameters are estimated using conjugate
    gradient optimisation of the marginal likelihood. Note that there is no
    explicit mean function, thus the gpr routines are limited to modelling
    zero-mean processes.

    Reference:
    C. Rasmussen and C. Williams (2006) Gaussian Processes for Machine Learning

    Written by A. Marquand
    """

    def __init__(self, hyp=None, covfunc=None, X=None, y=None, n_iter=100,
                 tol=1e-3, verbose=False, warp=None):

        self.hyp = np.nan
        self.nlZ = np.nan
        self.tol = tol          # not used at present
        self.n_iter = n_iter
        self.verbose = verbose

        # set up warped likelihood
        if warp is None:
            self.warp = None
            self.n_warp_param = 0
        else:
            self.warp = warp
            self.n_warp_param = warp.get_n_params()

        self.gamma = None

    def _updatepost(self, hyp, covfunc):

        hypeq = np.asarray(hyp == self.hyp)
        if hypeq.all() and hasattr(self, 'alpha') and \
           (hasattr(self, 'covfunc') and covfunc == self.covfunc):
            return False
        else:
            return True

[docs]    def post(self, hyp, covfunc, X, y):
        """ Generic function to compute posterior distribution.
        """

        if len(hyp.shape) > 1:  # force 1d hyperparameter array
            hyp = hyp.flatten()

        if len(X.shape) == 1:
            X = X[:, np.newaxis]
        self.N, self.D = X.shape

        # hyperparameters
        sn2 = np.exp(2*hyp[0])         # noise variance
        if self.warp is not None:      # parameters for warping the likelhood
            n_lik_param = self.n_warp_param+1
        else:
            n_lik_param = 1
        theta = hyp[n_lik_param:]      # (generic) covariance hyperparameters

        if self.verbose:
            print("estimating posterior ... | hyp=", hyp)

        self.K = covfunc.cov(theta, X)
        self.L = chol(self.K + sn2*np.eye(self.N))
        self.alpha = solve(self.L.T, solve(self.L, y))
        self.hyp = hyp
        self.covfunc = covfunc

[docs]    def loglik(self, hyp, covfunc, X, y):
        """ Function to compute compute log (marginal) likelihood
        """

        # load or recompute posterior
        if self.verbose:
            print("computing likelihood ... | hyp=", hyp)

        # parameters for warping the likelhood function
        if self.warp is not None:
            gamma = hyp[1:(self.n_warp_param+1)]
            y = self.warp.f(y, gamma)
            y_unwarped = y

        if len(hyp.shape) > 1:  # force 1d hyperparameter array
            hyp = hyp.flatten()
        if self._updatepost(hyp, covfunc):
            try:
                self.post(hyp, covfunc, X, y)
            except (ValueError, LinAlgError):
                print("Warning: Estimation of posterior distribution failed")
                self.nlZ = 1/np.finfo(float).eps
                return self.nlZ

        self.nlZ = 0.5*y.T.dot(self.alpha) + sum(np.log(np.diag(self.L))) + \
            0.5*self.N*np.log(2*np.pi)

        if self.warp is not None:
            # add in the Jacobian
            self.nlZ = self.nlZ - sum(np.log(self.warp.df(y_unwarped, gamma)))

        # make sure the output is finite to stop the minimizer getting upset
        if not np.isfinite(self.nlZ):
            self.nlZ = 1/np.finfo(float).eps

        if self.verbose:
            print("nlZ= ", self.nlZ, " | hyp=", hyp)

        return self.nlZ

[docs]    def dloglik(self, hyp, covfunc, X, y):
        """ Function to compute derivatives
        """

        if len(hyp.shape) > 1:  # force 1d hyperparameter array
            hyp = hyp.flatten()

        if self.warp is not None:
            raise ValueError('optimization with derivatives is not yet ' +
                             'supported for warped liklihood')

        # hyperparameters
        sn2 = np.exp(2*hyp[0])     # noise variance
        theta = hyp[1:]            # (generic) covariance hyperparameters

        # load posterior and prior covariance
        if self._updatepost(hyp, covfunc):
            try:
                self.post(hyp, covfunc, X, y)
            except (ValueError, LinAlgError):
                print("Warning: Estimation of posterior distribution failed")
                dnlZ = np.sign(self.dnlZ) / np.finfo(float).eps
                return dnlZ

        # compute Q = alpha*alpha' - inv(K)
        Q = np.outer(self.alpha, self.alpha) - \
            solve(self.L.T, solve(self.L, np.eye(self.N)))

        # initialise derivatives
        self.dnlZ = np.zeros(len(hyp))

        # noise variance
        self.dnlZ[0] = -sn2*np.trace(Q)

        # covariance parameter(s)
        for par in range(0, len(theta)):
            # compute -0.5*trace(Q.dot(dK/d[theta_i])) efficiently
            dK = covfunc.dcov(theta, X, i=par)
            self.dnlZ[par+1] = -0.5*np.sum(np.sum(Q*dK.T))

        # make sure the gradient is finite to stop the minimizer getting upset
        if not all(np.isfinite(self.dnlZ)):
            bad = np.where(np.logical_not(np.isfinite(self.dnlZ)))
            for b in bad:
                self.dnlZ[b] = np.sign(self.dnlZ[b]) / np.finfo(float).eps

        if self.verbose:
            print("dnlZ= ", self.dnlZ, " | hyp=", hyp)

        return self.dnlZ

    # model estimation (optimization)
[docs]    def estimate(self, hyp0, covfunc, X, y, optimizer='cg'):
        """ Function to estimate the model
        """
        if len(X.shape) == 1:
            X = X[:, np.newaxis]

        self.hyp0 = hyp0

        if optimizer.lower() == 'cg':  # conjugate gradients
            out = optimize.fmin_cg(self.loglik, hyp0, self.dloglik,
                                   (covfunc, X, y), disp=True, gtol=self.tol,
                                   maxiter=self.n_iter, full_output=1)

        elif optimizer.lower() == 'powell':  # Powell's method
            out = optimize.fmin_powell(self.loglik, hyp0, (covfunc, X, y),
                                       full_output=1)
        else:
            raise ValueError("unknown optimizer")

        # Always return a 1d array. The optimizer sometimes changes dimesnions
        if len(out[0].shape) > 1:
            self.hyp = out[0].flatten()
        else:
            self.hyp = out[0]
        self.nlZ = out[1]
        self.optimizer = optimizer

        return self.hyp

[docs]    def predict(self, hyp, X, y, Xs):
        """ Function to make predictions from the model
        """
        if len(hyp.shape) > 1:  # force 1d hyperparameter array
            hyp = hyp.flatten()

        # ensure X and Xs are multi-dimensional arrays
        if len(Xs.shape) == 1:
            Xs = Xs[:, np.newaxis]
        if len(X.shape) == 1:
            X = X[:, np.newaxis]

        # parameters for warping the likelhood function
        if self.warp is not None:
            gamma = hyp[1:(self.n_warp_param+1)]
            y = self.warp.f(y, gamma)

        # reestimate posterior (avoids numerical problems with optimizer)
        self.post(hyp, self.covfunc, X, y)

        # hyperparameters
        sn2 = np.exp(2*hyp[0])     # noise variance
        # (generic) covariance hyperparameters
        theta = hyp[(self.n_warp_param + 1):]

        Ks = self.covfunc.cov(theta, Xs, X)
        kss = self.covfunc.cov(theta, Xs)

        # predictive mean
        ymu = Ks.dot(self.alpha)

        # predictive variance (for a noisy test input)
        v = solve(self.L, Ks.T)
        ys2 = kss - v.T.dot(v) + sn2

        return ymu, ys2