Source code for dpcluster.algorithms

import math
import numpy as np
import scipy.linalg
import scipy.special
from caching import cached

[docs]class VDP(object):
    """ Variational Dirichlet Process clustering algorithm following `"Variational Inference for Dirichlet Process Mixtures" by Blei et al. (2006) <http://ba.stat.cmu.edu/journal/2006/vol01/issue01/blei.pdf>`_.
 
    :param distr: likelihood-prior distribution pair governing clusters. For now the only option is using a instance of :class:`dpcluster.distributions.GaussianNIW`.
    :param w: non-negative prior weight. The prior has as much influence as w data points.
    :param k: maximum number of clusters.
    :param tol: convergence tolerance.
    """
    def __init__(self,distr, w = .1, k=50,
                tol = 1e-5,
                max_iters = 10000):
       
        self.max_iters = max_iters
        self.tol = tol
        self.distr = distr
        self.w = w
        self.k = k
        d = self.distr.sufficient_stats_dim()

        self.prior = self.distr.prior_param
        self.s = np.array([0.0,0])
        
[docs]    def batch_learn(self,x,verbose=False, sort = True):
        """ Learn cluster from data. This is a batch algorithm that required all data be loaded in memory. 
       
        :arg x: sufficient statistics of the data to be clustered. Can be obtained from raw data by calling :func:`dpcluster.distributions.ConjugatePair.sufficient_stats()`
        :keyword verbose: print progress report 
        :keyword sort: algorithm optimization. Sort clusters at every step.

        Basic usage example::

            >>> distr = GaussianNIW(data.shape[2])
            >>> x = distr.sufficient_stats(data)
            >>> vdp = VDP(distr)
            >>> vdp.batch_learn(x)
            >>> print vdp.cluster_parameters()
        """
        n = x.shape[0] 
        k = self.k
        
        wx = x[:,-1]
        wt = wx.sum()

        lbd = self.prior + x.sum(0) / wx.sum() * self.w
        ex_alpha = 1.0
        
        phi = np.random.random(size=n*k).reshape((n,k))

        for t in range(self.max_iters):

            phi /= phi.sum(1)[:,np.newaxis]
            # m step
            tau = (lbd[np.newaxis,:] + np.einsum('ni,nj->ij', phi, x))
            psz = np.einsum('ni,n->i',phi,wx)

            # stick breaking process
            if sort:
                ind = np.argsort(-psz) 
                tau = tau[ind,:]
                psz = psz[ind]
            
            if t > 0:
                old = al

            al = 1.0 + psz

            if t > 0:
                diff = np.sum(np.abs(al - old))

            bt = ex_alpha + np.concatenate([
                    (np.cumsum(psz[:0:-1])[::-1])
                    ,[0]
                ])

            tmp = scipy.special.psi(al + bt)
            exlv  = (scipy.special.psi(al) - tmp)
            exlvc = (scipy.special.psi(bt) - tmp)

            elt = (exlv + np.concatenate([[0],np.cumsum(exlvc)[:-1]]))

            w = self.s + np.array([-1 + k, -np.sum(exlvc[:-1])])
            ex_alpha = w[0]/w[1]
            
            # end stick breaking process
            # end m step


            # e_step
            grad = self.distr.prior.log_partition(tau,(False,True,False))[1]
            np.einsum('ki,ni->nk',grad,x,out=phi)

            phi /= wx[:,np.newaxis]
            phi += elt
            phi -= phi.max(1)[:,np.newaxis]
            np.exp(phi,phi)
            
            
            if t>0:
                if verbose:
                    print str(diff)
                if diff < wt*self.tol:
                    break

        self.al = al
        self.bt = bt
        self.tau = tau
        self.lbd = lbd
        self.glp = grad
        self.elt = elt

        return

[docs]    def cluster_sizes(self):
        """:return: Data weight assigned to each cluster.
        """
        return (self.al -1)
        
        
[docs]    def cluster_parameters(self):
        """:return: Cluster parameters.
        """
        return self.tau
[docs]    def ll(self,x, ret_ll_gr_hs = (True,False,False)):
        """
        Compute the log likelihoods (ll) of data with respect to the trained model.

        :arg x: sufficient statistics of the data.
        :arg ret_ll_gr_hs: what to return: likelihood, gradient, hessian. Derivatives taken with respect to data, not sufficient statistics. 
        """

        rt = ret_ll_gr_hs
        llk,grk,hsk = self.distr.posterior_ll(x,self.tau,
             (True,rt[1],rt[2]), True)

        ll = None
        gr = None
        hs = None

        let = self.resp_cache(self.al,self.bt)

        llk = llk+let 
        np.exp(llk,llk)

        se = llk.sum(1)
        
        if rt[0]:
            ll = np.log(se)

        if rt[1] or rt[2]:
            p = llk/se[:,np.newaxis]
            gr = np.einsum('nk,nki->ni',p,grk)
        
        if rt[2]:
            hs1  = - gr[:,:,np.newaxis] * gr[:,np.newaxis,:]
            hs2 = np.einsum('nk,nkij->nij',p, hsk)
            # TODO: einsum wrong
            hs3 = np.einsum('nk,nki,nkj->nij',p, grk, grk)

            hs = hs1 + hs2 + hs3
        
        return (ll,gr,hs)

    @cached
    def resp_cache(self,al,bt):
        tmp = np.log(al + bt)
        exlv  = np.log(al) - tmp
        exlvc = np.log(bt) - tmp
        let = exlv + np.concatenate([[0],np.cumsum(exlvc)[:-1]])
        return let


    @cached
    def pseudo_resp_cache(self,al,bt):
        tmp = scipy.special.psi(al + bt)
        exlv  = (scipy.special.psi(al) - tmp)
        exlvc = (scipy.special.psi(bt) - tmp)

        elt = (exlv + np.concatenate([[0],np.cumsum(exlvc)[:-1]]))


        return elt


    @cached
    def resp(self,x, ret_ll_gr_hs = (True,False,False)):
        """
        Cluster responsabilities.

        :arg x: sufficient statistics of data. 
        """
        
        cll,cgr,chs = ret_ll_gr_hs
        p = None
        gp = None
        hp = None

        llk,grk,hsk = self.distr.posterior_ll(x,self.tau,(True,cgr,chs),True)

        if cll or cgr: 
            llk = llk + self.resp_cache(self.al,self.bt)   
            llk -= llk.max(1)[:,np.newaxis] 
            np.exp(llk,llk)
            se = llk.sum(1)
            p = llk/se[:,np.newaxis]
        
        if cgr:
            mn = np.einsum('nkj,nk->nj',grk,p)
            gp = (grk - mn[:,np.newaxis,:] )*p[:,:,np.newaxis]

        return (p,gp,hp)


    @cached
    def pseudo_resp(self,x, ret_ll_gr_hs = (True,False,False)):
        
        cll,cgr,chs = ret_ll_gr_hs
        p = None
        gp = None
        hp = None

        grad = self.distr.prior.log_partition(self.tau,(False,True,False))[1]
        llk = np.einsum('ki,ni->nk',grad,self.distr.sufficient_stats(x))

        llk += self.pseudo_resp_cache(self.al,self.bt)   
        llk -= llk.max(1)[:,np.newaxis] 
        np.exp(llk,llk)
        se = llk.sum(1)
        p = llk/se[:,np.newaxis]

        return (p,None,None)


[docs]    def conditional_ll(self,x,cond):
        """
        Conditional log likelihood.
        
        :arg x: sufficient statistics of data.
        :arg cond: slice representing variables to condition on
        """

        ll , gr, hs    = self.ll(x,(True,True,True), usual_x=True)
        ll_ , gr_, hs_ = self.marginal(cond).ll(x,(True,True,True),usual_x=True)
        
        ll -= ll_
        gr[:,slc] -= gr_
        #line below will fail
        #hs -= hs_

        return (ll,gr,None)

[docs]    def plot_clusters(self,**kwargs):
        """
        Asks each cluster to plot itself. For Gaussian multidimensional clusters pass ``slc=np.array([i,j])`` as an argument to project clusters on the plane defined by the i'th and j'th coordinate.
        """
        sz = self.cluster_sizes()
        self.distr.plot(self.tau, sz, **kwargs)

    @cached
    def marginal(self,slc):
        
        distr, tau = self.distr.marginal(self.tau,slc)
        rv = type(self)(distr)
        rv.tau = tau
        rv.al = self.al
        rv.bt = self.bt
        
        return rv

    @cached
    def conditional_expectation(self,x,iy,ix,ret_ll_gr_hs = (True,False,False)):
        ps, psg, trash = self.marginal(ix).resp(x,ret_ll_gr_hs)

        ex, exg, trash = self.distr.conditional_expectation(x,self.tau,iy,ix,
                        ret_ll_gr_hs)
        
        ef = np.einsum('nki,nk->ni',ex,ps)
        efg = np.einsum('nka,nki->nia',psg,ex)+np.einsum('nk,nkia->nia',ps,exg)
        #efg = np.einsum('nka,nki->nia',psg,ex)+np.einsum('nk,nkia->nia',ps,exg)

        
        return ef,efg,None
        
[docs]    def conditional_variance(self,x,iy,ix,ret_ll_gr_hs = (True,False,False)):
        ps, psg, trash = self.marginal(ix).resp(x,ret_ll_gr_hs)

        ex, exg, trash = self.distr.conditional_expectation(x,self.tau,iy,ix,
                        ret_ll_gr_hs)

        vr, vrg, trash = self.distr.conditional_variance(x,self.tau,iy,ix,
                        ret_ll_gr_hs)
        
        ef, efg, trash = self.conditional_expectation(x,iy,ix,ret_ll_gr_hs) 
        
        de = ex - ef[:,np.newaxis,:]
        vt = de[:,:,:,np.newaxis]*de[:,:,np.newaxis,:]
        vs = vt+vr
        vf = np.einsum('nk,nkij->nij',ps,vs)
        
        deg = exg - efg[:,np.newaxis,:,:]

        vsg  =  de[:,:,np.newaxis,:,np.newaxis] * deg[:,:,:,np.newaxis,:]
        vsg +=  de[:,:,:,np.newaxis,np.newaxis] * deg[:,:,np.newaxis,:,:]
        vsg += vrg
        
        vfg  = np.einsum('nk,nkija->nija',ps,vsg) 
        vfg += np.einsum('nka,nkij->nija',psg,vs)

        return vf, vfg, None

[docs]    def var_cond_exp(self,x,iy,ix,ret_ll_gr_hs = (True,False,False),
            full_var=False):
        ps, psg, trash = self.marginal(ix).resp(x,ret_ll_gr_hs)

        vr, vrg, trash = self.distr.conditional_variance(x,self.tau,iy,ix,
                        ret_ll_gr_hs,full_var)
        ps2 = ps*ps
        
        vf = np.einsum('nk,nkij->nij',ps2,vr)
        
        vfg = None

        if ret_ll_gr_hs[1]:
            vfg  = np.einsum('nk,nkija->nija',ps2,vrg) 
            vfg += 2*np.einsum('nka,nkij->nija',psg*ps[:,:,np.newaxis],vr)

        return vf, vfg, None



[docs]class OnlineVDP:
    """Experimental online clustering algorithm.

    :param distr: likelihood-prior distribution pair governing clusters. For now the only option is using a instance of :class:`dpcluster.distributions.GaussianNIW`.
    :param w: non-negative prior weight. The prior has as much influence as w data points.
    :param k: maximum number of clusters.
    :param tol: convergence tolerance.
    :param max_items: maximum queue length.
    
    """
    def __init__(self, distr, w=.1, k = 25, tol=1e-3, max_items = 100):
        self.distr = distr
        self.w = w
        self.wm = w
        self.k = k
        self.tol = tol
        self.max_n = max_items

        self.xs = []
        self.vdps = []
        self.dim = self.distr.sufficient_stats_dim()
        
[docs]    def put(self,r,s=0):
        """
        Append data.

        :arg r: sufficient statistics of data to be appended. 

        Basic usage example::

            >>> distr = GaussianNIW(data.shape[2])
            >>> x = distr.sufficient_stats(data)
            >>> vdp = OnlineVDP(distr)
            >>> vdp.put(x)
            >>> print vdp.get_model().cluster_parameters()
        """

        if s<len(self.xs):
            ar = np.vstack((self.xs[s],r))
        else:
            ar = r
            self.xs.append(None)
        
        mn = self.max_n*(s+1)

        pcs = ar.shape[0] / mn
        self.xs[s] = ar[pcs*mn:,:]

        if s<len(self.vdps):
            proc = self.vdps[s]
        else:
            if s>0:
                w = self.wm
            else:
                w = self.w
            
            proc = VDP(self.distr, w, self.k*(s+1), self.tol)
            self.vdps.append(proc)

        if pcs==0:
            return        

        for x in np.split(ar[:pcs*self.max_n,:],pcs):
            proc.batch_learn(x,verbose=False)
            xc = proc.tau - proc.lbd[np.newaxis,:]
            xc = xc[xc[:,-1]>1e-5]
            self.put(xc,s+1)

[docs]    def get_model(self):
        """
        Get current model.
        
        :return: instance of :class:`dpcluster.algorithms.VDP`
        """

        np.random.seed(1)
        proc = VDP(self.distr, self.wm, self.k*(len(self.xs)), self.tol)
        proc.batch_learn(np.vstack(self.xs[::-1]))
        return proc
        
        

[docs]class Predictor:
    def __init__(self,model,ix,iy):
        self.model = model
        self.ix = ix
        self.iy = iy

    @cached
    def distr_fit(self,x,lgh):
        ix = self.ix
        iy = self.iy

        ps,psg,trash =self.model.marginal(ix).resp(x,lgh)
        tau = np.einsum('nk,ki->ni',ps,self.model.tau) 
        
        if lgh[1]:
            taug =np.einsum('nka,ki->nia',psg,self.model.tau) 
        else:
            taug = None

        return tau,taug

    @cached
    def precomp(self,x,lgh):
        ix = self.ix
        iy = self.iy
        tau,taug = self.distr_fit(x,lgh)
        (mu,Psi,n,nu),(mug,Psig,ng,nug),trs = self.model.distr.prior.nat2usual(tau,lgh)

        A,B,D = Psi[:,iy,:][:,:,iy], Psi[:,iy,:][:,:,ix], Psi[:,ix,:][:,:,ix]

        Di = np.array(map(np.linalg.inv,D))
        P = np.einsum('njk,nkl->njl',B,Di)

        Li = A-np.einsum('nik,nlk->nil',P,B)
        V1 = Li
        
        V2 = Di
        
        ls = mu,P,V1,V2,n,nu
        gr = None

        
        if lgh[1]:
            mug = np.einsum('nia,naj->nij',mug,taug)
            Psig = np.einsum('nija,nab->nijb',Psig,taug)
            Bg = Psig[:,iy,:,:][:,:,ix,:]
            Dg = Psig[:,ix,:,:][:,:,ix,:]
            
            
            Pg = np.einsum('njka,nkl->njla',Bg,Di)
            Pg -= np.einsum('njk,nkl,nlma,nmq->njqa',B,Di,Dg,Di)
            
            gr = (mug,Pg,None,None,None,None)
        
        return ls,gr,None
    @cached
    def predict(self,x,lgh): 

        ix = self.ix
        iy = self.iy

        vl,gr,trs =  self.precomp(x,lgh)
        mu,p,V1,V2,ni,nu =  vl
        mug,pg,t1,t2,t3,t4 =  gr
            
        df = x-mu[:,ix]
        yp = (mu[:,iy] + np.einsum('nij,nj->ni',p,df))
        
        ypg = None
        
        if lgh[1]:
            dfg  = -mug[:,ix,:]
            dfg += np.eye(dfg.shape[1])[np.newaxis,:,:]
            
            ypg = (mug[:,iy,:] 
                + np.einsum('nij,nja->nia',p,dfg) 
                + np.einsum('nija,nj->nia',pg,df)) 

        return yp,ypg,None

        
[docs]    def predict_old(self,z,lgh=(True,True,False),full_var=False):
        
        ix = self.ix
        iy = self.iy

        x = z[:,ix]
        y = z[:,iy]
        dx = len(ix)
        dy = len(iy)
        
        mu,exg,V1,V2,ni,nu =  self.precomp(x,lgh)[0]

        yp, ypg, trs = self.predict(x,lgh)
        
        xi = y - yp

        P = np.repeat(np.eye(dy,dy)[np.newaxis,:,:],exg.shape[0],0)
        P = np.dstack((P,-ypg))

        df = x-mu[:,ix]
        cf = np.einsum('nj,nj->n',np.einsum('ni,nij->nj',df, V2),df )

        if full_var:
            V = V1*((1.0+ 1.0/n + cf)/(nu+1.0))[:,np.newaxis,np.newaxis]        
        else:
            V = V1*((1.0/n + cf)/(nu+1.0))[:,np.newaxis,np.newaxis]        

        vi = np.array(map(np.linalg.inv,V))
        
        pr = np.einsum('nij,nj->ni',vi,xi)
        ll = np.einsum('nj,nj->n',pr,xi)

        return ll,xi,P,2*vi

        
[docs]class PredictorKL:
    def __init__(self,model,ix,iy):
        self.model = model
        self.ix = ix
        self.iy = iy

    @cached
    def predict(self,x,lgh): 

        ix = self.ix
        iy = self.iy
        
        dstr,tau,taug,tsh = self.model.distr.conditional(x,
                self.model.tau,iy,ix,lgh)

        ps,psg,trash = self.model.marginal(ix).resp(x,lgh)
        
        taun = np.einsum('nk,nki->ni',ps,tau)

        (mu,Psi,n,nu),(mug,Psig,ng,nug),trs = dstr.prior.nat2usual(taun,lgh)

        yp = mu
        if lgh[1]:
            taung = (np.einsum('nkj,nki->nij',psg,tau) 
                + np.einsum('nk,nkij->nij',ps,taug))

            ypg = np.einsum('njk,nij->nik',taung,mug)
        
        v = Psi/(n * (nu - len(iy) + 1.0) )[:,np.newaxis,np.newaxis] 

        return yp,ypg,v

        
[docs]    def predict_old(self,z,lgh=(True,True,False),full_var=False):
        
        ix = self.ix
        iy = self.iy

        x = z[:,ix]
        y = z[:,iy]
        dx = len(ix)
        dy = len(iy)
        
        mu,exg,V1,V2,ni,nu =  self.precomp(x,lgh)[0]

        yp, ypg, trs = self.predict(x,lgh)
        
        xi = y - yp

        P = np.repeat(np.eye(dy,dy)[np.newaxis,:,:],exg.shape[0],0)
        P = np.dstack((P,-ypg))

        df = x-mu[:,ix]
        cf = np.einsum('nj,nj->n',np.einsum('ni,nij->nj',df, V2),df )

        if full_var:
            V = V1*((1.0+ 1.0/n + cf)/(nu+1.0))[:,np.newaxis,np.newaxis]        
        else:
            V = V1*((1.0/n + cf)/(nu+1.0))[:,np.newaxis,np.newaxis]        

        vi = np.array(map(np.linalg.inv,V))
        
        pr = np.einsum('nij,nj->ni',vi,xi)
        ll = np.einsum('nj,nj->n',pr,xi)

        return ll,xi,P,2*vi