import math
import numpy as np
import scipy.linalg
import scipy.special
from caching import cached
[docs]class VDP(object):
""" Variational Dirichlet Process clustering algorithm following `"Variational Inference for Dirichlet Process Mixtures" by Blei et al. (2006) <http://ba.stat.cmu.edu/journal/2006/vol01/issue01/blei.pdf>`_.
:param distr: likelihood-prior distribution pair governing clusters. For now the only option is using a instance of :class:`dpcluster.distributions.GaussianNIW`.
:param w: non-negative prior weight. The prior has as much influence as w data points.
:param k: maximum number of clusters.
:param tol: convergence tolerance.
"""
def __init__(self,distr, w = .1, k=50,
tol = 1e-5,
max_iters = 10000):
self.max_iters = max_iters
self.tol = tol
self.distr = distr
self.w = w
self.k = k
d = self.distr.sufficient_stats_dim()
self.prior = self.distr.prior_param
self.s = np.array([0.0,0])
[docs] def batch_learn(self,x,verbose=False, sort = True):
""" Learn cluster from data. This is a batch algorithm that required all data be loaded in memory.
:arg x: sufficient statistics of the data to be clustered. Can be obtained from raw data by calling :func:`dpcluster.distributions.ConjugatePair.sufficient_stats()`
:keyword verbose: print progress report
:keyword sort: algorithm optimization. Sort clusters at every step.
Basic usage example::
>>> distr = GaussianNIW(data.shape[2])
>>> x = distr.sufficient_stats(data)
>>> vdp = VDP(distr)
>>> vdp.batch_learn(x)
>>> print vdp.cluster_parameters()
"""
n = x.shape[0]
k = self.k
wx = x[:,-1]
wt = wx.sum()
lbd = self.prior + x.sum(0) / wx.sum() * self.w
ex_alpha = 1.0
phi = np.random.random(size=n*k).reshape((n,k))
for t in range(self.max_iters):
phi /= phi.sum(1)[:,np.newaxis]
# m step
tau = (lbd[np.newaxis,:] + np.einsum('ni,nj->ij', phi, x))
psz = np.einsum('ni,n->i',phi,wx)
# stick breaking process
if sort:
ind = np.argsort(-psz)
tau = tau[ind,:]
psz = psz[ind]
if t > 0:
old = al
al = 1.0 + psz
if t > 0:
diff = np.sum(np.abs(al - old))
bt = ex_alpha + np.concatenate([
(np.cumsum(psz[:0:-1])[::-1])
,[0]
])
tmp = scipy.special.psi(al + bt)
exlv = (scipy.special.psi(al) - tmp)
exlvc = (scipy.special.psi(bt) - tmp)
elt = (exlv + np.concatenate([[0],np.cumsum(exlvc)[:-1]]))
w = self.s + np.array([-1 + k, -np.sum(exlvc[:-1])])
ex_alpha = w[0]/w[1]
# end stick breaking process
# end m step
# e_step
grad = self.distr.prior.log_partition(tau,(False,True,False))[1]
np.einsum('ki,ni->nk',grad,x,out=phi)
phi /= wx[:,np.newaxis]
phi += elt
phi -= phi.max(1)[:,np.newaxis]
np.exp(phi,phi)
if t>0:
if verbose:
print str(diff)
if diff < wt*self.tol:
break
self.al = al
self.bt = bt
self.tau = tau
self.lbd = lbd
self.glp = grad
self.elt = elt
return
[docs] def cluster_sizes(self):
""":return: Data weight assigned to each cluster.
"""
return (self.al -1)
[docs] def cluster_parameters(self):
""":return: Cluster parameters.
"""
return self.tau
[docs] def ll(self,x, ret_ll_gr_hs = (True,False,False)):
"""
Compute the log likelihoods (ll) of data with respect to the trained model.
:arg x: sufficient statistics of the data.
:arg ret_ll_gr_hs: what to return: likelihood, gradient, hessian. Derivatives taken with respect to data, not sufficient statistics.
"""
rt = ret_ll_gr_hs
llk,grk,hsk = self.distr.posterior_ll(x,self.tau,
(True,rt[1],rt[2]), True)
ll = None
gr = None
hs = None
let = self.resp_cache(self.al,self.bt)
llk = llk+let
np.exp(llk,llk)
se = llk.sum(1)
if rt[0]:
ll = np.log(se)
if rt[1] or rt[2]:
p = llk/se[:,np.newaxis]
gr = np.einsum('nk,nki->ni',p,grk)
if rt[2]:
hs1 = - gr[:,:,np.newaxis] * gr[:,np.newaxis,:]
hs2 = np.einsum('nk,nkij->nij',p, hsk)
# TODO: einsum wrong
hs3 = np.einsum('nk,nki,nkj->nij',p, grk, grk)
hs = hs1 + hs2 + hs3
return (ll,gr,hs)
@cached
def resp_cache(self,al,bt):
tmp = np.log(al + bt)
exlv = np.log(al) - tmp
exlvc = np.log(bt) - tmp
let = exlv + np.concatenate([[0],np.cumsum(exlvc)[:-1]])
return let
@cached
def pseudo_resp_cache(self,al,bt):
tmp = scipy.special.psi(al + bt)
exlv = (scipy.special.psi(al) - tmp)
exlvc = (scipy.special.psi(bt) - tmp)
elt = (exlv + np.concatenate([[0],np.cumsum(exlvc)[:-1]]))
return elt
@cached
def resp(self,x, ret_ll_gr_hs = (True,False,False)):
"""
Cluster responsabilities.
:arg x: sufficient statistics of data.
"""
cll,cgr,chs = ret_ll_gr_hs
p = None
gp = None
hp = None
llk,grk,hsk = self.distr.posterior_ll(x,self.tau,(True,cgr,chs),True)
if cll or cgr:
llk = llk + self.resp_cache(self.al,self.bt)
llk -= llk.max(1)[:,np.newaxis]
np.exp(llk,llk)
se = llk.sum(1)
p = llk/se[:,np.newaxis]
if cgr:
mn = np.einsum('nkj,nk->nj',grk,p)
gp = (grk - mn[:,np.newaxis,:] )*p[:,:,np.newaxis]
return (p,gp,hp)
@cached
def pseudo_resp(self,x, ret_ll_gr_hs = (True,False,False)):
cll,cgr,chs = ret_ll_gr_hs
p = None
gp = None
hp = None
grad = self.distr.prior.log_partition(self.tau,(False,True,False))[1]
llk = np.einsum('ki,ni->nk',grad,self.distr.sufficient_stats(x))
llk += self.pseudo_resp_cache(self.al,self.bt)
llk -= llk.max(1)[:,np.newaxis]
np.exp(llk,llk)
se = llk.sum(1)
p = llk/se[:,np.newaxis]
return (p,None,None)
[docs] def conditional_ll(self,x,cond):
"""
Conditional log likelihood.
:arg x: sufficient statistics of data.
:arg cond: slice representing variables to condition on
"""
ll , gr, hs = self.ll(x,(True,True,True), usual_x=True)
ll_ , gr_, hs_ = self.marginal(cond).ll(x,(True,True,True),usual_x=True)
ll -= ll_
gr[:,slc] -= gr_
#line below will fail
#hs -= hs_
return (ll,gr,None)
[docs] def plot_clusters(self,**kwargs):
"""
Asks each cluster to plot itself. For Gaussian multidimensional clusters pass ``slc=np.array([i,j])`` as an argument to project clusters on the plane defined by the i'th and j'th coordinate.
"""
sz = self.cluster_sizes()
self.distr.plot(self.tau, sz, **kwargs)
@cached
def marginal(self,slc):
distr, tau = self.distr.marginal(self.tau,slc)
rv = type(self)(distr)
rv.tau = tau
rv.al = self.al
rv.bt = self.bt
return rv
@cached
def conditional_expectation(self,x,iy,ix,ret_ll_gr_hs = (True,False,False)):
ps, psg, trash = self.marginal(ix).resp(x,ret_ll_gr_hs)
ex, exg, trash = self.distr.conditional_expectation(x,self.tau,iy,ix,
ret_ll_gr_hs)
ef = np.einsum('nki,nk->ni',ex,ps)
efg = np.einsum('nka,nki->nia',psg,ex)+np.einsum('nk,nkia->nia',ps,exg)
#efg = np.einsum('nka,nki->nia',psg,ex)+np.einsum('nk,nkia->nia',ps,exg)
return ef,efg,None
[docs] def conditional_variance(self,x,iy,ix,ret_ll_gr_hs = (True,False,False)):
ps, psg, trash = self.marginal(ix).resp(x,ret_ll_gr_hs)
ex, exg, trash = self.distr.conditional_expectation(x,self.tau,iy,ix,
ret_ll_gr_hs)
vr, vrg, trash = self.distr.conditional_variance(x,self.tau,iy,ix,
ret_ll_gr_hs)
ef, efg, trash = self.conditional_expectation(x,iy,ix,ret_ll_gr_hs)
de = ex - ef[:,np.newaxis,:]
vt = de[:,:,:,np.newaxis]*de[:,:,np.newaxis,:]
vs = vt+vr
vf = np.einsum('nk,nkij->nij',ps,vs)
deg = exg - efg[:,np.newaxis,:,:]
vsg = de[:,:,np.newaxis,:,np.newaxis] * deg[:,:,:,np.newaxis,:]
vsg += de[:,:,:,np.newaxis,np.newaxis] * deg[:,:,np.newaxis,:,:]
vsg += vrg
vfg = np.einsum('nk,nkija->nija',ps,vsg)
vfg += np.einsum('nka,nkij->nija',psg,vs)
return vf, vfg, None
[docs] def var_cond_exp(self,x,iy,ix,ret_ll_gr_hs = (True,False,False),
full_var=False):
ps, psg, trash = self.marginal(ix).resp(x,ret_ll_gr_hs)
vr, vrg, trash = self.distr.conditional_variance(x,self.tau,iy,ix,
ret_ll_gr_hs,full_var)
ps2 = ps*ps
vf = np.einsum('nk,nkij->nij',ps2,vr)
vfg = None
if ret_ll_gr_hs[1]:
vfg = np.einsum('nk,nkija->nija',ps2,vrg)
vfg += 2*np.einsum('nka,nkij->nija',psg*ps[:,:,np.newaxis],vr)
return vf, vfg, None
[docs]class OnlineVDP:
"""Experimental online clustering algorithm.
:param distr: likelihood-prior distribution pair governing clusters. For now the only option is using a instance of :class:`dpcluster.distributions.GaussianNIW`.
:param w: non-negative prior weight. The prior has as much influence as w data points.
:param k: maximum number of clusters.
:param tol: convergence tolerance.
:param max_items: maximum queue length.
"""
def __init__(self, distr, w=.1, k = 25, tol=1e-3, max_items = 100):
self.distr = distr
self.w = w
self.wm = w
self.k = k
self.tol = tol
self.max_n = max_items
self.xs = []
self.vdps = []
self.dim = self.distr.sufficient_stats_dim()
[docs] def put(self,r,s=0):
"""
Append data.
:arg r: sufficient statistics of data to be appended.
Basic usage example::
>>> distr = GaussianNIW(data.shape[2])
>>> x = distr.sufficient_stats(data)
>>> vdp = OnlineVDP(distr)
>>> vdp.put(x)
>>> print vdp.get_model().cluster_parameters()
"""
if s<len(self.xs):
ar = np.vstack((self.xs[s],r))
else:
ar = r
self.xs.append(None)
mn = self.max_n*(s+1)
pcs = ar.shape[0] / mn
self.xs[s] = ar[pcs*mn:,:]
if s<len(self.vdps):
proc = self.vdps[s]
else:
if s>0:
w = self.wm
else:
w = self.w
proc = VDP(self.distr, w, self.k*(s+1), self.tol)
self.vdps.append(proc)
if pcs==0:
return
for x in np.split(ar[:pcs*self.max_n,:],pcs):
proc.batch_learn(x,verbose=False)
xc = proc.tau - proc.lbd[np.newaxis,:]
xc = xc[xc[:,-1]>1e-5]
self.put(xc,s+1)
[docs] def get_model(self):
"""
Get current model.
:return: instance of :class:`dpcluster.algorithms.VDP`
"""
np.random.seed(1)
proc = VDP(self.distr, self.wm, self.k*(len(self.xs)), self.tol)
proc.batch_learn(np.vstack(self.xs[::-1]))
return proc
[docs]class Predictor:
def __init__(self,model,ix,iy):
self.model = model
self.ix = ix
self.iy = iy
@cached
def distr_fit(self,x,lgh):
ix = self.ix
iy = self.iy
ps,psg,trash =self.model.marginal(ix).resp(x,lgh)
tau = np.einsum('nk,ki->ni',ps,self.model.tau)
if lgh[1]:
taug =np.einsum('nka,ki->nia',psg,self.model.tau)
else:
taug = None
return tau,taug
@cached
def precomp(self,x,lgh):
ix = self.ix
iy = self.iy
tau,taug = self.distr_fit(x,lgh)
(mu,Psi,n,nu),(mug,Psig,ng,nug),trs = self.model.distr.prior.nat2usual(tau,lgh)
A,B,D = Psi[:,iy,:][:,:,iy], Psi[:,iy,:][:,:,ix], Psi[:,ix,:][:,:,ix]
Di = np.array(map(np.linalg.inv,D))
P = np.einsum('njk,nkl->njl',B,Di)
Li = A-np.einsum('nik,nlk->nil',P,B)
V1 = Li
V2 = Di
ls = mu,P,V1,V2,n,nu
gr = None
if lgh[1]:
mug = np.einsum('nia,naj->nij',mug,taug)
Psig = np.einsum('nija,nab->nijb',Psig,taug)
Bg = Psig[:,iy,:,:][:,:,ix,:]
Dg = Psig[:,ix,:,:][:,:,ix,:]
Pg = np.einsum('njka,nkl->njla',Bg,Di)
Pg -= np.einsum('njk,nkl,nlma,nmq->njqa',B,Di,Dg,Di)
gr = (mug,Pg,None,None,None,None)
return ls,gr,None
@cached
def predict(self,x,lgh):
ix = self.ix
iy = self.iy
vl,gr,trs = self.precomp(x,lgh)
mu,p,V1,V2,ni,nu = vl
mug,pg,t1,t2,t3,t4 = gr
df = x-mu[:,ix]
yp = (mu[:,iy] + np.einsum('nij,nj->ni',p,df))
ypg = None
if lgh[1]:
dfg = -mug[:,ix,:]
dfg += np.eye(dfg.shape[1])[np.newaxis,:,:]
ypg = (mug[:,iy,:]
+ np.einsum('nij,nja->nia',p,dfg)
+ np.einsum('nija,nj->nia',pg,df))
return yp,ypg,None
[docs] def predict_old(self,z,lgh=(True,True,False),full_var=False):
ix = self.ix
iy = self.iy
x = z[:,ix]
y = z[:,iy]
dx = len(ix)
dy = len(iy)
mu,exg,V1,V2,ni,nu = self.precomp(x,lgh)[0]
yp, ypg, trs = self.predict(x,lgh)
xi = y - yp
P = np.repeat(np.eye(dy,dy)[np.newaxis,:,:],exg.shape[0],0)
P = np.dstack((P,-ypg))
df = x-mu[:,ix]
cf = np.einsum('nj,nj->n',np.einsum('ni,nij->nj',df, V2),df )
if full_var:
V = V1*((1.0+ 1.0/n + cf)/(nu+1.0))[:,np.newaxis,np.newaxis]
else:
V = V1*((1.0/n + cf)/(nu+1.0))[:,np.newaxis,np.newaxis]
vi = np.array(map(np.linalg.inv,V))
pr = np.einsum('nij,nj->ni',vi,xi)
ll = np.einsum('nj,nj->n',pr,xi)
return ll,xi,P,2*vi
[docs]class PredictorKL:
def __init__(self,model,ix,iy):
self.model = model
self.ix = ix
self.iy = iy
@cached
def predict(self,x,lgh):
ix = self.ix
iy = self.iy
dstr,tau,taug,tsh = self.model.distr.conditional(x,
self.model.tau,iy,ix,lgh)
ps,psg,trash = self.model.marginal(ix).resp(x,lgh)
taun = np.einsum('nk,nki->ni',ps,tau)
(mu,Psi,n,nu),(mug,Psig,ng,nug),trs = dstr.prior.nat2usual(taun,lgh)
yp = mu
if lgh[1]:
taung = (np.einsum('nkj,nki->nij',psg,tau)
+ np.einsum('nk,nkij->nij',ps,taug))
ypg = np.einsum('njk,nij->nik',taung,mug)
v = Psi/(n * (nu - len(iy) + 1.0) )[:,np.newaxis,np.newaxis]
return yp,ypg,v
[docs] def predict_old(self,z,lgh=(True,True,False),full_var=False):
ix = self.ix
iy = self.iy
x = z[:,ix]
y = z[:,iy]
dx = len(ix)
dy = len(iy)
mu,exg,V1,V2,ni,nu = self.precomp(x,lgh)[0]
yp, ypg, trs = self.predict(x,lgh)
xi = y - yp
P = np.repeat(np.eye(dy,dy)[np.newaxis,:,:],exg.shape[0],0)
P = np.dstack((P,-ypg))
df = x-mu[:,ix]
cf = np.einsum('nj,nj->n',np.einsum('ni,nij->nj',df, V2),df )
if full_var:
V = V1*((1.0+ 1.0/n + cf)/(nu+1.0))[:,np.newaxis,np.newaxis]
else:
V = V1*((1.0/n + cf)/(nu+1.0))[:,np.newaxis,np.newaxis]
vi = np.array(map(np.linalg.inv,V))
pr = np.einsum('nij,nj->ni',vi,xi)
ll = np.einsum('nj,nj->n',pr,xi)
return ll,xi,P,2*vi