cf

Sleeping

App Files Files Community

Yeefei commited on Apr 17, 2024

Commit

5ebd65a

verified ·

1 Parent(s): 9aa68f2

Upload 2 files

Browse files

Files changed (2) hide show

app.py +1 -1
vae.py +196 -155

app.py CHANGED Viewed

@@ -727,4 +727,4 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
 if __name__ == "__main__":
     demo.queue()
-    demo.launch(share=True)

 if __name__ == "__main__":
     demo.queue()
+    demo.launch()

vae.py CHANGED Viewed

@@ -1,14 +1,20 @@
 import numpy as np
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 import torch.distributions as dist
 EPS = -9  # minimum logscale
 @torch.jit.script
-def gaussian_kl(q_loc, q_logscale, p_loc, p_logscale):
     return (
         -0.5
         + p_logscale
@@ -20,27 +26,27 @@ def gaussian_kl(q_loc, q_logscale, p_loc, p_logscale):
 @torch.jit.script
-def sample_gaussian(loc, logscale):
     return loc + logscale.exp() * torch.randn_like(loc)
 class Block(nn.Module):
     def __init__(
         self,
-        in_width,
-        bottleneck,
-        out_width,
-        kernel_size=3,
-        residual=True,
-        down_rate=None,
-        version=None,
     ):
         super().__init__()
         self.d = down_rate
         self.residual = residual
         padding = 0 if kernel_size == 1 else 1
-        if version == "light":  # for ukbb
             activation = nn.ReLU()
             self.conv = nn.Sequential(
                 activation,
@@ -64,7 +70,7 @@ class Block(nn.Module):
         if self.residual and (self.d or in_width > out_width):
             self.width_proj = nn.Conv2d(in_width, out_width, 1, 1)
-    def forward(self, x):
         out = self.conv(x)
         if self.residual:
             if x.shape[1] != out.shape[1]:
@@ -79,7 +85,7 @@ class Block(nn.Module):
 class Encoder(nn.Module):
-    def __init__(self, args):
         super().__init__()
         # parse architecture
         stages = []
@@ -91,23 +97,17 @@ class Encoder(nn.Module):
             if i == 0:  # define network stem
                 if n_blocks == 0 and "d" not in stage:
                     print("Using stride=2 conv encoder stem.")
-                    self.stem = nn.Conv2d(
-                        args.input_channels,
-                        args.widths[1],
-                        kernel_size=7,
-                        stride=2,
-                        padding=3,
-                    )
                     continue
                 else:
-                    self.stem = nn.Conv2d(
-                        args.input_channels,
-                        args.widths[0],
-                        kernel_size=7,
-                        stride=1,
-                        padding=3,
-                    )
             stages += [(args.widths[i], None) for _ in range(n_blocks)]
             if "d" in stage:  # downsampling block
                 stages += [(args.widths[i + 1], int(stage[stage.index("d") + 1]))]
@@ -118,12 +118,11 @@ class Encoder(nn.Module):
             blocks.append(
                 Block(prev_width, bottleneck, width, down_rate=d, version=args.vr)
             )
-        # scale weights of last conv layer in each block
         for b in blocks:
             b.conv[-1].weight.data *= np.sqrt(1 / len(blocks))
         self.blocks = nn.ModuleList(blocks)
-    def forward(self, x):
         x = self.stem(x)
         acts = {}
         for block in self.blocks:
@@ -136,24 +135,18 @@ class Encoder(nn.Module):
 class DecoderBlock(nn.Module):
-    def __init__(self, args, in_width, out_width, resolution):
         super().__init__()
         bottleneck = int(in_width / args.bottleneck)
         self.res = resolution
         self.stochastic = self.res <= args.z_max_res
         self.z_dim = args.z_dim
         self.cond_prior = args.cond_prior
         k = 3 if self.res > 2 else 1
-        if self.cond_prior:  # conditional prior
-            p_in_width = in_width + args.context_dim
-        else:  # exogenous prior
-            p_in_width = in_width
-            # self.z_feat_proj = nn.Conv2d(self.z_dim + in_width, out_width, 1)
-        self.z_feat_proj = nn.Conv2d(self.z_dim + in_width, out_width, 1)
         self.prior = Block(
-            p_in_width,
             bottleneck,
             2 * self.z_dim + in_width,
             kernel_size=k,
@@ -170,11 +163,21 @@ class DecoderBlock(nn.Module):
                 version=args.vr,
             )
         self.z_proj = nn.Conv2d(self.z_dim + args.context_dim, in_width, 1)
         self.conv = Block(
             in_width, bottleneck, out_width, kernel_size=k, version=args.vr
         )
-    def forward_prior(self, z, pa=None, t=None):
         if self.cond_prior:
             z = torch.cat([z, pa], dim=1)
         z = self.prior(z)
@@ -185,8 +188,18 @@ class DecoderBlock(nn.Module):
             p_logscale = p_logscale + torch.tensor(t).to(z.device).log()
         return p_loc, p_logscale, p_features
-    def forward_posterior(self, z, pa, x, t=None):
         h = torch.cat([z, pa, x], dim=1)
         q_loc, q_logscale = self.posterior(h).chunk(2, dim=1)
         if t is not None:
             q_logscale = q_logscale + torch.tensor(t).to(z.device).log()
@@ -194,7 +207,7 @@ class DecoderBlock(nn.Module):
 class Decoder(nn.Module):
-    def __init__(self, args):
         super().__init__()
         # parse architecture
         stages = []
@@ -218,73 +231,58 @@ class Decoder(nn.Module):
                 )
         self.bias = nn.ParameterList(bias)
         self.cond_prior = args.cond_prior
-        self.is_drop_cond = True if "mnist" in args.hps else False  # hacky
-    def _scale_weights(self):
-        scale = np.sqrt(1 / len(self.blocks))
-        for b in self.blocks:
-            b.z_proj.weight.data *= scale
-            b.conv.conv[-1].weight.data *= scale
-            b.prior.conv[-1].weight.data *= 0.0
-    def forward(self, parents, x=None, t=None, abduct=False, latents=[]):
         # learnt params for each resolution r
         bias = {r.shape[2]: r for r in self.bias}
-        h = bias[1].repeat(parents.shape[0], 1, 1, 1)  # h_init
-        z = h  # for exogenous prior
-        # for conditioning dropout, stochastic path (p1), deterministic path (p2)
-        p1, p2 = self.drop_cond() if (self.training and self.cond_prior) else (1, 1)
         stats = []
         for i, block in enumerate(self.blocks):
             res = block.res  # current block resolution, e.g. 64x64
             pa = parents[..., :res, :res].clone()  # select parents @ res
-            if (
-                self.is_drop_cond
-            ):  # for morphomnist w/ conditioning dropout. Hacky, clean up later
-                pa_drop1 = pa.clone()
-                pa_drop1[:, 2:, ...] = pa_drop1[:, 2:, ...] * p1
-                pa_drop2 = pa.clone()
-                pa_drop2[:, 2:, ...] = pa_drop2[:, 2:, ...] * p2
-            else:  # for ukbb
-                pa_drop1 = pa_drop2 = pa
             if h.size(-1) < res:  # upsample previous layer output
                 b = bias[res] if res in bias.keys() else 0  # broadcasting
                 h = b + F.interpolate(h, scale_factor=res / h.shape[-1])
-            if block.cond_prior:  # conditional prior: p(z_i | z_<i, pa_x)
-                # w/ posterior correction
-                # p_loc, p_logscale, p_feat = block.forward_prior(h, pa_drop1, t=t)
-                if z.size(-1) < res:  # w/o posterior correction
-                    z = b + F.interpolate(z, scale_factor=res / z.shape[-1])
-                p_loc, p_logscale, p_feat = block.forward_prior(z, pa_drop1, t=t)
-            else:  # exogenous prior: p(z_i | z_<i)
-                if z.size(-1) < res:
-                    z = b + F.interpolate(z, scale_factor=res / z.shape[-1])
-                p_loc, p_logscale, p_feat = block.forward_prior(z, t=t)
-            # computation tree:
-            #                     decoder block
-            #                  /                 \
-            #     deterministic                   stochastic
-            #          |                         /          \
-            #   forward z = p_loc         given x            not given x
-            #                           /                  /            \
-            #                     abduct          forward z or z*     z ~ prior
-            #                    /      \                                |
-            # (prior:   conditional    exogenous)            get p(z|pa*) if abduct
-            #              get z*         get z
-            #
             if block.stochastic:
-                if x is not None:  # z_i ~ q(z_i | z_<i, pa_x, x)
-                    q_loc, q_logscale = block.forward_posterior(h, pa, x[res], t=t)
                     z = sample_gaussian(q_loc, q_logscale)
                     stat = dict(kl=gaussian_kl(q_loc, q_logscale, p_loc, p_logscale))
-                    # abduct exogenous noise
-                    if abduct:
                         if block.cond_prior:  # z* if conditional prior
                             stat.update(
                                 dict(
@@ -292,57 +290,52 @@ class Decoder(nn.Module):
                                 )
                             )
                         else:  # z if exogenous prior
-                            # stat.update(dict(z=z.detach()))
-                            stat.update(dict(z=z))  # if cf training
                     stats.append(stat)
                 else:
-                    if latents[i] is None:
                         z = sample_gaussian(p_loc, p_logscale)
                         if abduct and block.cond_prior:  # for abducting z*
                             stats.append(
                                 dict(z={"p_loc": p_loc, "p_logscale": p_logscale})
                             )
-                    else:
-                        try:  # forward fixed latents z or z*
-                            z = latents[i]
-                        except:  # sample prior
-                            z = sample_gaussian(p_loc, p_logscale)
-                            if abduct and block.cond_prior:  # for abducting z*
-                                stats.append(
-                                    dict(z={"p_loc": p_loc, "p_logscale": p_logscale})
-                                )
-            else:
-                z = p_loc  # deterministic path
             h = h + p_feat  # merge prior features
-            h = self.forward_merge(block, h, z, pa_drop2)
-            # if not block.cond_prior:
-            if (i + 1) < len(self.blocks):
-                # z independent of pa_x for next layer prior
-                z = block.z_feat_proj(torch.cat([z, p_feat], dim=1))
         return h, stats
-    def forward_merge(self, block, h, z, pa):
-        # h_i = h_<i + f(z_i, pa_x)
-        h = h + block.z_proj(torch.cat([z, pa], dim=1))
-        return block.conv(h)
-    def drop_cond(self):
         opt = dist.Categorical(1 / 3 * torch.ones(3)).sample()
         if opt == 0:  # drop stochastic path
-            p1, p2 = 0, 1
         elif opt == 1:  # drop deterministic path
-            p1, p2 = 1, 0
         elif opt == 2:  # keep both
-            p1, p2 = 1, 1
-        return p1, p2
 class DGaussNet(nn.Module):
-    def __init__(self, args):
         super(DGaussNet, self).__init__()
         self.x_loc = nn.Conv2d(
             args.widths[0], args.input_channels, kernel_size=1, stride=1
@@ -371,36 +364,48 @@ class DGaussNet(nn.Module):
             else:
                 NotImplementedError(f"{args.x_like} not implemented.")
-    def forward(self, h, x=None, t=None):
         loc, logscale = self.x_loc(h), self.x_logscale(h).clamp(min=EPS)
         # for RGB inputs
-        # if hasattr(self, 'channel_coeffs'):
-        #     coeff = torch.tanh(self.channel_coeffs(h))
-        #     if x is None:  # inference
-        #         # loc = loc + logscale.exp() * torch.randn_like(loc)  # random sampling
-        #         f = lambda x: torch.clamp(x, min=-1, max=1)
-        #         loc_red = f(loc[:,0,...])
-        #         loc_green = f(loc[:,1,...] + coeff[:,0,...] * loc_red)
-        #         loc_blue = f(loc[:,2,...] + coeff[:,1,...] * loc_red + coeff[:,2,...] * loc_green)
-        #     else:  # training
-        #         loc_red = loc[:,0,...]
-        #         loc_green = loc[:,1,...] + coeff[:,0,...] * x[:,0,...]
-        #         loc_blue = loc[:,2,...] + coeff[:,1,...] * x[:,0,...] + coeff[:,2,...] * x[:,1,...]
-        #     loc = torch.cat([loc_red.unsqueeze(1),
-        #         loc_green.unsqueeze(1), loc_blue.unsqueeze(1)], dim=1)
         if t is not None:
             logscale = logscale + torch.tensor(t).to(h.device).log()
         return loc, logscale
-    def approx_cdf(self, x):
         return 0.5 * (
             1.0 + torch.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * torch.pow(x, 3)))
         )
-    def nll(self, h, x):
         loc, logscale = self.forward(h, x)
         centered_x = x - loc
         inv_stdv = torch.exp(-logscale)
@@ -420,7 +425,9 @@ class DGaussNet(nn.Module):
         )
         return -1.0 * log_probs.mean(dim=(1, 2, 3))
-    def sample(self, h, return_loc=True, t=None):
         if return_loc:
             x, logscale = self.forward(h)
         else:
@@ -431,7 +438,7 @@ class DGaussNet(nn.Module):
 class HVAE(nn.Module):
-    def __init__(self, args):
         super().__init__()
         args.vr = "light" if "ukbb" in args.hps else None  # hacky
         self.encoder = Encoder(args)
@@ -442,10 +449,30 @@ class HVAE(nn.Module):
             NotImplementedError(f"{args.x_like} not implemented.")
         self.cond_prior = args.cond_prior
         self.free_bits = args.kl_free_bits
-    def forward(self, x, parents, beta=1):
         acts = self.encoder(x)
         h, stats = self.decoder(parents=parents, x=acts)
         nll_pp = self.likelihood.nll(h, x)
         if self.free_bits > 0:
             free_bits = torch.tensor(self.free_bits).type_as(nll_pp)
@@ -456,17 +483,28 @@ class HVAE(nn.Module):
                 ).sum()
         else:
             kl_pp = torch.zeros_like(nll_pp)
-            for i, stat in enumerate(stats):
                 kl_pp += stat["kl"].sum(dim=(1, 2, 3))
         kl_pp = kl_pp / np.prod(x.shape[1:])  # per pixel
-        elbo = nll_pp.mean() + beta * kl_pp.mean()  # negative elbo (free energy)
-        return dict(elbo=elbo, nll=nll_pp.mean(), kl=kl_pp.mean())
-    def sample(self, parents, return_loc=True, t=None):
         h, _ = self.decoder(parents=parents, t=t)
         return self.likelihood.sample(h, return_loc, t=t)
-    def abduct(self, x, parents, cf_parents=None, alpha=0.5, t=None):
         acts = self.encoder(x)
         _, q_stats = self.decoder(
             x=acts, parents=parents, abduct=True, t=t
@@ -493,8 +531,9 @@ class HVAE(nn.Module):
                 # Option1: mixture distribution: r(z_i | z_{<i}, x, pa, pa*)
                 #   = a*q(z_i | z_{<i}, x, pa) + (1-a)*p(z_i | z_{<i}, pa*)
                 r_loc = alpha * q_loc + (1 - alpha) * p_loc
-                # assumes independence
-                r_var = alpha * q_scale.pow(2) + (1 - alpha) * p_var
                 # r_var = a*(q_loc.pow(2) + q_var) + (1-a)*(p_loc.pow(2) + p_var) - r_loc.pow(2)
                 # # Option 2: precision weighted distribution
@@ -512,6 +551,8 @@ class HVAE(nn.Module):
         else:
             return q_stats  # zs
-    def forward_latents(self, latents, parents, t=None):
         h, _ = self.decoder(latents=latents, parents=parents, t=t)
         return self.likelihood.sample(h, t=t)

+from typing import Dict, List, Optional, Tuple
 import numpy as np
 import torch
 import torch.distributions as dist
+import torch.nn.functional as F
+from torch import Tensor, nn
+from hps import Hparams
 EPS = -9  # minimum logscale
 @torch.jit.script
+def gaussian_kl(
+    q_loc: Tensor, q_logscale: Tensor, p_loc: Tensor, p_logscale: Tensor
+) -> Tensor:
     return (
         -0.5
         + p_logscale
 @torch.jit.script
+def sample_gaussian(loc: Tensor, logscale: Tensor) -> Tensor:
     return loc + logscale.exp() * torch.randn_like(loc)
 class Block(nn.Module):
     def __init__(
         self,
+        in_width: int,
+        bottleneck: int,
+        out_width: int,
+        kernel_size: int = 3,
+        residual: bool = True,
+        down_rate: Optional[int] = None,
+        version: Optional[str] = None,
     ):
         super().__init__()
         self.d = down_rate
         self.residual = residual
         padding = 0 if kernel_size == 1 else 1
+        if version == "light":  # uses less VRAM
             activation = nn.ReLU()
             self.conv = nn.Sequential(
                 activation,
         if self.residual and (self.d or in_width > out_width):
             self.width_proj = nn.Conv2d(in_width, out_width, 1, 1)
+    def forward(self, x: Tensor) -> Tensor:
         out = self.conv(x)
         if self.residual:
             if x.shape[1] != out.shape[1]:
 class Encoder(nn.Module):
+    def __init__(self, args: Hparams):
         super().__init__()
         # parse architecture
         stages = []
             if i == 0:  # define network stem
                 if n_blocks == 0 and "d" not in stage:
                     print("Using stride=2 conv encoder stem.")
+                    stem_width, stem_stride = args.widths[1], 2
                     continue
                 else:
+                    stem_width, stem_stride = args.widths[0], 1
+                self.stem = nn.Conv2d(
+                    args.input_channels,
+                    stem_width,
+                    kernel_size=7,
+                    stride=stem_stride,
+                    padding=3,
+                )
             stages += [(args.widths[i], None) for _ in range(n_blocks)]
             if "d" in stage:  # downsampling block
                 stages += [(args.widths[i + 1], int(stage[stage.index("d") + 1]))]
             blocks.append(
                 Block(prev_width, bottleneck, width, down_rate=d, version=args.vr)
             )
         for b in blocks:
             b.conv[-1].weight.data *= np.sqrt(1 / len(blocks))
         self.blocks = nn.ModuleList(blocks)
+    def forward(self, x: Tensor) -> Dict[int, Tensor]:
         x = self.stem(x)
         acts = {}
         for block in self.blocks:
 class DecoderBlock(nn.Module):
+    def __init__(self, args: Hparams, in_width: int, out_width: int, resolution: int):
         super().__init__()
         bottleneck = int(in_width / args.bottleneck)
         self.res = resolution
         self.stochastic = self.res <= args.z_max_res
         self.z_dim = args.z_dim
         self.cond_prior = args.cond_prior
+        self.q_correction = args.q_correction
         k = 3 if self.res > 2 else 1
         self.prior = Block(
+            (in_width + args.context_dim if self.cond_prior else in_width),
             bottleneck,
             2 * self.z_dim + in_width,
             kernel_size=k,
                 version=args.vr,
             )
         self.z_proj = nn.Conv2d(self.z_dim + args.context_dim, in_width, 1)
+        if not self.q_correction:  # for no posterior correction
+            self.z_feat_proj = nn.Conv2d(self.z_dim + in_width, out_width, 1)
         self.conv = Block(
             in_width, bottleneck, out_width, kernel_size=k, version=args.vr
         )
+    def forward_prior(
+        self, z: Tensor, pa: Optional[Tensor] = None, t: Optional[float] = None
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        #print('Prior')
+        #print('z')
+        #print(z.shape)
+        #print('pa')
+        #print(pa.shape)
         if self.cond_prior:
             z = torch.cat([z, pa], dim=1)
         z = self.prior(z)
             p_logscale = p_logscale + torch.tensor(t).to(z.device).log()
         return p_loc, p_logscale, p_features
+    def forward_posterior(
+        self, z: Tensor, x: Tensor, pa: Tensor, t: Optional[float] = None
+    ) -> Tuple[Tensor, Tensor]:
+        #print('Posterior')
+        #print('z')
+        #print(z.shape)
+        #print('x')
+        #print(x.shape)
+        #print('pa')
+        #print(pa.shape)
         h = torch.cat([z, pa, x], dim=1)
+        #print('h shape: ', h.shape)
         q_loc, q_logscale = self.posterior(h).chunk(2, dim=1)
         if t is not None:
             q_logscale = q_logscale + torch.tensor(t).to(z.device).log()
 class Decoder(nn.Module):
+    def __init__(self, args: Hparams):
         super().__init__()
         # parse architecture
         stages = []
                 )
         self.bias = nn.ParameterList(bias)
         self.cond_prior = args.cond_prior
+        self.is_drop_cond = True if "morphomnist" in args.hps else False  # hacky
+    def forward(
+        self,
+        parents: Tensor,
+        x: Optional[Dict[int, Tensor]] = None,
+        t: Optional[float] = None,
+        abduct: bool = False,
+        latents: List[Tensor] = [],
+    ) -> Tuple[Tensor, List[Dict[str, Tensor]]]:
         # learnt params for each resolution r
         bias = {r.shape[2]: r for r in self.bias}
+        h = z = bias[1].repeat(parents.shape[0], 1, 1, 1)  # initial state
+        # conditioning dropout: stochastic path (p_sto), deterministic path (p_det)
+        p_sto, p_det = (
+            self.drop_cond() if (self.training and self.cond_prior) else (1, 1)
+        )
         stats = []
         for i, block in enumerate(self.blocks):
             res = block.res  # current block resolution, e.g. 64x64
             pa = parents[..., :res, :res].clone()  # select parents @ res
+            # for morphomnist w/ conditioning dropout of y only, clean up later
+            if self.is_drop_cond:
+                pa_sto, pa_det = pa.clone(), pa.clone()
+                pa_sto[:, 2:, ...] = pa_sto[:, 2:, ...] * p_sto
+                pa_det[:, 2:, ...] = pa_det[:, 2:, ...] * p_det
+            else:  # disabled otherwise
+                pa_sto = pa_det = pa
             if h.size(-1) < res:  # upsample previous layer output
                 b = bias[res] if res in bias.keys() else 0  # broadcasting
                 h = b + F.interpolate(h, scale_factor=res / h.shape[-1])
+            if block.q_correction:
+                p_input = h  # current prior depends on previous posterior
+            else:  # current prior depends on previous prior only, upsample previous prior latent z
+                p_input = (
+                    b + F.interpolate(z, scale_factor=res / z.shape[-1])
+                    if z.size(-1) < res
+                    else z
+                )
+            p_loc, p_logscale, p_feat = block.forward_prior(p_input, pa_sto, t=t)
             if block.stochastic:
+                if x is not None:  # z_i ~ q(z_i | z_<i, x, pa_x)
+                    # print(res)
+                    q_loc, q_logscale = block.forward_posterior(h, x[res], pa, t=t)
                     z = sample_gaussian(q_loc, q_logscale)
                     stat = dict(kl=gaussian_kl(q_loc, q_logscale, p_loc, p_logscale))
+                    if abduct:  # abduct exogenous noise
                         if block.cond_prior:  # z* if conditional prior
                             stat.update(
                                 dict(
                                 )
                             )
                         else:  # z if exogenous prior
+                            stat.update(dict(z=z))  # .detach() z if not cf training
                     stats.append(stat)
                 else:
+                    try:  # forward abducted latents
+                        z = latents[i]
+                        z = sample_gaussian(p_loc, p_logscale) if z is None else z
+                    except:  # sample prior
                         z = sample_gaussian(p_loc, p_logscale)
                         if abduct and block.cond_prior:  # for abducting z*
                             stats.append(
                                 dict(z={"p_loc": p_loc, "p_logscale": p_logscale})
                             )
+            else:  # deterministic block
+                z = p_loc
             h = h + p_feat  # merge prior features
+            # h_i = h_<i + f(z_i, pa_x)
+            h = h + block.z_proj(torch.cat([z, pa], dim=1))
+            h = block.conv(h)
+            if not block.q_correction:
+                if (i + 1) < len(self.blocks):
+                    # z independent of pa_x for next layer prior
+                    z = block.z_feat_proj(torch.cat([z, p_feat], dim=1))
         return h, stats
+    def _scale_weights(self):
+        scale = np.sqrt(1 / len(self.blocks))
+        for b in self.blocks:
+            b.z_proj.weight.data *= scale
+            b.conv.conv[-1].weight.data *= scale
+            b.prior.conv[-1].weight.data *= 0.0
+    @torch.no_grad()
+    def drop_cond(self) -> Tuple[int, int]:
         opt = dist.Categorical(1 / 3 * torch.ones(3)).sample()
         if opt == 0:  # drop stochastic path
+            p_sto, p_det = 0, 1
         elif opt == 1:  # drop deterministic path
+            p_sto, p_det = 1, 0
         elif opt == 2:  # keep both
+            p_sto, p_det = 1, 1
+        return p_sto, p_det
 class DGaussNet(nn.Module):
+    def __init__(self, args: Hparams):
         super(DGaussNet, self).__init__()
         self.x_loc = nn.Conv2d(
             args.widths[0], args.input_channels, kernel_size=1, stride=1
             else:
                 NotImplementedError(f"{args.x_like} not implemented.")
+    def forward(
+        self, h: Tensor, x: Optional[Tensor] = None, t: Optional[float] = None
+    ) -> Tuple[Tensor, Tensor]:
         loc, logscale = self.x_loc(h), self.x_logscale(h).clamp(min=EPS)
         # for RGB inputs
+        if hasattr(self, "channel_coeffs"):
+            coeff = torch.tanh(self.channel_coeffs(h))
+            if x is None:  # inference
+                # loc = loc + logscale.exp() * torch.randn_like(loc)  # random sampling
+                f = lambda x: torch.clamp(x, min=-1, max=1)
+                loc_red = f(loc[:, 0, ...])
+                loc_green = f(loc[:, 1, ...] + coeff[:, 0, ...] * loc_red)
+                loc_blue = f(
+                    loc[:, 2, ...]
+                    + coeff[:, 1, ...] * loc_red
+                    + coeff[:, 2, ...] * loc_green
+                )
+            else:  # training
+                loc_red = loc[:, 0, ...]
+                loc_green = loc[:, 1, ...] + coeff[:, 0, ...] * x[:, 0, ...]
+                loc_blue = (
+                    loc[:, 2, ...]
+                    + coeff[:, 1, ...] * x[:, 0, ...]
+                    + coeff[:, 2, ...] * x[:, 1, ...]
+                )
+            loc = torch.cat(
+                [loc_red.unsqueeze(1), loc_green.unsqueeze(1), loc_blue.unsqueeze(1)],
+                dim=1,
+            )
         if t is not None:
             logscale = logscale + torch.tensor(t).to(h.device).log()
         return loc, logscale
+    def approx_cdf(self, x: Tensor) -> Tensor:
         return 0.5 * (
             1.0 + torch.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * torch.pow(x, 3)))
         )
+    def nll(self, h: Tensor, x: Tensor) -> Tensor:
         loc, logscale = self.forward(h, x)
         centered_x = x - loc
         inv_stdv = torch.exp(-logscale)
         )
         return -1.0 * log_probs.mean(dim=(1, 2, 3))
+    def sample(
+        self, h: Tensor, return_loc: bool = True, t: Optional[float] = None
+    ) -> Tuple[Tensor, Tensor]:
         if return_loc:
             x, logscale = self.forward(h)
         else:
 class HVAE(nn.Module):
+    def __init__(self, args: Hparams):
         super().__init__()
         args.vr = "light" if "ukbb" in args.hps else None  # hacky
         self.encoder = Encoder(args)
             NotImplementedError(f"{args.x_like} not implemented.")
         self.cond_prior = args.cond_prior
         self.free_bits = args.kl_free_bits
+        self.register_buffer("log2", torch.tensor(2.0).log())
+    def forward(self, x: Tensor, parents: Tensor, beta: int = 1) -> Dict[str, Tensor]:
+        #print(f'Encoder Input:')
+        #print(type(x))
+        #print(x.shape)
         acts = self.encoder(x)
+        #print(type(acts))
+        #for key, i in acts.items():
+            #print(f'Encoder output key: {key}')
+            #print(type(i))
+            #print(i.shape)
+        #print('Parents')
+        #print(parents.shape)
         h, stats = self.decoder(parents=parents, x=acts)
+        #print('Decoder output shape: ', h.shape)
+        #print('Stats: ')
+        #for stat in stats:
+            #for key, i in stat.items():
+                #print(f'Key: {key}')
+                #print(type(i))
+                #print(i.shape)
         nll_pp = self.likelihood.nll(h, x)
         if self.free_bits > 0:
             free_bits = torch.tensor(self.free_bits).type_as(nll_pp)
                 ).sum()
         else:
             kl_pp = torch.zeros_like(nll_pp)
+            for _, stat in enumerate(stats):
                 kl_pp += stat["kl"].sum(dim=(1, 2, 3))
         kl_pp = kl_pp / np.prod(x.shape[1:])  # per pixel
+        kl_pp = kl_pp.mean()  # / self.log2
+        nll_pp = nll_pp.mean()  # / self.log2
+        nelbo = nll_pp + beta * kl_pp  # negative elbo (free energy)
+        return dict(elbo=nelbo, nll=nll_pp, kl=kl_pp)
+    def sample(
+        self, parents: Tensor, return_loc: bool = True, t: Optional[float] = None
+    ) -> Tuple[Tensor, Tensor]:
         h, _ = self.decoder(parents=parents, t=t)
         return self.likelihood.sample(h, return_loc, t=t)
+    def abduct(
+        self,
+        x: Tensor,
+        parents: Tensor,
+        cf_parents: Optional[Tensor] = None,
+        alpha: float = 0.5,
+        t: Optional[float] = None,
+    ) -> List[Tensor]:
         acts = self.encoder(x)
         _, q_stats = self.decoder(
             x=acts, parents=parents, abduct=True, t=t
                 # Option1: mixture distribution: r(z_i | z_{<i}, x, pa, pa*)
                 #   = a*q(z_i | z_{<i}, x, pa) + (1-a)*p(z_i | z_{<i}, pa*)
                 r_loc = alpha * q_loc + (1 - alpha) * p_loc
+                r_var = (
+                    alpha**2 * q_scale.pow(2) + (1 - alpha)**2 * p_var
+                )  # assumes independence
                 # r_var = a*(q_loc.pow(2) + q_var) + (1-a)*(p_loc.pow(2) + p_var) - r_loc.pow(2)
                 # # Option 2: precision weighted distribution
         else:
             return q_stats  # zs
+    def forward_latents(
+        self, latents: List[Tensor], parents: Tensor, t: Optional[float] = None
+    ) -> Tuple[Tensor, Tensor]:
         h, _ = self.decoder(latents=latents, parents=parents, t=t)
         return self.likelihood.sample(h, t=t)