Merge pull request #2 from hubertsiuzdak/v1.0

hubertsiuzdak · web-flow · commit ccfc8115fd30 · 2024-02-27T20:51:34.000+01:00
v1.0
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
-# [WIP] SNAC 🍿
+# SNAC 🍿
 
-Multi-**S**cale **N**eural **A**udio **C**odec (SNAC) compressess 44.1 kHz audio into discrete codes at a low bitrate.
+Multi-**S**cale **N**eural **A**udio **C**odec (SNAC) compressess audio into discrete codes at a low bitrate.
 
 ## Overview
 
@@ -14,6 +14,15 @@ consistent structure of an audio track for ~3 minutes.
 
 ![snac.png](img%2Fsnac.png)
 
+## Pretrained models
+
+| Model                                                                       | Bitrate  | Sample Rate | 
+|-----------------------------------------------------------------------------|----------|-------------|
+| [hubertsiuzdak/snac_32khz](https://huggingface.co/hubertsiuzdak/snac_32khz) | 1.9 kbps | 32 kHz      | 
+| [hubertsiuzdak/snac_44khz](https://huggingface.co/hubertsiuzdak/snac_44khz) | 2.6 kbps | 44 kHz      |
+
+These models were trained mostly on music. 
+
 ## Usage
 
 Install it using:
@@ -22,18 +31,14 @@ Install it using:
 pip install snac
 ```
 
-A pretrained model that compresses audio into discrete codes at a 2.2 kbps bitrate is available
-at [Hugging Face](https://huggingface.co/hubertsiuzdak/snac). It uses 4 RVQ levels with token rates of 12.5, 25, 50, and
-100 Hz.
-
 To encode (and reconstruct) audio with SNAC in Python, use the following code:
 
 ```python
 import torch
 from snac import SNAC
 
-model = SNAC.from_pretrained("hubertsiuzdak/snac").eval().cuda()
-audio = torch.randn(1, 1, 44100).cuda()  # B, 1, T
+model = SNAC.from_pretrained("hubertsiuzdak/snac_32khz").eval().cuda()
+audio = torch.randn(1, 1, 32000).cuda()  # B, 1, T
 
 with torch.inference_mode():
     audio_hat, _, codes, _, _ = model(audio)
@@ -44,7 +49,7 @@ resolution.
 
 ```
 >>> [code.shape[1] for code in codes]
-[13, 26, 52, 104]
+[12, 24, 48, 96]
 ```
 
 ## Acknowledgements
diff --git a/snac/__init__.py b/snac/__init__.py
@@ -1,3 +1,3 @@
 from .snac import SNAC
 
-__version__ = "0.1.0"
+__version__ = "1.0.0"
diff --git a/snac/attention.py b/snac/attention.py
@@ -0,0 +1,75 @@
+import torch
+from einops import rearrange
+from torch import nn
+
+
+class LocalMHA(nn.Module):
+    def __init__(self, dim=1024, window_size=32, dim_head=64, use_rotary_pos_emb=True):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.heads = dim // dim_head
+        self.window_size = window_size
+        self.to_qkv = nn.Linear(dim, dim * 3, bias=False)
+        if use_rotary_pos_emb:
+            self.rel_pos = SinusoidalEmbeddings(dim_head, scale_base=window_size // 2)
+        else:
+            self.rel_pos = None
+        self.to_out = nn.Linear(dim, dim, bias=False)
+
+    def forward(self, x):
+        B, C, T = x.shape
+        residual = x
+        x = self.norm(x.transpose(1, 2))
+        windows = T // self.window_size
+        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, "b (w n) (h d) -> b h w n d", w=windows, h=self.heads), (q, k, v))
+        if self.rel_pos is not None:
+            pos_emb, scale = self.rel_pos(k)
+            q, k = apply_rotary_pos_emb(q, k, pos_emb, scale)
+        out = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        out = rearrange(out, "b h w n d -> b (w n) (h d)")
+        out = self.to_out(out)
+        return out.transpose(1, 2) + residual
+
+
+class SinusoidalEmbeddings(nn.Module):
+    def __init__(self, dim, scale_base=None, use_xpos=False):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        # xpos related
+        self.use_xpos = use_xpos
+        self.scale_base = scale_base
+        assert not (use_xpos and scale_base is None), "scale base must be defined if using xpos"
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.register_buffer("scale", scale, persistent=False)
+
+    def forward(self, x):
+        seq_len, device = x.shape[-2], x.device
+        t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
+        freqs = torch.einsum("i , j -> i j", t, self.inv_freq)
+        freqs = torch.cat((freqs, freqs), dim=-1)
+        if not self.use_xpos:
+            return freqs, torch.ones(1, device=device)
+        power = (t - (seq_len // 2)) / self.scale_base
+        scale = self.scale ** rearrange(power, "n -> n 1")
+        scale = torch.cat((scale, scale), dim=-1)
+
+        return freqs, scale
+
+
+def rotate_half(x):
+    x = rearrange(x, "b ... (r d) -> b ... r d", r=2)
+    x1, x2 = x.unbind(dim=-2)
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, freqs, scale=1):
+    q_len = q.shape[-2]
+    q_freqs = freqs[..., -q_len:, :]
+    inv_scale = scale**-1
+    if scale.ndim == 2:
+        scale = scale[-q_len:, :]
+    q = (q * q_freqs.cos() * scale) + (rotate_half(q) * q_freqs.sin() * scale)
+    k = (k * freqs.cos() * inv_scale) + (rotate_half(k) * freqs.sin() * inv_scale)
+    return q, k
diff --git a/snac/layers.py b/snac/layers.py
@@ -4,23 +4,27 @@
 import torch.nn as nn
 from torch.nn.utils.parametrizations import weight_norm
 
+from .attention import LocalMHA
+
 
 class Encoder(nn.Module):
     def __init__(
         self,
         d_model=64,
         strides=[3, 3, 7, 7],
         depthwise=False,
+        attn_window_size=32,
     ):
         super().__init__()
         layers = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
         for stride in strides:
             d_model *= 2
             groups = d_model // 2 if depthwise else 1
             layers += [EncoderBlock(output_dim=d_model, stride=stride, groups=groups)]
+        if attn_window_size is not None:
+            layers += [LocalMHA(dim=d_model, window_size=attn_window_size)]
         groups = d_model if depthwise else 1
         layers += [
-            Snake1d(d_model),
             WNConv1d(d_model, d_model, kernel_size=7, padding=3, groups=groups),
         ]
         self.block = nn.Sequential(*layers)
@@ -37,18 +41,21 @@ def __init__(
         rates,
         noise=False,
         depthwise=False,
+        attn_window_size=32,
         d_out=1,
     ):
         super().__init__()
         if depthwise:
             layers = [
+                WNConv1d(input_channel, input_channel, kernel_size=7, padding=3, groups=input_channel),
                 WNConv1d(input_channel, channels, kernel_size=1),
-                Snake1d(channels),
-                WNConv1d(channels, channels, kernel_size=7, padding=3, groups=channels),
             ]
         else:
             layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
 
+        if attn_window_size is not None:
+            layers += [LocalMHA(dim=channels, window_size=attn_window_size)]
+
         for i, stride in enumerate(rates):
             input_dim = channels // 2**i
             output_dim = channels // 2 ** (i + 1)
@@ -111,13 +118,14 @@ def forward(self, x):
 class NoiseBlock(nn.Module):
     def __init__(self, dim):
         super().__init__()
-        self.scale = nn.Parameter(torch.zeros(dim, 1))
+        self.linear = WNConv1d(dim, dim, kernel_size=1, bias=False)
 
     def forward(self, x):
         B, C, T = x.shape
         noise = torch.randn((B, 1, T), device=x.device, dtype=x.dtype)
-        noise_scaled = noise * self.scale
-        x = x + noise_scaled
+        h = self.linear(x)
+        n = noise * h
+        x = x + n
         return x
 
 
diff --git a/snac/snac.py b/snac/snac.py
@@ -12,18 +12,21 @@
 class SNAC(nn.Module):
     def __init__(
         self,
+        sampling_rate=44100,
         encoder_dim=64,
         encoder_rates=[3, 3, 7, 7],
         latent_dim=None,
         decoder_dim=1536,
         decoder_rates=[7, 7, 3, 3],
+        attn_window_size=32,
         codebook_size=4096,
         codebook_dim=8,
         vq_strides=[8, 4, 2, 1],
         noise=True,
         depthwise=True,
     ):
         super().__init__()
+        self.sampling_rate = sampling_rate
         self.encoder_dim = encoder_dim
         self.encoder_rates = encoder_rates
         self.decoder_dim = decoder_dim
@@ -37,6 +40,7 @@ def __init__(
         self.codebook_size = codebook_size
         self.codebook_dim = codebook_dim
         self.vq_strides = vq_strides
+        self.attn_window_size = attn_window_size
         self.quantizer = ResidualVectorQuantize(
             input_dim=latent_dim,
             codebook_size=codebook_size,
@@ -53,7 +57,7 @@ def __init__(
 
     def preprocess(self, audio_data):
         length = audio_data.shape[-1]
-        pad_to = self.hop_length * self.vq_strides[0]
+        pad_to = self.hop_length * self.attn_window_size
         right_pad = math.ceil(length / pad_to) * pad_to - length
         audio_data = nn.functional.pad(audio_data, (0, right_pad))
         return audio_data
@@ -76,6 +80,7 @@ def from_config(cls, config_path):
     @classmethod
     def from_pretrained(cls, repo_id, **kwargs):
         from huggingface_hub import hf_hub_download
+
         config_path = hf_hub_download(repo_id=repo_id, filename="config.json", **kwargs)
         model_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin", **kwargs)
         model = cls.from_config(config_path)

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`from .snac import SNAC`
`2`	`2`
`3`		`-__version__ = "0.1.0"`
	`3`	`+__version__ = "1.0.0"`