Add BFU encode cache and use it in ATRAC3 bit allocation

dcherednik · dcherednik · commit bd270db9f2ef · 2026-06-25T23:48:15.000+02:00
Introduce a generic TEncCache/TUnit library (src/atrac/atrac_enc_cache.*)
that memoizes per-unit quantization results keyed by a user-supplied
function. Units are owned via unique_ptr in a flat vector pre-sized to a
known key bound; the codec supplies both the unit factory (which also
performs the computation, avoiding an extra virtual call on the hot path)
and the key packing.

Wire it into the ATRAC3 allocator: during the lambda binary search the
same (bfu, wordlen) block is quantized repeatedly. TAt3SpecUnit now caches
the mantissas plus the CLC/VLC spectrum costs and energy error, so each
(bfu, wordlen) is computed once per channel. The cache is reset in
TAlloc::Dump (once per channel) so no stale spectrum is reused.

Output is bit-exact with the previous encoder; ~36% faster encode.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -120,6 +120,7 @@ set(SOURCE_ATRACDENC_IMPL
     atrac/at1/atrac1_bitalloc.cpp
     atrac/at1/atrac1_dequantiser.cpp
     atrac/atrac_scale.cpp
+    atrac/atrac_enc_cache.cpp
     atrac/atrac_psy_common.cpp
     oma.cpp
     rm.cpp
diff --git a/src/atrac/at3/atrac3_bitstream.cpp b/src/atrac/at3/atrac3_bitstream.cpp
@@ -19,6 +19,7 @@
 #include "atrac3_bitstream.h"
 #include "qmf/qmf.h"
 #include <atrac/atrac_psy_common.h>
+#include <atrac/atrac_enc_cache.h>
 #include <bitstream/bitstream.h>
 #include <util.h>
 #include <env.h>
@@ -147,40 +148,82 @@ uint32_t VLCEnc(const uint32_t selector, const int mantissas[TAtrac3Data::MaxSpe
     return bitsUsed;
 }
 
+// Cached per-BFU quantization result reused across the bit-allocation binary
+// search. For a fixed (channel, bfu, wordlen) within one frame the quantized
+// mantissas and their CLC/VLC costs are deterministic, so we compute them once.
+class TAt3SpecUnit : public TUnit {
+public:
+    // TEncCache::TProvideUnit factory: build the unit and quantize `values`.
+    static TUnit* Provide(size_t /*ch*/, size_t bfu, size_t wordlen, const float* values, void*) {
+        auto* u = new TAt3SpecUnit();
+        const uint32_t first = TAtrac3Data::BlockSizeTab[bfu];
+        const uint32_t last = TAtrac3Data::BlockSizeTab[bfu + 1];
+        const uint32_t blockSize = last - first;
+        const float mul = TAtrac3Data::MaxQuant[std::min((uint32_t)wordlen, (uint32_t)7)];
+
+        u->Wordlen = wordlen;
+        u->Multiplier = mul;
+        u->Mantisas.resize(blockSize);
+        // `ea` (extended/adaptive rounding) depends only on bfu, so it is
+        // constant for a given cache key.
+        u->EnergyErr = QuantMantisas(values, 0, blockSize, mul, bfu > LOSY_NAQ_START, u->Mantisas.data());
+        u->ClcBits = CLCEnc(wordlen, u->Mantisas.data(), blockSize, nullptr);
+        u->VlcBits = VLCEnc(wordlen, u->Mantisas.data(), blockSize, nullptr);
+        return u;
+    }
+
+    float EnergyErr = 0.0f;
+    uint32_t ClcBits = 0; // CLC spectrum cost (no per-block header bits)
+    uint32_t VlcBits = 0; // VLC spectrum cost (no per-block header bits)
+};
+
+// atrac3 has only MS stereo and BFUs carry no channel identity, so the cache
+// (reset per channel) is keyed purely on <bfu, wordlen>; `ch` is unused.
+static size_t MakeAt3SpecKey(size_t /*ch*/, size_t bfu, size_t wordlen) {
+    ASSERT(bfu < 32);
+    ASSERT(wordlen < 8);
+    return (bfu << 3) | wordlen;
+}
+// Upper bound on MakeAt3SpecKey(): bfu < 32, wordlen < 8.
+static constexpr size_t kAt3SpecCacheKeys = 1u << 8;
+
 std::pair<uint8_t, uint32_t> CalcSpecsBitsConsumption(const TAtrac3BitStreamWriter::TSingleChannelElement& sce,
                                                       const vector<uint32_t>& precisionPerEachBlocks,
                                                       int* mantisas,
-                                                      vector<float>& energyErr)
+                                                      vector<float>& energyErr,
+                                                      TEncCache& cache)
 {
     const vector<TScaledBlock>& scaledBlocks = sce.ScaledBlocks;
     const uint32_t numBlocks = precisionPerEachBlocks.size();
     uint32_t bitsUsed = numBlocks * 3;
 
-    auto lambda = [numBlocks, mantisas, &precisionPerEachBlocks, &scaledBlocks, &energyErr](bool clcMode, bool calcMant) {
-        uint32_t bits = 0;
-        for (uint32_t i = 0; i < numBlocks; ++i) {
-            if (precisionPerEachBlocks[i] == 0) {
-                continue;
-            }
-            bits += 6; // sfi
-            const uint32_t first = TAtrac3Data::BlockSizeTab[i];
-            const uint32_t last = TAtrac3Data::BlockSizeTab[i + 1];
-            const uint32_t blockSize = last - first;
-            const float mul = TAtrac3Data::MaxQuant[std::min(precisionPerEachBlocks[i], (uint32_t)7)];
-            if (calcMant) {
-                const float* values = scaledBlocks[i].Values.data();
-                energyErr[i] = QuantMantisas(values, first, last, mul, i > LOSY_NAQ_START, mantisas);
-            }
-            bits += clcMode ? CLCEnc(precisionPerEachBlocks[i], mantisas + first, blockSize, nullptr)
-                            : VLCEnc(precisionPerEachBlocks[i], mantisas + first, blockSize, nullptr);
+    // Per-block header (sfi) bits are common to both coding modes; only the
+    // spectrum cost differs. We accumulate the CLC and VLC spectrum costs from
+    // the cached units and pick the cheaper mode once at the end.
+    uint32_t clcSpecBits = 0;
+    uint32_t vlcSpecBits = 0;
+    for (uint32_t i = 0; i < numBlocks; ++i) {
+        if (precisionPerEachBlocks[i] == 0) {
+            continue;
         }
-        return bits;
-    };
+        bitsUsed += 6; // sfi
+        const uint32_t first = TAtrac3Data::BlockSizeTab[i];
+        const uint32_t last = TAtrac3Data::BlockSizeTab[i + 1];
+        const uint32_t blockSize = last - first;
+
+        auto* unit = static_cast<TAt3SpecUnit*>(
+            cache.GetOrCompute(0, i, precisionPerEachBlocks[i], scaledBlocks[i].Values.data()));
 
-    const uint32_t clcBits = lambda(true, true);
-    const uint32_t vlcBits = lambda(false, false);
-    const bool mode = clcBits <= vlcBits;
-    return std::make_pair(mode, bitsUsed + (mode ? clcBits : vlcBits));
+        // Mirror the cached block-local mantissas into the frame-global array
+        // for the eventual EncodeSpecs() dump.
+        std::copy_n(unit->GetMantisas().data(), blockSize, mantisas + first);
+        energyErr[i] = unit->EnergyErr;
+        clcSpecBits += unit->ClcBits;
+        vlcSpecBits += unit->VlcBits;
+    }
+
+    const bool mode = clcSpecBits <= vlcSpecBits;
+    return std::make_pair(mode, bitsUsed + (mode ? clcSpecBits : vlcSpecBits));
 }
 
 static inline bool CheckBfus(uint16_t* numBfu, const vector<uint32_t>& precisionPerEachBlocks)
@@ -593,7 +636,8 @@ class TAlloc final : public IBitStreamPartEncoder {
         ctx->EnergyErr.assign(ctx->NumBfu, 0.0f);
         std::pair<uint8_t, uint32_t> consumption;
         do {
-            consumption = CalcSpecsBitsConsumption(*ctx->Sce, tmpAlloc, ctx->Mantissas.data(), ctx->EnergyErr);
+            consumption = CalcSpecsBitsConsumption(*ctx->Sce, tmpAlloc, ctx->Mantissas.data(),
+                                                   ctx->EnergyErr, SpecCache);
         } while (ConsiderEnergyErr(ctx->EnergyErr, tmpAlloc));
 
         uint32_t totalBits = consumption.second + EncodeTonalComponents(*ctx->Sce, tmpAlloc, nullptr);
@@ -615,11 +659,13 @@ class TAlloc final : public IBitStreamPartEncoder {
     }
 
     void Dump(NBitStream::TBitStream& bs) override {
-        if (!Ctx) {
-            return;
+        if (Ctx) {
+            EncodeSpecs(*Ctx->Sce, &bs, Ctx->PrecisionPerBlock, Ctx->CodingMode, Ctx->Mantissas.data());
+            Ctx = nullptr;
         }
-        EncodeSpecs(*Ctx->Sce, &bs, Ctx->PrecisionPerBlock, Ctx->CodingMode, Ctx->Mantissas.data());
-        Ctx = nullptr;
+        // The cached quantization results are only valid for the channel/frame
+        // just finished; drop them before the next channel reuses this part.
+        SpecCache.Reset();
     }
 
     void Reset() noexcept override {
@@ -632,6 +678,7 @@ class TAlloc final : public IBitStreamPartEncoder {
 
 private:
     TEncodeCtx* Ctx = nullptr;
+    TEncCache SpecCache{kAt3SpecCacheKeys, &TAt3SpecUnit::Provide, &MakeAt3SpecKey};
 };
 
 std::vector<IBitStreamPartEncoder::TPtr> CreateEncParts()
diff --git a/src/atrac/atrac_enc_cache.cpp b/src/atrac/atrac_enc_cache.cpp
@@ -0,0 +1,51 @@
+/*
+ * This file is part of AtracDEnc.
+ *
+ * AtracDEnc is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * AtracDEnc is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with AtracDEnc; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "atrac_enc_cache.h"
+
+namespace NAtracDEnc {
+
+TEncCache::TEncCache(size_t numKeys, TProvideUnit provideUnit, TMakeKey makeKey, void* opaque)
+    : UnitBuffers(numKeys)
+    , ProvideUnit(provideUnit)
+    , MakeKey(makeKey)
+    , Opaque(opaque)
+{
+}
+
+TUnit* TEncCache::GetOrCompute(size_t ch, size_t bfu, size_t wordlen, const float* values)
+{
+    const size_t key = MakeKey(ch, bfu, wordlen);
+
+    std::unique_ptr<TUnit>& slot = UnitBuffers[key];
+    if (!slot) {
+        slot.reset(ProvideUnit(ch, bfu, wordlen, values, Opaque));
+    }
+
+    return slot.get();
+}
+
+void TEncCache::Reset()
+{
+    // Keep the vector sized; just drop the cached units for the next frame.
+    for (std::unique_ptr<TUnit>& slot : UnitBuffers) {
+        slot.reset();
+    }
+}
+
+} // namespace NAtracDEnc
diff --git a/src/atrac/atrac_enc_cache.h b/src/atrac/atrac_enc_cache.h
@@ -0,0 +1,95 @@
+#pragma once
+
+/*
+ * This file is part of AtracDEnc.
+ *
+ * AtracDEnc is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * AtracDEnc is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with AtracDEnc; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace NAtracDEnc {
+
+// Codec-agnostic base for a single cached encoding unit (BFU / quant unit).
+//
+// A codec subclasses TUnit and quantizes the scaled spectrum into Mantisas,
+// filling the bookkeeping fields needed to later write the unit into the
+// stream. The actual computation lives in the user-supplied ProvideUnit
+// factory (see TEncCache) so the cached hot path carries no extra virtual
+// dispatch. The result is produced once per cache lifetime for a given key
+// and then reused across the bit-allocation search.
+class TUnit {
+public:
+    virtual ~TUnit() = default;
+
+    const std::vector<int>& GetMantisas() const { return Mantisas; }
+    uint32_t GetWordlen() const { return Wordlen; }
+    float GetMultiplier() const { return Multiplier; }
+    uint16_t GetConsumedBits() const { return ConsumedBits; }
+
+protected:
+    // Info needed to write the unit into the stream after encoding.
+    std::vector<int> Mantisas;
+    uint32_t Wordlen = 0;
+    float Multiplier = 1.0f;
+    uint16_t ConsumedBits = 0; // Number of bits consumed by the quantized spectrum
+};
+
+// Caches per-unit encoding results during the bit-allocation search.
+//
+// Within a single frame the scaled spectrum of a given (ch, bfu, wordlen)
+// is fixed, so its quantized mantissas and bit consumption are deterministic.
+// The binary search requests the same combinations repeatedly; this cache
+// computes each one once.
+//
+// The key space is small and dense, so units are stored in a vector that is
+// directly indexed by a user-supplied key function (nullptr slot == not yet
+// computed) rather than in a std::map.
+class TEncCache {
+public:
+    // Build the right TUnit subclass for this key and quantize `values` into
+    // it. Invoked only on a cache miss. `opaque` carries user context
+    // (e.g. scale tables / per-frame data).
+    using TProvideUnit = TUnit* (*)(size_t ch, size_t bfu, size_t wordlen,
+                                    const float* values, void* opaque);
+
+    // Pack (ch, bfu, wordlen) into a dense vector index. Codec specific.
+    using TMakeKey = size_t (*)(size_t ch, size_t bfu, size_t wordlen);
+
+    // `numKeys` is the upper bound on MakeKey() values; the backing vector is
+    // sized once to it. ProvideUnit/MakeKey must agree on this bound.
+    TEncCache(size_t numKeys, TProvideUnit provideUnit, TMakeKey makeKey, void* opaque = nullptr);
+
+    TEncCache(const TEncCache&) = delete;
+    TEncCache& operator=(const TEncCache&) = delete;
+
+    // Return the cached unit for (ch, bfu, wordlen), creating and computing
+    // it via ProvideUnit on the first request.
+    TUnit* GetOrCompute(size_t ch, size_t bfu, size_t wordlen, const float* values);
+
+    // Drop all cached units. Call at frame boundaries.
+    void Reset();
+
+private:
+    std::vector<std::unique_ptr<TUnit>> UnitBuffers; // direct-indexed by MakeKey()
+    TProvideUnit ProvideUnit;
+    TMakeKey MakeKey;
+    void* Opaque;
+};
+
+} // namespace NAtracDEnc