Skip to content

Commit bd270db

Browse files
committed
Add BFU encode cache and use it in ATRAC3 bit allocation
Introduce a generic TEncCache/TUnit library (src/atrac/atrac_enc_cache.*) that memoizes per-unit quantization results keyed by a user-supplied function. Units are owned via unique_ptr in a flat vector pre-sized to a known key bound; the codec supplies both the unit factory (which also performs the computation, avoiding an extra virtual call on the hot path) and the key packing. Wire it into the ATRAC3 allocator: during the lambda binary search the same (bfu, wordlen) block is quantized repeatedly. TAt3SpecUnit now caches the mantissas plus the CLC/VLC spectrum costs and energy error, so each (bfu, wordlen) is computed once per channel. The cache is reset in TAlloc::Dump (once per channel) so no stale spectrum is reused. Output is bit-exact with the previous encoder; ~36% faster encode.
1 parent 01234b0 commit bd270db

4 files changed

Lines changed: 223 additions & 29 deletions

File tree

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ set(SOURCE_ATRACDENC_IMPL
120120
atrac/at1/atrac1_bitalloc.cpp
121121
atrac/at1/atrac1_dequantiser.cpp
122122
atrac/atrac_scale.cpp
123+
atrac/atrac_enc_cache.cpp
123124
atrac/atrac_psy_common.cpp
124125
oma.cpp
125126
rm.cpp

src/atrac/at3/atrac3_bitstream.cpp

Lines changed: 76 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "atrac3_bitstream.h"
2020
#include "qmf/qmf.h"
2121
#include <atrac/atrac_psy_common.h>
22+
#include <atrac/atrac_enc_cache.h>
2223
#include <bitstream/bitstream.h>
2324
#include <util.h>
2425
#include <env.h>
@@ -147,40 +148,82 @@ uint32_t VLCEnc(const uint32_t selector, const int mantissas[TAtrac3Data::MaxSpe
147148
return bitsUsed;
148149
}
149150

151+
// Cached per-BFU quantization result reused across the bit-allocation binary
152+
// search. For a fixed (channel, bfu, wordlen) within one frame the quantized
153+
// mantissas and their CLC/VLC costs are deterministic, so we compute them once.
154+
class TAt3SpecUnit : public TUnit {
155+
public:
156+
// TEncCache::TProvideUnit factory: build the unit and quantize `values`.
157+
static TUnit* Provide(size_t /*ch*/, size_t bfu, size_t wordlen, const float* values, void*) {
158+
auto* u = new TAt3SpecUnit();
159+
const uint32_t first = TAtrac3Data::BlockSizeTab[bfu];
160+
const uint32_t last = TAtrac3Data::BlockSizeTab[bfu + 1];
161+
const uint32_t blockSize = last - first;
162+
const float mul = TAtrac3Data::MaxQuant[std::min((uint32_t)wordlen, (uint32_t)7)];
163+
164+
u->Wordlen = wordlen;
165+
u->Multiplier = mul;
166+
u->Mantisas.resize(blockSize);
167+
// `ea` (extended/adaptive rounding) depends only on bfu, so it is
168+
// constant for a given cache key.
169+
u->EnergyErr = QuantMantisas(values, 0, blockSize, mul, bfu > LOSY_NAQ_START, u->Mantisas.data());
170+
u->ClcBits = CLCEnc(wordlen, u->Mantisas.data(), blockSize, nullptr);
171+
u->VlcBits = VLCEnc(wordlen, u->Mantisas.data(), blockSize, nullptr);
172+
return u;
173+
}
174+
175+
float EnergyErr = 0.0f;
176+
uint32_t ClcBits = 0; // CLC spectrum cost (no per-block header bits)
177+
uint32_t VlcBits = 0; // VLC spectrum cost (no per-block header bits)
178+
};
179+
180+
// atrac3 has only MS stereo and BFUs carry no channel identity, so the cache
181+
// (reset per channel) is keyed purely on <bfu, wordlen>; `ch` is unused.
182+
static size_t MakeAt3SpecKey(size_t /*ch*/, size_t bfu, size_t wordlen) {
183+
ASSERT(bfu < 32);
184+
ASSERT(wordlen < 8);
185+
return (bfu << 3) | wordlen;
186+
}
187+
// Upper bound on MakeAt3SpecKey(): bfu < 32, wordlen < 8.
188+
static constexpr size_t kAt3SpecCacheKeys = 1u << 8;
189+
150190
std::pair<uint8_t, uint32_t> CalcSpecsBitsConsumption(const TAtrac3BitStreamWriter::TSingleChannelElement& sce,
151191
const vector<uint32_t>& precisionPerEachBlocks,
152192
int* mantisas,
153-
vector<float>& energyErr)
193+
vector<float>& energyErr,
194+
TEncCache& cache)
154195
{
155196
const vector<TScaledBlock>& scaledBlocks = sce.ScaledBlocks;
156197
const uint32_t numBlocks = precisionPerEachBlocks.size();
157198
uint32_t bitsUsed = numBlocks * 3;
158199

159-
auto lambda = [numBlocks, mantisas, &precisionPerEachBlocks, &scaledBlocks, &energyErr](bool clcMode, bool calcMant) {
160-
uint32_t bits = 0;
161-
for (uint32_t i = 0; i < numBlocks; ++i) {
162-
if (precisionPerEachBlocks[i] == 0) {
163-
continue;
164-
}
165-
bits += 6; // sfi
166-
const uint32_t first = TAtrac3Data::BlockSizeTab[i];
167-
const uint32_t last = TAtrac3Data::BlockSizeTab[i + 1];
168-
const uint32_t blockSize = last - first;
169-
const float mul = TAtrac3Data::MaxQuant[std::min(precisionPerEachBlocks[i], (uint32_t)7)];
170-
if (calcMant) {
171-
const float* values = scaledBlocks[i].Values.data();
172-
energyErr[i] = QuantMantisas(values, first, last, mul, i > LOSY_NAQ_START, mantisas);
173-
}
174-
bits += clcMode ? CLCEnc(precisionPerEachBlocks[i], mantisas + first, blockSize, nullptr)
175-
: VLCEnc(precisionPerEachBlocks[i], mantisas + first, blockSize, nullptr);
200+
// Per-block header (sfi) bits are common to both coding modes; only the
201+
// spectrum cost differs. We accumulate the CLC and VLC spectrum costs from
202+
// the cached units and pick the cheaper mode once at the end.
203+
uint32_t clcSpecBits = 0;
204+
uint32_t vlcSpecBits = 0;
205+
for (uint32_t i = 0; i < numBlocks; ++i) {
206+
if (precisionPerEachBlocks[i] == 0) {
207+
continue;
176208
}
177-
return bits;
178-
};
209+
bitsUsed += 6; // sfi
210+
const uint32_t first = TAtrac3Data::BlockSizeTab[i];
211+
const uint32_t last = TAtrac3Data::BlockSizeTab[i + 1];
212+
const uint32_t blockSize = last - first;
213+
214+
auto* unit = static_cast<TAt3SpecUnit*>(
215+
cache.GetOrCompute(0, i, precisionPerEachBlocks[i], scaledBlocks[i].Values.data()));
179216

180-
const uint32_t clcBits = lambda(true, true);
181-
const uint32_t vlcBits = lambda(false, false);
182-
const bool mode = clcBits <= vlcBits;
183-
return std::make_pair(mode, bitsUsed + (mode ? clcBits : vlcBits));
217+
// Mirror the cached block-local mantissas into the frame-global array
218+
// for the eventual EncodeSpecs() dump.
219+
std::copy_n(unit->GetMantisas().data(), blockSize, mantisas + first);
220+
energyErr[i] = unit->EnergyErr;
221+
clcSpecBits += unit->ClcBits;
222+
vlcSpecBits += unit->VlcBits;
223+
}
224+
225+
const bool mode = clcSpecBits <= vlcSpecBits;
226+
return std::make_pair(mode, bitsUsed + (mode ? clcSpecBits : vlcSpecBits));
184227
}
185228

186229
static inline bool CheckBfus(uint16_t* numBfu, const vector<uint32_t>& precisionPerEachBlocks)
@@ -593,7 +636,8 @@ class TAlloc final : public IBitStreamPartEncoder {
593636
ctx->EnergyErr.assign(ctx->NumBfu, 0.0f);
594637
std::pair<uint8_t, uint32_t> consumption;
595638
do {
596-
consumption = CalcSpecsBitsConsumption(*ctx->Sce, tmpAlloc, ctx->Mantissas.data(), ctx->EnergyErr);
639+
consumption = CalcSpecsBitsConsumption(*ctx->Sce, tmpAlloc, ctx->Mantissas.data(),
640+
ctx->EnergyErr, SpecCache);
597641
} while (ConsiderEnergyErr(ctx->EnergyErr, tmpAlloc));
598642

599643
uint32_t totalBits = consumption.second + EncodeTonalComponents(*ctx->Sce, tmpAlloc, nullptr);
@@ -615,11 +659,13 @@ class TAlloc final : public IBitStreamPartEncoder {
615659
}
616660

617661
void Dump(NBitStream::TBitStream& bs) override {
618-
if (!Ctx) {
619-
return;
662+
if (Ctx) {
663+
EncodeSpecs(*Ctx->Sce, &bs, Ctx->PrecisionPerBlock, Ctx->CodingMode, Ctx->Mantissas.data());
664+
Ctx = nullptr;
620665
}
621-
EncodeSpecs(*Ctx->Sce, &bs, Ctx->PrecisionPerBlock, Ctx->CodingMode, Ctx->Mantissas.data());
622-
Ctx = nullptr;
666+
// The cached quantization results are only valid for the channel/frame
667+
// just finished; drop them before the next channel reuses this part.
668+
SpecCache.Reset();
623669
}
624670

625671
void Reset() noexcept override {
@@ -632,6 +678,7 @@ class TAlloc final : public IBitStreamPartEncoder {
632678

633679
private:
634680
TEncodeCtx* Ctx = nullptr;
681+
TEncCache SpecCache{kAt3SpecCacheKeys, &TAt3SpecUnit::Provide, &MakeAt3SpecKey};
635682
};
636683

637684
std::vector<IBitStreamPartEncoder::TPtr> CreateEncParts()

src/atrac/atrac_enc_cache.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/*
2+
* This file is part of AtracDEnc.
3+
*
4+
* AtracDEnc is free software; you can redistribute it and/or
5+
* modify it under the terms of the GNU Lesser General Public
6+
* License as published by the Free Software Foundation; either
7+
* version 2.1 of the License, or (at your option) any later version.
8+
*
9+
* AtracDEnc is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12+
* Lesser General Public License for more details.
13+
*
14+
* You should have received a copy of the GNU Lesser General Public
15+
* License along with AtracDEnc; if not, write to the Free Software
16+
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17+
*/
18+
19+
#include "atrac_enc_cache.h"
20+
21+
namespace NAtracDEnc {
22+
23+
TEncCache::TEncCache(size_t numKeys, TProvideUnit provideUnit, TMakeKey makeKey, void* opaque)
24+
: UnitBuffers(numKeys)
25+
, ProvideUnit(provideUnit)
26+
, MakeKey(makeKey)
27+
, Opaque(opaque)
28+
{
29+
}
30+
31+
TUnit* TEncCache::GetOrCompute(size_t ch, size_t bfu, size_t wordlen, const float* values)
32+
{
33+
const size_t key = MakeKey(ch, bfu, wordlen);
34+
35+
std::unique_ptr<TUnit>& slot = UnitBuffers[key];
36+
if (!slot) {
37+
slot.reset(ProvideUnit(ch, bfu, wordlen, values, Opaque));
38+
}
39+
40+
return slot.get();
41+
}
42+
43+
void TEncCache::Reset()
44+
{
45+
// Keep the vector sized; just drop the cached units for the next frame.
46+
for (std::unique_ptr<TUnit>& slot : UnitBuffers) {
47+
slot.reset();
48+
}
49+
}
50+
51+
} // namespace NAtracDEnc

src/atrac/atrac_enc_cache.h

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#pragma once
2+
3+
/*
4+
* This file is part of AtracDEnc.
5+
*
6+
* AtracDEnc is free software; you can redistribute it and/or
7+
* modify it under the terms of the GNU Lesser General Public
8+
* License as published by the Free Software Foundation; either
9+
* version 2.1 of the License, or (at your option) any later version.
10+
*
11+
* AtracDEnc is distributed in the hope that it will be useful,
12+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14+
* Lesser General Public License for more details.
15+
*
16+
* You should have received a copy of the GNU Lesser General Public
17+
* License along with AtracDEnc; if not, write to the Free Software
18+
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19+
*/
20+
21+
#include <cstddef>
22+
#include <cstdint>
23+
#include <memory>
24+
#include <vector>
25+
26+
namespace NAtracDEnc {
27+
28+
// Codec-agnostic base for a single cached encoding unit (BFU / quant unit).
29+
//
30+
// A codec subclasses TUnit and quantizes the scaled spectrum into Mantisas,
31+
// filling the bookkeeping fields needed to later write the unit into the
32+
// stream. The actual computation lives in the user-supplied ProvideUnit
33+
// factory (see TEncCache) so the cached hot path carries no extra virtual
34+
// dispatch. The result is produced once per cache lifetime for a given key
35+
// and then reused across the bit-allocation search.
36+
class TUnit {
37+
public:
38+
virtual ~TUnit() = default;
39+
40+
const std::vector<int>& GetMantisas() const { return Mantisas; }
41+
uint32_t GetWordlen() const { return Wordlen; }
42+
float GetMultiplier() const { return Multiplier; }
43+
uint16_t GetConsumedBits() const { return ConsumedBits; }
44+
45+
protected:
46+
// Info needed to write the unit into the stream after encoding.
47+
std::vector<int> Mantisas;
48+
uint32_t Wordlen = 0;
49+
float Multiplier = 1.0f;
50+
uint16_t ConsumedBits = 0; // Number of bits consumed by the quantized spectrum
51+
};
52+
53+
// Caches per-unit encoding results during the bit-allocation search.
54+
//
55+
// Within a single frame the scaled spectrum of a given (ch, bfu, wordlen)
56+
// is fixed, so its quantized mantissas and bit consumption are deterministic.
57+
// The binary search requests the same combinations repeatedly; this cache
58+
// computes each one once.
59+
//
60+
// The key space is small and dense, so units are stored in a vector that is
61+
// directly indexed by a user-supplied key function (nullptr slot == not yet
62+
// computed) rather than in a std::map.
63+
class TEncCache {
64+
public:
65+
// Build the right TUnit subclass for this key and quantize `values` into
66+
// it. Invoked only on a cache miss. `opaque` carries user context
67+
// (e.g. scale tables / per-frame data).
68+
using TProvideUnit = TUnit* (*)(size_t ch, size_t bfu, size_t wordlen,
69+
const float* values, void* opaque);
70+
71+
// Pack (ch, bfu, wordlen) into a dense vector index. Codec specific.
72+
using TMakeKey = size_t (*)(size_t ch, size_t bfu, size_t wordlen);
73+
74+
// `numKeys` is the upper bound on MakeKey() values; the backing vector is
75+
// sized once to it. ProvideUnit/MakeKey must agree on this bound.
76+
TEncCache(size_t numKeys, TProvideUnit provideUnit, TMakeKey makeKey, void* opaque = nullptr);
77+
78+
TEncCache(const TEncCache&) = delete;
79+
TEncCache& operator=(const TEncCache&) = delete;
80+
81+
// Return the cached unit for (ch, bfu, wordlen), creating and computing
82+
// it via ProvideUnit on the first request.
83+
TUnit* GetOrCompute(size_t ch, size_t bfu, size_t wordlen, const float* values);
84+
85+
// Drop all cached units. Call at frame boundaries.
86+
void Reset();
87+
88+
private:
89+
std::vector<std::unique_ptr<TUnit>> UnitBuffers; // direct-indexed by MakeKey()
90+
TProvideUnit ProvideUnit;
91+
TMakeKey MakeKey;
92+
void* Opaque;
93+
};
94+
95+
} // namespace NAtracDEnc

0 commit comments

Comments
 (0)