HKUSTGZ-MICS-LYU
diff --git a/‎MiCoCodeGen.py‎
Lines changed: 6 additions & 6 deletions b/‎MiCoCodeGen.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎MiCoEval.py‎
Lines changed: 7 additions & 4 deletions b/‎MiCoEval.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎MiCoModel.py‎
Lines changed: 19 additions & 6 deletions b/‎MiCoModel.py‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎MiCoQLayers.py‎
Lines changed: 42 additions & 15 deletions b/‎MiCoQLayers.py‎
Lines changed: 42 additions & 15 deletions
diff --git a/‎examples/attention_quant_test.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/attention_quant_test.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/mpq_gen.py‎
Lines changed: 9 additions & 0 deletions b/‎examples/mpq_gen.py‎
Lines changed: 9 additions & 0 deletions
@@ -54,7 +54,7 @@ class MiCoCodeGen(torch.fx.Interpreter):
 extern size_t model_weight_end[];
 
 // Profiler Timer
-extern long QMATMUL_TIMER, QUANT_TIMER, IM2COL_TIMER;
+extern unsigned long long QMATMUL_TIMER, QUANT_TIMER, IM2COL_TIMER;
 
 typedef struct {{
 {model_struct}
@@ -352,20 +352,20 @@ def _format_benchmark_forward(self, indent: str):
         groups = self.get_benchmark_call_groups()
         total_occurrences = sum(group["count"] for group in groups)
         lines = [
-            f"{indent}long benchmark_total_time = 0;",
+            f"{indent}unsigned long long benchmark_total_time = 0;",
             f"{indent}printf(\"Benchmark Mode: %d unique kernels, %d total occurrences\\n\", {len(groups)}, {total_occurrences});",
         ]
         for idx, group in enumerate(groups):
             escaped_name = group["function_name"].replace("\\", "\\\\").replace('"', '\\"')
             lines += [
-                f"{indent}long benchmark_kernel_time_{idx} = MiCo_time();",
+                f"{indent}unsigned long long benchmark_kernel_time_{idx} = MiCo_time();",
                 f"{indent}{group['call']}",
                 f"{indent}benchmark_kernel_time_{idx} = MiCo_time() - benchmark_kernel_time_{idx};",
-                f"{indent}long benchmark_kernel_estimate_{idx} = benchmark_kernel_time_{idx} * {group['count']};",
+                f"{indent}unsigned long long benchmark_kernel_estimate_{idx} = benchmark_kernel_time_{idx} * {group['count']};",
                 f"{indent}benchmark_total_time += benchmark_kernel_estimate_{idx};",
-                f"{indent}printf(\"Benchmark Kernel {idx}: {escaped_name} occurrences={group['count']} time=%ld estimated=%ld\\n\", benchmark_kernel_time_{idx}, benchmark_kernel_estimate_{idx});",
+                f"{indent}printf(\"Benchmark Kernel {idx}: {escaped_name} occurrences={group['count']} time=%llu estimated=%llu\\n\", benchmark_kernel_time_{idx}, benchmark_kernel_estimate_{idx});",
             ]
-        lines.append(f"{indent}printf(\"Estimated Execution Time: %ld\\n\", benchmark_total_time);")
+        lines.append(f"{indent}printf(\"Estimated Execution Time: %llu\\n\", benchmark_total_time);")
         return lines
 
     def _extract_input_names(self, n: torch.fx.node.Node) -> List[str]:
 
@@ -42,7 +42,7 @@ def __init__(self, model: MiCoModel | str,
                  lr=0.0001, model_name = "", 
                  objective='ptq_acc',
                  constraint='bops',
-                 linear_group_size = 1,
+                 linear_group_size = 0,
                  output_json='output/json/mico_eval.json') -> None:
 
         self.model = model
@@ -68,8 +68,10 @@ def __init__(self, model: MiCoModel | str,
 
         self.set_eval(objective)
         self.set_constraint(constraint)
-        
-        self.fp_acc = self.eval_fp()
+
+        self.fp_res = self.eval_fp()
+        self.fp_acc = self.eval_fp()['TestAcc']
+        self.fp_loss = self.eval_fp()['TestLoss']
         # Initial Conversion and Test
         res = self.eval_f([8]*self.n_layers*2)
         self.baseline_acc = res
@@ -96,6 +98,7 @@ def __init__(self, model: MiCoModel | str,
         print("Total Params: ", np.sum(self.layer_params))
         print("INT8 Model Accuracy: ", res)
         print("FP Model Accuracy: ", self.fp_acc)
+        print("FP Model Loss:", self.fp_loss)
         return
 
     def get_layer_info(self):
@@ -325,7 +328,7 @@ def set_mico_target(self, mico_type: str):
 
     def eval_fp(self):
         self.model.unset_qscheme()
-        return self.model.test(self.test_loader)['TestAcc']
+        return self.model.test(self.test_loader)
 
     def eval_ptq_loss(self, scheme: list):
         wq = scheme[:self.n_layers]
 
@@ -37,7 +37,10 @@ def get_attn_qlayers(self):
     def n_attn_layers(self):
         return len(self.get_attn_qlayers())
 
-    def set_qscheme(self, qscheme, qat=False, device=device, group_size = 1, use_bias = True, use_norm = False):
+    def set_qscheme(self, qscheme, qat=False, device=None, group_size = 1, use_bias = True, use_norm = False):
+        if device is None:
+            device = self.current_device()
+        self.to(device)
         replace_quantize_layers(self, qscheme[0], qscheme[1], 
                                 quant_aware=qat, group_size=group_size,
                                 device=device, use_bias=use_bias, use_norm=use_norm)
@@ -55,22 +58,31 @@ def set_attn_qscheme(self, attn_qscheme, qat=False, **kwargs):
             set_to_qforward(self)
         return
 
-    def set_qscheme_torchao(self, qscheme,device=device):
+    def set_qscheme_torchao(self, qscheme, device=None):
+        if device is None:
+            device = self.current_device()
+        self.to(device)
         unset_qforward(self)
         replace_quantize_layers_torchao(self, qscheme[0], qscheme[1], device=device)
         return
 
     def torchao_autoquant(self, example_input: torch.Tensor):
         raise NotImplementedError("autoquant was removed in torchao 0.17.0. Use set_qscheme_torchao() instead.")
 
+    def current_device(self):
+        for tensor in list(self.parameters()) + list(self.buffers()):
+            return tensor.device
+        return device
+
     def test(self, test_loader):
         self.eval()
+        run_device = self.current_device()
         criterion = torch.nn.CrossEntropyLoss()
         test_loss = []
         test_total, test_correct = 0, 0
         with torch.no_grad():
             for i, (images, labels) in enumerate(test_loader):
-                x, y = images.to(device), labels.to(device)
+                x, y = images.to(run_device), labels.to(run_device)
                 output = self.forward(x)
                 _, predicted = torch.max(output.data, 1)
                 loss = criterion(output, y)
@@ -86,6 +98,7 @@ def train_loop(self, n_epoch, train_loader, test_loader, verbose = False,
                    warmup_epochs = 0, warmup_lr = 1e-6):
         optimizer = torch.optim.Adam(self.parameters(), lr=lr)
         criterion = torch.nn.CrossEntropyLoss()
+        run_device = self.current_device()
         warmup_epochs = max(0, int(warmup_epochs))
         warmup_epochs = min(warmup_epochs, max(0, n_epoch - 1))
         use_warmup = scheduler == "cosine" and warmup_epochs > 0
@@ -118,14 +131,14 @@ def train_loop(self, n_epoch, train_loader, test_loader, verbose = False,
 
             train_loss = []
             train_total, train_correct = 0, 0
-            loss = torch.tensor(np.inf)
+            loss = torch.tensor(np.inf, device=run_device)
             # Training
             self.train()
             loop = tqdm(enumerate(train_loader), total=len(train_loader), 
                         disable=not verbose)
 
             for i, (images, labels) in loop:
-                x, y = images.to(device), labels.to(device)
+                x, y = images.to(run_device), labels.to(run_device)
                 optimizer.zero_grad()
                 output = self(x)
                 _, predicted = torch.max(output.data, 1)
@@ -147,7 +160,7 @@ def train_loop(self, n_epoch, train_loader, test_loader, verbose = False,
             test_total, test_correct = 0, 0
             with torch.no_grad():
                 for i, (images, labels) in enumerate(test_loader):
-                    x, y = images.to(device), labels.to(device)
+                    x, y = images.to(run_device), labels.to(run_device)
                     output = self.forward(x)
                     _, predicted = torch.max(output.data, 1)
                     loss = criterion(output, y)
 
@@ -41,6 +41,7 @@ def activation_nquant_2d(x: torch.Tensor, qbit = 8):
     if qbit == 1:
         x_absmean = torch.mean(x.abs(), dim=(-2,-1), keepdim=True)
         y = x.sign() * x_absmean
+        y = torch.where(y == 0.0, -x_absmean, y)
     elif qbit < 2: # Ternary quantization
         x_absmean = torch.mean(x.abs(), dim=(-2,-1), keepdim=True)
         scale = 1.0 / x_absmean.clamp_(min=1e-5)
@@ -140,7 +141,7 @@ def weight_quantnb(w: torch.Tensor, qbit = 8, mode = "max"):
 def weight_quantnb_group(w: torch.Tensor, qbit: int = 8, mode: str = "max",
                          dim: int = -1, group_size: int = 32, return_expanded: bool = True):
     """
-    Group-wise symmetric quantization for qbit >= 2.
+    Group-wise weight quantization for qbit >= 1.
     - dim: dimension to group over
     - group_size: number of contiguous elements per group along 'dim'
     - mode: "max" (per-group max) for qbit > 2, otherwise "mean"
@@ -149,7 +150,7 @@ def weight_quantnb_group(w: torch.Tensor, qbit: int = 8, mode: str = "max",
       u: integer-quantized weights (same shape as w)
       inv_scale: inverse scale tensor (broadcastable to w if return_expanded=True)
     """
-    assert qbit > 1, "qbit should be larger than 1"
+    assert qbit >= 1, "qbit should be larger than or equal to 1"
     assert isinstance(group_size, int) and group_size > 0, "group_size must be a positive int"
 
     # Normalize dim to positive index
@@ -166,16 +167,25 @@ def weight_quantnb_group(w: torch.Tensor, qbit: int = 8, mode: str = "max",
     x = w.reshape(new_shape)
 
     reduce_dim = dim + 1  # the 'group_size' axis
-    if (mode == "max") and (qbit > 2):
+    if qbit == 1:
+        denom = x.abs().mean(dim=reduce_dim, keepdim=True)
+        scale = 1.0 / denom.clamp(min=1e-5)
+        u_group = x.sign()
+    elif 1 < qbit < 2:
+        denom = x.abs().mean(dim=reduce_dim, keepdim=True)
+        scale = 1.0 / denom.clamp(min=1e-5)
+        u_group = (x * scale).round().clamp_(-1, 1)
+    elif (mode == "max") and (qbit > 2):
         denom = x.abs().amax(dim=reduce_dim, keepdim=True)
+        scale = (2**(qbit - 1) - 1) / denom.clamp(min=1e-5)
+        u_group = (x * scale).round().clamp_(-(2**(qbit - 1)), 2**(qbit - 1) - 1)
     elif (mode == "mean") or (qbit <= 2):
         denom = x.abs().mean(dim=reduce_dim, keepdim=True)
+        scale = (2**(qbit - 1) - 1) / denom.clamp(min=1e-5)
+        u_group = (x * scale).round().clamp_(-(2**(qbit - 1)), 2**(qbit - 1) - 1)
     else:
         raise ValueError("Invalid mode")
 
-    scale = (2**(qbit - 1) - 1) / denom.clamp(min=1e-5)
-
-    u_group = (x * scale).round().clamp_(-(2**(qbit - 1)), 2**(qbit - 1) - 1)
     u = u_group.reshape_as(w)
 
     inv_scale_group = 1.0 / scale
@@ -264,6 +274,19 @@ def weight_quant(self, w: torch.Tensor):
 
     def save_qweight(self):
         self.qw, self.qw_scale = self._weight_quant_impl(self.weight.data)
+
+    def qweight(self):
+        if self.qw is None or self.qw_scale is None:
+            self.save_qweight()
+        if self.qw.device != self.weight.device:
+            self.qw = self.qw.to(self.weight.device)
+        if self.qw_scale.device != self.weight.device:
+            self.qw_scale = self.qw_scale.to(self.weight.device)
+        return self.qw * self.qw_scale
+
+    def ste_weight_quant(self):
+        w = self.weight
+        return w + (self.weight_quant(w) - w).detach()
 
     def export_qweight(self):
         return {
@@ -315,14 +338,16 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             x_norm = SimpleRMSNorm(self.in_features)(x) if self.use_norm else x
             x_norm = HadamardTransform()(x) if self.haramard else x_norm
             x_quant = x_norm + (self.act_quant(x_norm) - x_norm).detach()
-            w_quant = w + (self.weight_quant(w) - w).detach()
+            w_quant = self.ste_weight_quant()
             y = F.linear(x_quant, w_quant, bias=self.bias)
             return y
         elif self.qforward is True:
             # Forward with Post Training Quantization (PTQ)
             # Only for inference
-            qx = self.act_quant(x)
-            y = F.linear(qx, self.qw * self.qw_scale, bias=self.bias)
+            x_norm = SimpleRMSNorm(self.in_features)(x) if self.use_norm else x
+            x_norm = HadamardTransform()(x) if self.haramard else x_norm
+            qx = self.act_quant(x_norm)
+            y = F.linear(qx, self.qweight(), bias=self.bias)
             return y
         else:
             return F.linear(x, w, bias=self.bias)
@@ -387,15 +412,16 @@ def forward(self, x: torch.Tensor):
             # Using Straight-Through-Estimator (STE) 
             x_norm = self.rmsnorm(x) if self.use_norm else x
             x_quant = x_norm + (activation_nquant_2d(x_norm, self.act_q) - x_norm).detach()
-            w_quant = w + (self.weight_quant(w) - w).detach()
+            w_quant = self.ste_weight_quant()
             y = F.conv2d(x_quant, w_quant, self.bias, self.stride, self.padding, 
                         self.dilation, self.groups)
             return y
         elif self.qforward:
             # Forward with Post Training Quantization (PTQ)
             # Only for inference
-            qx = activation_nquant_2d(x, self.act_q)
-            y = F.conv2d(qx, self.qw * self.qw_scale, self.bias, self.stride, self.padding, 
+            x_norm = self.rmsnorm(x) if self.use_norm else x
+            qx = activation_nquant_2d(x_norm, self.act_q)
+            y = F.conv2d(qx, self.qweight(), self.bias, self.stride, self.padding, 
                         self.dilation, self.groups)
             return y
         else:
@@ -449,12 +475,13 @@ def forward(self, x: torch.Tensor):
         if self.qat:
             x_norm = self.rmsnorm(x) if self.use_norm else x
             x_quant = x_norm + (activation_nquant(x_norm, self.act_q) - x_norm).detach()
-            w_quant = w + (self.weight_quant(w) - w).detach()
+            w_quant = self.ste_weight_quant()
             return F.conv1d(x_quant, w_quant, self.bias, self.stride,
                             self.padding, self.dilation, self.groups)
         elif self.qforward is True:
-            qx = activation_nquant(x, self.act_q)
-            return F.conv1d(qx, self.qw * self.qw_scale, self.bias,
+            x_norm = self.rmsnorm(x) if self.use_norm else x
+            qx = activation_nquant(x_norm, self.act_q)
+            return F.conv1d(qx, self.qweight(), self.bias,
                             self.stride, self.padding, self.dilation, self.groups)
         else:
             return F.conv1d(x, w, self.bias, self.stride,
 
@@ -24,8 +24,8 @@ def parse_args():
         description="Evaluate one model with multiple attention quantization modes."
     )
     parser.add_argument("model_name", type=str)
-    parser.add_argument("--weight-q", type=int, default=8)
-    parser.add_argument("--act-q", type=int, default=8)
+    parser.add_argument("--weight-q", type=float, default=8)
+    parser.add_argument("--act-q", type=float, default=8)
     parser.add_argument(
         "--quant",
         nargs="+",
 
@@ -87,6 +87,12 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("model_name", type=str, nargs="?")
     parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=None,
+        help="Override model_zoo DataLoader worker count. Use 0 in restricted sandboxes.",
+    )
     parser.add_argument("--list-models", action="store_true")
 
     parser.add_argument("--ckpt", type=str, default=None)
@@ -140,6 +146,9 @@ def main():
     if args.fuse and args.fuse_seq:
         raise ValueError("Please use only one of --fuse or --fuse-seq.")
 
+    if args.num_workers is not None:
+        model_zoo.NUM_WORKERS = args.num_workers
+
     model, _, test_loader = model_zoo.from_zoo(
         args.model_name, shuffle=False, batch_size=args.batch_size
     )