Improve TPU compatibility

ikergarcia1996 · ikergarcia1996 · commit 4515317448bf · 2022-03-27T23:09:22.000+02:00
diff --git a/model.py b/model.py
@@ -142,7 +142,7 @@ def forward(
         :param torch.tensor target: Target values [batch_size]
         :return: Loss [1] if reduction is "mean" else [9]
         """
-        return self.CrossEntropyLoss(predicted.view(-1, 9), target.view(-1).long())
+        return self.CrossEntropyLoss(predicted, target)
 
 
 class CrossEntropyLossImageReorder(torch.nn.Module):
@@ -994,6 +994,7 @@ def __init__(
         learning_rate: float = 1e-5,
         weight_decay: float = 1e-3,
         label_smoothing: float = 0.0,
+        accelerator: str = None,
     ):
         """
         INIT
@@ -1052,6 +1053,7 @@ def __init__(
         self.learning_rate = learning_rate
         self.weight_decay = weight_decay
         self.label_smoothing = label_smoothing
+        self.accelerator = accelerator
 
         if self.encoder_type == "transformer":
             self.model = TEDD1104Transformer(
@@ -1190,21 +1192,31 @@ def training_step(self, batch, batch_idx):
         preds = self.model(x)
         loss = self.criterion(preds, y)
         self.total_batches += 1
-        self.running_loss += loss.item()
-        self.log("Train/loss", loss, sync_dist=True)
-        self.log(
-            "Train/running_loss", self.running_loss / self.total_batches, sync_dist=True
-        )
+        if self.accelerator != "tpu":
+            self.running_loss += loss.item()
+            self.log("Train/loss", loss, sync_dist=True)
+            self.log(
+                "Train/running_loss",
+                self.running_loss / self.total_batches,
+                sync_dist=True,
+            )
+        else:
+            if self.total_batches % 200 == 0:
+                self.log("Train/loss", loss, sync_dist=True)
 
-        return {"preds": preds.detach(), "y": y, "loss": loss}
+        return (
+            {"preds": preds.detach(), "y": y, "loss": loss}
+            if self.accelerator != "tpu"
+            else {"loss": loss}
+        )
 
     def training_step_end(self, outputs):
         """
         Training step end.
 
         :param outputs: outputs of the training step
         """
-        if self.control_mode == "keyboard":
+        if self.accelerator != "tpu" and self.control_mode == "keyboard":
             self.train_accuracy(outputs["preds"], outputs["y"])
             self.log(
                 "Train/acc_k@1_macro",
@@ -1338,6 +1350,7 @@ def __init__(
         learning_rate: float = 1e-5,
         weight_decay: float = 1e-3,
         encoder_type: str = "transformer",
+        accelerator: str = None,
     ):
 
         """
@@ -1376,6 +1389,7 @@ def __init__(
         self.learning_rate = learning_rate
         self.weight_decay = weight_decay
         self.encoder_type = encoder_type
+        self.accelerator = accelerator
 
         self.model = TEDD1104TransformerForImageReordering(
             cnn_model_name=self.cnn_model_name,
@@ -1427,24 +1441,37 @@ def training_step(self, batch, batch_idx):
         preds = self.model(x)
         loss = self.criterion(preds, y)
         self.total_batches += 1
-        self.running_loss += loss.item()
-        self.log("Train/loss", loss, sync_dist=True)
-        self.log(
-            "Train/running_loss", self.running_loss / self.total_batches, sync_dist=True
+
+        if self.accelerator != "tpu":
+            self.running_loss += loss.item()
+            self.log("Train/loss", loss, sync_dist=True)
+            self.log(
+                "Train/running_loss",
+                self.running_loss / self.total_batches,
+                sync_dist=True,
+            )
+        else:
+            if self.total_batches % 200 == 0:
+                self.log("Train/loss", loss, sync_dist=True)
+
+        return (
+            {"preds": torch.argmax(preds.detach(), dim=-1), "y": y, "loss": loss}
+            if self.accelerator != "tpu"
+            else {"loss": loss}
         )
-        return {"preds": torch.argmax(preds.detach(), dim=-1), "y": y, "loss": loss}
 
     def training_step_end(self, outputs):
         """
         Training step end.
 
         :param outputs: outputs of the training step
         """
-        self.train_accuracy(outputs["preds"], outputs["y"])
-        self.log(
-            "Train/acc",
-            self.train_accuracy,
-        )
+        if self.accelerator != "tpu":
+            self.train_accuracy(outputs["preds"], outputs["y"])
+            self.log(
+                "Train/acc",
+                self.train_accuracy,
+            )
 
     def validation_step(self, batch, batch_idx):
         """
diff --git a/train.py b/train.py
@@ -86,10 +86,12 @@ def train(
     )
     checkpoint_callback.CHECKPOINT_NAME_LAST = "{epoch}-last"
 
+    model.accelerator = accelerator
+
     trainer = pl.Trainer(
         devices=devices,
         accelerator=accelerator,
-        precision=precision,
+        precision=precision if precision == "bf16" else int(precision),
         strategy=strategy,
         val_check_interval=val_check_interval,
         accumulate_grad_batches=accumulation_steps,
@@ -212,6 +214,7 @@ def train_new_model(
             weight_decay=weight_decay,
             weights=variable_weights,
             label_smoothing=label_smoothing,
+            accelerator=accelerator,
         )
 
     else:
@@ -334,10 +337,12 @@ def continue_training(
     )
     checkpoint_callback.CHECKPOINT_NAME_LAST = "{epoch}-last"
 
+    model.accelerator = accelerator
+
     trainer = pl.Trainer(
         devices=devices,
         accelerator=accelerator,
-        precision=precision,
+        precision=precision if precision == "bf16" else int(precision),
         strategy=strategy,
         val_check_interval=val_check_interval,
         accumulate_grad_batches=accumulation_steps,
diff --git a/train_reorder.py b/train_reorder.py
@@ -84,7 +84,7 @@ def train(
     trainer = pl.Trainer(
         devices=devices,
         accelerator=accelerator,
-        precision=precision,
+        precision=precision if precision == "bf16" else int(precision),
         strategy=strategy,
         val_check_interval=val_check_interval,
         accumulate_grad_batches=accumulation_steps,
@@ -181,6 +181,7 @@ def train_new_model(
         sequence_size=sequence_size,
         learning_rate=learning_rate,
         weight_decay=weight_decay,
+        accelerator=accelerator,
     )
 
     train(
@@ -290,10 +291,12 @@ def continue_training(
     )
     checkpoint_callback.CHECKPOINT_NAME_LAST = "{epoch}-last"
 
+    model.accelerator = accelerator
+
     trainer = pl.Trainer(
         devices=devices,
         accelerator=accelerator,
-        precision=precision,
+        precision=precision if precision == "bf16" else int(precision),
         strategy=strategy,
         val_check_interval=val_check_interval,
         accumulate_grad_batches=accumulation_steps,
diff --git a/training_scripts/GPU/TEDD_1140_large.sh b/training_scripts/GPU/TEDD_1140_large.sh
@@ -7,12 +7,12 @@ python3 train.py --train_new \
   --accumulation_steps 4 \
   --max_epochs 40 \
   --cnn_model_name efficientnet_b7 \
-  --num_layers_encoder 6 \
+  --num_layers_encoder 4 \
   --embedded_size 512 \
   --learning_rate 1e-5 \
   --mask_prob 0.2 \
   --dropout_cnn_out 0.3 \
-  --dropout_encoder 0.15 \
+  --dropout_encoder 0.1 \
   --dropout_encoder_features 0.3 \
   --control_mode keyboard \
   --val_check_interval 0.5 \
diff --git a/training_scripts/TPU/TEDD_1140_base.sh b/training_scripts/TPU/TEDD_1140_base.sh
@@ -21,6 +21,7 @@ python3 train.py --train_new \
   --val_check_interval 0.5 \
   --hide_map_prob 0.4 \
   --devices 8 \
-  --accelerator tpu
+  --accelerator tpu \
+  --report_to tensorboard
 
 
diff --git a/training_scripts/TPU/TEDD_1140_large.sh b/training_scripts/TPU/TEDD_1140_large.sh
@@ -10,17 +10,18 @@ python3 train.py --train_new \
   --accumulation_steps 1 \
   --max_epochs 40 \
   --cnn_model_name efficientnet_b7 \
-  --num_layers_encoder 6 \
+  --num_layers_encoder 4 \
   --embedded_size 512 \
   --learning_rate 1e-5 \
   --mask_prob 0.2 \
   --dropout_cnn_out 0.3 \
-  --dropout_encoder 0.15 \
+  --dropout_encoder 0.1 \
   --dropout_encoder_features 0.3 \
   --control_mode keyboard \
   --val_check_interval 0.5 \
   --hide_map_prob 0.4 \
   --devices 8 \
-  --accelerator tpu
+  --accelerator tpu \
+  --report_to wandb