feat: add fp16 inference support (torch/onnx) (#871)

OrangeSodahub · web-flow · commit 1fe3a5a01123 · 2022-12-08T16:27:01.000+08:00
* feat: add fp16 inference in clip_torch * Revert "feat: add fp16 inference in clip_torch" This reverts commit 326e265. * feat: add fp16 inference in clip_torch * fix: device * fix: str to torch.dtype * fix: layernorm * feat: add fp16 inference in clip_trt * feat: add fp16 inference in clip_onnx * fix: housekeeping * fix: ci * fix: ci * fix: ci * fix: ci and get test path * fix: dtype amp and gpu test dependency * fix: layernorm * fix: cast dtype in visiontransformer * fix: clip_onnx * fix: clip_onnx * fix: convert onnx to fp16 * fix: dtype in preproc images * fix: dtype in preproc images * fix: typo * fix: dtype in clip_torch and fp16 in trt * fix: remove plain text in trt_test * fix: test * fix: typo * fix: stash * Revert "fix: stash" This reverts commit f72fd99. * fix: for test * fix: onnx * fix: for test * fix: for test * fix: trt * fix: convert onnx to fp16 before convert trt * fix: discard changes in trt * fix: optimize fp16 test * fix: move __cast_dtype__ * Revert "fix: move __cast_dtype__" This reverts commit edf4629. * fix: ci
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -113,12 +113,11 @@ jobs:
           pip install --no-cache-dir "server/[onnx]"
           pip install --no-cache-dir "server/[transformers]"
           pip install --no-cache-dir "server/[search]"
-          pip install --no-cache-dir "server/[transformers]"
       - name: Test
         id: test
         run: |
           pytest --suppress-no-test-exit-code --cov=clip_client --cov=clip_server --cov-report=xml \
-            -v -s -m "not gpu" ${{ matrix.test-path }}
+            -v -s ${{ matrix.test-path }}
           echo "::set-output name=codecov_flag::cas"
         timeout-minutes: 30
       - name: Check codecov file
@@ -158,6 +157,7 @@ jobs:
           python -m pip install wheel pytest pytest-cov nvidia-pyindex
           pip install -e "client/[test]"
           pip install -e "server/[tensorrt]"
+          pip install -e "server/[onnx]"
           {
             pip install -e "server/[flash-attn]"
           } || {
@@ -168,6 +168,8 @@ jobs:
         run: |
           pytest --suppress-no-test-exit-code --cov=clip_client --cov=clip_server --cov-report=xml \
             -v -s -m "gpu" ./tests/test_tensorrt.py
+          pytest --suppress-no-test-exit-code --cov=clip_client --cov=clip_server --cov-report=xml \
+            -v -s -m "gpu" ./tests/test_simple.py
           echo "::set-output name=codecov_flag::cas"
         timeout-minutes: 30
         env:
diff --git a/server/clip_server/executors/clip_onnx.py b/server/clip_server/executors/clip_onnx.py
@@ -27,6 +27,7 @@ def __init__(
         minibatch_size: int = 32,
         access_paths: str = '@r',
         model_path: Optional[str] = None,
+        dtype: Optional[str] = None,
         **kwargs,
     ):
         """
@@ -41,8 +42,17 @@ def __init__(
         :param model_path: The path to the model to be used. If not specified, the model will be downloaded or loaded
             from the local cache. Visit https://clip-as-service.jina.ai/user-guides/server/#use-custom-model-for-onnx
             to learn how to finetune custom models.
+        :param dtype: inference data type, if None defaults to 'fp32' if device == 'cpu' else 'fp16'.
         """
         super().__init__(**kwargs)
+        import torch
+
+        if not device:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self._device = device
+        if not dtype:
+            dtype = 'fp32' if self._device in ('cpu', torch.device('cpu')) else 'fp16'
+        self._dtype = dtype
 
         self._minibatch_size = minibatch_size
         self._access_paths = access_paths
@@ -55,18 +65,11 @@ def __init__(
         self._num_worker_preprocess = num_worker_preprocess
         self._pool = ThreadPool(processes=num_worker_preprocess)
 
-        self._model = CLIPOnnxModel(name, model_path)
+        self._model = CLIPOnnxModel(name, model_path, dtype)
         self._tokenizer = Tokenizer(name)
 
         self._image_transform = clip._transform_ndarray(self._model.image_size)
 
-        import torch
-
-        if not device:
-            self._device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        else:
-            self._device = device
-
         # define the priority order for the execution providers
         providers = ['CPUExecutionProvider']
 
@@ -116,6 +119,7 @@ def _preproc_images(self, docs: 'DocumentArray', drop_image_content: bool):
                     preprocess_fn=self._image_transform,
                     return_np=True,
                     drop_image_content=drop_image_content,
+                    dtype=self._dtype,
                 )
 
     def _preproc_texts(self, docs: 'DocumentArray'):
diff --git a/server/clip_server/executors/clip_torch.py b/server/clip_server/executors/clip_torch.py
@@ -2,7 +2,7 @@
 import warnings
 from functools import partial
 from multiprocessing.pool import ThreadPool
-from typing import Dict, Optional
+from typing import Dict, Union, Optional
 
 import numpy as np
 import torch
@@ -12,6 +12,7 @@
     set_rank,
     split_img_txt_da,
 )
+from clip_server.helper import __cast_dtype__
 from clip_server.model import clip
 from clip_server.model.clip_model import CLIPModel
 from clip_server.model.tokenization import Tokenizer
@@ -28,6 +29,7 @@ def __init__(
         num_worker_preprocess: int = 4,
         minibatch_size: int = 32,
         access_paths: str = '@r',
+        dtype: Optional[Union[str, torch.dtype]] = None,
         **kwargs,
     ):
         """
@@ -40,6 +42,7 @@ def __init__(
             number if you encounter OOM errors.
         :param access_paths: The access paths to traverse on the input documents to get the images and texts to be
             processed. Visit https://docarray.jina.ai/fundamentals/documentarray/access-elements for more details.
+        :param dtype: inference data type, if None defaults to torch.float32 if device == 'cpu' else torch.float16.
         """
         super().__init__(**kwargs)
 
@@ -52,9 +55,17 @@ def __init__(
             self._access_paths = kwargs['traversal_paths']
 
         if not device:
-            self._device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        else:
-            self._device = device
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self._device = device
+        if isinstance(dtype, str):
+            dtype = __cast_dtype__.get(dtype)
+        elif not dtype:
+            dtype = (
+                torch.float32
+                if self._device in ('cpu', torch.device('cpu'))
+                else torch.float16
+            )
+        self._dtype = dtype
 
         if not self._device.startswith('cuda') and (
             'OMP_NUM_THREADS' not in os.environ
@@ -77,7 +88,9 @@ def __init__(
         self._num_worker_preprocess = num_worker_preprocess
         self._pool = ThreadPool(processes=num_worker_preprocess)
 
-        self._model = CLIPModel(name, device=self._device, jit=jit, **kwargs)
+        self._model = CLIPModel(
+            name, device=self._device, jit=jit, dtype=dtype, **kwargs
+        )
         self._tokenizer = Tokenizer(name)
         self._image_transform = clip._transform_ndarray(self._model.image_size)
 
@@ -96,6 +109,7 @@ def _preproc_images(self, docs: 'DocumentArray', drop_image_content: bool):
                     device=self._device,
                     return_np=False,
                     drop_image_content=drop_image_content,
+                    dtype=self._dtype,
                 )
 
     def _preproc_texts(self, docs: 'DocumentArray'):
diff --git a/server/clip_server/executors/helper.py b/server/clip_server/executors/helper.py
@@ -1,8 +1,9 @@
-from typing import Tuple, List, Callable, Any, Dict
+from typing import Tuple, List, Callable, Any, Dict, Union
 import torch
 import numpy as np
 from docarray import Document, DocumentArray
 from docarray.math.distance.numpy import cosine
+from clip_server.helper import __cast_dtype__
 
 
 from clip_server.model.tokenization import Tokenizer
@@ -22,8 +23,12 @@ def preproc_image(
     device: str = 'cpu',
     return_np: bool = False,
     drop_image_content: bool = False,
+    dtype: Union[str, torch.dtype] = torch.float32,
 ) -> Tuple['DocumentArray', Dict]:
 
+    if isinstance(dtype, str):
+        dtype = __cast_dtype__.get(dtype)
+
     tensors_batch = []
 
     for d in da:
@@ -42,7 +47,7 @@ def preproc_image(
         if drop_image_content:
             d.pop('blob', 'tensor')
 
-    tensors_batch = torch.stack(tensors_batch).type(torch.float32)
+    tensors_batch = torch.stack(tensors_batch).type(dtype)
 
     if return_np:
         tensors_batch = tensors_batch.cpu().numpy()
diff --git a/server/clip_server/helper.py b/server/clip_server/helper.py
@@ -2,6 +2,7 @@
 import os
 import sys
 import threading
+import torch
 from packaging.version import Version
 from urllib.request import Request, urlopen
 
@@ -19,6 +20,9 @@
 )
 
 
+__cast_dtype__ = {'fp16': torch.float16, 'fp32': torch.float32, 'bf16': torch.bfloat16}
+
+
 def _version_check(package: str = None, github_repo: str = None):
     try:
 
diff --git a/server/clip_server/model/clip_onnx.py b/server/clip_server/model/clip_onnx.py
@@ -1,5 +1,5 @@
 import os
-from typing import Dict
+from typing import Dict, Optional
 
 from clip_server.model.pretrained_models import (
     download_model,
@@ -201,8 +201,11 @@
 
 
 class CLIPOnnxModel(BaseCLIPModel):
-    def __init__(self, name: str, model_path: str = None):
+    def __init__(
+        self, name: str, model_path: str = None, dtype: Optional[str] = 'fp32'
+    ):
         super().__init__(name)
+        self._dtype = dtype
         if name in _MODELS:
             if not model_path:
                 cache_dir = os.path.expanduser(
@@ -237,6 +240,22 @@ def __init__(self, name: str, model_path: str = None):
                         f'The given model path {model_path} should be a folder containing both '
                         f'`textual.onnx` and `visual.onnx`.'
                     )
+            if dtype == 'fp16':
+                import onnx
+                from onnxmltools.utils import float16_converter
+
+                _textual_model_fp16 = (
+                    float16_converter.convert_float_to_float16_model_path(
+                        self._textual_path
+                    )
+                )
+                _visual_model_fp16 = (
+                    float16_converter.convert_float_to_float16_model_path(
+                        self._visual_path
+                    )
+                )
+                onnx.save_model(_textual_model_fp16, self._textual_path)
+                onnx.save_model(_visual_model_fp16, self._visual_path)
         else:
             raise RuntimeError(
                 'CLIP model {} not found or not supports ONNX backend; below is a list of all available models:\n{}'.format(
diff --git a/server/clip_server/model/model.py b/server/clip_server/model/model.py
@@ -15,6 +15,7 @@
 from dataclasses import dataclass
 from typing import Tuple, Union, Optional
 from copy import deepcopy
+from clip_server.helper import __cast_dtype__
 from open_clip.transformer import QuickGELU, LayerNorm, LayerNormFp32, Attention
 from open_clip.timm_model import TimmModel
 from open_clip.factory import _MODEL_CONFIGS
@@ -81,6 +82,11 @@ def __init__(
         super().__init__(image_size, patch_size, output_dim=output_dim, **kwargs)
         self.transformer = Transformer(dtype=dtype, **kwargs)
 
+    def forward(self, x: torch.Tensor):
+        dtype = self.transformer.get_cast_dtype()
+        x = x.to(dtype)
+        return super().forward(x)
+
 
 class TextTransformer(_TextTransformer):
     def __init__(
@@ -435,7 +441,9 @@ def load_openai_model(
     preprocess : Callable[[PIL.Image], torch.Tensor]
         A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
     """
-    if dtype is None:
+    if isinstance(dtype, str):
+        dtype = __cast_dtype__.get(dtype, 'amp')
+    elif dtype is None:
         dtype = (
             torch.float32 if device in ('cpu', torch.device('cpu')) else torch.float16
         )
@@ -550,7 +558,9 @@ def load_openclip_model(
     pretrained_image: bool = False,
     dtype: Optional[Union[str, torch.dtype]] = None,
 ):
-    if dtype is None:
+    if isinstance(dtype, str):
+        dtype = __cast_dtype__.get(dtype)
+    elif dtype is None:
         dtype = (
             torch.float32 if device in ('cpu', torch.device('cpu')) else torch.float16
         )
diff --git a/server/setup.py b/server/setup.py
@@ -53,6 +53,7 @@
         'onnx': [
             'onnxruntime',
             'onnx',
+            'onnxmltools',
         ]
         + (['onnxruntime-gpu>=1.8.0'] if sys.platform != 'darwin' else []),
         'tensorrt': ['nvidia-tensorrt'],
diff --git a/tests/test_simple.py b/tests/test_simple.py
@@ -27,6 +27,7 @@ def test_protocols(port_generator, protocol, jit, pytestconfig):
         c.profile(content=f'{pytestconfig.rootdir}/tests/img/00000.jpg')
 
 
+@pytest.mark.gpu
 @pytest.mark.parametrize(
     'inputs',
     [
@@ -48,6 +49,7 @@ def test_plain_inputs(make_flow, inputs):
     )
 
 
+@pytest.mark.gpu
 @pytest.mark.parametrize(
     'inputs',
     [

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,7 @@`
`53`	`53`	`'onnx': [`
`54`	`54`	`'onnxruntime',`
`55`	`55`	`'onnx',`
	`56`	`+ 'onnxmltools',`
`56`	`57`	`]`
`57`	`58`	`+ (['onnxruntime-gpu>=1.8.0'] if sys.platform != 'darwin' else []),`
`58`	`59`	`'tensorrt': ['nvidia-tensorrt'],`
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,7 @@ def test_protocols(port_generator, protocol, jit, pytestconfig):`
`27`	`27`	`c.profile(content=f'{pytestconfig.rootdir}/tests/img/00000.jpg')`
`28`	`28`
`29`	`29`
	`30`	`+@pytest.mark.gpu`
`30`	`31`	`@pytest.mark.parametrize(`
`31`	`32`	`'inputs',`
`32`	`33`	`[`
`@@ -48,6 +49,7 @@ def test_plain_inputs(make_flow, inputs):`
`48`	`49`	`)`
`49`	`50`
`50`	`51`
	`52`	`+@pytest.mark.gpu`
`51`	`53`	`@pytest.mark.parametrize(`
`52`	`54`	`'inputs',`
`53`	`55`	`[`