From 943ef054ba449974fecce163247191acb88f4d4e Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Mon, 1 Jul 2024 10:07:08 -0700 Subject: [PATCH] refactor: Address some issues with enums and overhaul documentation Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan ci: Adding typos pre-commit Signed-off-by: Naren Dasan Signed-off-by: Naren Dasan --- .pre-commit-config.yaml | 4 + CODE_OF_CONDUCT.md | 2 +- CONTRIBUTING.md | 6 +- cmake/paths.cmake | 2 +- dev_dep_versions.yml | 1 - docker/Dockerfile | 2 +- docker/README.md | 2 +- docsrc/RELEASE_CHECKLIST.md | 4 +- docsrc/conf.py | 7 +- docsrc/contributors/conversion.rst | 2 +- docsrc/contributors/dynamo_converters.rst | 8 +- docsrc/contributors/lowering.rst | 20 +- docsrc/contributors/runtime.rst | 4 +- docsrc/dynamo/dynamo_export.rst | 2 +- docsrc/dynamo/torch_compile.rst | 4 +- .../getting_started_with_windows.rst | 34 - docsrc/getting_started/installation.rst | 132 +++- docsrc/getting_started/quick_start.rst | 74 +++ docsrc/index.rst | 70 ++- docsrc/py_api/dynamo.rst | 7 +- docsrc/py_api/logging.rst | 2 - docsrc/py_api/ptq.rst | 18 +- docsrc/py_api/runtime.rst | 26 + docsrc/py_api/torch_tensorrt.rst | 13 + .../creating_torchscript_module_in_python.rst | 12 +- docsrc/ts/getting_started_with_cpp_api.rst | 20 +- docsrc/{user_guide => ts}/ptq.rst | 0 .../user_guide/torch_tensorrt_explained.rst | 107 ++++ examples/custom_converters/README.md | 6 +- examples/custom_converters/elu_model.py | 2 +- examples/distributed_inference/README.md | 2 +- examples/dynamo/custom_kernel_plugins.py | 12 +- examples/dynamo/refit_engine_example.py | 2 +- examples/int8/ptq/README.md | 6 +- examples/int8/training/vgg16/README.md | 2 +- examples/int8/training/vgg16/finetune_qat.py | 13 +- examples/int8/training/vgg16/main.py | 2 +- py/README.md | 18 +- py/torch_tensorrt/_Device.py | 2 +- py/torch_tensorrt/_Input.py | 16 +- py/torch_tensorrt/_compile.py | 20 +- py/torch_tensorrt/_enums.py | 592 +++++++++++++++++- py/torch_tensorrt/csrc/torch_tensorrt_py.cpp | 6 +- py/torch_tensorrt/csrc/util.h | 4 +- py/torch_tensorrt/dynamo/_compiler.py | 64 +- py/torch_tensorrt/dynamo/_refit.py | 11 +- py/torch_tensorrt/dynamo/_settings.py | 2 +- py/torch_tensorrt/dynamo/_tracer.py | 2 +- .../dynamo/conversion/_conversion.py | 5 +- .../dynamo/conversion/converter_utils.py | 15 +- .../dynamo/conversion/impl/shape.py | 5 +- .../dynamo/conversion/impl/shuffle.py | 6 +- .../dynamo/conversion/truncate_double.py | 2 +- .../remove_input_alias_fixing_clones.py | 2 +- .../passes/replace_max_pool_with_indices.py | 2 +- .../partitioning/_adjacency_partitioner.py | 2 +- .../partitioning/_global_partitioner.py | 2 +- .../runtime/_PythonTorchTensorRTModule.py | 5 +- .../dynamo/runtime/_TorchTensorRTModule.py | 27 +- py/torch_tensorrt/dynamo/utils.py | 33 +- py/torch_tensorrt/logging.py | 48 +- py/torch_tensorrt/runtime/__init__.py | 5 + .../runtime/multi_device_safe_mode.py | 18 + py/torch_tensorrt/ts/_Device.py | 2 +- py/torch_tensorrt/ts/_Input.py | 6 +- py/torch_tensorrt/ts/_compile_spec.py | 6 +- py/torch_tensorrt/ts/_compiler.py | 4 +- py/torch_tensorrt/ts/ptq.py | 22 +- pyproject.toml | 26 +- setup.py | 6 +- tests/README.md | 4 +- tests/py/dynamo/conversion/harness.py | 4 +- .../dynamo/conversion/test_index_put_aten.py | 2 +- .../py/dynamo/conversion/test_linear_aten.py | 2 +- tests/py/dynamo/models/test_dtype_support.py | 37 ++ tools/cpp_benchmark/README.md | 2 +- tools/opset_coverage.ipynb | 516 ++++++++++++++- 77 files changed, 1809 insertions(+), 376 deletions(-) delete mode 100644 docsrc/getting_started/getting_started_with_windows.rst create mode 100644 docsrc/getting_started/quick_start.rst create mode 100644 docsrc/py_api/runtime.rst rename docsrc/{user_guide => ts}/ptq.rst (100%) create mode 100644 docsrc/user_guide/torch_tensorrt_explained.rst diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4d6b9eacc7..0605f7706e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,6 +51,10 @@ repos: hooks: - id: black exclude: ^examples/custom_converters/elu_converter/setup.py|^docs + - repo: https://github.com/crate-ci/typos + rev: v1.22.9 + hooks: + - id: typos - repo: local hooks: - id: dont-commit-upstream diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 08b500a221..fe49a253d0 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -6,7 +6,7 @@ In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, -level of experience, education, socio-economic status, nationality, personal +level of experience, education, socioeconomic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ba9ab32cf6..930321614b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,9 +4,9 @@ Do try to fill an issue with your feature or bug before filling a PR (op support is generally an exception as long as you provide tests to prove functionality). There is also a backlog (https://github.com/pytorch/TensorRT/issues) of issues which are tagged with the area of focus, a coarse priority level and whether the issue may be accessible to new contributors. Let us know if you are interested in working on a issue. We are happy to provide guidance and mentorship for new contributors. Though note, there is no claiming of issues, we prefer getting working code quickly vs. addressing concerns about "wasted work". -#### Development enviornment +#### Development environment -Our build system relies on `bazel` (https://bazel.build/). Though there are many ways to install `bazel`, the prefered method is to use `bazelisk` (https://github.com/bazelbuild/bazelisk) which makes it simple to set up the correct version of bazel on the fly. Additional developement dependencies can be installed via the `requirements-dev.txt` file. +Our build system relies on `bazel` (https://bazel.build/). Though there are many ways to install `bazel`, the preferred method is to use `bazelisk` (https://github.com/bazelbuild/bazelisk) which makes it simple to set up the correct version of bazel on the fly. Additional development dependencies can be installed via the `requirements-dev.txt` file. #### Communication @@ -27,7 +27,7 @@ We use the PyTorch Slack for communication about core development, integration w - Avoid introducing unnecessary complexity into existing code so that maintainability and readability are preserved -- Try to avoid commiting commented out code +- Try to avoid committing commented out code - Minimize warnings (and no errors) from the compiler diff --git a/cmake/paths.cmake b/cmake/paths.cmake index 3822c698ff..b80b18a6d3 100644 --- a/cmake/paths.cmake +++ b/cmake/paths.cmake @@ -6,7 +6,7 @@ set(ARCHIVE_OUTPUT_DIRECTORY "lib") set(RUNTIME_OUTPUT_DIRECTORY "bin") set(HEADERS_OUTPUT_DIRECTORY "include") -#Set target ouput directory in the build directory +#Set target output directory in the build directory set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${ARCHIVE_OUTPUT_DIRECTORY}") set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${LIBRARY_OUTPUT_DIRECTORY}") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${RUNTIME_OUTPUT_DIRECTORY}") diff --git a/dev_dep_versions.yml b/dev_dep_versions.yml index 5c560c41f9..a8da87116c 100644 --- a/dev_dep_versions.yml +++ b/dev_dep_versions.yml @@ -1,3 +1,2 @@ -__version__: "2.5.0.dev0" __cuda_version__: "12.4" __tensorrt_version__: "10.0.1" diff --git a/docker/Dockerfile b/docker/Dockerfile index 79c55a2e93..9d20c2f6e8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -50,7 +50,7 @@ RUN TENSORRT_MAJOR_VERSION=`echo ${TENSORRT_VERSION} | cut -d '.' -f 1` && \ RUN wget -q https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-amd64 -O /usr/bin/bazel &&\ chmod a+x /usr/bin/bazel -# Build Torch-TensorRT in an auxillary container +# Build Torch-TensorRT in an auxiliary container FROM base as torch-tensorrt-builder-base ARG ARCH="x86_64" diff --git a/docker/README.md b/docker/README.md index 824dae4d54..bddaaab0ba 100644 --- a/docker/README.md +++ b/docker/README.md @@ -35,7 +35,7 @@ nvidia-docker run --gpus all -it --shm-size=8gb --env="DISPLAY" --volume="/tmp/. Test: -You can run any converter test to verify if Torch-TRT built sucessfully inside the container. Once you launch the container, you can run +You can run any converter test to verify if Torch-TRT built successfully inside the container. Once you launch the container, you can run ``` bazel test //tests/core/conversion/converters:test_activation --compilation_mode=opt --test_output=summary --config use_precompiled_torchtrt --config pre_cxx11_abi ``` diff --git a/docsrc/RELEASE_CHECKLIST.md b/docsrc/RELEASE_CHECKLIST.md index 77bb973487..0900d0b2bd 100644 --- a/docsrc/RELEASE_CHECKLIST.md +++ b/docsrc/RELEASE_CHECKLIST.md @@ -9,7 +9,7 @@ While Torch-TensorRT is in alpha, patch versions are bumped sequentially on brea In beta Torch-TensorRT will get a minor version bump on breaking changes, or upgrade to the next version of PyTorch, patch version will be incremented based on significant bug fixes, or siginficant new functionality in the compiler. Once Torch-TensorRT hits version 1.0.0, major versions are bumped on breaking API changes, breaking changes or significant new functionality in the compiler -will result in a minor version bump and sigificant bug fixes will result in a patch version change. +will result in a minor version bump and significant bug fixes will result in a patch version change. ## Steps to Packaging a Release @@ -50,7 +50,7 @@ will result in a minor version bump and sigificant bug fixes will result in a pa - `[3, 1920, 1080]` (P2) - Batch Sizes: 1, 4, 8, 16, 32 - Frameworks: PyTorch, Torch-TensorRT, ONNX + TRT - - If any models do not convert to ONNX / TRT, that is fine. Mark them as failling / no result + - If any models do not convert to ONNX / TRT, that is fine. Mark them as failing / no result - Devices: - A100 (P0) - A30 / A30 MIG (P1) (same batches as T4 diff --git a/docsrc/conf.py b/docsrc/conf.py index d1cae714cc..2e782358cb 100644 --- a/docsrc/conf.py +++ b/docsrc/conf.py @@ -25,7 +25,7 @@ # -- Project information ----------------------------------------------------- project = "Torch-TensorRT" -copyright = "2022, NVIDIA Corporation" +copyright = "2024, NVIDIA Corporation" author = "NVIDIA Corporation" version = f"v{torch_tensorrt.__version__}" @@ -151,6 +151,9 @@ "master_doc": True, "version_info": { "main": "https://pytorch.org/TensorRT/", + "v2.3.0": "https://pytorch.org/TensorRT/v2.3.0", + "v2.2.0": "https://pytorch.org/TensorRT/v2.2.0", + "v2.1.0": "https://pytorch.org/TensorRT/v2.1.0", "v1.4.0": "https://pytorch.org/TensorRT/v1.4.0", "v1.3.0": "https://pytorch.org/TensorRT/v1.3.0", "v1.2.0": "https://pytorch.org/TensorRT/v1.2.0", @@ -186,6 +189,8 @@ nbsphinx_execute = "never" +autodoc_member_order = "groupwise" + # -- A patch that prevents Sphinx from cross-referencing ivar tags ------- # See http://stackoverflow.com/a/41184353/3343043 diff --git a/docsrc/contributors/conversion.rst b/docsrc/contributors/conversion.rst index f19fc5eba8..fdb477bc67 100644 --- a/docsrc/contributors/conversion.rst +++ b/docsrc/contributors/conversion.rst @@ -3,7 +3,7 @@ Conversion Phase ================== -Once the graph has be simplified to a form thats easy to convert, we then set up a conversion context +Once the graph has be simplified to a form that's easy to convert, we then set up a conversion context to manage the construction of a TensorRT ``INetworkDefinition`` from the blocks nodes. The conversion context records the set of converted nodes, block inputs and outputs and other information about the conversion of the graph. This data is then used to help converters link together layers and also hold build time diff --git a/docsrc/contributors/dynamo_converters.rst b/docsrc/contributors/dynamo_converters.rst index 3238d609f3..7cc85f5bea 100644 --- a/docsrc/contributors/dynamo_converters.rst +++ b/docsrc/contributors/dynamo_converters.rst @@ -36,7 +36,7 @@ The decorator takes a number of arguments: All that is required for a converter is the key. The function body is responsible for taking the current state of the network and adding the next subgraph to perform the op specified in the decorator with TensorRT operations. -The function is provided arguments as the native PyTorch op would be provided with the added case of numpy arrays for frozen Tensor attributes or TensorRT ITensors which are ouput Tensors of previous nodes, correspoding to edges/output Tensors of intermediate operations in the graph. +The function is provided arguments as the native PyTorch op would be provided with the added case of numpy arrays for frozen Tensor attributes or TensorRT ITensors which are output Tensors of previous nodes, corresponding to edges/output Tensors of intermediate operations in the graph. To determine the types expected as well as the return type of the converter, look at the definition of the op being converted. In the case of ``aten`` operations, this file will be the source of truth: https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml Since many converters a developer may write are a composition of lower level operators, instead of needing to implement the converter in raw TensorRT, the ``torch_tensorrt.dynamo.conversion.impl`` subpackage contains many implementations of operations that can be chained to create a TensorRT subgraph. @@ -53,14 +53,14 @@ Capability Validation There are some converters which have special cases to be accounted for. In those cases, one should use ``capability_validators`` to register the converter using ``@dynamo_tensorrt_converter`` We illustrate this through ``torch.ops.aten.embedding.default``. It has parameters - ``scale_grad_by_freq`` and ``sparse`` which are not currently supported by the implementation. -In such cases we can write validator ``embedding_param_validator`` which implements that given those paramters the converter is not supported and register the converter by +In such cases we can write validator ``embedding_param_validator`` which implements that given those parameters the converter is not supported and register the converter by Type Contract ^^^^^^^^^^^^^^^ The function is expected to follow the type contract established by the signature. This includes accepting the union of valid PyTorch types + numpy arrays for constant tensors and TensorRT ITensors. -In the case that only a subset of types is supported in the converter, you can also add the ``torch_tensorrt.dynamo.conversion.converter_utils.enforce_tensor_types``, which allows you to specify a dictionary mapping between input positions and types that those inputs can take. Where possible the decorator will convert inputs to match these types prefering the order provided. +In the case that only a subset of types is supported in the converter, you can also add the ``torch_tensorrt.dynamo.conversion.converter_utils.enforce_tensor_types``, which allows you to specify a dictionary mapping between input positions and types that those inputs can take. Where possible the decorator will convert inputs to match these types preferring the order provided. ``int`` keys in the dictionary will refer to positional arguments in ``args``. ``str`` keys will refer to keyword arguments in ``kwargs``. @@ -105,7 +105,7 @@ Some operations do not produce TensorRT subgraphs as a side-effect. These are te Operator Decomposition ----------------------- -There are some converters which can be decomposed into suboperations in PyTorch and need not have seperate converter registration. +There are some converters which can be decomposed into suboperations in PyTorch and need not have separate converter registration. Such converters can be implemented via a decomposition Example: ``addmm`` diff --git a/docsrc/contributors/lowering.rst b/docsrc/contributors/lowering.rst index a82f497ed2..69fa587988 100644 --- a/docsrc/contributors/lowering.rst +++ b/docsrc/contributors/lowering.rst @@ -30,12 +30,12 @@ Eliminate Dead Code Dead code elimination will check if a node has side effects and not delete it if it does. -Eliminate Exeception Or Pass Pattern +Eliminate Exception Or Pass Pattern *************************************** `Torch-TensorRT/core/lowering/passes/exception_elimination.cpp `_ -A common pattern in scripted modules are dimension gaurds which will throw execptions if +A common pattern in scripted modules are dimension guards which will throw exceptions if the input dimension is not what was expected. .. code-block:: none @@ -48,9 +48,9 @@ the input dimension is not what was expected. block1(): -> () -Since we are resolving all of this at compile time and there are no execptions in the TensorRT graph, we just remove it. +Since we are resolving all of this at compile time and there are no exceptions in the TensorRT graph, we just remove it. -Eliminate Redundant Gaurds +Eliminate Redundant Guards *************************************** `torch/csrc/jit/passes/guard_elimination.h `_ @@ -63,7 +63,7 @@ Freeze Module `torch/csrc/jit/passes/freeze_module.h `_ -Freeze attributes and inline constants and modules. Propogates constants in the graph. +Freeze attributes and inline constants and modules. Propagates constants in the graph. Fuse AddMM Branches *************************************** @@ -71,7 +71,7 @@ Fuse AddMM Branches `Torch-TensorRT/core/lowering/passes/fuse_addmm_branches.cpp `_ A common pattern in scripted modules is tensors of different dimensions use different constructions for implementing linear layers. We fuse these -different varients into a single one that will get caught by the Unpack AddMM pass. +different variants into a single one that will get caught by the Unpack AddMM pass. .. code-block:: none @@ -103,7 +103,7 @@ Fuse Flatten Linear `Torch-TensorRT/core/lowering/passes/fuse_flatten_linear.cpp `_ -TensorRT implicity flattens input layers into fully connected layers when they are higher than 1D. So when there is a +TensorRT implicitly flattens input layers into fully connected layers when they are higher than 1D. So when there is a ``aten::flatten`` -> ``aten::linear`` pattern we remove the ``aten::flatten``. Lower Graph @@ -147,7 +147,7 @@ Places delimiting nodes around module calls pre freezing to signify where in the Looks for delimiters then marks all nodes between the delimiters to tell partitioning to run them in PyTorch -Peephole Optimze +Peephole Optimize *************************************** `torch/csrc/jit/passes/peephole_optimze.h `_ @@ -179,7 +179,7 @@ Remove To `Torch-TensorRT/core/lowering/passes/remove_to.cpp `_ -Removes ``aten::to`` operators that do casting, since TensorRT mangages it itself. It is important that this is one of the last passes run so that +Removes ``aten::to`` operators that do casting, since TensorRT manages it itself. It is important that this is one of the last passes run so that other passes have a change to move required cast operators out of the main namespace. Unpack AddMM @@ -204,7 +204,7 @@ Unroll Loops `torch/csrc/jit/passes/loop_unrolling.h `_ -Unrolls the operations of compatable loops (e.g. sufficently short) so that you only have to go through the loop once. +Unrolls the operations of compatible loops (e.g. sufficiently short) so that you only have to go through the loop once. Replace Tile with Repeat *************************************** diff --git a/docsrc/contributors/runtime.rst b/docsrc/contributors/runtime.rst index 23d83b6db2..94021f986c 100644 --- a/docsrc/contributors/runtime.rst +++ b/docsrc/contributors/runtime.rst @@ -6,9 +6,9 @@ Runtime Phase The Runtime phase is responsible for constructing self standing TorchScript graphs with embedded TensorRT engines and serving as the runtime when these engines are called. The main interface accepts a serialized TensorRT engine. The execution phase will deserialize and wrap this engine in a class which maintains a execution context for each engine -and some metadata about its inputs and outputs and is compatable with the TorchScript interpreter so that +and some metadata about its inputs and outputs and is compatible with the TorchScript interpreter so that it can be moved around and used like other TorchScript IValues. The engine is run by providing it and inputs -to the ``tensorrt::execute_engine`` operator which will take the engine and its inputs and return the results of engine exeuction. +to the ``tensorrt::execute_engine`` operator which will take the engine and its inputs and return the results of engine execution. Background diff --git a/docsrc/dynamo/dynamo_export.rst b/docsrc/dynamo/dynamo_export.rst index 7a17cd5df2..f23dce679e 100644 --- a/docsrc/dynamo/dynamo_export.rst +++ b/docsrc/dynamo/dynamo_export.rst @@ -27,7 +27,7 @@ usage of the dynamo frontend .. note:: ``torch_tensorrt.dynamo.compile`` is the main API for users to interact with Torch-TensorRT dynamo frontend. The input type of the model should be ``ExportedProgram`` (ideally the output of ``torch.export.export`` or ``torch_tensorrt.dynamo.trace`` (discussed in the section below)) and output type is a ``torch.fx.GraphModule`` object. -Customizeable Settings +Customizable Settings ---------------------- There are lot of options for users to customize their settings for optimizing with TensorRT. diff --git a/docsrc/dynamo/torch_compile.rst b/docsrc/dynamo/torch_compile.rst index 955f64d64c..14498fc0c3 100644 --- a/docsrc/dynamo/torch_compile.rst +++ b/docsrc/dynamo/torch_compile.rst @@ -26,7 +26,7 @@ The primary goal of the Torch-TensorRT `torch.compile` backend is to enable Just The backend can handle a variety of challenging model structures and offers a simple-to-use interface for effective acceleration of models. Additionally, it has many customization options to ensure the compilation process is fitting to the specific use case. -Customizeable Settings +Customizable Settings ----------------- .. autoclass:: CompilationSettings @@ -87,7 +87,7 @@ If key operators for your model are unsupported, see :ref:`dynamo_conversion` to Feasibility of Serialization ^^^^^^^^^^^^^^^^^ -Compilation can also be helpful in demonstrating graph breaks and the feasibility of serialization of a particular model. For instance, if a model has no graph breaks and compiles successfully with the Torch-TensorRT backend, then that model should be compileable and serializeable via the `torch_tensorrt` Dynamo IR, as discussed in :ref:`dynamic_shapes`. To determine the number of graph breaks in a model, the `torch._dynamo.explain` function is very useful: +Compilation can also be helpful in demonstrating graph breaks and the feasibility of serialization of a particular model. For instance, if a model has no graph breaks and compiles successfully with the Torch-TensorRT backend, then that model should be compilable and serializeable via the `torch_tensorrt` Dynamo IR, as discussed in :ref:`dynamic_shapes`. To determine the number of graph breaks in a model, the `torch._dynamo.explain` function is very useful: .. code-block:: python diff --git a/docsrc/getting_started/getting_started_with_windows.rst b/docsrc/getting_started/getting_started_with_windows.rst deleted file mode 100644 index a90221b532..0000000000 --- a/docsrc/getting_started/getting_started_with_windows.rst +++ /dev/null @@ -1,34 +0,0 @@ -.. _getting_started_windows: - -Building Torch-TensorRT on Windows -==================================== - -Torch-TensorRT has community support for Windows platform using CMake - -Prerequisite: - -* Microsoft VS 2022 Tools -* Bazelisk -* CUDA - - -Build steps -------------------- - -* Open the app "x64 Native Tools Command Prompt for VS 2022" - note that Admin priveleges may be necessary -* Ensure Bazelisk (Bazel launcher) is installed on your machine and available from the command line. Package installers such as Chocolatey can be used to install Bazelisk -* Install latest version of Torch (i.e. with `pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124`) -* Clone the Torch-TensorRT repository and navigate to its root directory -* Run `pip install ninja wheel setuptools` -* Run `pip install --pre -r py/requirements.txt` -* Run `set DISTUTILS_USE_SDK=1` -* Run `python setup.py bdist_wheel` -* Run `pip install dist/*.whl` - -Advanced setup and Troubleshooting -------------------- -In the `WORKSPACE` file, the `cuda_win`, `libtorch_win`, and `tensorrt_win` are Windows-specific modules which can be customized. For instance, if you would like to build with a different version of CUDA, or your CUDA installation is in a non-standard location, update the `path` in the `cuda_win` module. - -Similarly, if you would like to use a different version of pytorch or tensorrt, customize the `urls` in the `libtorch_win` and `tensorrt_win` modules, respectively. - -Local versions of these packages can also be used on Windows. See `toolchains\ci_workspaces\WORKSPACE.win.release.tmpl` for an example of using a local version of TensorRT on Windows. diff --git a/docsrc/getting_started/installation.rst b/docsrc/getting_started/installation.rst index 379756c347..26f13c0aca 100644 --- a/docsrc/getting_started/installation.rst +++ b/docsrc/getting_started/installation.rst @@ -1,15 +1,15 @@ .. _installation: Installation -============= +################## Precompiled Binaries -********************* +--------------------- -Torch-TensorRT 2.x is centered primarily around Python. As such, precompiled releases can be found on pypi.org +Torch-TensorRT 2.x is centered primarily around Python. As such, precompiled releases can be found on `pypi.org `_ Dependencies ---------------- +~~~~~~~~~~~~~~ You need to have CUDA, PyTorch, and TensorRT (python package is sufficient) installed to use Torch-TensorRT @@ -18,7 +18,7 @@ You need to have CUDA, PyTorch, and TensorRT (python package is sufficient) inst Installing Torch-TensorRT ---------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You can install the python package using @@ -26,8 +26,10 @@ You can install the python package using python -m pip install torch torch-tensorrt tensorrt +Packages are uploaded for Linux on x86 and Windows + Installing Torch-TensorRT for a specific CUDA version --------------------------------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Similar to PyTorch, Torch-TensorRT has builds compiled for different versions of CUDA. These are distributed on PyTorch's package index @@ -38,7 +40,7 @@ For example CUDA 11.8 python -m pip install torch torch-tensorrt tensorrt --extra-index-url https://download.pytorch.org/whl/cu118 Installing Nightly Builds ---------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Torch-TensorRT distributed nightlies targeting the PyTorch nightly. These can be installed from the PyTorch nightly package index (separated by CUDA version) @@ -51,19 +53,22 @@ Torch-TensorRT distributed nightlies targeting the PyTorch nightly. These can be .. _bin-dist: C++ Precompiled Binaries (TorchScript Only) --------------------------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Precompiled tarballs for releases are provided here: https://github.com/pytorch/TensorRT/releases .. _compile-from-source: Compiling From Source -****************************************** +------------------------ + +Building on Linux +~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. _installing-deps: -Dependencies for Compilation -------------------------------- +Dependencies +^^^^^^^^^^^^^^ * Torch-TensorRT is built with **Bazel**, so begin by installing it. @@ -95,7 +100,7 @@ Dependencies for Compilation * https://github.com/pytorch/TensorRT/blob/4e5b0f6e860910eb510fa70a76ee3eb9825e7a4d/WORKSPACE#L53C1-L53C1 -* **TensorRT** is not required to be installed on the system to build Torch-TensorRT, in fact this is preferable to ensure reproducable builds. If versions other than the default are needed +* **TensorRT** is not required to be installed on the system to build Torch-TensorRT, in fact this is preferable to ensure reproducible builds. If versions other than the default are needed point the WORKSPACE file to the URL of the tarball or download the tarball for TensorRT from https://developer.nvidia.com and update the paths in the WORKSPACE file here https://github.com/pytorch/TensorRT/blob/4e5b0f6e860910eb510fa70a76ee3eb9825e7a4d/WORKSPACE#L71 For example: @@ -114,13 +119,13 @@ Dependencies for Compilation ], ) - Remember at runtime, these libraries must be added to your ``LD_LIBRARY_PATH`` explicity + Remember at runtime, these libraries must be added to your ``LD_LIBRARY_PATH`` explicitly If you have a local version of TensorRT installed, this can be used as well by commenting out the above lines and uncommenting the following lines https://github.com/pytorch/TensorRT/blob/4e5b0f6e860910eb510fa70a76ee3eb9825e7a4d/WORKSPACE#L114C1-L124C3 Building the Package ---------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Once the WORKSPACE has been configured properly, all that is required to build torch-tensorrt is the following command @@ -135,12 +140,41 @@ To build the wheel file python -m pip wheel --no-deps --pre . --extra-index-url https://download.pytorch.org/whl/nightly/cu124 -w dist +Additional Build Options +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some features in the library are optional and allow builds to be lighter or more portable. + +Python Only Distribution +............................ + +There are multiple features of the library which require C++ components to be enabled. This includes both the TorchScript frontend which accepts TorchScript modules for compilation +and the Torch-TensorRT runtime, the default executor for modules compiled with Torch-TensorRT, be it with the TorchScript or Dynamo frontend. + +In the case you may want a build which does not require C++ you can disable these features and avoid building these components. As a result, the only available runtime will be the Python based on +which has implications for features like serialization. + +.. code-block:: sh + + PYTHON_ONLY=1 python -m pip install --pre . --extra-index-url https://download.pytorch.org/whl/nightly/cu124 + + +No TorchScript Frontend +............................ + +The TorchScript frontend is a legacy feature of Torch-TensorRT which is now in maintenance as TorchDynamo has become the preferred compiler technology for this project. It contains quite a bit +of C++ code that is no longer necessary for most users. Therefore you can exclude this component from your build to speed up build times. The C++ based runtime will still be available to use. + +.. code-block:: sh + + NO_TORCHSCRIPT=1 python -m pip install --pre . --extra-index-url https://download.pytorch.org/whl/nightly/cu124 + -Building the C++ Library (TorchScript Only) ------------------------------- +Building the C++ Library Standalone (TorchScript Only) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Release Build -^^^^^^^^^^^^^^^^^^^^^^^^ +............................ .. code-block:: shell @@ -151,7 +185,7 @@ A tarball with the include files and library can then be found in ``bazel-bin`` .. _build-from-archive-debug: Debug Build -^^^^^^^^^^^^^^^^^^^^^^^^ +............................ To build with debug symbols use the following command @@ -162,7 +196,7 @@ To build with debug symbols use the following command A tarball with the include files and library can then be found in ``bazel-bin`` Pre CXX11 ABI Build -^^^^^^^^^^^^^^^^^^^^^^^^ +............................ To build using the pre-CXX11 ABI use the ``pre_cxx11_abi`` config @@ -204,8 +238,45 @@ recommended commands: NOTE: For all of the above cases you must correctly declare the source of PyTorch you intend to use in your WORKSPACE file for both Python and C++ builds. See below for more information -**Building with CMake** (TorchScript Only) -------------------------------------------- + + +Building on Windows +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +* Microsoft VS 2022 Tools +* Bazelisk +* CUDA + + +Build steps +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Open the app "x64 Native Tools Command Prompt for VS 2022" - note that Admin privileges may be necessary +* Ensure Bazelisk (Bazel launcher) is installed on your machine and available from the command line. Package installers such as Chocolatey can be used to install Bazelisk +* Install latest version of Torch (i.e. with ``pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124``) +* Clone the Torch-TensorRT repository and navigate to its root directory +* Run ``pip install ninja wheel setuptools`` +* Run ``pip install --pre -r py/requirements.txt`` +* Run ``set DISTUTILS_USE_SDK=1`` +* Run ``python setup.py bdist_wheel`` +* Run ``pip install dist/*.whl`` + +Advanced setup and Troubleshooting +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In the ``WORKSPACE`` file, the ``cuda_win``, ``libtorch_win``, and ``tensorrt_win`` are Windows-specific modules which can be customized. For instance, if you would like to build with a different version of CUDA, or your CUDA installation is in a non-standard location, update the `path` in the `cuda_win` module. + +Similarly, if you would like to use a different version of pytorch or tensorrt, customize the `urls` in the ``libtorch_win`` and ``tensorrt_win`` modules, respectively. + +Local versions of these packages can also be used on Windows. See ``toolchains\\ci_workspaces\\WORKSPACE.win.release.tmpl`` for an example of using a local version of TensorRT on Windows. + + +Alternative Build Systems +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Building with CMake (TorchScript Only) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ It is possible to build the API libraries (in cpp/) and the torchtrtc executable using CMake instead of Bazel. Currently, the python API and the tests cannot be built with CMake. @@ -233,11 +304,12 @@ A few useful CMake options include: [-DCMAKE_BUILD_TYPE=Debug|Release] cmake --build -**Building Natively on aarch64 (Jetson)** -------------------------------------------- + +Building Natively on aarch64 (Jetson) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Prerequisites -^^^^^^^^^^^^^^ +............................ Install or compile a build of PyTorch/LibTorch for aarch64 @@ -246,8 +318,8 @@ NVIDIA hosts builds the latest release branch for Jetson here: https://forums.developer.nvidia.com/t/pytorch-for-jetson-version-1-10-now-available/72048 -Enviorment Setup -^^^^^^^^^^^^^^^^^ +Environment Setup +............................ To build natively on aarch64-linux-gnu platform, configure the ``WORKSPACE`` with local available dependencies. @@ -279,7 +351,7 @@ use that library, set the paths to the same path but when you compile make sure Compile C++ Library and Compiler CLI -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +........................................................ NOTE: Due to shifting dependency locations between Jetpack 4.5 and 4.6 there is a now a flag to inform bazel of the Jetpack version @@ -295,9 +367,9 @@ Compile Torch-TensorRT library using bazel command: bazel build //:libtorchtrt --platforms //toolchains:jetpack_5.0 Compile Python API -^^^^^^^^^^^^^^^^^^^^ +............................ - NOTE: Due to shifting dependencies locations between Jetpack 4.5 and newer Jetpack verisons there is now a flag for ``setup.py`` which sets the jetpack version (default: 5.0) + NOTE: Due to shifting dependencies locations between Jetpack 4.5 and newer Jetpack versions there is now a flag for ``setup.py`` which sets the jetpack version (default: 5.0) Compile the Python API using the following command from the ``//py`` directory: @@ -307,4 +379,4 @@ Compile the Python API using the following command from the ``//py`` directory: If you have a build of PyTorch that uses Pre-CXX11 ABI drop the ``--use-cxx11-abi`` flag -If you are building for Jetpack 4.5 add the ``--jetpack-version 5.0`` flag +If you are building for Jetpack 4.5 add the ``--jetpack-version 5.0`` flag \ No newline at end of file diff --git a/docsrc/getting_started/quick_start.rst b/docsrc/getting_started/quick_start.rst new file mode 100644 index 0000000000..e3e3c371cd --- /dev/null +++ b/docsrc/getting_started/quick_start.rst @@ -0,0 +1,74 @@ +.. _quick_start: + +Quick Start +################## + +Option 1: torch.compile +------------------------- + +You can use Torch-TensorRT anywhere you use torch.compile: + +.. code-block:: py + + import torch + import torch_tensorrt + + model = MyModel().eval().cuda() # define your model here + x = torch.randn((1, 3, 224, 224)).cuda() # define what the inputs to the model will look like + + optimized_model = torch.compile(model, backend="tensorrt") + optimized_model(x) # compiled on first run + + optimized_model(x) # this will be fast! + + +Option 2: Export +------------------------- + +If you want to optimize your model ahead-of-time and/or deploy in a C++ environment, Torch-TensorRT provides an export-style workflow that serializes an optimized module. This module can be deployed in PyTorch or with libtorch (i.e. without a Python dependency). + +Step 1: Optimize + serialize +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: py + + import torch + import torch_tensorrt + + model = MyModel().eval().cuda() # define your model here + inputs = [torch.randn((1, 3, 224, 224)).cuda()] # define a list of representative inputs here + + trt_gm = torch_tensorrt.compile(model, ir="dynamo", inputs) + torch_tensorrt.save(trt_gm, "trt.ep", inputs=inputs) # PyTorch only supports Python runtime for an ExportedProgram. For C++ deployment, use a TorchScript file + torch_tensorrt.save(trt_gm, "trt.ts", output_format="torchscript", inputs=inputs) + +Step 2: Deploy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Deployment in Python: +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +.. code-block:: py + + import torch + import torch_tensorrt + + inputs = [torch.randn((1, 3, 224, 224)).cuda()] # your inputs go here + + # You can run this in a new python session! + model = torch.export.load("trt.ep").module() + # model = torch_tensorrt.load("trt.ep").module() # this also works + model(*inputs) + +Deployment in C++: +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: c++ + + #include "torch/script.h" + #include "torch_tensorrt/torch_tensorrt.h" + + auto trt_mod = torch::jit::load("trt.ts"); + auto input_tensor = [...]; // fill this with your inputs + auto results = trt_mod.forward({input_tensor}); \ No newline at end of file diff --git a/docsrc/index.rst b/docsrc/index.rst index df3f297162..0eadef1d14 100644 --- a/docsrc/index.rst +++ b/docsrc/index.rst @@ -26,8 +26,31 @@ Getting Started :hidden: getting_started/installation - getting_started/getting_started_with_windows + getting_started/quick_start +User Guide +------------ + +* :ref:`torch_tensorrt_explained` +* :ref:`dynamic_shapes` +* :ref:`ptq` +* :ref:`saving_models` +* :ref:`runtime` +* :ref:`using_dla` + +.. toctree:: + :caption: User Guide + :maxdepth: 1 + :hidden: + + user_guide/torch_tensorrt_explained + user_guide/getting_started + user_guide/dynamic_shapes + user_guide/saving_models + user_guide/runtime + user_guide/using_dla + tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage + tutorials/_rendered_examples/dynamo/vgg16_fp8_ptq Dynamo Frontend ---------------- @@ -59,6 +82,7 @@ TorchScript Frontend ts/getting_started_with_python_api ts/getting_started_with_cpp_api ts/use_from_pytorch + ts/ptq FX Frontend ------------ @@ -72,28 +96,6 @@ FX Frontend fx/getting_started_with_fx_path - -User Guide ------------- - -* :ref:`dynamic_shapes` -* :ref:`ptq` -* :ref:`saving_models` -* :ref:`runtime` -* :ref:`using_dla` - -.. toctree:: - :caption: User Guide - :maxdepth: 1 - :hidden: - - - user_guide/dynamic_shapes - user_guide/ptq - user_guide/saving_models - user_guide/runtime - user_guide/using_dla - Tutorials ------------ * :ref:`torch_tensorrt_tutorials` @@ -116,28 +118,28 @@ Tutorials tutorials/_rendered_examples/distributed_inference/data_parallel_stable_diffusion tutorials/_rendered_examples/dynamo/vgg16_fp8_ptq -Python API Documenation +Python API Documentation ------------------------ * :ref:`torch_tensorrt_py` -* :ref:`torch_tensorrt_logging_py` -* :ref:`torch_tensorrt_ptq_py` * :ref:`torch_tensorrt_dynamo_py` -* :ref:`torch_tensorrt_ts_py` +* :ref:`torch_tensorrt_logging_py` * :ref:`torch_tensorrt_fx_py` +* :ref:`torch_tensorrt_ts_py` +* :ref:`torch_tensorrt_ptq_py` .. toctree:: - :caption: Python API Documenation + :caption: Python API Documentation :maxdepth: 0 :hidden: py_api/torch_tensorrt - py_api/logging - py_api/ptq py_api/dynamo - py_api/ts + py_api/logging py_api/fx + py_api/ts + py_api/ptq -C++ API Documenation +C++ API Documentation ---------------------- * :ref:`namespace_torch_tensorrt` * :ref:`namespace_torch_tensorrt__logging` @@ -146,7 +148,7 @@ C++ API Documenation .. toctree:: - :caption: C++ API Documenation + :caption: C++ API Documentation :maxdepth: 1 :hidden: @@ -161,7 +163,7 @@ CLI Documentation * :ref:`torchtrtc` .. toctree:: - :caption: CLI Documenation + :caption: CLI Documentation :maxdepth: 0 :hidden: diff --git a/docsrc/py_api/dynamo.rst b/docsrc/py_api/dynamo.rst index 12fa5e76c1..0f3e32f2f7 100644 --- a/docsrc/py_api/dynamo.rst +++ b/docsrc/py_api/dynamo.rst @@ -26,13 +26,8 @@ Functions .. autofunction:: refit_module_weights + Classes -------- .. autoclass:: CompilationSettings - -.. autoclass:: SourceIR - -.. autoclass:: runtime.TorchTensorRTModule - -.. autoclass:: runtime.PythonTorchTensorRTModule \ No newline at end of file diff --git a/docsrc/py_api/logging.rst b/docsrc/py_api/logging.rst index 7918fe7f86..0c0ecb7bf6 100644 --- a/docsrc/py_api/logging.rst +++ b/docsrc/py_api/logging.rst @@ -7,7 +7,5 @@ torch_tensorrt.logging .. automodule:: torch_tensorrt.logging :members: - :undoc-members: - :show-inheritance: .. autoclass:: py torch_tensorrt.logging.Level diff --git a/docsrc/py_api/ptq.rst b/docsrc/py_api/ptq.rst index ec83662efb..81925d2b09 100644 --- a/docsrc/py_api/ptq.rst +++ b/docsrc/py_api/ptq.rst @@ -1,27 +1,27 @@ .. _torch_tensorrt_ptq_py: -torch_tensorrt.ptq -=================== +torch_tensorrt.ts.ptq +====================== -.. currentmodule:: torch_tensorrt.ptq +These components are legacy quantization utilities designed to work with the TorchScript Frontend. They have been replaced by the `TensorRT Model Optimizer `_ toolkit +which can be used with the dynamo frontend: -.. automodule:: torch_tensorrt.ptq - :members: - :undoc-members: - :show-inheritance: +.. currentmodule:: torch_tensorrt.ts.ptq + +.. automodule:: torch_tensorrt.ts.ptq Classes --------- .. autoclass:: DataLoaderCalibrator :members: - :special-members: __init__ .. autoclass:: CacheCalibrator :members: - :special-members: __init__ Enums ------- .. autoclass:: CalibrationAlgo + :members: + :undoc-members: diff --git a/docsrc/py_api/runtime.rst b/docsrc/py_api/runtime.rst new file mode 100644 index 0000000000..4e6721c7a6 --- /dev/null +++ b/docsrc/py_api/runtime.rst @@ -0,0 +1,26 @@ +.. _torch_tensorrt_py: + +torch_tensorrt.runtime +============================== + +.. automodule:: torch_tensorrt.runtime + :members: + :undoc-members: + :show-inheritance: + +Functions +------------ + +.. autofunction:: set_multi_device_safe_mode + + +Classes +--------- + +.. autoclass:: TorchTensorRTModule + :members: + :special-members: __init__ + +.. autoclass:: PythonTorchTensorRTModule + :members: + :special-members: __init__ \ No newline at end of file diff --git a/docsrc/py_api/torch_tensorrt.rst b/docsrc/py_api/torch_tensorrt.rst index eb8285e103..c2ddc9c701 100644 --- a/docsrc/py_api/torch_tensorrt.rst +++ b/docsrc/py_api/torch_tensorrt.rst @@ -26,6 +26,10 @@ Functions .. autofunction:: dump_build_info +.. autofunction:: save + +.. autofunction:: load + Classes --------- @@ -41,12 +45,20 @@ Enums ------- .. autoclass:: dtype + :members: + :member-order: .. autoclass:: DeviceType + :members: + :member-order: .. autoclass:: EngineCapability + :members: + :member-order: .. autoclass:: memory_format + :members: + :member-order: Submodules ---------- @@ -59,3 +71,4 @@ Submodules ts fx dynamo + runtime diff --git a/docsrc/ts/creating_torchscript_module_in_python.rst b/docsrc/ts/creating_torchscript_module_in_python.rst index 1d1cdba574..6c63302c76 100644 --- a/docsrc/ts/creating_torchscript_module_in_python.rst +++ b/docsrc/ts/creating_torchscript_module_in_python.rst @@ -49,11 +49,11 @@ For example, we can define a LeNet module like this: def __init__(self): super(LeNet, self).__init__() self.feat = LeNetFeatExtractor() - self.classifer = LeNetClassifier() + self.classifier = LeNetClassifier() def forward(self, x): x = self.feat(x) - x = self.classifer(x) + x = self.classifier(x) return x . @@ -84,7 +84,7 @@ include these components. We can run the script compiler on our LeNet module by model = LeNet() script_model = torch.jit.script(model) -There are reasons to use one path or another, the PyTorch documentation has information on how to choose. From a Torch-TensorRT prespective, there is +There are reasons to use one path or another, the PyTorch documentation has information on how to choose. From a Torch-TensorRT perspective, there is better support (i.e your module is more likely to compile) for traced modules because it doesn't include all the complexities of a complete programming language, though both paths supported. @@ -97,7 +97,7 @@ Here is what the LeNet traced module IR looks like: graph(%self.1 : __torch__.___torch_mangle_10.LeNet, %input.1 : Float(1, 1, 32, 32)): - %129 : __torch__.___torch_mangle_9.LeNetClassifier = prim::GetAttr[name="classifer"](%self.1) + %129 : __torch__.___torch_mangle_9.LeNetClassifier = prim::GetAttr[name="classifier"](%self.1) %119 : __torch__.___torch_mangle_5.LeNetFeatExtractor = prim::GetAttr[name="feat"](%self.1) %137 : Tensor = prim::CallMethod[name="forward"](%119, %input.1) %138 : Tensor = prim::CallMethod[name="forward"](%129, %137) @@ -111,7 +111,7 @@ and the LeNet scripted module IR: %x.1 : Tensor): %2 : __torch__.LeNetFeatExtractor = prim::GetAttr[name="feat"](%self) %x.3 : Tensor = prim::CallMethod[name="forward"](%2, %x.1) # x.py:38:12 - %5 : __torch__.LeNetClassifier = prim::GetAttr[name="classifer"](%self) + %5 : __torch__.LeNetClassifier = prim::GetAttr[name="classifier"](%self) %x.5 : Tensor = prim::CallMethod[name="forward"](%5, %x.3) # x.py:39:12 return (%x.5) @@ -123,7 +123,7 @@ Working with TorchScript in Python ----------------------------------- TorchScript Modules are run the same way you run normal PyTorch modules. You can run the forward pass using the -``forward`` method or just calling the module ``torch_scirpt_module(in_tensor)`` The JIT compiler will compile +``forward`` method or just calling the module ``torch_script_module(in_tensor)`` The JIT compiler will compile and optimize the module on the fly and then returns the results. Saving TorchScript Module to Disk diff --git a/docsrc/ts/getting_started_with_cpp_api.rst b/docsrc/ts/getting_started_with_cpp_api.rst index 70f439ea6d..4e22b7f938 100644 --- a/docsrc/ts/getting_started_with_cpp_api.rst +++ b/docsrc/ts/getting_started_with_cpp_api.rst @@ -118,7 +118,7 @@ With our module loaded, we can feed it into the Torch-TensorRT compiler. When we auto trt_mod = torch_tensorrt::CompileGraph(mod, std::vector{{in.sizes()}}); auto out = trt_mod.forward({in}); -Thats it! Now the graph runs primarily not with the JIT compiler but using TensorRT (though we execute the graph using the JIT runtime). +That's it! Now the graph runs primarily not with the JIT compiler but using TensorRT (though we execute the graph using the JIT runtime). We can also set settings like operating precision to run in FP16. @@ -209,9 +209,9 @@ When a module is provided to Torch-TensorRT, the compiler starts by mapping a gr %10 : bool = prim::Constant[value=1]() # ~/.local/lib/python3.6/site-packages/torch/nn/modules/conv.py:346:0 %11 : int = prim::Constant[value=1]() # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 %12 : bool = prim::Constant[value=0]() # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 - %self.classifer.fc3.bias : Float(10) = prim::Constant[value= 0.0464 0.0383 0.0678 0.0932 0.1045 -0.0805 -0.0435 -0.0818 0.0208 -0.0358 [ CUDAFloatType{10} ]]() - %self.classifer.fc2.bias : Float(84) = prim::Constant[value=]() - %self.classifer.fc1.bias : Float(120) = prim::Constant[value=]() + %self.classifier.fc3.bias : Float(10) = prim::Constant[value= 0.0464 0.0383 0.0678 0.0932 0.1045 -0.0805 -0.0435 -0.0818 0.0208 -0.0358 [ CUDAFloatType{10} ]]() + %self.classifier.fc2.bias : Float(84) = prim::Constant[value=]() + %self.classifier.fc1.bias : Float(120) = prim::Constant[value=]() %self.feat.conv2.weight : Float(16, 6, 3, 3) = prim::Constant[value=]() %self.feat.conv2.bias : Float(16) = prim::Constant[value=]() %self.feat.conv1.weight : Float(6, 1, 3, 3) = prim::Constant[value=]() @@ -224,15 +224,15 @@ When a module is provided to Torch-TensorRT, the compiler starts by mapping a gr %x.1 : Tensor = aten::max_pool2d(%input2.1, %7, %6, %8, %9, %12) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 %input.1 : Tensor = aten::flatten(%x.1, %11, %5) # x.py:25:0 %27 : Tensor = aten::matmul(%input.1, %4) - %28 : Tensor = trt::const(%self.classifer.fc1.bias) + %28 : Tensor = trt::const(%self.classifier.fc1.bias) %29 : Tensor = aten::add_(%28, %27, %11) %input0.2 : Tensor = aten::relu(%29) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:1063:0 %31 : Tensor = aten::matmul(%input0.2, %3) - %32 : Tensor = trt::const(%self.classifer.fc2.bias) + %32 : Tensor = trt::const(%self.classifier.fc2.bias) %33 : Tensor = aten::add_(%32, %31, %11) %input1.1 : Tensor = aten::relu(%33) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:1063:0 %35 : Tensor = aten::matmul(%input1.1, %2) - %36 : Tensor = trt::const(%self.classifer.fc3.bias) + %36 : Tensor = trt::const(%self.classifier.fc3.bias) %37 : Tensor = aten::add_(%36, %35, %11) return (%37) (CompileGraph) @@ -264,10 +264,10 @@ You can see the call where the engine is executed, after extracting the attribut Working with Unsupported Operators ----------------------------------- -Torch-TensorRT is a new library and the PyTorch operator library is quite large, so there will be ops that aren't supported natively by the compiler. You can either use the composition techinques +Torch-TensorRT is a new library and the PyTorch operator library is quite large, so there will be ops that aren't supported natively by the compiler. You can either use the composition techniques shown above to make modules are fully Torch-TensorRT supported and ones that are not and stitch the modules together in the deployment application or you can register converters for missing ops. - You can check support without going through the full compilation pipleine using the ``torch_tensorrt::CheckMethodOperatorSupport(const torch::jit::Module& module, std::string method_name)`` api + You can check support without going through the full compilation pipeline using the ``torch_tensorrt::CheckMethodOperatorSupport(const torch::jit::Module& module, std::string method_name)`` api to see what operators are not supported. ``torchtrtc`` automatically checks modules with this method before starting compilation and will print out a list of operators that are not supported. .. _custom_converters: @@ -333,7 +333,7 @@ for example we can quickly get the output size by just running the operation in int main() { ... -To use this converter in Python, it is recommended to use PyTorch's `C++ / CUDA Extention `_ +To use this converter in Python, it is recommended to use PyTorch's `C++ / CUDA Extension `_ template to wrap your library of converters into a ``.so`` that you can load with ``ctypes.CDLL()`` in your Python application. You can find more information on all the details of writing converters in the contributors documentation (:ref:`writing_converters`). diff --git a/docsrc/user_guide/ptq.rst b/docsrc/ts/ptq.rst similarity index 100% rename from docsrc/user_guide/ptq.rst rename to docsrc/ts/ptq.rst diff --git a/docsrc/user_guide/torch_tensorrt_explained.rst b/docsrc/user_guide/torch_tensorrt_explained.rst new file mode 100644 index 0000000000..53216b2806 --- /dev/null +++ b/docsrc/user_guide/torch_tensorrt_explained.rst @@ -0,0 +1,107 @@ +.. _torch_tensorrt_explained: + +Torch-TensorRT Explained +================================= + +Torch-TensorRT is a compiler for PyTorch models targeting NVIDIA GPUs +via the TensorRT Model Optimization SDK. It aims to provide better +inference performance for PyTorch models while still maintaining the +great ergonomics of PyTorch. + +Dynamo Frontend +----------------- + +The Dynamo frontend is the default frontend for Torch-TensorRT. It utilizes the `dynamo compiler stack `_ from PyTorch. + + +``torch.compile`` (Just-in-time) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``torch.compile`` is a JIT compiler stack, as such, compilation is deferred until first use. This means that as conditions change in the graph, the graph will automatically recompile. +This provides users the most runtime flexibility, however limits options regarding serialization. + +Under the hood, `torch.compile `_ delegates subgraphs it believes can be lowered to Torch-TensorRT. Torch-TensorRT further lowers these graphs into ops consisting of solely `Core ATen Operators `_ +or select "High-level Ops" amenable to TensorRT acceleration. Subgraphs are further partitioned into components that will run in PyTorch and ones to be further compiled to TensorRT based +on support for operators. TensorRT engines then replace supported blocks and a hybrid subgraph is returned to ``torch.compile`` to be run on call. + +Accepted Formats +................... +- torch.fx GraphModule (``torch.fx.GraphModule``) +- PyTorch Module (``torch.nn.Module``) + +Returns +................... +- Boxed-function that triggers compilation on first call + + +``torch_tensorrt.dynamo.compile`` (Ahead-of-time) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``torch_tensorrt.dynamo.compile`` is an AOT compiler, models are compiled in an explicit compilation phase. These compilation artifacts can then be serialized and reloaded at a later date. +Graphs go through the ``torch.export.trace`` system to be lowered into a graph consisting of `Core ATen Operators `_ or select "High-level Ops" amenable to TensoRT acceleration. +Subgraphs are further partitioned into components that will run in PyTorch and ones to be further compiled to TensorRT based on support for operators. TensorRT engines then replace supported blocks +and a hybrid subgraph is packed into an `ExportedProgram `_ which can be serialized and reloaded. + +Accepted Formats +................... +- torch.export.ExportedProgram (``torch.export.ExportedProgram``) +- torch.fx GraphModule (``torch.fx.GraphModule``) (via ``torch.export.export``) +- PyTorch Module (``torch.nn.Module``) (via ``torch.export.export``) + +Returns +................... +- torch.fx.GraphModule (serializable with ``torch.export.ExportedProgram``) + +Legacy Frontends +------------------ + +As there has been a number of compiler technologies in the PyTorch ecosystem over the years +Torch-TensorRT has some legacy features targeting them. + + +TorchScript (`torch_tensorrt.ts.compile`) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The TorchScript frontend was the original default frontend for Torch-TensorRT and targets models in the TorchScript format. The graph provided will be partitioned into supported and unsupported +blocks. Supported blocks will be lowered to TensorRT and unsupported blocks will remain to run with LibTorch. The resultant graph is returned back to the user as a ``ScriptModule`` that can be loaded and saved +with the Torch-TensorRT PyTorch runtime extension. + +Accepted Formats +................... +- TorchScript Module (``torch.jit.ScriptModule``) +- PyTorch Module (``torch.nn.Module``) (via ``torch.jit.script`` or ``torch.jit.trace``) + +Returns +................... +- TorchScript Module (``torch.jit.ScriptModule``) + + +FX Graph Modules (`torch_tensorrt.fx.compile`) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This frontend has almost entirely been replaced by the Dynamo frontend which is a superset of the +features available though the FX frontend. The original FX frontend remains in the codebase for +backwards compatibility reasons. + +Accepted Formats +................... +- torch.fx GraphModule (``torch.fx.GraphModule``) +- PyTorch Module (``torch.nn.Module``) (via ``torch.fx.trace``) + +Returns +................... +- torch.fx GraphModule (``torch.fx.GraphModule``) + +``torch_tensorrt.compile`` +---------------------------------- + +As there are many different frontends and supported formats, we provide a convenience layer called ``torch_tensorrt.compile`` which lets users access +all the different compiler options. You can specify to ``torch_tensorrt.compile`` what compiler path to use by setting the ``ir`` option, telling +Torch-TensorRT to try to lower the provided model through a specific intermediate representation. + +``ir`` Options +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- ``torch_compile``: Use the ``torch.compile`` system. Immediately returns a boxed-function that will compile on first call +- ``dynamo``: Run the graph through the ``torch.export``/ torchdynamo stack. If the input module is a ``torch.nn.Module``, it must be "export-traceable" as the module will be traced with ``torch.export.export``. Returns a ``torch.fx.GraphModule`` which can be run immediately or saved via ``torch.export.export`` or ``torch_tensorrt.save`` +- ``torchscript`` or ``ts``: Run graph through the TorchScript stack. If the input module is a ``torch.nn.Module``, it must be "scriptable" as the module will be compiled with ``torch.jit.script``. Returns a ``torch.jit.ScriptModule`` which can be run immediately or saved via ``torch.save`` or ``torch_tensorrt.save`` +- ``fx``: Run graph through the ``torch.fx`` stack. If the input module is a ``torch.nn.Module``, it will be traced with ``torch.fx.trace`` and subject to its limitations. \ No newline at end of file diff --git a/examples/custom_converters/README.md b/examples/custom_converters/README.md index 322a867194..70fad0ae27 100644 --- a/examples/custom_converters/README.md +++ b/examples/custom_converters/README.md @@ -23,7 +23,7 @@ it doesn't support elu operator in default. (Torch-TensorRT <= v0.1.0) We can register a converter for this operator in our application. You can find more information on all the details of writing converters in the contributors documentation ([Writing Converters](https://nvidia.github.io/Torch-TensorRT/contributors/writing_converters.html)). -Once we are clear about these rules and writing patterns, we can create a seperate new C++ source file as: +Once we are clear about these rules and writing patterns, we can create a separate new C++ source file as: ```c++ #include "core/conversion/converters/converters.h" @@ -58,7 +58,7 @@ auto actelu = torch_tensorrt::core::conversion::converters::RegisterNodeConversi To use this converter in Python, it is recommended to use PyTorch's [C++/CUDA Extension](https://pytorch.org/tutorials/advanced/cpp_extension.html#custom-c-and-cuda-extensions). We give an example here about how to wrap the converter into a `.so` -library so that you can load it to use in Python applicaton. +library so that you can load it to use in Python application. ```python import os from setuptools import setup, Extension @@ -124,7 +124,7 @@ def cal_max_diff(pytorch_out, torch_tensorrt_out): diff = torch.sub(pytorch_out, torch_tensorrt_out) abs_diff = torch.abs(diff) max_diff = torch.max(abs_diff) - print("Maximum differnce between Torch-TensorRT and PyTorch: \n", max_diff) + print("Maximum difference between Torch-TensorRT and PyTorch: \n", max_diff) def main(): diff --git a/examples/custom_converters/elu_model.py b/examples/custom_converters/elu_model.py index 01cfdd1250..e4edfa39f5 100644 --- a/examples/custom_converters/elu_model.py +++ b/examples/custom_converters/elu_model.py @@ -20,7 +20,7 @@ def cal_max_diff(pytorch_out, torch_tensorrt_out): diff = torch.sub(pytorch_out, torch_tensorrt_out) abs_diff = torch.abs(diff) max_diff = torch.max(abs_diff) - print("Maximum differnce between Torch-TensorRT and PyTorch: \n", max_diff) + print("Maximum difference between Torch-TensorRT and PyTorch: \n", max_diff) def main(): diff --git a/examples/distributed_inference/README.md b/examples/distributed_inference/README.md index f9608e8950..c164e9581f 100644 --- a/examples/distributed_inference/README.md +++ b/examples/distributed_inference/README.md @@ -2,7 +2,7 @@ Examples in this folder demonstrates doing distributed inference on multiple devices with Torch-TensorRT backend. -1. Data parallel distributed inference based on [Acclerate](https://huggingface.co/docs/accelerate/usage_guides/distributed_inference) +1. Data parallel distributed inference based on [Accelerate](https://huggingface.co/docs/accelerate/usage_guides/distributed_inference) Using Accelerate users can achieve data parallel distributed inference with Torch-TensorRt backend. In this case, the entire model will be loaded onto each GPU and different chunks of batch input is processed on each device. diff --git a/examples/dynamo/custom_kernel_plugins.py b/examples/dynamo/custom_kernel_plugins.py index 4165c54105..73b06119ae 100644 --- a/examples/dynamo/custom_kernel_plugins.py +++ b/examples/dynamo/custom_kernel_plugins.py @@ -13,7 +13,7 @@ in terms of PyTorch ops that are supported in Torch-TensorRT or a converter (see: `Writing converters for the Dynamo frontend `_) - which defines the operator in terms of TensorRT operators. -In some cases there isnt a great way to do either of these, perhaps because the operator is a custom kernel that is not part of standard PyTorch or +In some cases there isn't a great way to do either of these, perhaps because the operator is a custom kernel that is not part of standard PyTorch or TensorRT cannot support it natively. For these cases, it is possible to use a TensorRT plugin to replace the operator **inside** the TensorRT engine, thereby avoiding @@ -147,7 +147,7 @@ def triton_circular_pad(x: torch.Tensor, padding: Sequence[int]) -> torch.Tensor # %% # Testing our custom op -# ^^^^^^^^^^^^^^^^^^^^^^^ +# ----------------------------------------- # %% # The native PyTorch implementation @@ -190,7 +190,7 @@ def _(x: torch.Tensor, padding: Sequence[int]) -> torch.Tensor: return torch.nn.functional.pad(x, padding, "circular") -# Additionally one may want to define an autograd implementation for the backwards pass to round out the custom op implmentation but that is beyond the scope of this tutorial (see https://pytorch.org/docs/main/library.html#torch.library.register_autograd for more) +# Additionally one may want to define an autograd implementation for the backwards pass to round out the custom op implementation but that is beyond the scope of this tutorial (see https://pytorch.org/docs/main/library.html#torch.library.register_autograd for more) # %% @@ -304,7 +304,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # %% # Wrapping Custom Kernels to use in TensorRT -# ============================================= +# -------------------------------------------- # # To address this graph break, the first step is to make our kernel implementation available in TensorRT. Again this can be done in either C++ or Python. For the actual details on how to implement # TensorRT plugins refer `here `_. From a high level, similar to PyTorch you will need to @@ -501,7 +501,7 @@ def deserialize_plugin(self, name: str, data: bytes) -> CircularPaddingPlugin: # %% # Using Torch-TensorRT to Insert the Kernel -# ============================================= +# ------------------------------------------- # Now with our TensorRT plugin, we can create a converter so that Torch-TensorRT knows to insert our plugin in place of our custom circular padding operator. # More information on writing converters can be found `here `_ @@ -534,7 +534,7 @@ def circular_padding_converter( plugin_creator = plugin_registry.get_plugin_creator( type="CircularPaddingPlugin", version="1", plugin_namespace="" ) - assert plugin_creator, f"Unabled to find CircularPaddingPlugin creator" + assert plugin_creator, f"Unable to find CircularPaddingPlugin creator" # Pass configurations to the plugin implementation field_configs = trt.PluginFieldCollection( diff --git a/examples/dynamo/refit_engine_example.py b/examples/dynamo/refit_engine_example.py index c841c5f57a..167344977e 100644 --- a/examples/dynamo/refit_engine_example.py +++ b/examples/dynamo/refit_engine_example.py @@ -91,7 +91,7 @@ print("Refit successfully!") # %% -# Alterative Workflow using Python Runtime +# Alternative Workflow using Python Runtime # ----------------------------- # Currently python runtime does not support engine serialization. So the refitting will be done in the same runtime. diff --git a/examples/int8/ptq/README.md b/examples/int8/ptq/README.md index 329d4d021d..246ef6e1d4 100644 --- a/examples/int8/ptq/README.md +++ b/examples/int8/ptq/README.md @@ -6,7 +6,7 @@ Post Training Quantization (PTQ) is a technique to reduce the required computati Users writing TensorRT applications are required to setup a calibrator class which will provide sample data to the TensorRT calibrator. With Torch-TensorRT we look to leverage existing infrastructure in PyTorch to make implementing calibrators easier. -LibTorch provides a `Dataloader` and `Dataset` API which steamlines preprocessing and batching input data. Torch-TensorRT uses Dataloaders as the base of a generic calibrator implementation. So you will be able to reuse or quickly implement a `torch::Dataset` for your target domain, place it in a Dataloader and create a INT8 Calibrator from it which you can provide to Torch-TensorRT to run INT8 Calibration during compliation of your module. +LibTorch provides a `Dataloader` and `Dataset` API which steamlines preprocessing and batching input data. Torch-TensorRT uses Dataloaders as the base of a generic calibrator implementation. So you will be able to reuse or quickly implement a `torch::Dataset` for your target domain, place it in a Dataloader and create a INT8 Calibrator from it which you can provide to Torch-TensorRT to run INT8 Calibration during compilation of your module. ### Code @@ -92,7 +92,7 @@ The calibrator factories create a calibrator that inherits from a `nvinfer1::IIn auto calibrator = torch_tensorrt::ptq::make_int8_calibrator(std::move(calibration_dataloader), calibration_cache_file, true); ``` -Then all thats required to setup the module for INT8 calibration is to set the following compile settings in the `torch_tensorrt::CompileSpec` struct and compiling the module: +Then all that's required to setup the module for INT8 calibration is to set the following compile settings in the `torch_tensorrt::CompileSpec` struct and compiling the module: ```C++ std::vector> input_shape = {{32, 3, 32, 32}}; @@ -102,7 +102,7 @@ Then all thats required to setup the module for INT8 calibration is to set the f compile_spec.enabled_precisions.insert(torch::kI8); /// Use the TensorRT Entropy Calibrator compile_spec.ptq_calibrator = calibrator; - /// Set a larger workspace (you may get better performace from doing so) + /// Set a larger workspace (you may get better performance from doing so) compile_spec.workspace_size = 1 << 28; auto trt_mod = torch_tensorrt::CompileGraph(mod, compile_spec); diff --git a/examples/int8/training/vgg16/README.md b/examples/int8/training/vgg16/README.md index 5aff4ca116..05404494be 100644 --- a/examples/int8/training/vgg16/README.md +++ b/examples/int8/training/vgg16/README.md @@ -2,7 +2,7 @@ This is a recipe to train a VGG network on CIFAR10 to use with the Torch-TensorRT PTQ example. -## Prequisites +## Prerequisites ``` pip3 install -r requirements.txt --user diff --git a/examples/int8/training/vgg16/finetune_qat.py b/examples/int8/training/vgg16/finetune_qat.py index 0414af00de..60850bf6a3 100644 --- a/examples/int8/training/vgg16/finetune_qat.py +++ b/examples/int8/training/vgg16/finetune_qat.py @@ -8,17 +8,14 @@ import torch.nn.functional as F import torch.optim as optim import torch.utils.data as data -import torchvision.transforms as transforms import torchvision.datasets as datasets - -from torch.utils.tensorboard import SummaryWriter - +import torchvision.transforms as transforms +from pytorch_quantization import calib from pytorch_quantization import nn as quant_nn from pytorch_quantization import quant_modules from pytorch_quantization.tensor_quant import QuantDescriptor -from pytorch_quantization import calib +from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm - from vgg16 import vgg16 PARSER = argparse.ArgumentParser( @@ -49,7 +46,7 @@ "--start-from", default=0, type=int, - help="Epoch to resume from (requires a checkpoin in the providied checkpoi", + help="Epoch to resume from (requires a checkpoint in the providied checkpoi", ) PARSER.add_argument("--seed", type=int, help="Seed value for rng") PARSER.add_argument( @@ -147,7 +144,7 @@ def calibrate_model( data_loader: calibration data set num_calib_batch: amount of calibration passes to perform calibrator: type of calibration to use (max/histogram) - hist_percentile: percentiles to be used for historgram calibration + hist_percentile: percentiles to be used for histogram calibration out_dir: dir to save state files in """ diff --git a/examples/int8/training/vgg16/main.py b/examples/int8/training/vgg16/main.py index 3db8e9d4dd..aed7f68565 100644 --- a/examples/int8/training/vgg16/main.py +++ b/examples/int8/training/vgg16/main.py @@ -35,7 +35,7 @@ "--start-from", default=0, type=int, - help="Epoch to resume from (requires a checkpoin in the providied checkpoi", + help="Epoch to resume from (requires a checkpoint in the providied checkpoi", ) PARSER.add_argument("--seed", type=int, help="Seed value for rng") PARSER.add_argument( diff --git a/py/README.md b/py/README.md index 234c86fa05..45c68ff98f 100644 --- a/py/README.md +++ b/py/README.md @@ -2,7 +2,7 @@ > Ahead of Time (AOT) compiling for PyTorch JIT -Torch-TensorRT is a compiler for PyTorch/TorchScript, targeting NVIDIA GPUs via NVIDIA's TensorRT Deep Learning Optimizer and Runtime. Unlike PyTorch's Just-In-Time (JIT) compiler, Torch-TensorRT is an Ahead-of-Time (AOT) compiler, meaning that before you deploy your TorchScript code, you go through an explicit compile step to convert a standard TorchScript program into an module targeting a TensorRT engine. Torch-TensorRT operates as a PyTorch extention and compiles modules that integrate into the JIT runtime seamlessly. After compilation using the optimized graph should feel no different than running a TorchScript module. You also have access to TensorRT's suite of configurations at compile time, so you are able to specify operating precision (FP32/FP16/INT8) and other settings for your module. +Torch-TensorRT is a compiler for PyTorch/TorchScript, targeting NVIDIA GPUs via NVIDIA's TensorRT Deep Learning Optimizer and Runtime. Unlike PyTorch's Just-In-Time (JIT) compiler, Torch-TensorRT is an Ahead-of-Time (AOT) compiler, meaning that before you deploy your TorchScript code, you go through an explicit compile step to convert a standard TorchScript program into an module targeting a TensorRT engine. Torch-TensorRT operates as a PyTorch extension and compiles modules that integrate into the JIT runtime seamlessly. After compilation using the optimized graph should feel no different than running a TorchScript module. You also have access to TensorRT's suite of configurations at compile time, so you are able to specify operating precision (FP32/FP16/INT8) and other settings for your module. ## Example Usage @@ -57,9 +57,9 @@ graph(%input.2 : Tensor): %10 : bool = prim::Constant[value=1]() # ~/.local/lib/python3.6/site-packages/torch/nn/modules/conv.py:346:0 %11 : int = prim::Constant[value=1]() # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 %12 : bool = prim::Constant[value=0]() # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 - %self.classifer.fc3.bias : Float(10) = prim::Constant[value= 0.0464 0.0383 0.0678 0.0932 0.1045 -0.0805 -0.0435 -0.0818 0.0208 -0.0358 [ CUDAFloatType{10} ]]() - %self.classifer.fc2.bias : Float(84) = prim::Constant[value=]() - %self.classifer.fc1.bias : Float(120) = prim::Constant[value=]() + %self.classifier.fc3.bias : Float(10) = prim::Constant[value= 0.0464 0.0383 0.0678 0.0932 0.1045 -0.0805 -0.0435 -0.0818 0.0208 -0.0358 [ CUDAFloatType{10} ]]() + %self.classifier.fc2.bias : Float(84) = prim::Constant[value=]() + %self.classifier.fc1.bias : Float(120) = prim::Constant[value=]() %self.feat.conv2.weight : Float(16, 6, 3, 3) = prim::Constant[value=]() %self.feat.conv2.bias : Float(16) = prim::Constant[value=]() %self.feat.conv1.weight : Float(6, 1, 3, 3) = prim::Constant[value=]() @@ -72,15 +72,15 @@ graph(%input.2 : Tensor): %x.1 : Tensor = aten::max_pool2d(%input2.1, %7, %6, %8, %9, %12) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 %input.1 : Tensor = aten::flatten(%x.1, %11, %5) # x.py:25:0 %27 : Tensor = aten::matmul(%input.1, %4) - %28 : Tensor = trt::const(%self.classifer.fc1.bias) + %28 : Tensor = trt::const(%self.classifier.fc1.bias) %29 : Tensor = aten::add_(%28, %27, %11) %input0.2 : Tensor = aten::relu(%29) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:1063:0 %31 : Tensor = aten::matmul(%input0.2, %3) - %32 : Tensor = trt::const(%self.classifer.fc2.bias) + %32 : Tensor = trt::const(%self.classifier.fc2.bias) %33 : Tensor = aten::add_(%32, %31, %11) %input1.1 : Tensor = aten::relu(%33) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:1063:0 %35 : Tensor = aten::matmul(%input1.1, %2) - %36 : Tensor = trt::const(%self.classifer.fc3.bias) + %36 : Tensor = trt::const(%self.classifier.fc3.bias) %37 : Tensor = aten::add_(%36, %35, %11) return (%37) (CompileGraph) @@ -107,7 +107,7 @@ graph(%self.1 : __torch__.___torch_mangle_10.LeNet_trt, You can see the call where the engine is executed, based on a constant which is the ID of the engine, telling JIT how to find the engine and the input tensor which will be fed to TensorRT. The engine represents the exact same calculations as what is done by running a normal PyTorch module but optimized to run on your GPU. -Torch-TensorRT converts from TorchScript by generating layers or subgraphs in correspondance with instructions seen in the graph. Converters are small modules of code used to map one specific +Torch-TensorRT converts from TorchScript by generating layers or subgraphs in correspondence with instructions seen in the graph. Converters are small modules of code used to map one specific operation to a layer or subgraph in TensorRT. Not all operations are support, but if you need to implement one, you can in C++. ## Registering Custom Converters @@ -166,7 +166,7 @@ static auto flatten_converter = torch_tensorrt::core::conversion::converters::Re }); ``` -To use this converter in Python, it is recommended to use PyTorch’s [C++ / CUDA Extention](https://pytorch.org/tutorials/advanced/cpp_extension.html#custom-c-and-cuda-extensions) template to wrap +To use this converter in Python, it is recommended to use PyTorch’s [C++ / CUDA Extension](https://pytorch.org/tutorials/advanced/cpp_extension.html#custom-c-and-cuda-extensions) template to wrap your library of converters into a `.so` that you can load with `ctypes.CDLL()` in your Python application. You can find more information on all the details of writing converters in the contributors documentation ([Writing Converters](https://nvidia.github.io/Torch-TensorRT/contributors/writing_converters.html#writing-converters)). If you diff --git a/py/torch_tensorrt/_Device.py b/py/torch_tensorrt/_Device.py index 4c8c855943..e425c89be5 100644 --- a/py/torch_tensorrt/_Device.py +++ b/py/torch_tensorrt/_Device.py @@ -45,7 +45,7 @@ def __init__(self, *args: Any, **kwargs: Any): spec (str): String with device spec e.g. "dla:0" for dla, core_id 0 Keyword Arguments: - gpu_id (int): ID of target GPU (will get overrided if dla_core is specified to the GPU managing DLA). If specified, no positional arguments should be provided + gpu_id (int): ID of target GPU (will get overridden if dla_core is specified to the GPU managing DLA). If specified, no positional arguments should be provided dla_core (int): ID of target DLA core. If specified, no positional arguments should be provided. allow_gpu_fallback (bool): Allow TensorRT to schedule operations on GPU if they are not supported on DLA (ignored if device type is not DLA) diff --git a/py/torch_tensorrt/_Input.py b/py/torch_tensorrt/_Input.py index 18636f8114..72775944cb 100644 --- a/py/torch_tensorrt/_Input.py +++ b/py/torch_tensorrt/_Input.py @@ -15,11 +15,11 @@ class Input(object): shape_mode (torch_tensorrt.Input._ShapeMode): Is input statically or dynamically shaped shape (Tuple or Dict): Either a single Tuple or a dict of tuples defining the input shape. Static shaped inputs will have a single tuple. Dynamic inputs will have a dict of the form - ``{ - "min_shape": Tuple, - "opt_shape": Tuple, - "max_shape": Tuple - }`` + + .. code-block:: py + + {"min_shape": Tuple, "opt_shape": Tuple, "max_shape": Tuple} + dtype (torch_tensorrt.dtype): The expected data type of the input tensor (default: torch_tensorrt.dtype.float32) format (torch_tensorrt.TensorFormat): The expected format of the input tensor (default: torch_tensorrt.TensorFormat.NCHW) """ @@ -60,11 +60,11 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: Keyword Arguments: shape (Tuple or List, optional): Static shape of input tensor min_shape (Tuple or List, optional): Min size of input tensor's shape range - Note: All three of min_shape, opt_shape, max_shape must be provided, there must be no positional arguments, shape must not be defined and implictly this sets Input's shape_mode to DYNAMIC + Note: All three of min_shape, opt_shape, max_shape must be provided, there must be no positional arguments, shape must not be defined and implicitly this sets Input's shape_mode to DYNAMIC opt_shape (Tuple or List, optional): Opt size of input tensor's shape range - Note: All three of min_shape, opt_shape, max_shape must be provided, there must be no positional arguments, shape must not be defined and implictly this sets Input's shape_mode to DYNAMIC + Note: All three of min_shape, opt_shape, max_shape must be provided, there must be no positional arguments, shape must not be defined and implicitly this sets Input's shape_mode to DYNAMIC max_shape (Tuple or List, optional): Max size of input tensor's shape range - Note: All three of min_shape, opt_shape, max_shape must be provided, there must be no positional arguments, shape must not be defined and implictly this sets Input's shape_mode to DYNAMIC + Note: All three of min_shape, opt_shape, max_shape must be provided, there must be no positional arguments, shape must not be defined and implicitly this sets Input's shape_mode to DYNAMIC dtype (torch.dtype or torch_tensorrt.dtype): Expected data type for input tensor (default: torch_tensorrt.dtype.float32) format (torch.memory_format or torch_tensorrt.TensorFormat): The expected format of the input tensor (default: torch_tensorrt.TensorFormat.NCHW) tensor_domain (Tuple(float, float), optional): The domain of allowed values for the tensor, as interval notation: [tensor_domain[0], tensor_domain[1]). diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index ce966a2609..e59d6a6f3e 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -169,7 +169,7 @@ def compile( torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum to select device type. :: - input=[ + inputs=[ torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1 torch_tensorrt.Input( min_shape=(1, 224, 224, 3), @@ -367,8 +367,15 @@ def convert_method_to_trt_engine( def load(file_path: str = "") -> Any: """ - Load either a Torchscript model or ExportedProgram. Autodetect the type using - try, except + Load either a Torchscript model or ExportedProgram. + + Loads a TorchScript or ExportedProgram file from disk. File type will be detect the type using try, except. + + Arguments: + file_path (str): Path to file on the disk + + Raises: + ValueError: If there is no file or the file is not either a TorchScript file or ExportedProgram file """ try: logger.debug(f"Loading the provided file {file_path} using torch.jit.load()") @@ -405,11 +412,12 @@ def save( ) -> None: """ Save the model to disk in the specified output format. + Arguments: - module : Compiled Torch-TensorRT module (Options include torch.jit.ScriptModule | torch.export.ExportedProgram | torch.fx.GraphModule) + module (Optional(torch.jit.ScriptModule | torch.export.ExportedProgram | torch.fx.GraphModule)): Compiled Torch-TensorRT module inputs (torch.Tensor): Torch input tensors - output_format: Format to save the model. Options include exported_program | torchscript. - retrace: When the module type is a fx.GraphModule, this option re-exports the graph using torch.export.export(strict=False) to save it. + output_format (str): Format to save the model. Options include exported_program | torchscript. + retrace (bool): When the module type is a fx.GraphModule, this option re-exports the graph using torch.export.export(strict=False) to save it. This flag is experimental for now. """ module_type = _parse_module_type(module) diff --git a/py/torch_tensorrt/_enums.py b/py/torch_tensorrt/_enums.py index befc22d474..7d261a88bf 100644 --- a/py/torch_tensorrt/_enums.py +++ b/py/torch_tensorrt/_enums.py @@ -5,27 +5,81 @@ from typing import Any, Optional, Type, Union import numpy as np -import tensorrt as trt import torch from torch_tensorrt._features import ENABLED_FEATURES +import tensorrt as trt + class dtype(Enum): - """Enum to set supported dtypes in the compiler""" + """Enum to describe data types to Torch-TensorRT, has compatibility with torch, tensorrt and numpy dtypes""" # Supported types in Torch-TensorRT unknown = auto() + """Sentinel value + + :meta hide-value: + """ + u8 = auto() + """Unsigned 8 bit integer, equivalent to ``dtype.uint8`` + + :meta hide-value: + """ + i8 = auto() + """Signed 8 bit integer, equivalent to ``dtype.int8``, when enabled as a kernel precision typically requires the model to support quantization + + :meta hide-value: + """ + i32 = auto() + """Signed 32 bit integer, equivalent to ``dtype.int32`` and ``dtype.int`` + + :meta hide-value: + """ + i64 = auto() + """Signed 64 bit integer, equivalent to ``dtype.int64`` and ``dtype.long`` + + :meta hide-value: + """ + f16 = auto() + """16 bit floating-point number, equivalent to ``dtype.half``, ``dtype.fp16`` and ``dtype.float16`` + + :meta hide-value: + """ + f32 = auto() + """32 bit floating-point number, equivalent to ``dtype.float``, ``dtype.fp32`` and ``dtype.float32`` + + :meta hide-value: + """ + f64 = auto() + """64 bit floating-point number, equivalent to ``dtype.double``, ``dtype.fp64`` and ``dtype.float64`` + + :meta hide-value: + """ + b = auto() + """Boolean value, equivalent to ``dtype.bool`` + + :meta hide-value: + """ bf16 = auto() + """16 bit "Brain" floating-point number, equivalent to ``dtype.bfloat16`` + + :meta hide-value: + """ + f8 = auto() + """8 bit floating-point number, equivalent to ``dtype.fp8`` and ``dtype.float8`` + + :meta hide-value: + """ uint8 = u8 int8 = i8 @@ -67,6 +121,36 @@ def _from( t: Union[torch.dtype, trt.DataType, np.dtype, dtype, type], use_default: bool = False, ) -> dtype: + """Create a Torch-TensorRT dtype from another library's dtype system. + + Takes a dtype enum from one of numpy, torch, and tensorrt and create a ``torch_tensorrt.dtype``. + If the source dtype system is not supported or the type is not supported in Torch-TensorRT, + then an exception will be raised. As such it is not recommended to use this method directly. + + Alternatively use ``torch_tensorrt.dtype.try_from()`` + + Arguments: + t (Union(torch.dtype, tensorrt.DataType, numpy.dtype, dtype)): Data type enum from another library + use_default (bool): In some cases a catch all type (such as ``torch_tensorrt.dtype.f32``) is sufficient, so instead of throwing an exception, return default value. + + Returns: + dtype: Equivalent ``torch_tensorrt.dtype`` to ``t`` + + Raises: + TypeError: Unsupported data type or unknown source + + Examples: + + .. code:: py + + # Succeeds + float_dtype = torch_tensorrt.dtype._from(torch.float) # Returns torch_tensorrt.dtype.f32 + + # Throws exception + float_dtype = torch_tensorrt.dtype._from(torch.complex128) + + """ + # TODO: Ideally implemented with match statement but need to wait for Py39 EoL if isinstance(t, torch.dtype): if t == torch.uint8: @@ -139,6 +223,10 @@ def _from( return dtype.f64 elif t == np.bool: return dtype.b + # TODO: Consider using ml_dtypes when issues like this are resolved: + # https://github.com/pytorch/pytorch/issues/109873 + # elif t == ml_dtypes.bfloat16: + # return dtype.bf16 elif use_default: logging.warning( f"Given dtype that does not have direct mapping to Torch-TensorRT supported types ({t}), defaulting to torch_tensorrt.dtype.float" @@ -188,6 +276,32 @@ def try_from( t: Union[torch.dtype, trt.DataType, np.dtype, dtype], use_default: bool = False, ) -> Optional[dtype]: + """Create a Torch-TensorRT dtype from another library's dtype system. + + Takes a dtype enum from one of numpy, torch, and tensorrt and create a ``torch_tensorrt.dtype``. + If the source dtype system is not supported or the type is not supported in Torch-TensorRT, + then returns ``None``. + + + Arguments: + t (Union(torch.dtype, tensorrt.DataType, numpy.dtype, dtype)): Data type enum from another library + use_default (bool): In some cases a catch all type (such as ``torch_tensorrt.dtype.f32``) is sufficient, so instead of throwing an exception, return default value. + + Returns: + Optional(dtype): Equivalent ``torch_tensorrt.dtype`` to ``t`` or ``None`` + + Examples: + + .. code:: py + + # Succeeds + float_dtype = torch_tensorrt.dtype.try_from(torch.float) # Returns torch_tensorrt.dtype.f32 + + # Unsupported type + float_dtype = torch_tensorrt.dtype.try_from(torch.complex128) # Returns None + + """ + try: casted_format = dtype._from(t, use_default=use_default) return casted_format @@ -202,6 +316,36 @@ def to( t: Union[Type[torch.dtype], Type[trt.DataType], Type[np.dtype], Type[dtype]], use_default: bool = False, ) -> Union[torch.dtype, trt.DataType, np.dtype, dtype]: + """Convert dtype into the equivalent type in [torch, numpy, tensorrt] + + Converts ``self`` into one of numpy, torch, and tensorrt equivalent dtypes. + If ``self`` is not supported in the target library, then an exception will be raised. + As such it is not recommended to use this method directly. + + Alternatively use ``torch_tensorrt.dtype.try_to()`` + + Arguments: + t (Union(Type(torch.dtype), Type(tensorrt.DataType), Type(numpy.dtype), Type(dtype))): Data type enum from another library to convert to + use_default (bool): In some cases a catch all type (such as ``torch.float``) is sufficient, so instead of throwing an exception, return default value. + + Returns: + Union(torch.dtype, tensorrt.DataType, numpy.dtype, dtype): dtype equivalent ``torch_tensorrt.dtype`` from library enum ``t`` + + Raises: + TypeError: Unsupported data type or unknown target + + Examples: + + .. code:: py + + # Succeeds + float_dtype = torch_tensorrt.dtype.f32.to(torch.dtype) # Returns torch.float + + # Failure + float_dtype = torch_tensorrt.dtype.bf16.to(numpy.dtype) # Throws exception + + """ + # TODO: Ideally implemented with match statement but need to wait for Py39 EoL if t == torch.dtype: if self == dtype.u8: @@ -273,10 +417,14 @@ def to( return np.float64 elif self == dtype.b: return np.bool_ + # TODO: Consider using ml_dtypes when issues like this are resolved: + # https://github.com/pytorch/pytorch/issues/109873 + # elif self == dtype.bf16: + # return ml_dtypes.bfloat16 elif use_default: return np.float32 else: - raise TypeError("Unspported numpy dtype") + raise TypeError("Unsupported numpy dtype") elif t == dtype: return self @@ -315,6 +463,30 @@ def try_to( t: Union[Type[torch.dtype], Type[trt.DataType], Type[np.dtype], Type[dtype]], use_default: bool, ) -> Optional[Union[torch.dtype, trt.DataType, np.dtype, dtype]]: + """Convert dtype into the equivalent type in [torch, numpy, tensorrt] + + Converts ``self`` into one of numpy, torch, and tensorrt equivalent dtypes. + If ``self`` is not supported in the target library, then returns ``None``. + + Arguments: + t (Union(Type(torch.dtype), Type(tensorrt.DataType), Type(numpy.dtype), Type(dtype))): Data type enum from another library to convert to + use_default (bool): In some cases a catch all type (such as ``torch.float``) is sufficient, so instead of throwing an exception, return default value. + + Returns: + Optional(Union(torch.dtype, tensorrt.DataType, numpy.dtype, dtype)): dtype equivalent ``torch_tensorrt.dtype`` from library enum ``t`` + + Examples: + + .. code:: py + + # Succeeds + float_dtype = torch_tensorrt.dtype.f32.to(torch.dtype) # Returns torch.float + + # Failure + float_dtype = torch_tensorrt.dtype.bf16.to(numpy.dtype) # Returns None + + """ + try: casted_format = self.to(t, use_default) return casted_format @@ -338,21 +510,130 @@ def __hash__(self) -> int: class memory_format(Enum): + """""" # TensorRT supported memory layouts linear = auto() + """Row major linear format. + + For a tensor with dimensions {N, C, H, W}, the W axis always has unit stride, and the stride of every other axis is at least the product of the next dimension times the next stride. the strides are the same as for a C array with dimensions [N][C][H][W]. + + Equivient to ``memory_format.contiguous`` + + :meta hide-value: + """ + chw2 = auto() + """Two wide channel vectorized row major format. + + This format is bound to FP16 in TensorRT. It is only available for dimensions >= 3. + + For a tensor with dimensions {N, C, H, W}, the memory layout is equivalent to a C array with dimensions [N][(C+1)/2][H][W][2], with the tensor coordinates (n, c, h, w) mapping to array subscript [n][c/2][h][w][c%2]. + + :meta hide-value: + """ + hwc8 = auto() + """Eight channel format where C is padded to a multiple of 8. + + This format is bound to FP16. It is only available for dimensions >= 3. + + For a tensor with dimensions {N, C, H, W}, the memory layout is equivalent to the array with dimensions [N][H][W][(C+7)/8*8], with the tensor coordinates (n, c, h, w) mapping to array subscript [n][h][w][c]. + + :meta hide-value: + """ + chw4 = auto() + """Four wide channel vectorized row major format. This format is bound to INT8. It is only available for dimensions >= 3. + + For a tensor with dimensions {N, C, H, W}, the memory layout is equivalent to a C array with dimensions [N][(C+3)/4][H][W][4], with the tensor coordinates (n, c, h, w) mapping to array subscript [n][c/4][h][w][c%4]. + + :meta hide-value: + """ + chw16 = auto() + """Sixteen wide channel vectorized row major format. + + This format is bound to FP16. It is only available for dimensions >= 3. + + For a tensor with dimensions {N, C, H, W}, the memory layout is equivalent to a C array with dimensions [N][(C+15)/16][H][W][16], with the tensor coordinates (n, c, h, w) mapping to array subscript [n][c/16][h][w][c%16]. + + :meta hide-value: + """ + chw32 = auto() + """Thirty-two wide channel vectorized row major format. + + This format is only available for dimensions >= 3. + + For a tensor with dimensions {N, C, H, W}, the memory layout is equivalent to a C array with dimensions [N][(C+31)/32][H][W][32], with the tensor coordinates (n, c, h, w) mapping to array subscript [n][c/32][h][w][c%32]. + + :meta hide-value: + """ + dhwc8 = auto() + """Eight channel format where C is padded to a multiple of 8. + + This format is bound to FP16, and it is only available for dimensions >= 4. + + For a tensor with dimensions {N, C, D, H, W}, the memory layout is equivalent to an array with dimensions [N][D][H][W][(C+7)/8*8], with the tensor coordinates (n, c, d, h, w) mapping to array subscript [n][d][h][w][c]. + + :meta hide-value: + """ + cdhw32 = auto() + """Thirty-two wide channel vectorized row major format with 3 spatial dimensions. + + This format is bound to FP16 and INT8. It is only available for dimensions >= 4. + + For a tensor with dimensions {N, C, D, H, W}, the memory layout is equivalent to a C array with dimensions [N][(C+31)/32][D][H][W][32], with the tensor coordinates (n, d, c, h, w) mapping to array subscript [n][c/32][d][h][w][c%32]. + + :meta hide-value: + """ + hwc = auto() + """Non-vectorized channel-last format. This format is bound to FP32 and is only available for dimensions >= 3. + + Equivient to ``memory_format.channels_last`` + + :meta hide-value: + """ + dla_linear = auto() + """ DLA planar format. Row major format. The stride for stepping along the H axis is rounded up to 64 bytes. + + This format is bound to FP16/Int8 and is only available for dimensions >= 3. + + For a tensor with dimensions {N, C, H, W}, the memory layout is equivalent to a C array with dimensions [N][C][H][roundUp(W, 64/elementSize)] where elementSize is 2 for FP16 and 1 for Int8, with the tensor coordinates (n, c, h, w) mapping to array subscript [n][c][h][w]. + + :meta hide-value: + """ + dla_hwc4 = auto() + """DLA image format. channel-last format. C can only be 1, 3, 4. If C == 3 it will be rounded to 4. The stride for stepping along the H axis is rounded up to 32 bytes. + + This format is bound to FP16/Int8 and is only available for dimensions >= 3. + + For a tensor with dimensions {N, C, H, W}, with C’ is 1, 4, 4 when C is 1, 3, 4 respectively, the memory layout is equivalent to a C array with dimensions [N][H][roundUp(W, 32/C’/elementSize)][C’] where elementSize is 2 for FP16 and 1 for Int8, C’ is the rounded C. The tensor coordinates (n, c, h, w) maps to array subscript [n][h][w][c]. + + :meta hide-value: + """ + hwc16 = auto() + """Sixteen channel format where C is padded to a multiple of 16. This format is bound to FP16. It is only available for dimensions >= 3. + + For a tensor with dimensions {N, C, H, W}, the memory layout is equivalent to the array with dimensions [N][H][W][(C+15)/16*16], with the tensor coordinates (n, c, h, w) mapping to array subscript [n][h][w][c]. + + :meta hide-value: + """ + dhwc = auto() + """Non-vectorized channel-last format. This format is bound to FP32. It is only available for dimensions >= 4. + + Equivient to ``memory_format.channels_last_3d`` + + :meta hide-value: + """ # PyTorch aliases for TRT layouts contiguous = linear @@ -363,6 +644,30 @@ class memory_format(Enum): def _from( cls, f: Union[torch.memory_format, trt.TensorFormat, memory_format] ) -> memory_format: + """Create a Torch-TensorRT memory format enum from another library memory format enum. + + Takes a memory format enum from one of torch, and tensorrt and create a ``torch_tensorrt.memory_format``. + If the source is not supported or the memory format is not supported in Torch-TensorRT, + then an exception will be raised. As such it is not recommended to use this method directly. + + Alternatively use ``torch_tensorrt.memory_format.try_from()`` + + Arguments: + f (Union(torch.memory_format, tensorrt.TensorFormat, memory_format)): Memory format enum from another library + + Returns: + memory_format: Equivalent ``torch_tensorrt.memory_format`` to ``f`` + + Raises: + TypeError: Unsupported memory format or unknown source + + Examples: + + .. code:: py + + torchtrt_linear = torch_tensorrt.memory_format._from(torch.contiguous) + + """ # TODO: Ideally implemented with match statement but need to wait for Py39 EoL if isinstance(f, torch.memory_format): if f == torch.contiguous_format: @@ -430,6 +735,26 @@ def _from( def try_from( cls, f: Union[torch.memory_format, trt.TensorFormat, memory_format] ) -> Optional[memory_format]: + """Create a Torch-TensorRT memory format enum from another library memory format enum. + + Takes a memory format enum from one of torch, and tensorrt and create a ``torch_tensorrt.memory_format``. + If the source is not supported or the memory format is not supported in Torch-TensorRT, + then ``None`` will be returned. + + + Arguments: + f (Union(torch.memory_format, tensorrt.TensorFormat, memory_format)): Memory format enum from another library + + Returns: + Optional(memory_format): Equivalent ``torch_tensorrt.memory_format`` to ``f`` + + Examples: + + .. code:: py + + torchtrt_linear = torch_tensorrt.memory_format.try_from(torch.contiguous) + + """ try: casted_format = memory_format._from(f) return casted_format @@ -446,6 +771,31 @@ def to( Type[torch.memory_format], Type[trt.TensorFormat], Type[memory_format] ], ) -> Union[torch.memory_format, trt.TensorFormat, memory_format]: + """Convert ``memory_format`` into the equivalent type in torch or tensorrt + + Converts ``self`` into one of torch or tensorrt equivalent memory format. + If ``self`` is not supported in the target library, then an exception will be raised. + As such it is not recommended to use this method directly. + + Alternatively use ``torch_tensorrt.memory_format.try_to()`` + + Arguments: + t (Union(Type(torch.memory_format), Type(tensorrt.TensorFormat), Type(memory_format))): Memory format type enum from another library to convert to + + Returns: + Union(torch.memory_format, tensorrt.TensorFormat, memory_format): Memory format equivalent ``torch_tensorrt.memory_format`` in enum ``t`` + + Raises: + TypeError: Unknown target type or unsupported memory format + + Examples: + + .. code:: py + + # Succeeds + tf = torch_tensorrt.memory_format.linear.to(torch.dtype) # Returns torch.contiguous + """ + if t == torch.memory_format: if self == memory_format.contiguous: return torch.contiguous_format @@ -512,6 +862,25 @@ def try_to( Type[torch.memory_format], Type[trt.TensorFormat], Type[memory_format] ], ) -> Optional[Union[torch.memory_format, trt.TensorFormat, memory_format]]: + """Convert ``memory_format`` into the equivalent type in torch or tensorrt + + Converts ``self`` into one of torch or tensorrt equivalent memory format. + If ``self`` is not supported in the target library, then ``None`` will be returned + + Arguments: + t (Union(Type(torch.memory_format), Type(tensorrt.TensorFormat), Type(memory_format))): Memory format type enum from another library to convert to + + Returns: + Optional(Union(torch.memory_format, tensorrt.TensorFormat, memory_format)): Memory format equivalent ``torch_tensorrt.memory_format`` in enum ``t`` + + Examples: + + .. code:: py + + # Succeeds + tf = torch_tensorrt.memory_format.linear.to(torch.dtype) # Returns torch.contiguous + """ + try: casted_format = self.to(t) return casted_format @@ -533,12 +902,55 @@ def __hash__(self) -> int: class DeviceType(Enum): + """Type of device TensorRT will target""" + UNKNOWN = auto() + """ + Sentinel value + + :meta hide-value: + """ + GPU = auto() + """ + Target is a GPU + + :meta hide-value: + """ + DLA = auto() + """ + Target is a DLA core + + :meta hide-value: + """ @classmethod def _from(cls, d: Union[trt.DeviceType, DeviceType]) -> DeviceType: + """Create a Torch-TensorRT device type enum from a TensorRT device type enum. + + Takes a device type enum from tensorrt and create a ``torch_tensorrt.DeviceType``. + If the source is not supported or the device type is not supported in Torch-TensorRT, + then an exception will be raised. As such it is not recommended to use this method directly. + + Alternatively use ``torch_tensorrt.DeviceType.try_from()`` + + Arguments: + d (Union(tensorrt.DeviceType, DeviceType)): Device type enum from another library + + Returns: + DeviceType: Equivalent ``torch_tensorrt.DeviceType`` to ``d`` + + Raises: + TypeError: Unknown source type or unsupported device type + + Examples: + + .. code:: py + + torchtrt_dla = torch_tensorrt.DeviceType._from(tensorrt.DeviceType.DLA) + + """ if isinstance(d, trt.DeviceType): if d == trt.DeviceType.GPU: return DeviceType.GPU @@ -569,6 +981,27 @@ def _from(cls, d: Union[trt.DeviceType, DeviceType]) -> DeviceType: @classmethod def try_from(cls, d: Union[trt.DeviceType, DeviceType]) -> Optional[DeviceType]: + """Create a Torch-TensorRT device type enum from a TensorRT device type enum. + + Takes a device type enum from tensorrt and create a ``torch_tensorrt.DeviceType``. + If the source is not supported or the device type is not supported in Torch-TensorRT, + then an exception will be raised. As such it is not recommended to use this method directly. + + Alternatively use ``torch_tensorrt.DeviceType.try_from()`` + + Arguments: + d (Union(tensorrt.DeviceType, DeviceType)): Device type enum from another library + + Returns: + DeviceType: Equivalent ``torch_tensorrt.DeviceType`` to ``d`` + + Examples: + + .. code:: py + + torchtrt_dla = torch_tensorrt.DeviceType._from(tensorrt.DeviceType.DLA) + + """ try: casted_format = DeviceType._from(d) return casted_format @@ -584,6 +1017,31 @@ def to( t: Union[Type[trt.DeviceType], Type[DeviceType]], use_default: bool = False, ) -> Union[trt.DeviceType, DeviceType]: + """Convert ``DeviceType`` into the equivalent type in tensorrt + + Converts ``self`` into one of torch or tensorrt equivalent device type. + If ``self`` is not supported in the target library, then an exception will be raised. + As such it is not recommended to use this method directly. + + Alternatively use ``torch_tensorrt.DeviceType.try_to()`` + + Arguments: + t (Union(Type(tensorrt.DeviceType), Type(DeviceType))): Device type enum from another library to convert to + + Returns: + Union(tensorrt.DeviceType, DeviceType): Device type equivalent ``torch_tensorrt.DeviceType`` in enum ``t`` + + Raises: + TypeError: Unknown target type or unsupported device type + + Examples: + + .. code:: py + + # Succeeds + trt_dla = torch_tensorrt.DeviceType.DLA.to(tensorrt.DeviceType) # Returns tensorrt.DeviceType.DLA + """ + if t == trt.DeviceType: if self == DeviceType.GPU: return trt.DeviceType.GPU @@ -621,6 +1079,24 @@ def try_to( t: Union[Type[trt.DeviceType], Type[DeviceType]], use_default: bool = False, ) -> Optional[Union[trt.DeviceType, DeviceType]]: + """Convert ``DeviceType`` into the equivalent type in tensorrt + + Converts ``self`` into one of torch or tensorrt equivalent memory format. + If ``self`` is not supported in the target library, then ``None`` will be returned. + + Arguments: + t (Union(Type(tensorrt.DeviceType), Type(DeviceType))): Device type enum from another library to convert to + + Returns: + Optional(Union(tensorrt.DeviceType, DeviceType)): Device type equivalent ``torch_tensorrt.DeviceType`` in enum ``t`` + + Examples: + + .. code:: py + + # Succeeds + trt_dla = torch_tensorrt.DeviceType.DLA.to(tensorrt.DeviceType) # Returns tensorrt.DeviceType.DLA + """ try: casted_format = self.to(t, use_default=use_default) return casted_format @@ -640,14 +1116,59 @@ def __hash__(self) -> int: class EngineCapability(Enum): + """ + EngineCapability determines the restrictions of a network during build time and what runtime it targets. + """ + STANDARD = auto() + """ + EngineCapability.STANDARD does not provide any restrictions on functionality and the resulting serialized engine can be executed with TensorRT’s standard runtime APIs. + + :meta hide-value: + """ + SAFETY = auto() + """ + EngineCapability.SAFETY provides a restricted subset of network operations that are safety certified and the resulting serialized engine can be executed with TensorRT’s safe runtime APIs in the tensorrt.safe namespace. + + :meta hide-value: + """ + DLA_STANDALONE = auto() + """ + ``EngineCapability.DLA_STANDALONE`` provides a restricted subset of network operations that are DLA compatible and the resulting serialized engine can be executed using standalone DLA runtime APIs. + + :meta hide-value: + """ @classmethod def _from( cls, c: Union[trt.EngineCapability, EngineCapability] ) -> EngineCapability: + """Create a Torch-TensorRT Engine capability enum from a TensorRT Engine capability enum. + + Takes a device type enum from tensorrt and create a ``torch_tensorrt.EngineCapability``. + If the source is not supported or the engine capability is not supported in Torch-TensorRT, + then an exception will be raised. As such it is not recommended to use this method directly. + + Alternatively use ``torch_tensorrt.EngineCapability.try_from()`` + + Arguments: + c (Union(tensorrt.EngineCapability, EngineCapability)): Engine capability enum from another library + + Returns: + EngineCapability: Equivalent ``torch_tensorrt.EngineCapability`` to ``c`` + + Raises: + TypeError: Unknown source type or unsupported engine capability + + Examples: + + .. code:: py + + torchtrt_ec = torch_tensorrt.EngineCapability._from(tensorrt.EngineCapability.SAFETY) + + """ if isinstance(c, trt.EngineCapability): if c == trt.EngineCapability.STANDARD: return EngineCapability.STANDARD @@ -682,6 +1203,27 @@ def _from( def try_from( c: Union[trt.EngineCapability, EngineCapability] ) -> Optional[EngineCapability]: + """Create a Torch-TensorRT engine capability enum from a TensorRT engine capability enum. + + Takes a device type enum from tensorrt and create a ``torch_tensorrt.EngineCapability``. + If the source is not supported or the engine capability level is not supported in Torch-TensorRT, + then an exception will be raised. As such it is not recommended to use this method directly. + + Alternatively use ``torch_tensorrt.EngineCapability.try_from()`` + + Arguments: + c (Union(tensorrt.EngineCapability, EngineCapability)): Engine capability enum from another library + + Returns: + EngineCapability: Equivalent ``torch_tensorrt.EngineCapability`` to ``c`` + + Examples: + + .. code:: py + + torchtrt_safety_ec = torch_tensorrt.EngineCapability._from(tensorrt.EngineCapability.SAEFTY) + + """ try: casted_format = EngineCapability._from(c) return casted_format @@ -695,6 +1237,30 @@ def try_from( def to( self, t: Union[Type[trt.EngineCapability], Type[EngineCapability]] ) -> Union[trt.EngineCapability, EngineCapability]: + """Convert ``EngineCapability`` into the equivalent type in tensorrt + + Converts ``self`` into one of torch or tensorrt equivalent engine capability. + If ``self`` is not supported in the target library, then an exception will be raised. + As such it is not recommended to use this method directly. + + Alternatively use ``torch_tensorrt.EngineCapability.try_to()`` + + Arguments: + t (Union(Type(tensorrt.EngineCapability), Type(EngineCapability))): Engine capability enum from another library to convert to + + Returns: + Union(tensorrt.EngineCapability, EngineCapability): Engine capability equivalent ``torch_tensorrt.EngineCapability`` in enum ``t`` + + Raises: + TypeError: Unknown target type or unsupported engine capability + + Examples: + + .. code:: py + + # Succeeds + torchtrt_dla_ec = torch_tensorrt.EngineCapability.DLA_STANDALONE.to(tensorrt.EngineCapability) # Returns tensorrt.EngineCapability.DLA + """ if t == trt.EngineCapability: if self == EngineCapability.STANDARD: return trt.EngineCapability.STANDARD @@ -722,12 +1288,30 @@ def to( raise ValueError("Provided an unsupported engine capability") # else: # commented out for mypy raise TypeError( - "Provided unsupported destination type for engine capablity type conversion" + "Provided unsupported destination type for engine capability type conversion" ) def try_to( self, t: Union[Type[trt.EngineCapability], Type[EngineCapability]] ) -> Optional[Union[trt.EngineCapability, EngineCapability]]: + """Convert ``EngineCapability`` into the equivalent type in tensorrt + + Converts ``self`` into one of torch or tensorrt equivalent engine capability. + If ``self`` is not supported in the target library, then ``None`` will be returned. + + Arguments: + t (Union(Type(tensorrt.EngineCapability), Type(EngineCapability))): Engine capability enum from another library to convert to + + Returns: + Optional(Union(tensorrt.EngineCapability, EngineCapability)): Engine capability equivalent ``torch_tensorrt.EngineCapability`` in enum ``t`` + + Examples: + + .. code:: py + + # Succeeds + trt_dla_ec = torch_tensorrt.EngineCapability.DLA.to(tensorrt.EngineCapability) # Returns tensorrt.EngineCapability.DLA_STANDALONE + """ try: casted_format = self.to(t) return casted_format diff --git a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp index e9ad8b159c..e32d102f8b 100644 --- a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp +++ b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp @@ -242,7 +242,7 @@ PYBIND11_MODULE(_C, m) { .def("__str__", &InputSignature::to_str) .def_readwrite("_signature_ivalue", &InputSignature::signature_ivalue); - py::enum_(m, "dtype", "Enum to specifiy operating precision for engine execution") + py::enum_(m, "dtype", "Enum to specify operating precision for engine execution") .value("float", DataType::kFloat, "32 bit floating point number") .value("float32", DataType::kFloat, "32 bit floating point number") .value("half", DataType::kHalf, "16 bit floating point number") @@ -270,7 +270,7 @@ PYBIND11_MODULE(_C, m) { .value("DLA_STANDALONE", EngineCapability::kDLA_STANDALONE, "Use DLA kernels only") .value("STANDARD", EngineCapability::kSTANDARD, "Use default behavior"); - py::enum_(m, "TensorFormat", "Enum to specifiy the memory layout of tensors") + py::enum_(m, "TensorFormat", "Enum to specify the memory layout of tensors") .value("contiguous", TensorFormat::kContiguous, "Contiguous memory layout (NCHW / Linear)") .value("channels_last", TensorFormat::kChannelsLast, "Channels last memory layout (NHWC)") .export_values(); @@ -401,7 +401,7 @@ PYBIND11_MODULE(_C, m) { ts_sub_mod.def( "check_method_op_support", &torch_tensorrt::pyapi::CheckMethodOperatorSupport, - "Takes a module and a method name and checks if the method graph contains purely convertable operators"); + "Takes a module and a method name and checks if the method graph contains purely convertible operators"); ts_sub_mod.def( "embed_engine_in_new_module", &torch_tensorrt::pyapi::EmbedEngineInNewModule, diff --git a/py/torch_tensorrt/csrc/util.h b/py/torch_tensorrt/csrc/util.h index 10f07e7f82..01a47564d3 100644 --- a/py/torch_tensorrt/csrc/util.h +++ b/py/torch_tensorrt/csrc/util.h @@ -13,14 +13,14 @@ namespace util { namespace py = pybind11; // Method for calling the python function and returning the value (returned from python) used in cpp trampoline -// classes. Prints an error if no such method is overriden in python. +// classes. Prints an error if no such method is overridden in python. // T* must NOT be a trampoline class! template py::function getOverload(const T* self, const std::string& overloadName) { py::function overload = py::get_override(self, overloadName.c_str()); if (!overload) { std::string msg{ - "Method: " + overloadName + " was not overriden. Please provide an implementation for this method."}; + "Method: " + overloadName + " was not overridden. Please provide an implementation for this method."}; LOG_ERROR(msg); } return overload; diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index e854b04d42..0c29bd378e 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -92,19 +92,21 @@ def compile( exported_program (torch.export.ExportedProgram): Source module, running torch.export on a ``torch.nn.Module`` inputs (Tuple[Any, ...]): List of specifications of input shape, dtype and memory layout for inputs to the module. This argument is required. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum - to select device type. :: - - input=[ - torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1 - torch_tensorrt.Input( - min_shape=(1, 224, 224, 3), - opt_shape=(1, 512, 512, 3), - max_shape=(1, 1024, 1024, 3), - dtype=torch.int32 - format=torch.channel_last - ), # Dynamic input shape for input #2 - torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings - ] + to select device type. + + .. code-block:: py + + inputs=[ + torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1 + torch_tensorrt.Input( + min_shape=(1, 224, 224, 3), + opt_shape=(1, 512, 512, 3), + max_shape=(1, 1024, 1024, 3), + dtype=torch.int32 + format=torch.channel_last + ), # Dynamic input shape for input #2 + torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings + ] Keyword Arguments: device (Union(torch_tensorrt.Device, torch.device, dict)): Target device for TensorRT engines to run on :: @@ -126,7 +128,7 @@ def compile( truncate_double (bool): Truncate weights provided in double (float64) to float32 calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration require_full_compilation (bool): Require modules to be compiled end to end or return an error as opposed to returning a hybrid graph where operations that cannot be run in TensorRT are run in PyTorch - min_block_size (int): The minimum number of contiguous TensorRT convertable operations in order to run a set of operations in TensorRT + min_block_size (int): The minimum number of contiguous TensorRT convertible operations in order to run a set of operations in TensorRT torch_executed_ops (Collection[Target]): Set of aten operators that must be run in PyTorch. An error will be thrown if this set is not empty but ``require_full_compilation`` is True torch_executed_modules (List[str]): List of modules that must be run in PyTorch. An error will be thrown if this list is not empty but ``require_full_compilation`` is True pass_through_build_failures (bool): Error out if there are issues during compilation (only applicable to torch.compile workflows) @@ -134,8 +136,8 @@ def compile( version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization - use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optiminal. Use the global paritioner (``False``) if looking for best performance - enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the grap easier to covert to TensorRT, potentially increasing the amount of graphs run in TensorRT. + use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance + enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation @@ -464,7 +466,7 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool: def convert_module_to_trt_engine( exported_program: ExportedProgram, - inputs: Tuple[Any, ...], + inputs: Sequence[Any], *, enabled_precisions: ( Set[torch.dtype | dtype] | Tuple[torch.dtype | dtype] @@ -507,19 +509,21 @@ def convert_module_to_trt_engine( Keyword Args: inputs (Optional[Sequence[torch_tensorrt.Input | torch.Tensor]]): **Required** List of specifications of input shape, dtype and memory layout for inputs to the module. This argument is required. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum - to select device type. :: - - input=[ - torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1 - torch_tensorrt.Input( - min_shape=(1, 224, 224, 3), - opt_shape=(1, 512, 512, 3), - max_shape=(1, 1024, 1024, 3), - dtype=torch.int32 - format=torch.channel_last - ), # Dynamic input shape for input #2 - torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings - ] + to select device type. + + .. code-block:: py + + inputs=[ + torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1 + torch_tensorrt.Input( + min_shape=(1, 224, 224, 3), + opt_shape=(1, 512, 512, 3), + max_shape=(1, 1024, 1024, 3), + dtype=torch.int32 + format=torch.channel_last + ), # Dynamic input shape for input #2 + torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings + ] enabled_precisions (Optional[Set[torch.dtype | _enums.dtype]]): The set of datatypes that TensorRT can use debug (bool): Whether to print out verbose debugging information workspace_size (int): Workspace TRT is allowed to use for the module (0 is default) diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py index 38810e59b3..569fc2db55 100644 --- a/py/torch_tensorrt/dynamo/_refit.py +++ b/py/torch_tensorrt/dynamo/_refit.py @@ -6,7 +6,6 @@ from typing import Any, Sequence, Tuple import numpy as np -import tensorrt as trt import torch from torch.export import ExportedProgram from torch_tensorrt._enums import dtype @@ -43,6 +42,8 @@ ) from torch_tensorrt.logging import TRT_LOGGER +import tensorrt as trt + logger = logging.getLogger(__name__) @@ -95,12 +96,16 @@ def construct_refit_mapping( layer_type: str = layer.type.name if layer_type in MODULE_MAP: # Cast the parent class to child class to access attributes - # For example: ILayer does not have ILayer.kernal/ILayer.bias + # For example: ILayer does not have ILayer.kernel/ILayer.bias # So we cast it to IConvolutionLayer and access the attributes layer.__class__ = MODULE_MAP[layer_type][0] for weight_type, weight_name in MODULE_MAP[layer_type][1]: weight = layer.__getattribute__(weight_type).copy() - weight_dtype = dtype.try_from(weight.dtype).to(trt.DataType) + weight_dtype_opt = dtype.try_from(weight.dtype) + assert ( + weight_dtype_opt is not None + ), f"Weights {weight_name} has unsupported dtype {weight.dtype}" + weight_dtype = weight_dtype_opt.to(trt.DataType) weight_map[f"{layer.name} {weight_name}"] = ( weight, weight_dtype, diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index 57b7d5dc69..13c786b858 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -70,7 +70,7 @@ class CompilationSettings: dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution dryrun (Union[bool, str]): Toggle "Dryrun" mode, which runs everything through partitioning, short of conversion to TRT Engines. Prints detailed logs of the graph structure and nature of partitioning. Optionally saves the - ouptut to a file if a string path is specified + output to a file if a string path is specified hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation """ diff --git a/py/torch_tensorrt/dynamo/_tracer.py b/py/torch_tensorrt/dynamo/_tracer.py index 6bc334f427..e1b89886ca 100644 --- a/py/torch_tensorrt/dynamo/_tracer.py +++ b/py/torch_tensorrt/dynamo/_tracer.py @@ -45,7 +45,7 @@ def trace( device=torch.device("cuda:0") debug (bool): Enable debuggable engine - enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the grap easier to covert to TensorRT, potentially increasing the amount of graphs run in TensorRT. + enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py index 1142559838..ea3034cb8c 100644 --- a/py/torch_tensorrt/dynamo/conversion/_conversion.py +++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py @@ -4,7 +4,6 @@ import logging from typing import List, Sequence -import tensorrt as trt import torch from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode from torch_tensorrt._Device import Device @@ -19,6 +18,8 @@ from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule from torch_tensorrt.dynamo.utils import get_torch_inputs +import tensorrt as trt + logger = logging.getLogger(__name__) @@ -44,7 +45,7 @@ def infer_module_output_dtypes( if not isinstance(output, torch.Tensor): if isinstance(output, str): raise ValueError( - f"Receieved an output type {type(output)} that's not in the acceptable datatypes (https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype)" + f"Received an output type {type(output)} that's not in the acceptable datatypes (https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype)" ) else: output_ = torch.tensor(output) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index 4bff27fd26..f847091800 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -5,7 +5,6 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, overload import numpy as np -import tensorrt as trt import torch import torch_tensorrt.dynamo.conversion.impl as impl from torch.fx.node import Argument, Target @@ -18,6 +17,8 @@ DynamoConverterImplSignature, ) +import tensorrt as trt + from ..types import Shape, TRTDataType, TRTLayer, TRTTensor _LOGGER: logging.Logger = logging.getLogger(__name__) @@ -70,7 +71,7 @@ def format_tensor_metadata(metadata: Union[Any, Sequence[Any]]) -> str: return formatted_str[:-2] + ")" else: _LOGGER.warning( - f"Detected unparseable type in node formatting: {type(metadata)}" + f"Detected unparsable type in node formatting: {type(metadata)}" ) return "" @@ -182,7 +183,7 @@ def cast_int_int_div_trt_tensor( rhs_val (TRTTensor): A TRT Tensor numerator name (str): Name of calling layer Returns: - A list of lhs_val and rhs_val casted to the approriate datatype + A list of lhs_val and rhs_val casted to the appropriate datatype """ if lhs_val.dtype == trt.int32 and rhs_val.dtype == trt.int32: lhs_val = cast_trt_tensor(ctx, lhs_val, trt.float32, name) @@ -344,9 +345,7 @@ def create_constant( # Rank 0 constant is required in IFillLayer inputs. if min_rank == 0: shape = trt.Dims() - numpy_value = to_numpy( - value, _enums.dtype._from(dtype).to(np.dtype) if dtype is not None else None - ) + numpy_value = to_numpy(value, dtype) constant = ctx.net.add_constant( shape if isinstance(value, (int, float, bool)) else value.shape, numpy_value.copy() if isinstance(numpy_value, np.ndarray) else numpy_value, @@ -576,7 +575,7 @@ def to_numpy( return ( output if (dtype is None or output is None) - else output.astype(_enums.dtype._from(dtype).to(np.dtype)) + else output.astype(_enums.dtype._from(dtype).to(np.dtype, use_default=True)) ) else: raise AssertionError( @@ -637,7 +636,7 @@ def append( name (str): Name of the calling layer original_tensor (TRTTensor): A TRTTensor to append the new value to new_value (Union[TRTTensor, int, float, torch.Tensor, np.ndarray]): A new value to append - dim (int, optional): Dimention to append the new value. Defaults to 0. + dim (int, optional): Dimension to append the new value. Defaults to 0. Returns: TRTTensor: A new TRTTensor that is the result of appending the new value to the original tensor diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shape.py b/py/torch_tensorrt/dynamo/conversion/impl/shape.py index 8c5ee6a26a..c2dfac802b 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shape.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shape.py @@ -3,7 +3,6 @@ from typing import List, Optional, Tuple import numpy as np -import tensorrt as trt import torch from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR @@ -23,6 +22,8 @@ ) from torch_tensorrt.fx.types import TRTTensor +import tensorrt as trt + def shape( ctx: ConversionContext, @@ -73,7 +74,7 @@ def get_shape_with_dynamic_shape( reduce operation output shape. Steps of calculations are: 1. get the actual tensor shape of input_val via add_shape layer; 2. create a all 0 tensor [0, 0, 0]; - 3. run elementwise comparision the [0, 0, 0] and [-1, 128, 256] tensor, get a condition tensor [True, False, False]; + 3. run elementwise comparison the [0, 0, 0] and [-1, 128, 256] tensor, get a condition tensor [True, False, False]; 4. use the condition tensor [True, False, False] to do selection between [2048, 256, 512] and [-1, 128, 256], replace all -1 dynamic shape dimensions with actual batch_size value; 5. output shape with actual batch_size as [2048, 128, 256] diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py b/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py index 45927e7709..86030ea4df 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py @@ -1,7 +1,6 @@ from typing import Optional, Sequence, Union import numpy as np -import tensorrt as trt import torch_tensorrt.dynamo.conversion.impl as impl from torch.fx.node import Target from torch_tensorrt import _enums @@ -13,9 +12,10 @@ get_trt_tensor, set_layer_name, ) -from torch_tensorrt.dynamo.utils import Frameworks, unified_dtype_converter from torch_tensorrt.fx.types import TRTTensor +import tensorrt as trt + def reshape( ctx: ConversionContext, @@ -145,7 +145,7 @@ def resize( input: TRTTensor, sizes: Sequence[int], ) -> TRTTensor: - input_np_dtype = unified_dtype_converter(input.dtype, Frameworks.NUMPY) + input_np_dtype = _enums.dtype._from(input.dtype).to(np.dtype) input_val = get_trt_tensor(ctx, input, f"{name}_input") # Calculate the total number of elements for new and current shape diff --git a/py/torch_tensorrt/dynamo/conversion/truncate_double.py b/py/torch_tensorrt/dynamo/conversion/truncate_double.py index e05cd30961..b14ee95dab 100644 --- a/py/torch_tensorrt/dynamo/conversion/truncate_double.py +++ b/py/torch_tensorrt/dynamo/conversion/truncate_double.py @@ -118,7 +118,7 @@ def _repair_64bit_input( # Only enter this code block if there exists a 64-bit output # This implies a cast is needed, since TRT cannot output 64-bit tensors if output_positions_64bit: - # Determine whther the outputs of the module are tuple-type or not + # Determine whether the outputs of the module are tuple-type or not is_collection_output = False if isinstance(submodule_outputs, tuple): is_collection_output = True diff --git a/py/torch_tensorrt/dynamo/lowering/passes/remove_input_alias_fixing_clones.py b/py/torch_tensorrt/dynamo/lowering/passes/remove_input_alias_fixing_clones.py index 7630f3c1a5..87bd518555 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/remove_input_alias_fixing_clones.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/remove_input_alias_fixing_clones.py @@ -30,7 +30,7 @@ def remove_input_alias_fixing_clones( ): modified_graph = True - # Replace all uses of the clone with the placholder, delete the clone + # Replace all uses of the clone with the placeholder, delete the clone clone_node = list(node.users)[0] logger.debug( f"Removing node {clone_node} from graph, since it is a clone node which " diff --git a/py/torch_tensorrt/dynamo/lowering/passes/replace_max_pool_with_indices.py b/py/torch_tensorrt/dynamo/lowering/passes/replace_max_pool_with_indices.py index 29d9dcd3cc..6e3762e73c 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/replace_max_pool_with_indices.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/replace_max_pool_with_indices.py @@ -34,7 +34,7 @@ def replace_max_pool_with_indices( ): modified_graph = True - # Replace all uses of the clone with the placholder, delete the clone + # Replace all uses of the clone with the placeholder, delete the clone getitem_node = list(node.users)[0] with gm.graph.inserting_after(getitem_node): diff --git a/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py index c00d92577c..0e9077cdcb 100644 --- a/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py +++ b/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py @@ -95,7 +95,7 @@ class TRTPartitioner(_SplitterBase): # type: ignore Args: module: FX GraphModule to partition operator_support: OperatorSupport class describing allowed operators - allowed_single_node_partition_ops: Nodes which can be included in single-node partitons. + allowed_single_node_partition_ops: Nodes which can be included in single-node partitions. Generally useful for module-level exclusion ops which are intensive despite being single functions min_block_size: Minimum number of computational operators per block require_full_compilation: Require that all computational operators be run in TRT diff --git a/py/torch_tensorrt/dynamo/partitioning/_global_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_global_partitioner.py index 5982cc95ba..823a43beb8 100644 --- a/py/torch_tensorrt/dynamo/partitioning/_global_partitioner.py +++ b/py/torch_tensorrt/dynamo/partitioning/_global_partitioner.py @@ -26,7 +26,7 @@ class TRTPartitioner(CapabilityBasedPartitioner): # type: ignore[misc] graph_module: FX GraphModule to partition operator_support: OperatorSupport class describing allowed operators non_compute_ops: Operators which are not considered computational (e.g. getattr) - allowed_single_node_partition_ops: Nodes which can be included in single-node partitons. + allowed_single_node_partition_ops: Nodes which can be included in single-node partitions. Generally useful for module-level exclusion ops which are intensive despite being single functions min_block_size: Minimum number of computational operators per block require_full_compilation: Require that all computational operators be run in TRT diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index b5365bf208..78395b8943 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -4,7 +4,6 @@ from contextlib import nullcontext from typing import Any, Dict, List, Optional, Tuple -import tensorrt as trt import torch import torch_tensorrt from torch.nn import Module @@ -19,13 +18,15 @@ from torch_tensorrt.dynamo.utils import DYNAMIC_DIM from torch_tensorrt.logging import TRT_LOGGER +import tensorrt as trt + logger = logging.getLogger(__name__) class PythonTorchTensorRTModule(Module): # type: ignore[misc] """PythonTorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine. - This module is backed by the Torch-TensorRT runtime and is only compatibile with + This module is backed by the Torch-TensorRT runtime and is only compatible with FX / Dynamo / Python deployments. This module cannot be serialized to torchscript via torch.jit.trace for C++ deployment. """ diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index 1449d4ae36..601147279a 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -33,7 +33,7 @@ class TorchTensorRTModule(torch.nn.Module): # type: ignore[misc] """TorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine. - This module is backed by the Torch-TensorRT runtime and is fully compatibile with both + This module is backed by the Torch-TensorRT runtime and is fully compatible with both FX / Python deployments (just ``import torch_tensorrt`` as part of the application) as well as TorchScript / C++ deployments since TorchTensorRTModule can be passed to ``torch.jit.trace`` and then saved. @@ -41,11 +41,11 @@ class TorchTensorRTModule(torch.nn.Module): # type: ignore[misc] The forward function is simpily forward(*args: torch.Tensor) -> Tuple[torch.Tensor] where the internal implementation is ``return Tuple(torch.ops.tensorrt.execute_engine(list(inputs), self.engine))`` - > Note: TorchTensorRTModule only supports engines built with explict batch + > Note: TorchTensorRTModule only supports engines built with explicit batch Attributes: name (str): Name of module (for easier debugging) - engine (torch.classess.tensorrt.Engine): Torch-TensorRT TensorRT Engine instance, manages [de]serialization, device configuration, profiling + engine (torch.classes.tensorrt.Engine): Torch-TensorRT TensorRT Engine instance, manages [de]serialization, device configuration, profiling input_binding_names (List[str]): List of input TensorRT engine binding names in the order they would be passed to the TRT modules output_binding_names (List[str]): List of output TensorRT engine binding names in the order they should be returned """ @@ -58,9 +58,7 @@ def __init__( output_binding_names: Optional[List[str]] = None, settings: CompilationSettings = CompilationSettings(), ): - """__init__ method for torch_tensorrt.dynamo.runtime._TorchTensorRTModule.TorchTensorRTModule - - Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs + """Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs a PyTorch ``torch.nn.Module`` around it. If binding names are not provided, it is assumed that the engine binding names follow the following convention: @@ -68,16 +66,17 @@ def __init__( - [symbol].[index in input / output array] - ex. [x.0, x.1, x.2] -> [y.0] - Args: + Arguments: name (str): Name for module serialized_engine (bytearray): Serialized TensorRT engine in the form of a bytearray input_binding_names (List[str]): List of input TensorRT engine binding names in the order they would be passed to the TRT modules output_binding_names (List[str]): List of output TensorRT engine binding names in the order they should be returned - target_device: (torch_tensorrt.Device): Device to instantiate TensorRT engine on. Must be a compatible device i.e. same GPU model / compute capability as was used to build the engine + target_device (torch_tensorrt.Device): Device to instantiate TensorRT engine on. Must be a compatible device i.e. same GPU model / compute capability as was used to build the engine + hardware_compatible (bool): If the engine has be built with the hardware compatibility feature enabled Example: - ..code-block:: py + .. code-block:: py with io.BytesIO() as engine_bytes: engine_bytes.write(trt_engine.serialize()) @@ -186,7 +185,7 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: torch.Tensor or Tuple(torch.Tensor): Result of the engine computation """ if self.engine is None: - raise RuntimeError("Engine has not been initalized yet.") + raise RuntimeError("Engine has not been initialized yet.") assert len(inputs) == len( self.input_binding_names @@ -220,7 +219,7 @@ def enable_profiling(self, profiling_results_dir: Optional[str] = None) -> None: profiling_results_dir (str): Absolute path to the directory to sort results of profiling. """ if self.engine is None: - raise RuntimeError("Engine has not been initalized yet.") + raise RuntimeError("Engine has not been initialized yet.") if profiling_results_dir is not None: self.engine.profile_path_prefix = profiling_results_dir @@ -229,7 +228,7 @@ def enable_profiling(self, profiling_results_dir: Optional[str] = None) -> None: def disable_profiling(self) -> None: """Disable the profiler""" if self.engine is None: - raise RuntimeError("Engine has not been initalized yet.") + raise RuntimeError("Engine has not been initialized yet.") self.engine.disable_profiling() @@ -241,7 +240,7 @@ def get_layer_info(self) -> str: str: A JSON string which contains the layer information of the engine incapsulated in this module """ if self.engine is None: - raise RuntimeError("Engine has not been initalized yet.") + raise RuntimeError("Engine has not been initialized yet.") layer_info: str = self.engine.get_engine_layer_info() return layer_info @@ -249,7 +248,7 @@ def get_layer_info(self) -> str: def dump_layer_info(self) -> None: """Dump layer information encoded by the TensorRT engine in this module to STDOUT""" if self.engine is None: - raise RuntimeError("Engine has not been initalized yet.") + raise RuntimeError("Engine has not been initialized yet.") self.engine.dump_engine_layer_info() diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 7a65e27bfb..acfb2b0094 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -6,7 +6,6 @@ from typing import Any, Callable, Dict, Optional, Sequence, Union import numpy as np -import tensorrt as trt import torch from torch_tensorrt._Device import Device from torch_tensorrt._enums import dtype @@ -14,6 +13,7 @@ from torch_tensorrt.dynamo import _defaults from torch_tensorrt.dynamo._settings import CompilationSettings +import tensorrt as trt from packaging import version from .types import TRTDataType @@ -387,34 +387,3 @@ def check_output( return False return True - - -def unified_dtype_converter( - dtype: Union[TRTDataType, torch.dtype, np.dtype], to: Frameworks -) -> Union[np.dtype, torch.dtype, TRTDataType]: - """ - Convert TensorRT, Numpy, or Torch data types to any other of those data types. - - Args: - dtype (TRTDataType, torch.dtype, np.dtype): A TensorRT, Numpy, or Torch data type. - to (Frameworks): The framework to convert the data type to. - - Returns: - The equivalent data type in the requested framework. - """ - assert to in Frameworks, f"Expected valid Framework for translation, got {to}" - trt_major_version = int(trt.__version__.split(".")[0]) - if dtype in (np.int8, torch.int8, trt.int8): - return DataTypeEquivalence[trt.int8][to] - elif trt_major_version >= 7 and dtype in (np.bool_, torch.bool, trt.bool): - return DataTypeEquivalence[trt.bool][to] - elif dtype in (np.int32, torch.int32, trt.int32): - return DataTypeEquivalence[trt.int32][to] - elif dtype in (np.int64, torch.int64, trt.int64): - return DataTypeEquivalence[trt.int64][to] - elif dtype in (np.float16, torch.float16, trt.float16): - return DataTypeEquivalence[trt.float16][to] - elif dtype in (np.float32, torch.float32, trt.float32): - return DataTypeEquivalence[trt.float32][to] - else: - raise TypeError("%s is not a supported dtype" % dtype) diff --git a/py/torch_tensorrt/logging.py b/py/torch_tensorrt/logging.py index e75998b870..8447169cc2 100644 --- a/py/torch_tensorrt/logging.py +++ b/py/torch_tensorrt/logging.py @@ -36,10 +36,12 @@ def log(self, severity: trt.ILogger.Severity, msg: str) -> None: class internal_errors: """Context-manager to limit displayed log messages to just internal errors - Example:: + Example: - with torch_tensorrt.logging.internal_errors(): - outputs = model_torchtrt(inputs) + .. code-block:: py + + with torch_tensorrt.logging.internal_errors(): + outputs = model_torchtrt(inputs) """ def __enter__(self) -> None: @@ -73,10 +75,12 @@ def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None: class errors: """Context-manager to limit displayed log messages to just errors and above - Example:: + Example: + + .. code-block:: py - with torch_tensorrt.logging.errors(): - outputs = model_torchtrt(inputs) + with torch_tensorrt.logging.errors(): + outputs = model_torchtrt(inputs) """ def __enter__(self) -> None: @@ -108,10 +112,12 @@ def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None: class warnings: """Context-manager to limit displayed log messages to just warnings and above - Example:: + Example: + + .. code-block:: py - with torch_tensorrt.logging.warnings(): - model_trt = torch_tensorrt.compile(model, **spec) + with torch_tensorrt.logging.warnings(): + model_trt = torch_tensorrt.compile(model, **spec) """ def __enter__(self) -> None: @@ -143,10 +149,12 @@ def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None: class info: """Context-manager to display all info and greater severity messages - Example:: + Example: - with torch_tensorrt.logging.info(): - model_trt = torch_tensorrt.compile(model, **spec) + .. code-block:: py + + with torch_tensorrt.logging.info(): + model_trt = torch_tensorrt.compile(model, **spec) """ def __enter__(self) -> None: @@ -178,10 +186,12 @@ def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None: class debug: """Context-manager to display full debug information through the logger - Example:: + Example: + + .. code-block:: py - with torch_tensorrt.logging.debug(): - model_trt = torch_tensorrt.compile(model, **spec) + with torch_tensorrt.logging.debug(): + model_trt = torch_tensorrt.compile(model, **spec) """ def __enter__(self) -> None: @@ -214,10 +224,12 @@ class graphs: """Context-manager to display the results of intermediate lowering passes as well as full debug information through the logger - Example:: + Example: + + .. code-block:: py - with torch_tensorrt.logging.graphs(): - model_trt = torch_tensorrt.compile(model, **spec) + with torch_tensorrt.logging.graphs(): + model_trt = torch_tensorrt.compile(model, **spec) """ def __enter__(self) -> None: diff --git a/py/torch_tensorrt/runtime/__init__.py b/py/torch_tensorrt/runtime/__init__.py index 29895c83d5..d202c897f6 100644 --- a/py/torch_tensorrt/runtime/__init__.py +++ b/py/torch_tensorrt/runtime/__init__.py @@ -1 +1,6 @@ +from torch_tensorrt.dynamo.runtime import ( # noqa: F401 + PythonTorchTensorRTModule, + TorchTensorRTModule, +) + from .multi_device_safe_mode import set_multi_device_safe_mode diff --git a/py/torch_tensorrt/runtime/multi_device_safe_mode.py b/py/torch_tensorrt/runtime/multi_device_safe_mode.py index 0ddd900ab6..547868edf6 100644 --- a/py/torch_tensorrt/runtime/multi_device_safe_mode.py +++ b/py/torch_tensorrt/runtime/multi_device_safe_mode.py @@ -36,6 +36,24 @@ def __exit__(self, *args: Any) -> None: def set_multi_device_safe_mode(mode: bool) -> _MultiDeviceSafeModeContextManager: + """Sets the runtime (Python-only and default) into multi-device safe mode + + In the case that multiple devices are available on the system, in order for the + runtime to execute safely, additional device checks are necessary. These checks + can have a performance impact so they are therefore opt-in. Used to suppress + the warning about running unsafely in a multi-device context. + + Arguments: + mode (bool): Enable (``True``) or disable (``False``) multi-device checks + + Example: + + .. code-block:: py + + with torch_tensorrt.runtime.set_multi_device_safe_mode(True): + results = trt_compiled_module(*inputs) + + """ # Fetch existing safe mode and set new mode for Python global _PY_RT_MULTI_DEVICE_SAFE_MODE old_mode = _PY_RT_MULTI_DEVICE_SAFE_MODE diff --git a/py/torch_tensorrt/ts/_Device.py b/py/torch_tensorrt/ts/_Device.py index 3ae10a9c4d..6e6b39dc2d 100644 --- a/py/torch_tensorrt/ts/_Device.py +++ b/py/torch_tensorrt/ts/_Device.py @@ -38,7 +38,7 @@ def __init__(self, *args: Any, **kwargs: Any): spec (str): String with device spec e.g. "dla:0" for dla, core_id 0 Keyword Arguments: - gpu_id (int): ID of target GPU (will get overrided if dla_core is specified to the GPU managing DLA). If specified, no positional arguments should be provided + gpu_id (int): ID of target GPU (will get overridden if dla_core is specified to the GPU managing DLA). If specified, no positional arguments should be provided dla_core (int): ID of target DLA core. If specified, no positional arguments should be provided. allow_gpu_fallback (bool): Allow TensorRT to schedule operations on GPU if they are not supported on DLA (ignored if device type is not DLA) diff --git a/py/torch_tensorrt/ts/_Input.py b/py/torch_tensorrt/ts/_Input.py index 6099efbcd2..348cdcf56d 100644 --- a/py/torch_tensorrt/ts/_Input.py +++ b/py/torch_tensorrt/ts/_Input.py @@ -33,11 +33,11 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: Keyword Arguments: shape (Tuple or List, optional): Static shape of input tensor min_shape (Tuple or List, optional): Min size of input tensor's shape range - Note: All three of min_shape, opt_shape, max_shape must be provided, there must be no positional arguments, shape must not be defined and implictly this sets Input's shape_mode to DYNAMIC + Note: All three of min_shape, opt_shape, max_shape must be provided, there must be no positional arguments, shape must not be defined and implicitly this sets Input's shape_mode to DYNAMIC opt_shape (Tuple or List, optional): Opt size of input tensor's shape range - Note: All three of min_shape, opt_shape, max_shape must be provided, there must be no positional arguments, shape must not be defined and implictly this sets Input's shape_mode to DYNAMIC + Note: All three of min_shape, opt_shape, max_shape must be provided, there must be no positional arguments, shape must not be defined and implicitly this sets Input's shape_mode to DYNAMIC max_shape (Tuple or List, optional): Max size of input tensor's shape range - Note: All three of min_shape, opt_shape, max_shape must be provided, there must be no positional arguments, shape must not be defined and implictly this sets Input's shape_mode to DYNAMIC + Note: All three of min_shape, opt_shape, max_shape must be provided, there must be no positional arguments, shape must not be defined and implicitly this sets Input's shape_mode to DYNAMIC dtype (torch.dtype or torch_tensorrt.dtype): Expected data type for input tensor (default: torch_tensorrt.dtype.float32) format (torch.memory_format or torch_tensorrt.TensorFormat): The expected format of the input tensor (default: torch_tensorrt.TensorFormat.NCHW) tensor_domain (Tuple(float, float), optional): The domain of allowed values for the tensor, as interval notation: [tensor_domain[0], tensor_domain[1]). diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py index 1574de02f3..4843ec0145 100644 --- a/py/torch_tensorrt/ts/_compile_spec.py +++ b/py/torch_tensorrt/ts/_compile_spec.py @@ -232,7 +232,7 @@ def _parse_compile_spec(compile_spec_: Dict[str, Any]) -> _ts_C.CompileSpec: else: raise KeyError( - 'Module input definitions are requried to compile module. Provide a list of torch_tensorrt.Input keyed to "inputs" in the compile spec' + 'Module input definitions are required to compile module. Provide a list of torch_tensorrt.Input keyed to "inputs" in the compile spec' ) if "enabled_precisions" in compile_spec: @@ -323,7 +323,7 @@ def TensorRTCompileSpec( calibrator: object = None, allow_shape_tensors: bool = False, ) -> torch.classes.tensorrt.CompileSpec: - """Utility to create a formated spec dictionary for using the PyTorch TensorRT backend + """Utility to create a formatted spec dictionary for using the PyTorch TensorRT backend Keyword Args: inputs (List[Union(torch_tensorrt.Input, torch.Tensor)]): **Required** List of specifications of input shape, dtype and memory layout for inputs to the module. This argument is required. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using @@ -359,7 +359,7 @@ def TensorRTCompileSpec( allow_shape_tensors: (Experimental) Allow aten::size to output shape tensors using IShapeLayer in TensorRT Returns: - torch.classes.tensorrt.CompileSpec: List of methods and formated spec objects to be provided to ``torch._C._jit_to_tensorrt`` + torch.classes.tensorrt.CompileSpec: List of methods and formatted spec objects to be provided to ``torch._C._jit_to_tensorrt`` """ compile_spec = { diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py index 3be9b7a4c2..675c245ac8 100644 --- a/py/torch_tensorrt/ts/_compiler.py +++ b/py/torch_tensorrt/ts/_compiler.py @@ -9,8 +9,6 @@ from torch_tensorrt._Input import Input from torch_tensorrt.ts._compile_spec import _parse_compile_spec, _parse_device -from torch_tensorrt import _enums - def compile( module: torch.jit.ScriptModule, @@ -95,7 +93,7 @@ def compile( truncate_long_and_double (bool): Truncate weights provided in int64 or double (float64) to int32 and float32 calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration require_full_compilation (bool): Require modules to be compiled end to end or return an error as opposed to returning a hybrid graph where operations that cannot be run in TensorRT are run in PyTorch - min_block_size (int): The minimum number of contiguous TensorRT convertable operations in order to run a set of operations in TensorRT + min_block_size (int): The minimum number of contiguous TensorRT convertible operations in order to run a set of operations in TensorRT torch_executed_ops (List[str]): List of aten operators that must be run in PyTorch. An error will be thrown if this list is not empty but ``require_full_compilation`` is True torch_executed_modules (List[str]): List of modules that must be run in PyTorch. An error will be thrown if this list is not empty but ``require_full_compilation`` is True allow_shape_tensors: (Experimental) Allow aten::size to output shape tensors using IShapeLayer in TensorRT diff --git a/py/torch_tensorrt/ts/ptq.py b/py/torch_tensorrt/ts/ptq.py index d129ea2824..6545de9674 100644 --- a/py/torch_tensorrt/ts/ptq.py +++ b/py/torch_tensorrt/ts/ptq.py @@ -73,14 +73,15 @@ def __reduce__(self: object) -> str: class DataLoaderCalibrator(object): """ - Constructs a calibrator class in TensorRT and uses pytorch dataloader to load/preproces + Constructs a calibrator class in TensorRT and uses pytorch dataloader to load/preprocess data which is passed during calibration. - Args: - dataloader: an instance of pytorch dataloader which iterates through a given dataset. - algo_type: choice of calibration algorithm. - cache_file: path to cache file. - use_cache: flag which enables usage of pre-existing cache. - device: device on which calibration data is copied to. + + Arguments: + dataloader (torch.utils.data.DataLoader): an instance of pytorch dataloader which iterates through a given dataset. + algo_type (CalibrationAlgo): choice of calibration algorithm. + cache_file (str): path to cache file. + use_cache (bool): flag which enables usage of pre-existing cache. + device (Device): device on which calibration data is copied to. """ def __init__(self, **kwargs: Any): @@ -164,9 +165,10 @@ def __new__(cls, *args: Any, **kwargs: Any) -> Self: class CacheCalibrator(object): """ Constructs a calibrator class in TensorRT which directly uses pre-existing cache file for calibration. - Args: - cache_file: path to cache file. - algo_type: choice of calibration algorithm. + + Arguments: + cache_file (str): path to cache file. + algo_type (CalibrationAlgo): choice of calibration algorithm. """ def __init__(self, **kwargs: Any): diff --git a/pyproject.toml b/pyproject.toml index f9e1f27690..d44baa3c07 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ classifiers = [ "Topic :: Software Development", "Topic :: Software Development :: Libraries", ] -readme = {file = "py/README.md", content-type = "text/markdown"} +readme = {file = "README.md", content-type = "text/markdown"} requires-python = ">=3.8" keywords = ["pytorch", "torch", "tensorrt", "trt", "ai", "artificial intelligence", "ml", "machine learning", "dl", "deep learning", "compiler", "dynamo", "torchscript", "inference"] dependencies = [ @@ -218,3 +218,27 @@ disallow_untyped_calls = false module = "torch_tensorrt.fx.*" ignore_errors = true follow_imports = "skip" + +[tool.typos] +files.extend-exclude = [ + "docs/**/*", + "*/fx/*", + "docsrc/_rendered_examples/", + "core/*", + "!core/runtime/", + "third_party/", + "CHANGELOG.md", + "*.ipynb", + "cpp/", + "py/torch_tensorrt/fx/" +] + +[tool.typos.default] +extend-ignore-identifiers-re = [ + "^([A-z]|[a-z])*Nd*", + "^([A-z]|[a-z])*nd*", + "activ*([A-z]|[a-z]|[0-9])*," +] + +[tool.typos.default.extend-words] +arange = "arange" diff --git a/setup.py b/setup.py index c74fb9e130..5e3b8d9dfa 100644 --- a/setup.py +++ b/setup.py @@ -140,6 +140,10 @@ def load_dep_info(): sys.argv.remove("--use-cxx11-abi") CXX11_ABI = True +if (cxx11_abi_env_var := os.environ.get("USE_CXX11_ABI")) is not None: + if cxx11_abi_env_var == "1": + CXX11_ABI = True + if platform.uname().processor == "aarch64": if "--jetpack-version" in sys.argv: version_idx = sys.argv.index("--jetpack-version") + 1 @@ -617,7 +621,7 @@ def run(self): } ) -with open("README.md", "r", encoding="utf-8") as fh: +with open(os.path.join(get_root_dir(), "README.md"), "r", encoding="utf-8") as fh: long_description = fh.read() setup( diff --git a/tests/README.md b/tests/README.md index d1ad177ea7..0dd2eb544e 100644 --- a/tests/README.md +++ b/tests/README.md @@ -5,7 +5,7 @@ Currently, following tests are supported: 2. Module level tests 3. Accuracy tests -The goal of Converter tests are to tests individual converters againsts specific subgraphs. The current tests in `core/conveters` are good examples on how to write these tests. In general every converter should have at least 1 test. More may be required if the operation has switches that change the behavior of the op. +The goal of Converter tests are to tests individual converters againsts specific subgraphs. The current tests in `core/converters` are good examples on how to write these tests. In general every converter should have at least 1 test. More may be required if the operation has switches that change the behavior of the op. Module tests are designed to test the compiler against common network architectures and verify the integration of converters together into a single engine. @@ -44,7 +44,7 @@ Note: Supported Python tests ``` ### Testing using pre-built Torch-TensorRT library -Currently, the default strategy when we run all the tests (`bazel test //tests`) is to build the testing scripts along with the full Torch-TensorRT library (`libtorchtrt.so`) from scratch. This can lead to increased testing time and might not be needed incase you already have a pre-built Torch-TensorRT library that you want to link against. +Currently, the default strategy when we run all the tests (`bazel test //tests`) is to build the testing scripts along with the full Torch-TensorRT library (`libtorchtrt.so`) from scratch. This can lead to increased testing time and might not be needed in case you already have a pre-built Torch-TensorRT library that you want to link against. In order to **not** build the entire Torch-TensorRT library and only build the test scripts, please use the following command. diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py index e8c9882c1f..4f11b05730 100644 --- a/tests/py/dynamo/conversion/harness.py +++ b/tests/py/dynamo/conversion/harness.py @@ -1,3 +1,5 @@ +# type: ignore + import logging import time import unittest @@ -192,7 +194,7 @@ def assert_has_op(self, mod, ops): ops_in_mod.add(node.target) self.assertTrue( - ops_in_mod >= ops, f"expected ops {ops}, actuall ops {ops_in_mod}" + ops_in_mod >= ops, f"expected ops {ops}, actual ops {ops_in_mod}" ) def assert_unexpected_op(self, mod, ops): diff --git a/tests/py/dynamo/conversion/test_index_put_aten.py b/tests/py/dynamo/conversion/test_index_put_aten.py index 13e5853e51..f1101144dc 100644 --- a/tests/py/dynamo/conversion/test_index_put_aten.py +++ b/tests/py/dynamo/conversion/test_index_put_aten.py @@ -135,7 +135,7 @@ class TestIndexPutConverter(DispatchTestCase): value_tensor=torch.tensor([5.5, 7.5], dtype=torch.float32), ), # param( - # test_name="3d_indices_float_broadcase_index", + # test_name="3d_indices_float_broadcast_index", # source_tensor=torch.zeros([3, 3, 3], dtype = torch.int32), # indices_tensor=( # torch.tensor([0,1], dtype=torch.int32), diff --git a/tests/py/dynamo/conversion/test_linear_aten.py b/tests/py/dynamo/conversion/test_linear_aten.py index 63b324c78f..615f40fb2f 100644 --- a/tests/py/dynamo/conversion/test_linear_aten.py +++ b/tests/py/dynamo/conversion/test_linear_aten.py @@ -69,7 +69,7 @@ def forward(self, x): # ) ## Testing with (-1, -1, 512) results into following error: - ## AssertionError: Currently we only support one dynmaic dim for linear and it can't be the last dim. + ## AssertionError: Currently we only support one dynamic dim for linear and it can't be the last dim. if __name__ == "__main__": diff --git a/tests/py/dynamo/models/test_dtype_support.py b/tests/py/dynamo/models/test_dtype_support.py index cc576715ff..1ab0848828 100644 --- a/tests/py/dynamo/models/test_dtype_support.py +++ b/tests/py/dynamo/models/test_dtype_support.py @@ -258,3 +258,40 @@ def forward(self, x): DECIMALS_OF_AGREEMENT, msg=f"Torch outputs and TRT outputs don't match close enough.", ) + + def test_bf16_torch_compile(self): + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(20, 30) + + def forward(self, x): + return self.linear(x) + + device = torch.device("cuda", 0) + mod = MyModule().eval().to(device).bfloat16() + inputs = [torch.randn((128, 20), dtype=torch.bfloat16, device=device)] + + with torch.inference_mode(): + trt_mod = torch_tensorrt.compile( + mod, + ir="torch_compile", + inputs=inputs, + enabled_precisions={torch.bfloat16}, + debug=True, + min_block_size=1, + device=device, + ) + + torch_model_results = mod(*inputs) + optimized_model_results = trt_mod(*inputs) + + max_diff = float( + torch.max(torch.abs(optimized_model_results - torch_model_results)) + ) + self.assertAlmostEqual( + max_diff, + 0, + DECIMALS_OF_AGREEMENT, + msg=f"Torch outputs and TRT outputs don't match close enough.", + ) diff --git a/tools/cpp_benchmark/README.md b/tools/cpp_benchmark/README.md index f7c08636ab..3acc3e7441 100644 --- a/tools/cpp_benchmark/README.md +++ b/tools/cpp_benchmark/README.md @@ -30,4 +30,4 @@ You can run a module with JIT or TRT via Torch-TensorRT in either FP32 or FP16. - To also save the TRT engine, add the argument `--cxxopt="-DSAVE_ENGINE"` -> It's suggested to also define `--cxxopt="-DNDEBUG"` to supress debug information +> It's suggested to also define `--cxxopt="-DNDEBUG"` to suppress debug information diff --git a/tools/opset_coverage.ipynb b/tools/opset_coverage.ipynb index 5cbc802138..667cf51dad 100644 --- a/tools/opset_coverage.ipynb +++ b/tools/opset_coverage.ipynb @@ -2,9 +2,352 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::relu(Tensor input) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::relu_(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::sigmoid(Tensor input) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::sigmoid_(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::tanh(Tensor input) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::tanh_(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::hardtanh(Tensor self, Scalar min_val, Scalar max_val) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::hardtanh_(Tensor self, Scalar min_val, Scalar max_val) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::prelu(Tensor self, Tensor weight) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::leaky_relu(Tensor self, Scalar negative_slope) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::leaky_relu_(Tensor self, Scalar negative_slope) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::elu(Tensor self, Scalar alpha, Scalar scale, Scalar input_scale) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::batch_norm(Tensor input, Tensor? gamma, Tensor? beta, Tensor? mean, Tensor? var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::bitwise_not(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::to(Tensor self, int dtype, bool non_blocking, bool copy, int? memory_format) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::to(Tensor self, Device device, int dtype, bool non_blocking, bool copy, int? memory_format) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::to(Tensor self, Tensor other, bool non_blocking, bool copy, int? memory_format) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::to(Tensor self, Device? device, int? dtype, bool non_blocking, bool copy) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::chunk(Tensor self, int chunks, int dim) -> Tensor[]\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::cat(Tensor[] tensors, int dim) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for trt::const(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::constant_pad_nd(Tensor self, int[] pad, Scalar value) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::_convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::_convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::cumsum(Tensor self, int dim, *, int? dtype) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::einsum(str equation, Tensor[] tensors, *, int[]? path) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::add(Tensor self, Tensor other, Scalar alpha) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::add_(Tensor self, Tensor other, *, Scalar alpha) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::clamp(Tensor self, Scalar? min, Scalar? max) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::clamp_min(Tensor self, Scalar min) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::clamp_max(Tensor self, Scalar max) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::sub(Tensor self, Tensor other, Scalar alpha) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::sub_(Tensor self, Tensor other, *, Scalar alpha) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::rsub(Tensor self, Scalar other, Scalar alpha) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::rsub(Tensor self, Tensor other, Scalar alpha) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::div(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::div(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::div(Tensor self, Scalar other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::div_(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::div_(Tensor self, Scalar other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::square(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::mul(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::mul(Tensor self, Scalar other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::mul_(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::ne(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::ne(Tensor self, Scalar other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::pow(Tensor self, Tensor exponent) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::pow(Tensor self, Scalar exponent) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::floor_divide(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::floor_divide(Tensor self, Scalar other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::max(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::min(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::gt(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::gt(Tensor self, Scalar other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::lt(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::lt(Tensor self, Scalar other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::eq(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::eq(Tensor self, Scalar other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::ge(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::ge(Tensor self, Scalar other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::le(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::le(Tensor self, Scalar other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::logical_and(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::atan2(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::expand(Tensor self, int[] size, *, bool implicit) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::expand_as(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::repeat(Tensor self, int[] repeats) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::repeat_interleave(Tensor self, int repeats, int? dim, *, int? output_size) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::meshgrid(Tensor[] tensors) -> Tensor[]\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for trt::attn_bias_from_attn_mask(Tensor attn_mask) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::upsample_nearest1d(Tensor self, int[] output_size, float? scales) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::upsample_nearest1d(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::upsample_nearest2d(Tensor self, int[] output_size, float? scales_h, float? scales_w) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::upsample_nearest2d(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::upsample_nearest3d(Tensor self, int[] output_size, float? scales_d, float? scales_h, float? scales_w) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::upsample_nearest3d(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::upsample_linear1d(Tensor self, int[] output_size, bool align_corners, float? scales) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::upsample_linear1d(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::upsample_bilinear2d(Tensor self, int[] output_size, bool align_corners, float? scales_h, float? scales_w) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::upsample_bilinear2d(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::upsample_trilinear3d(Tensor self, int[] output_size, bool align_corners, float? scales_d, float? scales_h, float? scales_w) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::upsample_trilinear3d(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? gamma, Tensor? beta, float eps, bool cudnn_enabled) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::linear(Tensor input, Tensor weight, Tensor? bias) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih, Tensor? b_hh) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih, Tensor? b_hh) -> (Tensor, Tensor)\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::matmul(Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::bmm(Tensor self, Tensor mat2) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta, Scalar alpha) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::max(Tensor self, int dim, bool keepdim) -> (Tensor, Tensor)\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::min(Tensor self, int dim, bool keepdim) -> (Tensor, Tensor)\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::argmax(Tensor self, int dim, bool keepdim) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::argmin(Tensor self, int dim, bool keepdim) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::norm(Tensor self, Scalar? p, int[] dim, bool keepdim) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::frobenius_norm(Tensor self, int[] dim, bool keepdim) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::linalg_norm(Tensor self, Scalar? ord, int[]? dim, bool keepdim, *, int? dtype) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::max_pool1d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::avg_pool1d(Tensor self, int[] kernel_size, int[] stride, int[] padding, bool ceil_mode, bool count_include_pad) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::max_pool2d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::avg_pool2d(Tensor self, int[] kernel_size, int[] stride, int[] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::max_pool3d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::avg_pool3d(Tensor self, int[] kernel_size, int[] stride, int[] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::adaptive_avg_pool1d(Tensor self, int[] output_size) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::adaptive_max_pool1d(Tensor self, int[] output_size) -> (Tensor, Tensor)\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::adaptive_avg_pool2d(Tensor self, int[] output_size) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::adaptive_max_pool2d(Tensor self, int[] output_size) -> (Tensor, Tensor)\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::adaptive_avg_pool3d(Tensor self, int[] output_size) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::adaptive_max_pool3d(Tensor self, int[] output_size) -> (Tensor, Tensor)\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::fake_quantize_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::mean(Tensor self, *, int? dtype) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::mean(Tensor self, int[] dim, bool keepdim, *, int? dtype) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::sum(Tensor self, *, int? dtype) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::sum(Tensor self, int[] dim, bool keepdim, *, int? dtype) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::prod(Tensor self, *, int? dtype) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::prod(Tensor self, int dim, bool keepdim, *, int? dtype) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::max(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::min(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::any(Tensor self, int dim, bool keepdim) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::all(Tensor self, int dim, bool keepdim) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::reflection_pad2d(Tensor self, int[] padding) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::reflection_pad1d(Tensor self, int[] padding) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::replication_pad1d(Tensor self, int[] padding) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::replication_pad2d(Tensor self, int[] padding) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::replication_pad3d(Tensor self, int[] padding) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::select(Tensor self, int dim, int index) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::index_select(Tensor self, int dim, Tensor index) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::narrow(Tensor self, int dim, int start, int length) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::narrow(Tensor self, int dim, Tensor start, int length) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::embedding(Tensor weight, Tensor indices, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::roll(Tensor self, int[] shifts, int[] dims) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::index(Tensor self, Tensor?[] indices) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::slice(Tensor self, int dim, int? start, int? end, int step) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::split(Tensor self, int[] split_sizes, int dim) -> Tensor[]\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::split(Tensor self, int[] split_size, int dim) -> Tensor[]\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::split(Tensor self, int split_size, int dim) -> Tensor[]\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::split_with_sizes(Tensor self, int[] split_sizes, int dim) -> Tensor[]\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::unbind(Tensor self, int dim) -> Tensor[]\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::masked_fill(Tensor self, Tensor mask, Scalar value) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::scatter(Tensor self, int dim, Tensor index, Scalar value) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::scatter(Tensor self, int dim, Tensor index, Tensor src) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::where(Tensor condition, Tensor self, Tensor other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::where(Tensor condition, Tensor self, Scalar other) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::flip(Tensor self, int[] dims) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::flatten(Tensor self, int start_dim, int end_dim) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::unflatten(Tensor self, int dim, int[] sizes) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::reshape(Tensor self, int[] shape) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::view(Tensor self, int[] size) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::permute(Tensor self, int[] dims) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::transpose(Tensor self, int dim0, int dim1) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::t(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::pixel_shuffle(Tensor self, int upscale_factor) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::softmax(Tensor self, int dim, int? dtype) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::squeeze(Tensor self, int dim) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::squeeze(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::stack(Tensor[] tensors, int dim) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::topk(Tensor self, int k, int dim, bool largest, bool sorted) -> (Tensor, Tensor)\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::abs(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::reciprocal(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::log2(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::logical_not(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::sqrt(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::isfinite(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::cos(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::acos(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::cosh(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::sin(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::asin(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::sinh(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::tan(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::atan(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::floor(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::log(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::ceil(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::exp(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::neg(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::erf(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::sign(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::asinh(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::acosh(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::atanh(Tensor self) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering converter for aten::unsqueeze(Tensor self, int dim) -> Tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::eq\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::ne\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::lt\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::gt\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::le\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::ge\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::pow\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::__and__\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::__or__\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::__xor__\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::__round_to_zero_floordiv\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::zeros\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::ones\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::new_zeros\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::new_ones\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::zeros_like\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::ones_like\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::fill_\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::full\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::full_like\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::slice\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::len\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::size\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::__getitem__\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::append\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::extend\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::neg\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::add\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::add_\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::mul\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::sub\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::Bool\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::Float\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::Int\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::__not__\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::__is__\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::__isnot__\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::numel\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::dim\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::div\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::floordiv\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::floor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::sqrt\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::warn\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::is_floating_point\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::tensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::arange\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::clone\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::copy_\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::format\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::__range_length\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::__derive_index\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for aten::list\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::Constant\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::NumToTensor\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::ListUnpack\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::ListConstruct\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::dtype\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::min\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::max\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::shape\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::TupleConstruct\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::TupleIndex\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::TupleUnpack\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::unchecked_cast\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::Uninitialized\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Registering evaluator for prim::RaiseException\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT - Debug Build] - Runtime:\n", + " Available CUDA Devices: \n", + " Device(ID: 0, Name: NVIDIA TITAN V, SM Capability: 7.0, Type: GPU)\n", + "\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CaskDeconvV2RunnerWeightsTransformerPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CaskDeconvV1RunnerWeightsTransformerPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CaskConvolutionRunnerWeightsTransformerPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CaskFlattenConvolutionRunnerWeightsTransformerPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CaskConvActPoolWeightsTransformerPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CaskDepSepConvWeightsTransformerPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - MyelinWeightsTransformPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - DisentangledAttention_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CustomEmbLayerNormPluginDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CustomEmbLayerNormPluginDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CustomEmbLayerNormPluginDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CustomFCPluginDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CustomGeluPluginDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - GroupNormalizationPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CustomSkipLayerNormPluginDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CustomSkipLayerNormPluginDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CustomSkipLayerNormPluginDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CustomSkipLayerNormPluginDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - RnRes2Br1Br2c_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - RnRes2Br1Br2c_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - RnRes2Br2bBr2c_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - RnRes2Br2bBr2c_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - RnRes2FullFusion_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - SingleStepLSTMPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CustomQKVToContextPluginDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CustomQKVToContextPluginDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CustomQKVToContextPluginDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - DLRM_BOTTOM_MLP_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - SmallTileGEMM_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - RNNTEncoderPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - Interpolate, Namespace: torch_tensorrt\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - NormalizePlugin, Namespace: torch_tensorrt\n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - BatchedNMSDynamic_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - BatchedNMS_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - BatchTilePlugin_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - Clip_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CoordConvAC, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CropAndResizeDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - CropAndResize, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - DecodeBbox3DPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - DetectionLayer_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - EfficientNMS_Explicit_TF_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - EfficientNMS_Implicit_TF_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - EfficientNMS_ONNX_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - EfficientNMS_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - FlattenConcat_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - GenerateDetection_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - GridAnchor_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - GridAnchorRect_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - InstanceNormalization_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - InstanceNormalization_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - LReLU_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - ModulatedDeformConv2d, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - MultilevelCropAndResize_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - MultilevelProposeROI_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - MultiscaleDeformableAttnPlugin_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - NMSDynamic_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - NMS_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - Normalize_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - PillarScatterPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - PriorBox_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - ProposalDynamic, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - ProposalLayer_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - Proposal, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - PyramidROIAlign_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - Region_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - Reorg_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - Reorg_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - ResizeNearest_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - ROIAlign_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - RPROI_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - ScatterElements, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - ScatterND, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - SpecialSlice_TRT, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - Split, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Registered plugin creator - VoxelGeneratorPlugin, Namespace: \n", + "\u001b[1;35mDEBUG: \u001b[0m[Torch-TensorRT Plugins Context] - Total number of plugins registered: 76\n", + "WARNING:torch_tensorrt.dynamo.conversion.aten_ops_converters:Unable to import quantization op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling quantized models\n" + ] + } + ], "source": [ "import torch_tensorrt\n", "from torch_tensorrt.dynamo.tools.opset_coverage import ATEN_COVERAGE, PRIMS_COVERAGE, PY_OVERLOAD_COVERAGE, SupportStatus, OpsetCoverage" @@ -12,9 +355,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unsupported Ops:\n", + "aten.nonzero(Tensor self) -> Tensor\n", + "aten.resize_(Tensor(a!) self, SymInt[] size, \\*, MemoryFormat? memory_format=None) -> Tensor(a!)\n", + "aten.empty_strided(SymInt[] size, SymInt[] stride, \\*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor\n", + "aten.index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor\n", + "aten.max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)\n", + "aten.gather(Tensor self, int dim, Tensor index, \\*, bool sparse_grad=False) -> Tensor\n", + "aten.max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)\n", + "aten.sym_stride.int(Tensor self, int dim) -> SymInt\n", + "aten.scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, \\*, bool include_self=True) -> Tensor\n", + "aten.scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor\n", + "aten.empty.memory_format(SymInt[] size, \\*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor\n", + "aten.native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)\n", + "aten.sym_storage_offset(Tensor self) -> SymInt\n", + "aten._local_scalar_dense(Tensor self) -> Scalar\n", + "\n", + "Backwards Ops:\n", + "aten.avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor\n", + "aten._adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor\n", + "aten.convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)\n", + "aten.max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor\n" + ] + } + ], "source": [ "unsupported_ops = {}\n", "backwards_ops = {}\n", @@ -37,9 +408,140 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unsupported Ops:\n", + "prims.(Tensor a, int[] dims) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor a, SymInt[] stride) -> Tensor\n", + "prims.(Tensor(a!) a, SymInt[] shape) -> Tensor(a!)\n", + "prims.(Tensor[] tensors, int dim) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor(a) self) -> Tensor(a)\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, \\*, int[] dim, bool forward) -> Tensor\n", + "prims.(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt storage_offset) -> Tensor\n", + "prims.(Tensor(a) a, ScalarType dtype) -> Tensor(a)\n", + "prims.(Tensor(a) a, int dim, SymInt outer_length) -> Tensor(a)\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor a, Device device) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Scalar s, \\*, ScalarType? dtype=None, Device? device=None) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(SymInt[] shape, \\*, Scalar low, Scalar high, ScalarType dtype, Device device, Generator? generator=None) -> Tensor\n", + "prims.(Tensor A, \\*, bool full_matrices) -> (Tensor U, Tensor S, Tensor Vh)\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self, \\*, int[] dim, SymInt last_dim_size) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor a, ScalarType dtype) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> (Tensor mantissa, Tensor exponent)\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self, Scalar value) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, \\*, int[] dim, bool onesided) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(ScalarType dtype) -> Scalar\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, \\*, MemoryFormat? memory_format=None) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(SymInt[] shape, int[] physical_layout, \\*, ScalarType dtype, Device device, bool requires_grad) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor(a) a, int start, int end) -> Tensor(a)\n", + "prims.(Tensor(a) a, int[] dimensions) -> Tensor(a)\n", + "prims.(ScalarType dtype) -> Scalar\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor(a) a, SymInt[] start_indices, SymInt[] limit_indices, SymInt[]? strides=None) -> Tensor(a)\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor(a) a, int[] permutation) -> Tensor(a)\n", + "prims.(SymInt[] shape, \\*, Scalar mean, Scalar std, ScalarType dtype, Device device, bool requires_grad, Generator? generator=None) -> Tensor\n", + "prims.(Tensor a, SymInt[] shape) -> Tensor\n", + "prims.(SymInt length, \\*, SymInt start, SymInt step, ScalarType dtype, Device device, bool requires_grad) -> Tensor\n", + "prims.(Tensor(a!) a, Tensor b) -> Tensor(a!)\n", + "prims.() -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor pred, Tensor a, Tensor b) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor inp, int[]? dims, \\*, ScalarType? output_dtype=None) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(SymInt[] shape, SymInt[] strides, \\*, ScalarType dtype, Device device, bool requires_grad) -> Tensor\n", + "prims.(Tensor a) -> Scalar\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor a, int start, int end) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor(a) a) -> Tensor(a)\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor[] tokens) -> ()\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor inp, int[]? dims, \\*, ScalarType? output_dtype=None) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor(a) self) -> Tensor(a)\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor inp, int[]? dims, \\*, ScalarType? output_dtype=None) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor(a!) a, SymInt[] size, SymInt[] stride, SymInt storage_offset) -> Tensor(a!)\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor inp, int[]? dims, \\*, ScalarType? output_dtype=None) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor(a) a, SymInt start_index, SymInt limit_index, int stride=1, int axis=0) -> Tensor(a)\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor(a) a) -> Tensor(a)\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "prims.(Tensor self, Tensor other) -> Tensor\n", + "prims.(Tensor self) -> Tensor\n", + "\n", + "Backwards Ops:\n" + ] + } + ], "source": [ "unsupported_ops = {}\n", "backwards_ops = {}\n", @@ -84,7 +586,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.12.3" } }, "nbformat": 4,