pyink

lsy323 · lsy323 · commit 17884bcc392e · 2024-05-25T02:05:23.000Z
diff --git a/jetstream_pt/config.py b/jetstream_pt/config.py
@@ -42,7 +42,11 @@
 
 # Quantization related flags
 flags.DEFINE_bool("quantize_weights", False, "weight quantization")
-flags.DEFINE_bool("quantize_activation", False, "Quantize Q,K,V projection and FeedForward activation.")
+flags.DEFINE_bool(
+    "quantize_activation",
+    False,
+    "Quantize Q,K,V projection and FeedForward activation.",
+)
 flags.DEFINE_string(
     "quantize_type", "int8_per_channel", "Type of quantization."
 )
@@ -91,9 +95,9 @@ def create_quantization_config_from_flags():
   config.enable_weight_quantization = True
   config.num_bits_weight = 8 if "int8" in quantize_type else 4
   config.is_blockwise_weight = "blockwise" in quantize_type
-  
+
   config.enable_activation_quantization = FLAGS.quantize_activation
-  
+
   config.enable_kv_quantization = FLAGS.quantize_kv_cache
   return config
 
diff --git a/jetstream_pt/layers.py b/jetstream_pt/layers.py
@@ -102,10 +102,10 @@ def __init__(
 
     # Number of bits of weight tensor
     self.n_bit = quant_config.num_bits_weight
-    
+
     # Quantize activation
     self.quantize_activation = quant_config.enable_activation_quantization
-    
+
     # Flag to enable dequantize weight first, then do matmul. Useful for debugging.
     self.run_fake_quantize = False
 
@@ -136,8 +136,14 @@ def forward(self, inputs):
       if not self.quantize_activation:
         result = F.linear(inputs, self.weight)
       else:
-        result = torchjax.call_jax(jax.lax.dot_general, inputs, self.weight,
-                        (((2,),(1)),((),())), None, torch.int32)
+        result = torchjax.call_jax(
+            jax.lax.dot_general,
+            inputs,
+            self.weight,
+            (((2,), (1)), ((), ())),
+            None,
+            torch.int32,
+        )
       result = result * self.weight_scaler
       if self.quantize_activation:
         result = result * act_s
@@ -182,15 +188,21 @@ def __init__(
     self.block_size = quant_config.block_size_weight
     n_blocks = in_features // self.block_size
 
-    assert not quant_config.enable_activation_quantization, "Activation quantization not supported for blockwise quantized matmul."
-    
+    assert (
+        not quant_config.enable_activation_quantization
+    ), "Activation quantization not supported for blockwise quantized matmul."
+
     if self.use_dot_general:
       weight = torch.ones(
-          (n_blocks, out_features, self.block_size), dtype=torch.int8, device=device
+          (n_blocks, out_features, self.block_size),
+          dtype=torch.int8,
+          device=device,
       )
     else:
       weight = torch.ones(
-          (n_blocks, self.block_size, out_features), dtype=torch.int8, device=device
+          (n_blocks, self.block_size, out_features),
+          dtype=torch.int8,
+          device=device,
       )
     self.register_buffer("weight", weight)
 
@@ -209,7 +221,7 @@ def __init__(
       self.register_buffer("zero_point", None)
 
     self.n_bit = quant_config.num_bits_weight
-    
+
     # Quantize activation
     self.quantize_activation = quant_config.enable_activation_quantization
 
@@ -240,15 +252,23 @@ def quantize_weight_from_nn_linear(self, weight):
   def forward(self, inputs):
     if not self.run_fake_quantize:
       if self.use_dot_general or self.flatten:
-        assert self.zero_point is None, "Blockwise quantized linear doesn't support zero_point in dot_general or einsum flattened implementation."
-      blockwise_matmul_kernel = blockwise_jax_kernel if not self.use_dot_general and not self.flatten else blockwise_jax_kernel_dot_general if self.use_dot_general else blockwise_jax_kernel_einsum_flatten
+        assert (
+            self.zero_point is None
+        ), "Blockwise quantized linear doesn't support zero_point in dot_general or einsum flattened implementation."
+      blockwise_matmul_kernel = (
+          blockwise_jax_kernel
+          if not self.use_dot_general and not self.flatten
+          else blockwise_jax_kernel_dot_general
+          if self.use_dot_general
+          else blockwise_jax_kernel_einsum_flatten
+      )
       result = torchjax.call_jax(
-                  blockwise_matmul_kernel,
-                  inputs,
-                  self.weight,
-                  self.weight_scaler,
-                  self.zero_point,
-                )
+          blockwise_matmul_kernel,
+          inputs,
+          self.weight,
+          self.weight_scaler,
+          self.zero_point,
+      )
       return result
     else:
       # Fake quantization, debugging purpose.
diff --git a/jetstream_pt/quantize.py b/jetstream_pt/quantize.py
@@ -14,8 +14,9 @@
 
 from typing import Tuple, Union
 
-import torch
+import jax
 import jax.numpy as jnp
+import torch
 
 EPS = 1e-5
 
@@ -116,9 +117,7 @@ def blockwise_jax_kernel(inputs, weight, weight_scaler, zero_point):
   return out
 
 
-def blockwise_jax_kernel_dot_general(
-    inputs, weight, weight_scaler, zero_point
-):
+def blockwise_jax_kernel_dot_general(inputs, weight, weight_scaler, zero_point):
   """Blockwise Matmul kernel impl in JAX using dot general"""
   inputs_shape = inputs.shape
   block_size = weight.shape[2]
diff --git a/jetstream_pt/third_party/gemma/model.py b/jetstream_pt/third_party/gemma/model.py
@@ -240,13 +240,25 @@ def __init__(
       linear_kwargs = {"quant_config": env.quant_config}
 
     self.gate_proj = Linear(
-        hidden_size, intermediate_size, bias=False, device=device, **linear_kwargs,
+        hidden_size,
+        intermediate_size,
+        bias=False,
+        device=device,
+        **linear_kwargs,
     )
     self.up_proj = Linear(
-        hidden_size, intermediate_size, bias=False, device=device, **linear_kwargs,
+        hidden_size,
+        intermediate_size,
+        bias=False,
+        device=device,
+        **linear_kwargs,
     )
     self.down_proj = Linear(
-        intermediate_size, hidden_size, bias=False, device=device, **linear_kwargs,
+        intermediate_size,
+        hidden_size,
+        bias=False,
+        device=device,
+        **linear_kwargs,
     )
 
   def forward(self, x):
diff --git a/tests/test_quantization.py b/tests/test_quantization.py
@@ -48,18 +48,18 @@ def _calc_cosine_dist(self, x, y):
     return (torch.dot(x, y) / (x.norm() * y.norm())).item()
 
   def _nn_linear_run_and_compare(
-        self,
-        nn_linear,
-        qlinear_layer,
-        arg,
-    ):
-      torch_result = nn_linear(arg)
-      qlinear_layer.quantize_weight_from_nn_linear(nn_linear.weight)
-      result = helpers.call_xla_model(
-          qlinear_layer, qlinear_layer.state_dict(), arg
-      )
-      diff = result - torch_result
-      return result, torch_result, diff
+      self,
+      nn_linear,
+      qlinear_layer,
+      arg,
+  ):
+    torch_result = nn_linear(arg)
+    qlinear_layer.quantize_weight_from_nn_linear(nn_linear.weight)
+    result = helpers.call_xla_model(
+        qlinear_layer, qlinear_layer.state_dict(), arg
+    )
+    diff = result - torch_result
+    return result, torch_result, diff
 
   def _print_diff(self, w, w_dq):
     print("Print diff:")
@@ -195,7 +195,9 @@ def test_weight_only_quant(self):
     block_q_linear = WeightOnlyBlockwiseQuantizedLinear(
         in_features, out_features
     )
-    res, torch_res, block_diff = self._nn_linear_run_and_compare(nn_linear, block_q_linear, arg)
+    res, torch_res, block_diff = self._nn_linear_run_and_compare(
+        nn_linear, block_q_linear, arg
+    )
     # self.assertTrue(torch.allclose(res, torch_res, atol=1.5))
     # Block quant is more accurate than per_channel quant.
     self.assertLess(block_diff.norm(), per_channel_diff.norm())
@@ -210,7 +212,9 @@ def test_weight_only_quant(self):
     )
     # self._print_diff(res, torch_res)
     self.assertTrue(torch.allclose(res, torch_res, atol=2))
-    quant_config = QuantizationConfig(is_symmetric_weight=False, is_blockwise_weight=True)
+    quant_config = QuantizationConfig(
+        is_symmetric_weight=False, is_blockwise_weight=True
+    )
     block_q_linear = WeightOnlyBlockwiseQuantizedLinear(
         in_features, out_features, quant_config=quant_config
     )
@@ -273,28 +277,28 @@ def shard_and_lower(f, layer, state_dict_jax, input, shardings):
       opt_hlo = shard_and_lower(f, layer, state_dict_jax, input, sharding)
       self.assertFalse("all-to-all" in opt_hlo)
       self.assertFalse("all-reduce-scatter" in opt_hlo)
-  
+
   def test_activation_quant_per_channel(self):
 
     out_features = 8
     in_features = 4
     block_size = 128
-    
+
     arg = torch.randn(2, 1, in_features).to(torch.bfloat16)
     nn_linear = torch.nn.Linear(
         in_features, out_features, bias=False, dtype=torch.bfloat16
     )
     quant_config = QuantizationConfig(
-      enable_weight_quantization=True,
-      enable_activation_quantization=True,
+        enable_weight_quantization=True,
+        enable_activation_quantization=True,
     )
     per_channel_q_linear = WeightOnlyPerChannelQuantizedLinear(
         in_features, out_features, quant_config=quant_config
     )
     res, torch_res, _ = self._nn_linear_run_and_compare(
         nn_linear, per_channel_q_linear, arg
     )
-    self.assertGreater(self._calc_cosine_dist(res, torch_res), 0.9999)    
+    self.assertGreater(self._calc_cosine_dist(res, torch_res), 0.9999)
 
 
 if __name__ == "__main__":