diff --git a/.lintrunner.toml b/.lintrunner.toml new file mode 100644 index 000000000..97de2b6a2 --- /dev/null +++ b/.lintrunner.toml @@ -0,0 +1,51 @@ +merge_base_with = "origin/main" + +[[linter]] +code = 'FLAKE8' +include_patterns = ['**/*.py'] +command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'flake8_linter', + '--', + '@{{PATHSFILE}}' +] +init_command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'pip_init', + '--dry-run={{DRYRUN}}', + '--requirement=requirements-lintrunner.txt', +] + +# Black + usort +[[linter]] +code = 'UFMT' +include_patterns = [ + '**/*.py', + '**/*.pyi', +] +command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'ufmt_linter', + '--', + '@{{PATHSFILE}}' +] +init_command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'pip_init', + '--dry-run={{DRYRUN}}', + '--no-black-binary', + '--requirement=requirements-lintrunner.txt', +] +is_formatter = true \ No newline at end of file diff --git a/build/builder.py b/build/builder.py index f62c42131..c8b1fe78c 100644 --- a/build/builder.py +++ b/build/builder.py @@ -6,21 +6,19 @@ import itertools import sys import time +from dataclasses import dataclass from pathlib import Path -from typing import Optional, Tuple +from typing import Optional, Tuple, Union import torch import torch._dynamo.config import torch._inductor.config - -from quantize import ( - quantize_model, name_to_dtype, set_precision, get_precision -) from cli import cli_args -from dataclasses import dataclass -from typing import Union, Optional + +from quantize import get_precision, name_to_dtype, quantize_model, set_precision from sentencepiece import SentencePieceProcessor + from build.model import Transformer @@ -40,43 +38,50 @@ class BuilderArgs: def __post_init__(self): if not ( - (self.checkpoint_path and self.checkpoint_path.is_file()) or - (self.checkpoint_dir and self.checkpoint_path.is_dir()) or - (self.gguf_path and self.gguf_path.is_file()) or - (self.dso_path and Path(self.dso_path).is_file()) or - (self.pte_path and Path(self.pte_path).is_file()) + (self.checkpoint_path and self.checkpoint_path.is_file()) + or (self.checkpoint_dir and self.checkpoint_path.is_dir()) + or (self.gguf_path and self.gguf_path.is_file()) + or (self.dso_path and Path(self.dso_path).is_file()) + or (self.pte_path and Path(self.pte_path).is_file()) ): - raise RuntimeError("need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, or PTE path") + raise RuntimeError( + "need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, or PTE path" + ) - if (self.dso_path and self.pte_path): + if self.dso_path and self.pte_path: raise RuntimeError("specify either DSO path or PTE path, but not both") - if (self.checkpoint_path and (self.dso_path or self.pte_path)): - print("Warning: checkpoint path ignored because an exported DSO or PTE path specified") - if (self.checkpoint_dir and (self.dso_path or self.pte_path)): - print("Warning: checkpoint dir ignored because an exported DSO or PTE path specified") - if (self.gguf_path and (self.dso_path or self.pte_path)): - print("Warning: GGUF path ignored because an exported DSO or PTE path specified") - + if self.checkpoint_path and (self.dso_path or self.pte_path): + print( + "Warning: checkpoint path ignored because an exported DSO or PTE path specified" + ) + if self.checkpoint_dir and (self.dso_path or self.pte_path): + print( + "Warning: checkpoint dir ignored because an exported DSO or PTE path specified" + ) + if self.gguf_path and (self.dso_path or self.pte_path): + print( + "Warning: GGUF path ignored because an exported DSO or PTE path specified" + ) @classmethod - def from_args(cls, args): # -> BuilderArgs: + def from_args(cls, args): # -> BuilderArgs: return cls( - checkpoint_path = args.checkpoint_path, - checkpoint_dir = args.checkpoint_dir, - params_path = args.params_path, - params_table = args.params_table, - gguf_path = args.gguf_path, - dso_path = args.dso_path, - pte_path = args.pte_path, - device = args.device, - precision = name_to_dtype(args.dtype), - setup_caches = (args.output_dso_path or args.output_pte_path), - use_tp = False, + checkpoint_path=args.checkpoint_path, + checkpoint_dir=args.checkpoint_dir, + params_path=args.params_path, + params_table=args.params_table, + gguf_path=args.gguf_path, + dso_path=args.dso_path, + pte_path=args.pte_path, + device=args.device, + precision=name_to_dtype(args.dtype), + setup_caches=(args.output_dso_path or args.output_pte_path), + use_tp=False, ) @classmethod - def from_speculative_args(cls, args): # -> BuilderArgs: + def from_speculative_args(cls, args): # -> BuilderArgs: speculative_builder_args = BuilderArgs.from_args(args) # let's limit multi-checkpoint to checker speculative_builder_args.checkpoint_dir = None @@ -94,7 +99,7 @@ class TokenizerArgs: is_TikToken: bool = False @classmethod - def from_args(cls, args): # -> TokenizerArgs: + def from_args(cls, args): # -> TokenizerArgs: is_SentencePiece = True is_TikToken = False @@ -108,7 +113,7 @@ def from_args(cls, args): # -> TokenizerArgs: raise RuntimeError(f"cannot find tokenizer model") if not tokenizer_path.is_file(): - raise RuntimeError(f"did not find tokenizer at {tokenizer_path}") + raise RuntimeError(f"did not find tokenizer at {tokenizer_path}") if args.tiktoken: is_SentencePiece = False @@ -117,9 +122,10 @@ def from_args(cls, args): # -> TokenizerArgs: return cls( tokenizer_path=tokenizer_path, is_SentencePiece=is_SentencePiece, - is_TikToken=is_TikToken + is_TikToken=is_TikToken, ) + def _initialize_tokenizer(tokenizer_args: TokenizerArgs): if tokenizer_args.is_SentencePiece: return SentencePieceProcessor(model_file=str(tokenizer_args.tokenizer_path)) @@ -147,6 +153,7 @@ def device_sync(device): wd = Path(__file__).parent.parent.resolve() sys.path.append(str(wd)) + def _load_model(builder_args): if builder_args.gguf_path: model = Transformer.from_gguf(builder_args.gguf_path) @@ -160,9 +167,8 @@ def _load_model(builder_args): else: return _load_model_not_gguf(builder_args) -def _load_model_not_gguf( - builder_args -): + +def _load_model_not_gguf(builder_args): assert not builder_args.gguf_path with torch.device("meta"): @@ -200,7 +206,12 @@ def _load_model_not_gguf( else: checkpoint[key] = cps[0][key] else: - checkpoint = torch.load(builder_args.checkpoint_path, map_location=builder_args.device, mmap=True, weights_only=True) + checkpoint = torch.load( + builder_args.checkpoint_path, + map_location=builder_args.device, + mmap=True, + weights_only=True, + ) if "model" in checkpoint and "stories" in str(builder_args.checkpoint_path): checkpoint = checkpoint["model"] @@ -218,21 +229,21 @@ def _load_model_not_gguf( def _initialize_model( - builder_args, - quantize, + builder_args, + quantize, ): print("Loading model ...") t0 = time.time() - model_ = _load_model( - builder_args - ) + model_ = _load_model(builder_args) device_sync(device=builder_args.device) print(f"Time to load model: {time.time() - t0:.02f} seconds") if builder_args.dso_path: # make sure user did not try to set dtype # assert model_dtype == "float32", f"dtype setting not valid for a DSO model. Specify dtype during export." - assert quantize is None or quantize == "{ }", f"quantize not valid for exported DSO model. Specify quantization during export." + assert ( + quantize is None or quantize == "{ }" + ), f"quantize not valid for exported DSO model. Specify quantization during export." try: model = model_ # Replace model forward with the AOT-compiled forward @@ -241,15 +252,20 @@ def _initialize_model( # attributes will NOT be seen on by AOTI-compiled forward # function, e.g. calling model.setup_cache will NOT touch # AOTI compiled and maintained model buffers such as kv_cache. - model.forward = torch._export.aot_load(str(builder_args.dso_path.absolute()), builder_args.device) + model.forward = torch._export.aot_load( + str(builder_args.dso_path.absolute()), builder_args.device + ) except: raise RuntimeError(f"Failed to load AOTI compiled {builder_args.dso_path}") elif builder_args.pte_path: # make sure user did not try to set dtype # assert model_dtype == "float32", f"dtype setting not valid for a DSO model. Specify dtype during export." - assert quantize is None or quantize == "{ }", f"quantize not valid for exported PTE model. Specify quantization during export." + assert ( + quantize is None or quantize == "{ }" + ), f"quantize not valid for exported PTE model. Specify quantization during export." try: from build.model_et import PTEModel + model = PTEModel(model_.config, builder_args.pte_path) except Exception as e: raise RuntimeError(f"Failed to load ET compiled {builder_args.pte_path}") @@ -265,10 +281,7 @@ def _initialize_model( if builder_args.setup_caches: max_seq_length = 350 with torch.device(builder_args.device): - model.setup_caches( - max_batch_size=1, - max_seq_length=max_seq_length - ) + model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length) model.to(dtype=builder_args.precision) diff --git a/build/gguf_loader.py b/build/gguf_loader.py index a52c15274..3655e804a 100644 --- a/build/gguf_loader.py +++ b/build/gguf_loader.py @@ -8,26 +8,33 @@ import argparse import copy +import logging import sys from dataclasses import dataclass from pathlib import Path -from typing import Any, Mapping, Dict -import logging -from quantize import WeightOnlyInt4Linear, pack_scales_and_zeros, group_dequantize_tensor_from_qparams -from build.gguf_util import F16, F32, Q4_0, Q6_K +from typing import Any, Dict, Mapping + import gguf import torch import torch.nn as nn from gguf import GGUFValueType, ReaderTensor +from quantize import ( + group_dequantize_tensor_from_qparams, + pack_scales_and_zeros, + WeightOnlyInt4Linear, +) + +from build.gguf_util import F16, F32, Q4_0, Q6_K wd = Path(__file__).parent.resolve() sys.path.append(str(wd)) -from model import ModelArgs, Transformer from typing import Set +from model import ModelArgs, Transformer + logger: logging.Logger = logging.getLogger(__name__) @@ -101,7 +108,9 @@ def _convert_gguf_tensor_name_to_llama_nn(gguf_name: str) -> str: def _build_model_args(metadata: dict[str, Any]) -> GGUFModelArgs: arch = metadata["general.architecture"] - assert arch == "llama", f"Only general.architecture=llama is supported, but got general.architecture={arch}" + assert ( + arch == "llama" + ), f"Only general.architecture=llama is supported, but got general.architecture={arch}" return GGUFModelArgs( arch=arch, embedding_length=metadata[f"{arch}.embedding_length"], @@ -119,6 +128,7 @@ def _build_model_args(metadata: dict[str, Any]) -> GGUFModelArgs: ), ) + def _fqn_lookup(fqn: str, module: torch.nn.Module) -> Any: if fqn == "": return module @@ -147,7 +157,9 @@ def _fqn_last(fqn: str) -> str: return atoms[-1] -def load_weights(pt_model: torch.nn.Module, weight_map: Dict[str, ReaderTensor], inner_k_tiles = 8) -> None: +def load_weights( + pt_model: torch.nn.Module, weight_map: Dict[str, ReaderTensor], inner_k_tiles=8 +) -> None: fqns = [] for fqn in pt_model.state_dict(): assert _fqn_last(fqn) == "weight" @@ -159,7 +171,10 @@ def load_weights(pt_model: torch.nn.Module, weight_map: Dict[str, ReaderTensor], t = weight_map[f"{fqn}.weight"] - if isinstance(mod, torch.nn.Linear) and t.tensor_type == gguf.GGMLQuantizationType.Q4_0: + if ( + isinstance(mod, torch.nn.Linear) + and t.tensor_type == gguf.GGMLQuantizationType.Q4_0 + ): assert not mod.bias out_features = mod.out_features in_features = mod.in_features @@ -167,30 +182,36 @@ def load_weights(pt_model: torch.nn.Module, weight_map: Dict[str, ReaderTensor], q, s, z = Q4_0.unpack(t) scales_and_zeros = pack_scales_and_zeros(s, z) - weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(q, inner_k_tiles) + weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( + q, inner_k_tiles + ) - state_dict[f"{fqn}.weight"] = weight_int4pack.to('cpu') - state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros.to('cpu') + state_dict[f"{fqn}.weight"] = weight_int4pack.to("cpu") + state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros.to("cpu") parent = _fqn_lookup(_fqn_up(fqn), pt_model) setattr( parent, _fqn_last(fqn), WeightOnlyInt4Linear( - "cpu", # TODO: should --device work for gguf load? (yes?!) + "cpu", # TODO: should --device work for gguf load? (yes?!) in_features, out_features, bias=False, groupsize=Q4_0.groupsize, inner_k_tiles=inner_k_tiles, - ) + ), ) else: # All other weights are dequantized to float if t.tensor_type == gguf.GGMLQuantizationType.Q4_0: - as_float = group_dequantize_tensor_from_qparams(*Q4_0.unpack(t), Q4_0.n_bit, Q4_0.groupsize) + as_float = group_dequantize_tensor_from_qparams( + *Q4_0.unpack(t), Q4_0.n_bit, Q4_0.groupsize + ) elif t.tensor_type == gguf.GGMLQuantizationType.Q6_K: - as_float = group_dequantize_tensor_from_qparams(*Q6_K.unpack(t), Q6_K.n_bit, Q6_K.groupsize) + as_float = group_dequantize_tensor_from_qparams( + *Q6_K.unpack(t), Q6_K.n_bit, Q6_K.groupsize + ) elif t.tensor_type == gguf.GGMLQuantizationType.F16: as_float = F16.unpack(t) elif t.tensor_type == gguf.GGMLQuantizationType.F32: @@ -198,7 +219,7 @@ def load_weights(pt_model: torch.nn.Module, weight_map: Dict[str, ReaderTensor], else: raise ValueError(f"Unsupported tensor type {t.tensor_type}") - state_dict[f"{fqn}.weight"] = as_float.to('cpu') + state_dict[f"{fqn}.weight"] = as_float.to("cpu") pt_model.load_state_dict(state_dict) return pt_model @@ -245,7 +266,6 @@ def load_llama_from_gguf_file(gguf_file: str) -> torch.nn.Module: logger.info("Creating initial PT model.") pt_model = _create_pt_model(model_args) - logger.info("Reading GGUF weights.") gguf_weights = GGUFWeights(tensors=reader.tensors) diff --git a/build/gguf_util.py b/build/gguf_util.py index 3160af0dc..9f8a07661 100644 --- a/build/gguf_util.py +++ b/build/gguf_util.py @@ -4,10 +4,11 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import torch import gguf +import torch from quantize import group_dequantize_tensor_from_qparams + def to_float(t: gguf.gguf_reader.ReaderTensor): """ Unpack and dequantize GGUF tensor to torch tensor of type torch.float32. @@ -15,9 +16,13 @@ def to_float(t: gguf.gguf_reader.ReaderTensor): # All other weights are dequantized to float if t.tensor_type == gguf.GGMLQuantizationType.Q4_0: - return group_dequantize_tensor_from_qparams(*Q4_0.unpack(t), Q4_0.n_bit, Q4_0.groupsize).to(torch.float32) + return group_dequantize_tensor_from_qparams( + *Q4_0.unpack(t), Q4_0.n_bit, Q4_0.groupsize + ).to(torch.float32) elif t.tensor_type == gguf.GGMLQuantizationType.Q6_K: - return group_dequantize_tensor_from_qparams(*Q6_K.unpack(t), Q6_K.n_bit, Q6_K.groupsize).to(torch.float32) + return group_dequantize_tensor_from_qparams( + *Q6_K.unpack(t), Q6_K.n_bit, Q6_K.groupsize + ).to(torch.float32) elif t.tensor_type == gguf.GGMLQuantizationType.F16: return F16.unpack(t).to(torch.float32) elif t.tensor_type == gguf.GGMLQuantizationType.F32: @@ -41,15 +46,21 @@ def test_by_to_float(source_file: str, target_file: str) -> None: gguf_targets = {t.name: t for t in gguf.GGUFReader(target_file, "r").tensors} for t in gguf_targets.values(): - assert t.tensor_type == gguf.GGMLQuantizationType.F32, f"target_file must only contain F32 tensors, but found tensor {t.name} with type {repr(t.tensor_type)}." - assert gguf_sources.keys() == gguf_targets.keys(), "source_file and target_file should have the same tensors (by name)" + assert ( + t.tensor_type == gguf.GGMLQuantizationType.F32 + ), f"target_file must only contain F32 tensors, but found tensor {t.name} with type {repr(t.tensor_type)}." + assert ( + gguf_sources.keys() == gguf_targets.keys() + ), "source_file and target_file should have the same tensors (by name)" for k in gguf_sources: source = to_float(gguf_sources[k]) target = to_float(gguf_targets[k]) if not torch.allclose(source, target): - print(f"After calling to_float on source tensor {k} of type {repr(gguf_sources[k].tensor_type)} it does not match its target.") + print( + f"After calling to_float on source tensor {k} of type {repr(gguf_sources[k].tensor_type)} it does not match its target." + ) print("First 5 elements of converted source: ", source.reshape(-1)[0:5]) print("First 5 elements of target: ", target.reshape(-1)[0:5]) assert False, "found mismatch" @@ -68,6 +79,7 @@ def unpack(gguf_tensor: gguf.gguf_reader.ReaderTensor): new_tensor = gguf_tensor.data.reshape(reversed_shape) return torch.from_numpy(new_tensor).to(torch.float16) + class F32: @staticmethod def unpack(gguf_tensor: gguf.gguf_reader.ReaderTensor): @@ -79,6 +91,7 @@ def unpack(gguf_tensor: gguf.gguf_reader.ReaderTensor): new_tensor = gguf_tensor.data.reshape(reversed_shape) return torch.from_numpy(new_tensor).to(torch.float32) + class Q4_0: groupsize = 32 n_bit = 4 @@ -111,24 +124,24 @@ def unpack(gguf_tensor: gguf.gguf_reader.ReaderTensor): assert gguf_tensor.tensor_type == gguf.GGMLQuantizationType.Q4_0 assert len(gguf_tensor.shape) == 2 - nc, nr = gguf_tensor.shape # GGUF tensor has reversed shape + nc, nr = gguf_tensor.shape # GGUF tensor has reversed shape - QK4_0 = 32 # groupsize + QK4_0 = 32 # groupsize # Parse block_q4_0 block_q4_0_size = int(2 + QK4_0 / 2) packed = torch.from_numpy(gguf_tensor.data.reshape(-1, block_q4_0_size)) assert packed.dtype == torch.uint8 - ng = packed.shape[0] # number of groups/blocks + ng = packed.shape[0] # number of groups/blocks curr = 0 - size = 2 # half size - d = packed[:,curr:(curr+size)].contiguous() + size = 2 # half size + d = packed[:, curr : (curr + size)].contiguous() d = torch.tensor(d.untyped_storage(), dtype=torch.float16).reshape(ng, 1) curr += size size = int(QK4_0 / 2) - qs = packed[:,curr:(curr+size)].contiguous() + qs = packed[:, curr : (curr + size)].contiguous() curr += size # Check we finished parsing @@ -141,7 +154,7 @@ def unpack(gguf_tensor: gguf.gguf_reader.ReaderTensor): int32_data = torch.cat([x0, x1], dim=1).to(torch.int32).reshape(ng, QK4_0) assert int32_data.dtype == torch.int32 assert int32_data.min().item() >= 0 - assert int32_data.max().item() <= 2**4-1 + assert int32_data.max().item() <= 2**4 - 1 assert int32_data.shape == (ng, QK4_0) # Prepare for return @@ -194,62 +207,86 @@ def unpack(gguf_tensor: gguf.gguf_reader.ReaderTensor): QK_K = 256 # Parse block_q6_K - block_q6_K_size = int(QK_K/2 + QK_K/4 + QK_K/16 + 2) + block_q6_K_size = int(QK_K / 2 + QK_K / 4 + QK_K / 16 + 2) packed = torch.from_numpy(gguf_tensor.data.reshape(-1, block_q6_K_size)) assert packed.dtype == torch.uint8 - ng = packed.shape[0] # number of groups/blocks + ng = packed.shape[0] # number of groups/blocks curr = 0 - size = int(QK_K/2) - ql = packed[:,curr:(curr+size)].contiguous() + size = int(QK_K / 2) + ql = packed[:, curr : (curr + size)].contiguous() assert ql.shape == (ng, 128) curr += size - size = int(QK_K/4) - qh = packed[:,curr:(curr+size)].contiguous() + size = int(QK_K / 4) + qh = packed[:, curr : (curr + size)].contiguous() assert qh.shape == (ng, 64) curr += size - size = int(QK_K/16) - scales = packed[:,curr:(curr+size)].contiguous() - scales = torch.tensor(scales.untyped_storage(), dtype=torch.int8).reshape(ng, int(QK_K/16)).to(torch.float32) + size = int(QK_K / 16) + scales = packed[:, curr : (curr + size)].contiguous() + scales = ( + torch.tensor(scales.untyped_storage(), dtype=torch.int8) + .reshape(ng, int(QK_K / 16)) + .to(torch.float32) + ) curr += size - size = 2 # half size - d = packed[:,curr:(curr+size)].contiguous() - d = torch.tensor(d.untyped_storage(), dtype=torch.float16).reshape(ng, 1).to(torch.float32) + size = 2 # half size + d = packed[:, curr : (curr + size)].contiguous() + d = ( + torch.tensor(d.untyped_storage(), dtype=torch.float16) + .reshape(ng, 1) + .to(torch.float32) + ) curr += size # Check we finished parsing assert curr == block_q6_K_size # Unpack quantized values. Unlike the code in ggml-quants.c, we do not subtract 32 - q1 = ((ql[:,0:32] & 0xF) | (((qh[:,0:32] >> 0) & 3) << 4)) - q2 = ((ql[:,32:64] & 0xF) | (((qh[:,0:32] >> 2) & 3) << 4)) - q3 = ((ql[:,0:32] >> 4) | (((qh[:,0:32] >> 4) & 3) << 4)) - q4 = ((ql[:,32:64] >> 4) | (((qh[:,0:32] >> 6) & 3) << 4)) + q1 = (ql[:, 0:32] & 0xF) | (((qh[:, 0:32] >> 0) & 3) << 4) + q2 = (ql[:, 32:64] & 0xF) | (((qh[:, 0:32] >> 2) & 3) << 4) + q3 = (ql[:, 0:32] >> 4) | (((qh[:, 0:32] >> 4) & 3) << 4) + q4 = (ql[:, 32:64] >> 4) | (((qh[:, 0:32] >> 6) & 3) << 4) - q5 = ((ql[:,64:96] & 0xF) | (((qh[:,32:64] >> 0) & 3) << 4)) - q6 = ((ql[:,96:128] & 0xF) | (((qh[:,32:64] >> 2) & 3) << 4)) - q7 = ((ql[:,64:96] >> 4) | (((qh[:,32:64] >> 4) & 3) << 4)) - q8 = ((ql[:,96:128] >> 4) | (((qh[:,32:64] >> 6) & 3) << 4)) + q5 = (ql[:, 64:96] & 0xF) | (((qh[:, 32:64] >> 0) & 3) << 4) + q6 = (ql[:, 96:128] & 0xF) | (((qh[:, 32:64] >> 2) & 3) << 4) + q7 = (ql[:, 64:96] >> 4) | (((qh[:, 32:64] >> 4) & 3) << 4) + q8 = (ql[:, 96:128] >> 4) | (((qh[:, 32:64] >> 6) & 3) << 4) q = torch.cat([q1, q2, q3, q4, q5, q6, q7, q8], dim=1).to(torch.int32) assert q.shape == (ng, QK_K) assert q.min().item() >= 0 - assert q.max().item() <= 2**6-1 + assert q.max().item() <= 2**6 - 1 # Unpack scales - s1 = d * torch.cat([scales[:,0].reshape(-1,1), scales[:,1].reshape(-1,1)], dim=1) - s2 = d * torch.cat([scales[:,2].reshape(-1,1), scales[:,3].reshape(-1,1)], dim=1) - s3 = d * torch.cat([scales[:,4].reshape(-1,1), scales[:,5].reshape(-1,1)], dim=1) - s4 = d * torch.cat([scales[:,6].reshape(-1,1), scales[:,7].reshape(-1,1)], dim=1) - - s5 = d * torch.cat([scales[:,8].reshape(-1,1), scales[:,9].reshape(-1,1)], dim=1) - s6 = d * torch.cat([scales[:,10].reshape(-1,1), scales[:,11].reshape(-1,1)], dim=1) - s7 = d * torch.cat([scales[:,12].reshape(-1,1), scales[:,13].reshape(-1,1)], dim=1) - s8 = d * torch.cat([scales[:,14].reshape(-1,1), scales[:,15].reshape(-1,1)], dim=1) + s1 = d * torch.cat( + [scales[:, 0].reshape(-1, 1), scales[:, 1].reshape(-1, 1)], dim=1 + ) + s2 = d * torch.cat( + [scales[:, 2].reshape(-1, 1), scales[:, 3].reshape(-1, 1)], dim=1 + ) + s3 = d * torch.cat( + [scales[:, 4].reshape(-1, 1), scales[:, 5].reshape(-1, 1)], dim=1 + ) + s4 = d * torch.cat( + [scales[:, 6].reshape(-1, 1), scales[:, 7].reshape(-1, 1)], dim=1 + ) + + s5 = d * torch.cat( + [scales[:, 8].reshape(-1, 1), scales[:, 9].reshape(-1, 1)], dim=1 + ) + s6 = d * torch.cat( + [scales[:, 10].reshape(-1, 1), scales[:, 11].reshape(-1, 1)], dim=1 + ) + s7 = d * torch.cat( + [scales[:, 12].reshape(-1, 1), scales[:, 13].reshape(-1, 1)], dim=1 + ) + s8 = d * torch.cat( + [scales[:, 14].reshape(-1, 1), scales[:, 15].reshape(-1, 1)], dim=1 + ) s = torch.cat([s1, s2, s3, s4, s5, s6, s7, s8], dim=1) assert s.shape == (ng, 16) diff --git a/build/model.py b/build/model.py index bc1a783b1..9e3cc1e71 100644 --- a/build/model.py +++ b/build/model.py @@ -9,10 +9,11 @@ import torch import torch.nn as nn + +from quantize import get_precision from torch import Tensor from torch.nn import functional as F -from quantize import get_precision def find_multiple(n: int, k: int) -> int: if n % k == 0: @@ -86,7 +87,9 @@ def from_name(cls, name: str): config[1] ), name # make sure only one 'best' match elif len(config) == 0: - raise ValueError(f"Unknown model directory name {name}. Must be one of {list(transformer_configs.keys())}.") + raise ValueError( + f"Unknown model directory name {name}. Must be one of {list(transformer_configs.keys())}." + ) return cls(**transformer_configs[config[0]]) @@ -107,9 +110,7 @@ def from_name(cls, name: str): hidden_dim=22016, rope_base=1000000, ), # CodeLlama-34B-Python-hf - "70B": dict( - n_layer=80, n_heads=64, dim=8192, n_local_heads=8, hidden_dim=28672 - ), + "70B": dict(n_layer=80, n_heads=64, dim=8192, n_local_heads=8, hidden_dim=28672), "Mistral-7B": dict( n_layer=32, n_heads=32, @@ -140,12 +141,11 @@ def from_name(cls, name: str): class KVCache(nn.Module): - def __init__( - self, max_batch_size, max_seq_length, n_heads, head_dim, dtype=None): + def __init__(self, max_batch_size, max_seq_length, n_heads, head_dim, dtype=None): # torch.float): # bfloat16 ): super().__init__() if not dtype: - dtype=get_precision() + dtype = get_precision() cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim) self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype)) self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype)) @@ -238,9 +238,10 @@ def from_params(cls, params_path: str): @classmethod def from_gguf(cls, gguf_path: str): from build.gguf_loader import load_llama_from_gguf_file + model = load_llama_from_gguf_file(gguf_path) return model - + class TransformerBlock(nn.Module): def __init__(self, config: ModelArgs) -> None: @@ -267,8 +268,12 @@ def __init__(self, config: ModelArgs): # total_head_dim = (config.n_heads + 2 * config.n_local_heads) * config.head_dim # self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False) self.wq = nn.Linear(config.dim, config.n_heads * config.head_dim, bias=False) - self.wk = nn.Linear(config.dim, config.n_local_heads * config.head_dim, bias=False) - self.wv = nn.Linear(config.dim, config.n_local_heads * config.head_dim, bias=False) + self.wk = nn.Linear( + config.dim, config.n_local_heads * config.head_dim, bias=False + ) + self.wv = nn.Linear( + config.dim, config.n_local_heads * config.head_dim, bias=False + ) self.wo = nn.Linear(config.dim, config.dim, bias=False) self.kv_cache = None @@ -297,7 +302,6 @@ def load_hook(self, state_dict, prefix, *args): return - def _unfuse_wqkv_state_dict( state_dict: Dict[str, torch.Tensor], dim: int, @@ -306,15 +310,16 @@ def _unfuse_wqkv_state_dict( if key.endswith("wqkv.weight"): tensor = state_dict[key] wq_key = key.replace("wqkv.weight", "wq.weight") - state_dict[wq_key] = tensor[: dim] + state_dict[wq_key] = tensor[:dim] wk_key = key.replace("wqkv.weight", "wk.weight") wv_key = key.replace("wqkv.weight", "wv.weight") - wk, wv = tensor[dim :].chunk(2, 0) + wk, wv = tensor[dim:].chunk(2, 0) state_dict[wk_key] = wk state_dict[wv_key] = wv state_dict.pop(key) else: continue + _unfuse_wqkv_state_dict(state_dict, self.dim) def forward( @@ -326,7 +331,6 @@ def forward( ) -> Tensor: bsz, seqlen, _ = x.shape - q = self.wq(x) k = self.wk(x) v = self.wv(x) @@ -379,8 +383,11 @@ def forward(self, x: Tensor) -> Tensor: output = self._norm(x.float()).type_as(x) return output * self.weight + # transpsoed first two arguments to align with model in ET -def precompute_freqs_cis(n_elem: int, seq_len: int, base: int = 10000, dtype=None) -> Tensor: +def precompute_freqs_cis( + n_elem: int, seq_len: int, base: int = 10000, dtype=None +) -> Tensor: if not dtype: dtype = get_precision() freqs = 1.0 / ( @@ -390,7 +397,7 @@ def precompute_freqs_cis(n_elem: int, seq_len: int, base: int = 10000, dtype=Non freqs = torch.outer(t, freqs) freqs_cis = torch.polar(torch.ones_like(freqs), freqs) cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1) - return cache.to(dtype=dtype) # bfloat16) + return cache.to(dtype=dtype) # bfloat16) def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor: diff --git a/build/model_aoti.py b/build/model_aoti.py index 4832c248d..50fa2b939 100644 --- a/build/model_aoti.py +++ b/build/model_aoti.py @@ -11,7 +11,7 @@ # with open("./dso_model.h", "rb") as f: # dso_src = f.read().decode("utf-8") -dso_src ="" +dso_src = "" src = """ #include @@ -36,7 +36,6 @@ """ - class DSOModel(nn.Module): def __init__(self, config, dso_path) -> None: super().__init__() @@ -44,8 +43,8 @@ def __init__(self, config, dso_path) -> None: # build transformer model global src, dso_src - - src = src.replace('***my_model.so***', str(dso_path)) + + src = src.replace("***my_model.so***", str(dso_path)) async_compile = AsyncCompile() self.transformer_model = async_compile.cpp_pybinding( ["long *", "long *", "float *"], dso_src + src @@ -53,9 +52,8 @@ def __init__(self, config, dso_path) -> None: async_compile.wait(globals()) del async_compile - def forward(self, x, input_pos): - vocab_size = self.config.vocab_size # 32000 + vocab_size = self.config.vocab_size # 32000 assert x.dim() == 2 and x.size(0) == 1 and x.size(1) == 1 logits = torch.empty(1, 1, vocab_size) x = x.to(torch.long) diff --git a/build/model_et.py b/build/model_et.py index cee0d5c3d..f7bd02194 100644 --- a/build/model_et.py +++ b/build/model_et.py @@ -2,8 +2,9 @@ import torch import torch.nn as nn -from torch import empty from executorch.extension.pybindings import portable_lib as exec_lib +from torch import empty + class PTEModel(nn.Module): def __init__(self, config, path) -> None: diff --git a/cli.py b/cli.py index b14f0a944..9bdca3776 100644 --- a/cli.py +++ b/cli.py @@ -4,8 +4,8 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. -import time import os +import time from pathlib import Path import torch @@ -15,22 +15,23 @@ strict = False + def check_args(args, command_name: str): global strict # chat and generate support the same options - if command_name in ["generate", "chat", "gui"]: + if command_name in ["generate", "chat", "gui"]: # examples, can add more. Note that attributes convert dash to _ - disallowed_args = ['output_pte_path', 'output_dso_path' ] + disallowed_args = ["output_pte_path", "output_dso_path"] elif command_name == "export": # examples, can add more. Note that attributes convert dash to _ - disallowed_args = ['pte_path', 'dso_path' ] + disallowed_args = ["pte_path", "dso_path"] elif command_name == "eval": # TBD disallowed_args = [] else: raise RuntimeError(f"{command_name} is not a valid command") - + for disallowed in disallowed_args: if hasattr(args, disallowed): text = f"command {command_name} does not support option {disallowed.replace('_', '-')}" @@ -39,7 +40,7 @@ def check_args(args, command_name: str): else: print(f"Warning: {text}") - + def cli_args(): import argparse @@ -48,8 +49,8 @@ def cli_args(): parser.add_argument( "--seed", type=int, - default=1234, # set None for release - help="Initialize torch seed" + default=1234, # set None for release + help="Initialize torch seed", ) parser.add_argument( "--prompt", type=str, default="Hello, my name is", help="Input prompt." @@ -78,55 +79,31 @@ def cli_args(): "--chat", action="store_true", help="Use torchat to for an interactive chat session.", - ) + ) parser.add_argument( "--gui", action="store_true", help="Use torchat to for an interactive gui-chat session.", - ) - parser.add_argument( - "--num-samples", - type=int, - default=1, - help="Number of samples.") - parser.add_argument( - "--max-new-tokens", - type=int, - default=200, - help="Maximum number of new tokens." ) + parser.add_argument("--num-samples", type=int, default=1, help="Number of samples.") parser.add_argument( - "--top-k", - type=int, - default=200, - help="Top-k for sampling.") + "--max-new-tokens", type=int, default=200, help="Maximum number of new tokens." + ) + parser.add_argument("--top-k", type=int, default=200, help="Top-k for sampling.") parser.add_argument( - "--temperature", - type=float, - default=0.8, - help="Temperature for sampling." + "--temperature", type=float, default=0.8, help="Temperature for sampling." ) parser.add_argument( - "--compile", - action="store_true", - help="Whether to compile the model." + "--compile", action="store_true", help="Whether to compile the model." ) parser.add_argument( "--compile-prefill", action="store_true", help="Whether to compile the prefill (improves prefill perf, but higher compile times)", ) + parser.add_argument("--profile", type=Path, default=None, help="Profile path.") parser.add_argument( - "--profile", - type=Path, - default=None, - help="Profile path." - ) - parser.add_argument( - "--speculate-k", - type=int, - default=5, - help="Speculative execution depth." + "--speculate-k", type=int, default=5, help="Speculative execution depth." ) parser.add_argument( "--draft-checkpoint-path", @@ -163,31 +140,18 @@ def cli_args(): type=Path, default=None, help="Model checkpoint path.", - ) - parser.add_argument( - "--output-pte-path", - type=str, - default=None, - help="Filename" ) + parser.add_argument("--output-pte-path", type=str, default=None, help="Filename") + parser.add_argument("--output-dso-path", type=str, default=None, help="Filename") parser.add_argument( - "--output-dso-path", - type=str, - default=None, - help="Filename" - ) - parser.add_argument( - "--dso-path", - type=Path, - default=None, - help="Use the specified AOTI DSO model." + "--dso-path", type=Path, default=None, help="Use the specified AOTI DSO model." ) parser.add_argument( "--pte-path", type=Path, default=None, - help="Use the specified Executorch PTE model." - ) + help="Use the specified Executorch PTE model.", + ) parser.add_argument( "-d", "--dtype", @@ -196,48 +160,36 @@ def cli_args(): ) parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument( - "--quantize", - type=str, - default="{ }", - help="Quantization options." + "--quantize", type=str, default="{ }", help="Quantization options." ) parser.add_argument( - "--device", - type=str, - default=default_device, - help="Device to use" + "--device", type=str, default=default_device, help="Device to use" ) + parser.add_argument("--params-table", type=str, default=None, help="Device to use") parser.add_argument( - "--params-table", - type=str, - default=None, - help="Device to use" - ) - parser.add_argument( - '--tasks', - nargs='+', + "--tasks", + nargs="+", type=str, default=["hellaswag"], - help='list of lm-eluther tasks to evaluate usage: --tasks task1 task2' + help="list of lm-eluther tasks to evaluate usage: --tasks task1 task2", ) parser.add_argument( - '--limit', type=int, - default=None, - help='number of samples to evaluate' + "--limit", type=int, default=None, help="number of samples to evaluate" ) parser.add_argument( - '--max-seq-length', + "--max-seq-length", type=int, default=None, - help='maximum length sequence to evaluate') - + help="maximum length sequence to evaluate", + ) + args = parser.parse_args() - if (Path(args.quantize).is_file()): + if Path(args.quantize).is_file(): with open(args.quantize, "r") as f: args.quantize = json.loads(f.read()) if args.seed: - torch.manual_seed(args.seed) + torch.manual_seed(args.seed) return args diff --git a/eval.py b/eval.py index 8ac8c457f..9ebd5337d 100644 --- a/eval.py +++ b/eval.py @@ -18,32 +18,36 @@ torch._inductor.config.triton.cudagraphs = True torch._dynamo.config.cache_size_limit = 100000 +from build.model import Transformer from cli import cli_args from quantize import name_to_dtype, set_precision -from build.model import Transformer - try: import lm_eval + lm_eval_available = True except: lm_eval_available = False -from build.builder import _initialize_model, _initialize_tokenizer, BuilderArgs, TokenizerArgs +from build.builder import ( + _initialize_model, + _initialize_tokenizer, + BuilderArgs, + TokenizerArgs, +) from generate import encode_tokens, model_forward if lm_eval_available: - try: # lm_eval version 0.4 + try: # lm_eval version 0.4 + from lm_eval.evaluator import evaluate from lm_eval.models.huggingface import HFLM as eval_wrapper from lm_eval.tasks import get_task_dict - from lm_eval.evaluator import evaluate - except: #lm_eval version 0.3 - from lm_eval import base - from lm_eval import tasks - from lm_eval import evaluator - eval_wrapper=base.BaseLM - get_task_dict=tasks.get_task_dict - evaluate=evaluator.evaluate + except: # lm_eval version 0.3 + from lm_eval import base, evaluator, tasks + + eval_wrapper = base.BaseLM + get_task_dict = tasks.get_task_dict + evaluate = evaluator.evaluate def setup_cache_padded_seq_input_pos_max_seq_length_for_prefill( @@ -84,20 +88,22 @@ def setup_cache_padded_seq_input_pos_max_seq_length_for_prefill( return seq, input_pos, max_seq_length + class GPTFastEvalWrapper(eval_wrapper): """ A wrapper class for GPTFast, providing integration with the lm-evaluation-harness library. """ + def __init__( self, model: Transformer, tokenizer, - max_seq_length: Optional[int]=None, + max_seq_length: Optional[int] = None, ): super().__init__() self._model = model self._tokenizer = tokenizer - self._device = torch.device('cuda') + self._device = torch.device("cuda") self._max_seq_length = 2048 if max_seq_length is None else max_seq_length @property @@ -121,8 +127,7 @@ def device(self): return self._device def tok_encode(self, string: str, **kwargs): - encoded = encode_tokens(self._tokenizer, - string, bos=True, device=self._device) + encoded = encode_tokens(self._tokenizer, string, bos=True, device=self._device) # encoded is a pytorch tensor, but some internal logic in the # eval harness expects it to be a list instead # TODO: verify this for multi-batch as well @@ -138,19 +143,20 @@ def _model_call(self, inps): inps = inps.squeeze(0) max_new_tokens = 1 - seq, input_pos, max_seq_length = \ + seq, input_pos, max_seq_length = ( setup_cache_padded_seq_input_pos_max_seq_length_for_prefill( self._model, inps, max_new_tokens, self.max_length, ) + ) x = seq.index_select(0, input_pos).view(1, -1) logits = model_forward(self._model, x, input_pos) return logits def _model_generate(self, context, max_length, eos_token_id): - raise Exception('unimplemented') + raise Exception("unimplemented") @torch.no_grad() @@ -185,8 +191,8 @@ def eval( except: pass - if 'hendrycks_test' in tasks: - tasks.remove('hendrycks_test') + if "hendrycks_test" in tasks: + tasks.remove("hendrycks_test") tasks += [x for x in lm_eval.tasks.hendrycks_test.create_all_tasks().keys()] task_dict = get_task_dict(tasks) @@ -212,7 +218,7 @@ def main(args) -> None: builder_args = BuilderArgs.from_args(args) tokenizer_args = TokenizerArgs.from_args(args) - + checkpoint_path = args.checkpoint_path checkpoint_dir = args.checkpoint_dir params_path = args.params_path @@ -223,12 +229,12 @@ def main(args) -> None: pte_path = args.pte_path quantize = args.quantize device = args.device - model_dtype = args.dtype + model_dtype = args.dtype tasks = args.tasks limit = args.limit max_seq_length = args.max_seq_length use_tiktoken = args.tiktoken - + print(f"Using device={device}") set_precision(buildeer_args.precision) @@ -240,9 +246,13 @@ def main(args) -> None: ) if compile: - assert not (builder_args.dso_path or builder_args.pte_path), "cannot compile exported model" + assert not ( + builder_args.dso_path or builder_args.pte_path + ), "cannot compile exported model" global model_forward - model_forward = torch.compile(model_forward, mode="reduce-overhead", dynamic=True, fullgraph=True) + model_forward = torch.compile( + model_forward, mode="reduce-overhead", dynamic=True, fullgraph=True + ) torch._inductor.config.coordinate_descent_tuning = True t1 = time.time() @@ -268,11 +278,13 @@ def main(args) -> None: for task, res in result["results"].items(): print(f"{task}: {res}") -if __name__ == '__main__': + +if __name__ == "__main__": + def cli(): args = cli_args() main(args) if __name__ == "__main__": - cli() + cli() diff --git a/export.py b/export.py index 1e5fb5d37..f31af803a 100644 --- a/export.py +++ b/export.py @@ -4,17 +4,17 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. -import time import os +import time from pathlib import Path import torch import torch.nn as nn -from torch.export import Dim, export - -from quantize import quantize_model, name_to_dtype, set_precision, get_precision from cli import cli_args +from quantize import get_precision, name_to_dtype, quantize_model, set_precision +from torch.export import Dim, export + try: executorch_export_available = True from export_et import export_model as export_model_et @@ -22,12 +22,12 @@ executorch_exception = f"ET EXPORT EXCEPTION: {e}" executorch_export_available = False -from export_aoti import export_model as export_model_aoti +from build.builder import _initialize_model, BuilderArgs, TokenizerArgs from build.model import Transformer -from build.builder import _initialize_model, BuilderArgs, TokenizerArgs +from export_aoti import export_model as export_model_aoti from generate import decode_one_token -from quantize import quantize_model, name_to_dtype +from quantize import name_to_dtype, quantize_model from torch._export import capture_pre_autograd_graph default_device = "cpu" # 'cuda' if torch.cuda.is_available() else 'cpu' @@ -42,7 +42,6 @@ def device_sync(device): print(f"device={device} is not yet suppported") - def main(args): builder_args = BuilderArgs.from_args(args) tokenizer_args = TokenizerArgs.from_args(args) @@ -70,7 +69,9 @@ def main(args): print(f"Exporting model using Executorch to {output_pte_path}") export_model_et(model, builder_args.device, args.output_pte_path, args) else: - print(f"Export with executorch requested but Executorch could not be loaded") + print( + f"Export with executorch requested but Executorch could not be loaded" + ) print(executorch_exception) if output_dso_path: output_dso_path = str(os.path.abspath(output_dso_path)) @@ -82,5 +83,6 @@ def cli(): args = cli_args() main(args) + if __name__ == "__main__": cli() diff --git a/export_aoti.py b/export_aoti.py index 6501b9e98..b9a59c3bb 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -12,12 +12,12 @@ import torch import torch.nn as nn -from torch.export import Dim, export + +from build.model import Transformer from generate import decode_one_token from quantize import quantize_model - -from build.model import Transformer +from torch.export import Dim, export default_device = "cpu" # 'cuda' if torch.cuda.is_available() else 'cpu' @@ -33,11 +33,11 @@ def device_sync(device): def export_model(model: nn.Module, device, output_path, args=None): max_seq_length = 350 -# with torch.device(device): -# model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length) + # with torch.device(device): + # model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length) input = ( - torch.tensor([[1, 9038, 2501, 263, 931]], dtype=torch.int, device=device), + torch.tensor([[1, 9038, 2501, 263, 931]], dtype=torch.int, device=device), torch.tensor([0, 1, 2, 3, 4], dtype=torch.int, device=device), ) diff --git a/export_et.py b/export_et.py index 030fd0b6c..6e7de441b 100644 --- a/export_et.py +++ b/export_et.py @@ -9,23 +9,10 @@ import torch import torch.nn as nn -from torch.export import Dim, export -from torch._export import capture_pre_autograd_graph - -from generate import decode_one_token -from quantize import ( - quantize_model, name_to_dtype, set_precision, get_precision, -) -from build.model import Transformer from build.model import Transformer -from executorch.backends.xnnpack.partition.xnnpack_partitioner import ( - XnnpackPartitioner, -) -# from executorch.backends.xnnpack.partition.xnnpack_partitioner import ( -# XnnpackDynamicallyQuantizedPartitioner, -#) -from executorch_portable_utils import export_to_edge +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner + # TODO: change back to executorch.examples.portable.utils # when executorch installs correctly @@ -33,6 +20,16 @@ from executorch.exir.passes.quant_fusion_pass import QuantFusionPass from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass +# from executorch.backends.xnnpack.partition.xnnpack_partitioner import ( +# XnnpackDynamicallyQuantizedPartitioner, +# ) +from executorch_portable_utils import export_to_edge + +from generate import decode_one_token +from quantize import get_precision, name_to_dtype, quantize_model, set_precision +from torch._export import capture_pre_autograd_graph +from torch.export import Dim, export + default_device = "cpu" # 'cuda' if torch.cuda.is_available() else 'cpu' @@ -99,7 +96,7 @@ def export_model(model, device, output_path, args=None) -> str: # noqa: C901 _skip_type_promotion=bool(target_precision == torch.float16), ) - if target_precision == torch.float16: # or args.quantization_mode=="int4": + if target_precision == torch.float16: # or args.quantization_mode=="int4": if state_dict_dtype != torch.float16: print("model.to torch.float16") model = model.to(dtype=torch.float16) @@ -111,11 +108,11 @@ def export_model(model, device, output_path, args=None) -> str: # noqa: C901 else: raise ValueError(f"Unsupported dtype for ET export: {target_precision}") - with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]), torch.no_grad(): + with torch.nn.attention.sdpa_kernel( + [torch.nn.attention.SDPBackend.MATH] + ), torch.no_grad(): m = capture_pre_autograd_graph( - export_model, - input, - dynamic_shapes=dynamic_shapes + export_model, input, dynamic_shapes=dynamic_shapes ) edge_manager = export_to_edge( diff --git a/generate.py b/generate.py index ad6085582..18583c2f9 100644 --- a/generate.py +++ b/generate.py @@ -4,48 +4,55 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import itertools -import sys import os +import sys import time +from dataclasses import dataclass from pathlib import Path from typing import Optional, Tuple -from dataclasses import dataclass import torch import torch._dynamo.config import torch._inductor.config -from build.builder import _load_model, _initialize_model, _initialize_tokenizer, BuilderArgs, TokenizerArgs +from build.builder import ( + _initialize_model, + _initialize_tokenizer, + _load_model, + BuilderArgs, + TokenizerArgs, +) from build.model import Transformer -from quantize import quantize_model, name_to_dtype, set_precision, get_precision from cli import cli_args +from quantize import get_precision, name_to_dtype, quantize_model, set_precision + @dataclass class GeneratorArgs: prompt: str = "torchat is pronounced torch-chat and is so cool because" - chat: bool = False, - gui: bool = False, - num_samples: int =1, - max_new_tokens: int = 200, - top_k: int = 200, - temperature: int = 0, # deterministic argmax - compile: bool = False, - compile_prefill: bool = False, - speculate_k: int = 5, + chat: bool = (False,) + gui: bool = (False,) + num_samples: int = (1,) + max_new_tokens: int = (200,) + top_k: int = (200,) + temperature: int = (0,) # deterministic argmax + compile: bool = (False,) + compile_prefill: bool = (False,) + speculate_k: int = (5,) @classmethod - def from_args(cls, args): # -> GeneratorArgs: + def from_args(cls, args): # -> GeneratorArgs: return cls( - prompt = args.prompt, - chat = args.chat, - gui = args.gui, - num_samples = args.num_samples, - max_new_tokens = args.max_new_tokens, - top_k = args.top_k, - temperature = args.temperature, - compile = args.compile, - compile_prefill = args.compile_prefill, - speculate_k = args.speculate_k, + prompt=args.prompt, + chat=args.chat, + gui=args.gui, + num_samples=args.num_samples, + max_new_tokens=args.max_new_tokens, + top_k=args.top_k, + temperature=args.temperature, + compile=args.compile, + compile_prefill=args.compile_prefill, + speculate_k=args.speculate_k, ) @@ -152,6 +159,7 @@ def decode_n_tokens( # except: # print("compiled model load not successful, running eager model") + def model_forward(model, x, input_pos): return model(x, input_pos) @@ -336,20 +344,20 @@ def _main( is_chat = "chat" in str(os.path.basename(builder_args.checkpoint_path)) if is_chat: - raise RuntimeError("need to stop filename based kludgery, at a minimum need to look at all pathnames. in particular, this now fails because chat is part of the pathname, yuck!") + raise RuntimeError( + "need to stop filename based kludgery, at a minimum need to look at all pathnames. in particular, this now fails because chat is part of the pathname, yuck!" + ) tokenizer = _initialize_tokenizer(tokenizer_args) builder_args.setup_caches = False - model = _initialize_model( - builder_args, - quantize - ) + model = _initialize_model(builder_args, quantize) # will add a version of _initialize_model in future # (need additional args) if is_speculative: from builder import _load_model + speculative_builder_args = builder_args draft_model = _load_model( @@ -359,7 +367,7 @@ def _main( draft_model = None encoded = encode_tokens(tokenizer, prompt, bos=True, device=builder_args.device) - print (encoded) + print(encoded) prompt_length = encoded.size(0) model_size = sum( @@ -369,7 +377,9 @@ def _main( ] ) if compile: - if is_speculative and builder_args.use_tp: # and ("cuda" in builder_args.device): + if ( + is_speculative and builder_args.use_tp + ): # and ("cuda" in builder_args.device): torch._inductor.config.triton.cudagraph_trees = ( False # Bug with cudagraph trees in this case ) @@ -401,7 +411,9 @@ def _main( prompt = input("What is your prompt? ") if is_chat: prompt = f"{B_INST} {prompt.strip()} {E_INST}" - encoded = encode_tokens(tokenizer, prompt, bos=True, device=builder_args.device) + encoded = encode_tokens( + tokenizer, prompt, bos=True, device=builder_args.device + ) if chat_mode and i >= 0: buffer = [] @@ -503,10 +515,11 @@ def main(args): args.quantize, ) + def cli(): args = cli_args() main(args) if __name__ == "__main__": - cli() + cli() diff --git a/quantize.py b/quantize.py index 4890c4b42..766efac04 100644 --- a/quantize.py +++ b/quantize.py @@ -4,21 +4,21 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import json from functools import reduce from math import gcd from typing import Dict, Optional, Tuple -import json + +import quantized_ops import torch import torch.nn as nn import torch.nn.functional as F -import quantized_ops - try: + from eval import evaluate, get_task_dict, lm_eval from GPTQ import GenericGPTQRunner, InputRecorder - from eval import get_task_dict, evaluate, lm_eval except: pass @@ -27,34 +27,39 @@ precision = torch.float + def set_precision(dtype): global precision precision = dtype + def get_precision(): global precision return precision + def name_to_dtype(name): if name in name_to_dtype_dict: return name_to_dtype_dict[name] else: raise RuntimeError(f"unsupported dtype name {name} specified") + name_to_dtype_dict = { - "fp32" : torch.float, - "fp16" : torch.float16, - "bf16" : torch.bfloat16, - "float" : torch.float, - "half" : torch.float16, - "float32" : torch.float, - "float16" : torch.float16, - "bfloat16" : torch.bfloat16, + "fp32": torch.float, + "fp16": torch.float16, + "bf16": torch.bfloat16, + "float": torch.float, + "half": torch.float16, + "float32": torch.float, + "float16": torch.float16, + "bfloat16": torch.bfloat16, } ########################################################################## ### process quantization dictionary ### + def quantize_model(model: nn.Module, device, quantize_options): """ Quantize the specified model using the quantizers described by @@ -73,46 +78,34 @@ def quantize_model(model: nn.Module, device, quantize_options): for quantizer, q_kwargs in quantize_options.items(): if quantizer == "embedding": model = EmbeddingOnlyInt8QuantHandler( - model, - device, - **q_kwargs + model, device, **q_kwargs ).quantized_model() elif linears_quantized: - assert 0==1, "can only specify one linear quantizer" + assert 0 == 1, "can only specify one linear quantizer" elif quantizer == "linear:int8": linears_quantized = True model = WeightOnlyInt8QuantHandler( - model, - device, - **q_kwargs + model, device, **q_kwargs ).quantized_model() elif quantizer == "linear:int4": linears_quantized = True model = WeightOnlyInt4QuantHandler( - model, - device, - **q_kwargs + model, device, **q_kwargs ).quantized_model() elif quantizer == "linear:a8w4dq": linears_quantized = True model = Int8DynActInt4WeightQuantHandler( - model, - device, - **q_kwargs + model, device, **q_kwargs ).quantized_model() elif quantizer == "linear:gptq": linears_quantized = True model = WeightOnlyInt4GPTQQuantHandler( - model, - device, - **q_kwargs + model, device, **q_kwargs ).quantized_model() elif quantizer == "linear:hqq": linears_quantized = True model = WeightOnlyInt4HqqQuantHandler( - model, - device, - **q_kwargs + model, device, **q_kwargs ).quantized_model() elif quantizer == "precision": model.to(**q_kwargs) @@ -123,6 +116,7 @@ def quantize_model(model: nn.Module, device, quantize_options): ######################################################################### ##### Quantization Primitives ###### + def dynamically_quantize_per_channel( x, quant_min, @@ -217,8 +211,7 @@ def dynamically_quantize_per_channel( return quant, scales, zero_points - -def get_group_qparams(w, n_bit=4, groupsize=128, *, scales_dtype= torch.float): +def get_group_qparams(w, n_bit=4, groupsize=128, *, scales_dtype=torch.float): # needed for GPTQ with padding if groupsize > w.shape[-1]: groupsize = w.shape[-1] @@ -324,6 +317,7 @@ def group_dequantize_tensor(w_int32, scales_and_zeros, n_bit=4, groupsize=128): w_int32, scales, zeros, n_bit, groupsize ) + ######################################################################### ### QuantHandler API definition ### @@ -349,9 +343,11 @@ def quantized_model(self) -> nn.Module: ##### Weight-only int8 per-channel quantized code ###### -def replace_linear_weight_only_int8_per_channel(module, device, node_type, groupsize=None): +def replace_linear_weight_only_int8_per_channel( + module, device, node_type, groupsize=None +): if groupsize is not None and groupsize != 0: - pass # groupsize = 2 ** groupsize + pass # groupsize = 2 ** groupsize for name, child in module.named_children(): # print(f"name: {name}") @@ -367,10 +363,14 @@ def replace_linear_weight_only_int8_per_channel(module, device, node_type, group setattr( module, name, - WeightOnlyInt8Linear(device, child.in_features, child.out_features, groupsize), + WeightOnlyInt8Linear( + device, child.in_features, child.out_features, groupsize + ), ) else: - replace_linear_weight_only_int8_per_channel(child, device, node_type, groupsize) + replace_linear_weight_only_int8_per_channel( + child, device, node_type, groupsize + ) class WeightOnlyInt8QuantHandler(QuantHandler): @@ -443,7 +443,9 @@ def create_quantized_state_dict(self) -> Dict: return cur_state_dict def convert_for_runtime(self) -> nn.Module: - replace_linear_weight_only_int8_per_channel(self.mod, self.device, self.node_type, self.groupsize) + replace_linear_weight_only_int8_per_channel( + self.mod, self.device, self.node_type, self.groupsize + ) return self.mod def quantized_model(self) -> nn.Module: @@ -474,14 +476,19 @@ def __init__( self.in_features = in_features self.out_features = out_features self.register_buffer( - "weight", torch.empty((out_features, in_features), dtype=torch.int8, device=device) + "weight", + torch.empty((out_features, in_features), dtype=torch.int8, device=device), ) - dtype=get_precision() + dtype = get_precision() if groupsize is None or (groupsize == 0): - self.register_buffer("scales", torch.ones(out_features, dtype=dtype, device=device)) + self.register_buffer( + "scales", torch.ones(out_features, dtype=dtype, device=device) + ) else: groups = (in_features + groupsize - 1) // groupsize - self.register_buffer("scales", torch.ones(out_features, groups, dtype=dtype, device=device)) + self.register_buffer( + "scales", torch.ones(out_features, groups, dtype=dtype, device=device) + ) def forward(self, input: torch.Tensor) -> torch.Tensor: scales = self.scales @@ -496,7 +503,13 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: if scales.shape[1] == 1: return F.linear(input, weight.to(dtype=input.dtype)) * self.scales else: - return F.linear(input, (weight.to(dtype=input.dtype).view(weight.shape[0], no_groups, -1) * scales.view(weight.shape[0], no_groups, -1)).view(weight.shape[0], -1)) + return F.linear( + input, + ( + weight.to(dtype=input.dtype).view(weight.shape[0], no_groups, -1) + * scales.view(weight.shape[0], no_groups, -1) + ).view(weight.shape[0], -1), + ) ######################################################################### @@ -504,7 +517,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: def replace_embedding_weight_only_grouped_int8_per_channel( - module, device, bitwidth: int = 8, groupsize: Optional[int] = None, packed = False + module, device, bitwidth: int = 8, groupsize: Optional[int] = None, packed=False ): for name, child in module.named_children(): # print(f"name: {name}") @@ -529,9 +542,17 @@ def replace_embedding_weight_only_grouped_int8_per_channel( class EmbeddingOnlyInt8QuantHandler(QuantHandler): - def __init__(self, mod, device, *, bitwidth: int = 8, groupsize: Optional[int] = None, packed = False): + def __init__( + self, + mod, + device, + *, + bitwidth: int = 8, + groupsize: Optional[int] = None, + packed=False, + ): if isinstance(packed, str): - packed = (packed == "True") + packed = packed == "True" self.mod = mod self.device = device self.groupsize = groupsize @@ -540,7 +561,6 @@ def __init__(self, mod, device, *, bitwidth: int = 8, groupsize: Optional[int] = if (bitwidth != 4) and packed: raise RuntimeError("pack only works with bitsize 4") - @torch.no_grad() def create_quantized_state_dict(self, packed=False) -> Dict: cur_state_dict = self.mod.state_dict() @@ -555,9 +575,7 @@ def create_quantized_state_dict(self, packed=False) -> Dict: raise ValueError(f"Unsupported bitwidth {self.bitwidth}") for fqn, mod in self.mod.named_modules(): - if ( - isinstance(mod, nn.Embedding) - ): + if isinstance(mod, nn.Embedding): # print("****") # print(f"Embedding identified: {fqn, mod}") # print(f"weights size: {mod.weight.size()}") @@ -576,17 +594,15 @@ def create_quantized_state_dict(self, packed=False) -> Dict: ) if packed: - if weight.shape[-1] %2 != 0: + if weight.shape[-1] % 2 != 0: raise RuntimeError("automatic padding not implemented yet") weight_range_shifted = weight.add(8).view(torch.uint8) weight_view = weight_range_shifted.view( - weight.shape[0], - weight.shape[1] //2, - 2 - ) - weight_even = weight_view[:,:,0] * 16 # left shift 4 - weight_odd = weight_view[:,:,1] + weight.shape[0], weight.shape[1] // 2, 2 + ) + weight_even = weight_view[:, :, 0] * 16 # left shift 4 + weight_odd = weight_view[:, :, 1] weight_packed = weight_even + weight_odd weight = weight_packed @@ -630,16 +646,25 @@ def __init__( self.packed = packed if not packed: self.register_buffer( - "weight", torch.empty((vocab_size, embedding_dim), dtype=torch.int8, device=device) + "weight", + torch.empty( + (vocab_size, embedding_dim), dtype=torch.int8, device=device + ), ) - else: # packed + else: # packed self.register_buffer( - "weight", torch.empty((vocab_size, embedding_dim//2), dtype=torch.uint8, device=device) + "weight", + torch.empty( + (vocab_size, embedding_dim // 2), dtype=torch.uint8, device=device + ), ) groups_per_row = (embedding_dim + groupsize - 1) // groupsize if groups_per_row > 1: self.register_buffer( - "scales", torch.ones((vocab_size, groups_per_row), dtype=torch.float16, device=device) + "scales", + torch.ones( + (vocab_size, groups_per_row), dtype=torch.float16, device=device + ), ) else: self.register_buffer( @@ -648,17 +673,16 @@ def __init__( @torch.no_grad() def forward(self, indices: torch.Tensor) -> torch.Tensor: - if False: # Used for Executorch + if False: # Used for Executorch return torch.ops.llama_quantized.embedding_byte.dtype( self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype ) - # result_weights = self.weight.index_select(0, indices.view(-1)) # result_scales = self.scales.index_select(0, indices.view(-1)) if self.packed: - weight_even = self.weight.div(16, rounding_mode='trunc') + weight_even = self.weight.div(16, rounding_mode="trunc") weight_odd = self.weight.remainder(16) weight_unpacked = torch.stack((weight_even, weight_odd), dim=-1) weight = weight_unpacked.view(self.weight.shape[0], -1) @@ -671,8 +695,22 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor: result_weights = F.embedding(indices, weight) result_scales = F.embedding(indices, scales) - rw_view = result_weights.to(dtype=result_scales.dtype).view(tuple(result_weights.shape[:-1] + (scales.shape[1], -1, ))) - rs_view = result_scales.view(tuple(result_scales.shape[:-1]) + (scales.shape[1], 1, )) + rw_view = result_weights.to(dtype=result_scales.dtype).view( + tuple( + result_weights.shape[:-1] + + ( + scales.shape[1], + -1, + ) + ) + ) + rs_view = result_scales.view( + tuple(result_scales.shape[:-1]) + + ( + scales.shape[1], + 1, + ) + ) # print(f"rw_view {rw_view.shape}") # print(f"rs_view {rs_view.shape}") @@ -685,17 +723,25 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor: ######################################################################### ##### weight only int4 per channel groupwise quantized code ###### -def _int4_prepare_int4_weight_and_scales_and_zeros(weight_bf16, groupsize, inner_k_tiles): + +def _int4_prepare_int4_weight_and_scales_and_zeros( + weight_bf16, groupsize, inner_k_tiles +): weight_int32, scales_and_zeros = group_quantize_tensor( weight_bf16, n_bit=4, groupsize=groupsize ) - weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(weight_int32, inner_k_tiles) + weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( + weight_int32, inner_k_tiles + ) return weight_int4pack, scales_and_zeros + def _int4_calc_padded_size(k, groupsize=1, innner_k_tiles=1): from build.model import find_multiple + return find_multiple(k, 1024) + def linear_forward_int4(x, weight_int4pack, scales_and_zeros, out_features, groupsize): origin_x_size = x.size() x = x.reshape(-1, origin_x_size[-1]) @@ -705,31 +751,41 @@ def linear_forward_int4(x, weight_int4pack, scales_and_zeros, out_features, grou if "mps" in str(x.device): new_shape = origin_x_size[:-1] + (out_features,) return torch.zeros(new_shape, dtype=x.dtype, device=x.device) - + c = torch.ops.aten._weight_int4pack_mm( - x.to(torch.bfloat16), # TODO: should probably make a warning if x is not already bfloat16 + x.to( + torch.bfloat16 + ), # TODO: should probably make a warning if x is not already bfloat16 weight_int4pack, groupsize, - scales_and_zeros.to(torch.bfloat16), # TODO: should probably make a warning if not already bfloat16 - ).to(x.dtype) # cast back to x.dtype + scales_and_zeros.to( + torch.bfloat16 + ), # TODO: should probably make a warning if not already bfloat16 + ).to( + x.dtype + ) # cast back to x.dtype new_shape = origin_x_size[:-1] + (out_features,) c = c.reshape(new_shape) return c -def _int4_check_linear_int4_k(k, groupsize = 1, inner_k_tiles = 1): +def _int4_check_linear_int4_k(k, groupsize=1, inner_k_tiles=1): return k % groupsize == 0 and k % (inner_k_tiles * 16) == 0 + def replace_linear_int4( - module, - device, - groupsize, - inner_k_tiles, - padding_allowed, + module, + device, + groupsize, + inner_k_tiles, + padding_allowed, ): for name, child in module.named_children(): if isinstance(child, nn.Linear): - if _int4_check_linear_int4_k(child.in_features, groupsize, inner_k_tiles) or padding_allowed: + if ( + _int4_check_linear_int4_k(child.in_features, groupsize, inner_k_tiles) + or padding_allowed + ): setattr( module, name, @@ -740,7 +796,8 @@ def replace_linear_int4( bias=False, groupsize=groupsize, inner_k_tiles=inner_k_tiles, - )) + ), + ) else: replace_linear_int4( child, device, groupsize, inner_k_tiles, padding_allowed @@ -748,7 +805,9 @@ def replace_linear_int4( class WeightOnlyInt4QuantHandler(QuantHandler): - def __init__(self, mod, device, *, groupsize=128, inner_k_tiles=8, padding_allowed=True): + def __init__( + self, mod, device, *, groupsize=128, inner_k_tiles=8, padding_allowed=True + ): self.mod = mod self.device = device self.groupsize = groupsize @@ -769,19 +828,30 @@ def create_quantized_state_dict(self): print(f"linear: {fqn}, in={in_features}, out={out_features}") weight = mod.weight.data - if not _int4_check_linear_int4_k(in_features, self.groupsize, self.inner_k_tiles): + if not _int4_check_linear_int4_k( + in_features, self.groupsize, self.inner_k_tiles + ): if self.padding_allowed: - from build.model import find_multiple import torch.nn.functional as F - print(f"warning: {fqn} is padded to satisfy in_features % 1024 == 0") + from build.model import find_multiple + + print( + f"warning: {fqn} is padded to satisfy in_features % 1024 == 0" + ) padded_in_features = find_multiple(in_features, 1024) - weight = F.pad(weight, pad=(0, padded_in_features - in_features)) + weight = F.pad( + weight, pad=(0, padded_in_features - in_features) + ) else: - print(f"warning: {fqn} is skipped, int4 requires that in_features is 32, 64, or is divisible by 1024, " + - "and that groupsize and inner_k_tiles*16 evenly divide into it") + print( + f"warning: {fqn} is skipped, int4 requires that in_features is 32, 64, or is divisible by 1024, " + + "and that groupsize and inner_k_tiles*16 evenly divide into it" + ) continue - weight_int4pack, scales_and_zeros = _int4_prepare_int4_weight_and_scales_and_zeros( - weight.to(torch.float), self.groupsize, self.inner_k_tiles + weight_int4pack, scales_and_zeros = ( + _int4_prepare_int4_weight_and_scales_and_zeros( + weight.to(torch.float), self.groupsize, self.inner_k_tiles + ) ) weight_int4pack = weight_int4pack.to(device=self.device) scales_and_zeros = scales_and_zeros.to(device=self.device) @@ -790,9 +860,14 @@ def create_quantized_state_dict(self): return cur_state_dict - def convert_for_runtime(self): - replace_linear_int4(self.mod, self.device, self.groupsize, self.inner_k_tiles, self.padding_allowed) + replace_linear_int4( + self.mod, + self.device, + self.groupsize, + self.inner_k_tiles, + self.padding_allowed, + ) return self.mod def quantized_model(self) -> nn.Module: @@ -803,25 +878,28 @@ def quantized_model(self) -> nn.Module: class WeightOnlyInt4Linear(torch.nn.Module): - __constants__ = ['in_features', 'out_features'] + __constants__ = ["in_features", "out_features"] in_features: int out_features: int weight: torch.Tensor def __init__( - self, - device: str, - in_features: int, - out_features: int, - bias=True, - dtype=None, - groupsize: int = 128, - inner_k_tiles: int = 8, + self, + device: str, + in_features: int, + out_features: int, + bias=True, + dtype=None, + groupsize: int = 128, + inner_k_tiles: int = 8, ) -> None: super().__init__() - self.padding = not _int4_check_linear_int4_k(in_features, groupsize, inner_k_tiles) + self.padding = not _int4_check_linear_int4_k( + in_features, groupsize, inner_k_tiles + ) if self.padding: from build.model import find_multiple + self.origin_in_features = in_features in_features = find_multiple(in_features, 1024) @@ -832,14 +910,21 @@ def __init__( self.inner_k_tiles = inner_k_tiles assert out_features % 8 == 0, "require out_features % 8 == 0" - assert in_features % (inner_k_tiles * 16) == 0, "require in_features % (innerKTiles * 16) == 0" + assert ( + in_features % (inner_k_tiles * 16) == 0 + ), "require in_features % (innerKTiles * 16) == 0" self.register_buffer( "weight", torch.empty( - (out_features // 8, in_features // (inner_k_tiles * 16), 32, inner_k_tiles // 2), + ( + out_features // 8, + in_features // (inner_k_tiles * 16), + 32, + inner_k_tiles // 2, + ), dtype=torch.int32, device=device, - ) + ), ) # MKG: torch.float self.register_buffer( @@ -848,7 +933,7 @@ def __init__( (in_features // groupsize, out_features, 2), dtype=get_precision(), device=device, - ) + ), ) def forward(self, input: torch.Tensor) -> torch.Tensor: @@ -856,18 +941,17 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: # input = input.to(torch.float) if self.padding: import torch.nn.functional as F + input = F.pad(input, pad=(0, self.in_features - self.origin_in_features)) return linear_forward_int4( - input, - self.weight, - self.scales_and_zeros, - self.out_features, - self.groupsize + input, self.weight, self.scales_and_zeros, self.out_features, self.groupsize ) + ######################################################################### ##### Int8 Dynamic Activations 4 Bit Weights ##### + def prepare_int4_weight_and_scales_and_zeros(weight, groupsize, precision): weight_int8, scales, zeros = group_quantize_tensor_symmetric( weight, @@ -924,6 +1008,7 @@ def find_multiple(n: int, *args: Tuple[int]) -> int: def _check_linear_int4_k(k, groupsize=1): return k % groupsize == 0 + def _calc_padded_size_linear_int4(k, groupsize=1): return find_multiple(k, groupsize) @@ -965,7 +1050,7 @@ def __init__( self, mod, device, - * , + *, groupsize=256, padding_allowed=False, precision=torch.float32, @@ -1130,6 +1215,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: ######################################################################### ##### GPTQ ##### + class GPTQQuantHandler(QuantHandler): """ This class implements a GPTQ QuantHandler that can be used to apply GPTQ to a model in concert with the GenericGPTQRunner class. @@ -1192,6 +1278,7 @@ class GPTQQuantHandler(QuantHandler): names_and_values_dict: a dictionary mapping the name of the parameters of the quantized module to the corresponding quantized weights and qparams. """ + def __init__(self): assert self.mod is not None assert self.get_qparams_func is not None @@ -1201,7 +1288,14 @@ def __init__(self): assert self.make_names_and_values_dict_func is not None @staticmethod - def get_inputs(model, tokenizer, calibration_tasks, calibration_limit, calibration_seq_length, pad_calibration_inputs) -> "MultiInput": + def get_inputs( + model, + tokenizer, + calibration_tasks, + calibration_limit, + calibration_seq_length, + pad_calibration_inputs, + ) -> "MultiInput": input_recorder = InputRecorder( model, tokenizer, @@ -1223,9 +1317,9 @@ def get_inputs(model, tokenizer, calibration_tasks, calibration_limit, calibrati ) inputs = input_recorder.get_recorded_inputs() assert inputs is not None, ( - f"No inputs were collected, use a task other than {calibration_tasks}, "+ - f"use option pad_calibration_inputs, or decrease calibration_sequence_length (currently "+ - f"{calibration_seq_length})" + f"No inputs were collected, use a task other than {calibration_tasks}, " + + f"use option pad_calibration_inputs, or decrease calibration_sequence_length (currently " + + f"{calibration_seq_length})" ) print(f"Obtained {len(inputs[0].values)} calibration samples") return inputs @@ -1242,7 +1336,14 @@ def create_quantized_state_dict( calibration_seq_length, pad_calibration_inputs, ) -> "StateDict": - inputs = GPTQQuantHandler.get_inputs(self.mod, tokenizer, calibration_tasks, calibration_limit, calibration_seq_length, pad_calibration_inputs) + inputs = GPTQQuantHandler.get_inputs( + self.mod, + tokenizer, + calibration_tasks, + calibration_limit, + calibration_seq_length, + pad_calibration_inputs, + ) print("Tracing model for GPTQ") GPTQ_runner = GenericGPTQRunner( self.mod, @@ -1256,7 +1357,7 @@ def create_quantized_state_dict( self.dequantize_func, self.combine_qparams_list_func, self.make_names_and_values_dict_func, - self.skip_layer_func + self.skip_layer_func, ) print("Applying GPTQ to weights") @@ -1270,40 +1371,52 @@ def convert_for_runtime(self) -> "nn.Module": class WeightOnlyInt4GPTQQuantHandler(GPTQQuantHandler): def __init__(self, mod, device, *, groupsize=128, inner_k_tiles=8, padding=True): from build.model import find_multiple + self.mod = mod self.device = device self.groupsize = groupsize self.inner_k_tiles = inner_k_tiles self.padding = padding self.get_qparams_func = lambda w: get_group_qparams(w, 4, groupsize) - self.quantize_func = lambda w, qparams: \ - group_quantize_tensor_from_qparams(w, qparams[0], qparams[1], 4, groupsize) - self.dequantize_func = lambda q, qparams: \ - group_dequantize_tensor_from_qparams(q, qparams[0], qparams[1], 4, groupsize).float() - self.combine_qparams_list_func = lambda qparams_list: \ - [torch.cat(x, dim=1) for x in zip(*qparams_list)] + self.quantize_func = lambda w, qparams: group_quantize_tensor_from_qparams( + w, qparams[0], qparams[1], 4, groupsize + ) + self.dequantize_func = lambda q, qparams: group_dequantize_tensor_from_qparams( + q, qparams[0], qparams[1], 4, groupsize + ).float() + self.combine_qparams_list_func = lambda qparams_list: [ + torch.cat(x, dim=1) for x in zip(*qparams_list) + ] # skip unless padding=True or its correctly sized self.skip_layer_func = lambda linear_weight: not ( - _check_linear_int4_k(linear_weight.shape[-1], groupsize, inner_k_tiles) or padding + _check_linear_int4_k(linear_weight.shape[-1], groupsize, inner_k_tiles) + or padding ) + # we need to do the padding here, both for q and the qparams if necessary def make_names_and_values_dict_func(q, qparams): k = q.shape[1] new_k = find_multiple(k, 1024) # how much we need to pad the weight delta_k = new_k - q.shape[1] - final_q = torch.ops.aten._convert_weight_to_int4pack(F.pad(q, pad=(0, delta_k)), inner_k_tiles) + final_q = torch.ops.aten._convert_weight_to_int4pack( + F.pad(q, pad=(0, delta_k)), inner_k_tiles + ) scales_and_zeros = pack_scales_and_zeros(*qparams) # how many new groups we need for padded weight delta_groups = new_k // groupsize - scales_and_zeros.shape[0] - final_s_and_z = F.pad(scales_and_zeros, pad=(0,0,0,0,0, delta_groups), value=1) + final_s_and_z = F.pad( + scales_and_zeros, pad=(0, 0, 0, 0, 0, delta_groups), value=1 + ) return {"weight": final_q, "scales_and_zeros": final_s_and_z} + self.make_names_and_values_dict_func = make_names_and_values_dict_func super().__init__() - def convert_for_runtime(self): - replace_linear_int4(self.mod, self.device, self.groupsize, self.inner_k_tiles, self.padding) + replace_linear_int4( + self.mod, self.device, self.groupsize, self.inner_k_tiles, self.padding + ) return self.mod def quantized_model(self) -> nn.Module: @@ -1313,7 +1426,6 @@ def quantized_model(self) -> nn.Module: return self.mod - # class Int8DynActInt4WeightGPTQQuantHandler(GPTQQuantHandler): # def __init__( # self, @@ -1388,6 +1500,7 @@ def quantized_model(self) -> nn.Module: ################################################################## ### WIP: HQQ ### + class WeightOnlyInt4HqqQuantHandler: def __init__(self, mod, device, *, groupsize): self.mod = mod @@ -1397,7 +1510,6 @@ def __init__(self, mod, device, *, groupsize): def create_quantized_state_dict(self): from hqq.core.quantize import Quantizer # TODO maybe torchao - for m in self.mod.modules(): for name, child in m.named_children(): if isinstance(child, torch.nn.Linear): diff --git a/quantized_ops.py b/quantized_ops.py index 7ac39b85e..e01cdf3af 100644 --- a/quantized_ops.py +++ b/quantized_ops.py @@ -10,15 +10,13 @@ import torch.nn.functional as F from torch.library import impl, impl_abstract -torchat_lib = torch.library.Library( - "torchat", "DEF" -) +torchat_lib = torch.library.Library("torchat", "DEF") torchat_lib.define( - "embedding_int8(Tensor input, Tensor weight, " - "Tensor scales) -> Tensor", + "embedding_int8(Tensor input, Tensor weight, " "Tensor scales) -> Tensor", ) + @impl(torchat_lib, "embedding_int8", "CompositeExplicitAutograd") def embedding_int8( input: torch.Tensor, @@ -27,9 +25,7 @@ def embedding_int8( ) -> torch.Tensor: indices = input # embedding_byte_weight_checks(weight, weight_scales, weight_zero_points) - groupsize = weight.size(1) // ( - scales.size(1) if scales.dim() == 2 else 1 - ) + groupsize = weight.size(1) // (scales.size(1) if scales.dim() == 2 else 1) # ET definition if False: weight_zero_points = None @@ -45,68 +41,82 @@ def embedding_int8( ) return torch.ops.aten.embedding.default(weight, indices) - scales = scales.view(weight.shape[0], -1) + scales = scales.view(weight.shape[0], -1) result_weights = F.embedding(indices, weight) result_scales = F.embedding(indices, scales) - rw_view = result_weights.to(dtype=result_scales.dtype).view(tuple(result_weights.shape[:-1]) + (scales.shape[1], -1, )) - rs_view = result_scales.view(tuple(result_scales.shape[:-1]) + (scales.shape[1], 1, )) + rw_view = result_weights.to(dtype=result_scales.dtype).view( + tuple(result_weights.shape[:-1]) + + ( + scales.shape[1], + -1, + ) + ) + rs_view = result_scales.view( + tuple(result_scales.shape[:-1]) + + ( + scales.shape[1], + 1, + ) + ) # print(f"rw_view {rw_view.shape}") # print(f"rs_view {rs_view.shape}") r = rw_view * rs_view return r.view(indices.size() + (-1,)) - - + + torchat_lib.define( "linear_int8(Tensor input, Tensor weight, Tensor scales, " "Tensor bias = None) -> Tensor", ) + @impl(torchat_lib, "linear_int8", "CompositeExplicitAutograd") def linear_int8( - input: torch.Tensor, - weight: torch.Tensor, - scales: torch.Tensor, - bias: Optional[torch.Tensor] = None, + input: torch.Tensor, + weight: torch.Tensor, + scales: torch.Tensor, + bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: assert bias is None, "bias != None not implemented" - + scales = scales.view(scales.shape[0], -1) no_groups = scales.shape[1] # for now, we special-case channel-wise, because we know how to - # make that fast with Triton + # make that fast with Triton if scales.shape[1] == 1: return F.linear(input, weight.to(dtype=input.dtype)) * scales else: return F.linear( input, - (weight.to(dtype=input.dtype).view(weight.shape[0],no_groups, -1) - * scales.view(weight.shape[0], no_groups, -1) - ).view(weight.shape[0], -1) + ( + weight.to(dtype=input.dtype).view(weight.shape[0], no_groups, -1) + * scales.view(weight.shape[0], no_groups, -1) + ).view(weight.shape[0], -1), ) - torchat_lib.define( "linear_int4(Tensor input, Tensor weight, Tensor scales_and_zeros, " "Tensor bias=None, *, int groupsize, int origin_in_features, " "int int_features, int out_features, bool padding = True) -> Tensor", ) + @impl(torchat_lib, "linear_int4", "CompositeExplicitAutograd") def linear_int4( - input: torch.Tensor, - weight: torch.Tensor, - scales_and_zeros: torch.Tensor, - bias: torch.Tensor, - *, - groupsize: int, - origin_in_features: int, - in_features: int, - out_features: int, - padding: bool = True, + input: torch.Tensor, + weight: torch.Tensor, + scales_and_zeros: torch.Tensor, + bias: torch.Tensor, + *, + groupsize: int, + origin_in_features: int, + in_features: int, + out_features: int, + padding: bool = True, ) -> torch.Tensor: assert bias is None, "bias != None not implemented" @@ -116,7 +126,7 @@ def linear_int4( # the weight is in int4pack format # rename to remind ourselves of that weight_int4pack = weight - + origin_input_size = input.size() input = input.reshape(-1, origin_input_size[-1]) c = torch.ops.aten._weight_int4pack_mm( @@ -136,10 +146,9 @@ def linear_int4( "dtype precision) -> Tensor", ) + @impl(torchat_lib, "linear_a8w4dq", "CompositeExplicitAutograd") -def linear_a8w4dq( - input, weight, scales, zeros, out_features, groupsize, precision -): +def linear_a8w4dq(input, weight, scales, zeros, out_features, groupsize, precision): x = per_token_dynamic_quant(input) weight_int8 = weight # TODO: verify and remove following reshape code diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt new file mode 100644 index 000000000..45bf4d81b --- /dev/null +++ b/requirements-lintrunner.txt @@ -0,0 +1,18 @@ +# Lintrunner itself +lintrunner==0.11.0 +lintrunner-adapters==0.11.0 + +# Flake 8 and its dependencies +flake8==6.0.0 +flake8-breakpoint==1.1.0 +flake8-bugbear==23.6.5 +flake8-comprehensions==3.12.0 +flake8-pyi==23.5.0 +mccabe==0.7.0 +pycodestyle==2.10.0 +torchfix==0.1.1 + +# UFMT +black==24.2.0 +ufmt==2.5.1 +usort==1.0.5 diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py index 428c4a733..6ac1dc3e1 100644 --- a/scripts/convert_hf_checkpoint.py +++ b/scripts/convert_hf_checkpoint.py @@ -22,7 +22,9 @@ @torch.inference_mode() def convert_hf_checkpoint( *, - checkpoint_dir: Path = Path("checkpoints/meta-Transformer/Transformer-2-7b-chat-hf"), + checkpoint_dir: Path = Path( + "checkpoints/meta-Transformer/Transformer-2-7b-chat-hf" + ), model_name: Optional[str] = None, ) -> None: if model_name is None: @@ -45,8 +47,8 @@ def convert_hf_checkpoint( "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight", "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight", "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight", - 'model.layers.{}.self_attn.rotary_emb.inv_freq': None, - 'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight', + "model.layers.{}.self_attn.rotary_emb.inv_freq": None, + "model.layers.{}.mlp.gate_proj.weight": "layers.{}.feed_forward.w1.weight", "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight", "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight", "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight", @@ -66,13 +68,15 @@ def permute(w, n_heads): merged_result = {} for file in sorted(bin_files): - state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True) + state_dict = torch.load( + str(file), map_location="cpu", mmap=True, weights_only=True + ) merged_result.update(state_dict) final_result = {} for key, value in merged_result.items(): if "layers" in key: - abstract_key = re.sub(r'(\d+)', '{}', key) - layer_num = re.search(r'\d+', key).group(0) + abstract_key = re.sub(r"(\d+)", "{}", key) + layer_num = re.search(r"\d+", key).group(0) new_key = weight_map[abstract_key] if new_key is None: continue @@ -96,11 +100,17 @@ def permute(w, n_heads): print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}") torch.save(final_result, checkpoint_dir / "model.pth") -if __name__ == '__main__': + +if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description='Convert HuggingFace checkpoint.') - parser.add_argument('--checkpoint-dir', type=Path, default=Path("checkpoints/meta-llama/llama-2-7b-chat-hf")) - parser.add_argument('--model-name', type=str, default=None) + + parser = argparse.ArgumentParser(description="Convert HuggingFace checkpoint.") + parser.add_argument( + "--checkpoint-dir", + type=Path, + default=Path("checkpoints/meta-llama/llama-2-7b-chat-hf"), + ) + parser.add_argument("--model-name", type=str, default=None) args = parser.parse_args() convert_hf_checkpoint( diff --git a/scripts/download.py b/scripts/download.py index 849095ddf..387c41243 100644 --- a/scripts/download.py +++ b/scripts/download.py @@ -11,6 +11,7 @@ def hf_download(repo_id: Optional[str] = None, hf_token: Optional[str] = None) -> None: from huggingface_hub import snapshot_download + os.makedirs(f"checkpoints/{repo_id}", exist_ok=True) try: snapshot_download( @@ -18,18 +19,30 @@ def hf_download(repo_id: Optional[str] = None, hf_token: Optional[str] = None) - local_dir=f"checkpoints/{repo_id}", local_dir_use_symlinks=False, token=hf_token, - ignore_patterns="*safetensors*") + ignore_patterns="*safetensors*", + ) except HTTPError as e: if e.response.status_code == 401: - print("You need to pass a valid `--hf_token=...` to download private checkpoints.") + print( + "You need to pass a valid `--hf_token=...` to download private checkpoints." + ) else: raise e -if __name__ == '__main__': + +if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description='Download data from HuggingFace Hub.') - parser.add_argument('--repo-id', type=str, default="checkpoints/meta-llama/llama-2-7b-chat-hf", help='Repository ID to download from.') - parser.add_argument('--hf-token', type=str, default=None, help='HuggingFace API token.') + + parser = argparse.ArgumentParser(description="Download data from HuggingFace Hub.") + parser.add_argument( + "--repo-id", + type=str, + default="checkpoints/meta-llama/llama-2-7b-chat-hf", + help="Repository ID to download from.", + ) + parser.add_argument( + "--hf-token", type=str, default=None, help="HuggingFace API token." + ) args = parser.parse_args() hf_download(args.repo_id, args.hf_token) diff --git a/torchat.py b/torchat.py index 4b720b8dd..d01b12f88 100644 --- a/torchat.py +++ b/torchat.py @@ -4,24 +4,25 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. -import time import os +import time from pathlib import Path import torch import torch.nn as nn -from torch.export import Dim, export +from cli import check_args, cli_args +from eval import main as eval_main from export import main as export_main from generate import main as generate_main -from eval import main as eval_main -from cli import cli_args, check_args +from torch.export import Dim, export default_device = "cpu" # 'cuda' if torch.cuda.is_available() else 'cpu' + def cli(): args = cli_args() - + if args.generate or args.chat: check_args(args, "generate") generate_main(args) @@ -32,6 +33,7 @@ def cli(): export_main(args) else: raise RuntimeError("must specify either --generate or --export") - + + if __name__ == "__main__": cli() diff --git a/utils/tokenizer.py b/utils/tokenizer.py index f3c0cc324..b20dccf1d 100644 --- a/utils/tokenizer.py +++ b/utils/tokenizer.py @@ -2,14 +2,15 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. +import argparse import os import struct -import argparse from typing import List from sentencepiece import SentencePieceProcessor -TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model +TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model + class Tokenizer: def __init__(self, tokenizer_model=None): @@ -23,7 +24,7 @@ def __init__(self, tokenizer_model=None): self.bos_id: int = self.sp_model.bos_id() self.eos_id: int = self.sp_model.eos_id() self.pad_id: int = self.sp_model.pad_id() - #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}") + # print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}") assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() def encode(self, s: str, bos: bool, eos: bool) -> List[int]: @@ -48,11 +49,11 @@ def export(self): t = self.sp_model.id_to_piece(i) s = self.sp_model.get_score(i) if i == self.bos_id: - t = '\n\n' + t = "\n\n" elif i == self.eos_id: - t = '\n\n' - t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace - b = t.encode('utf-8') # bytes of this token, utf-8 encoded + t = "\n\n" + t = t.replace("▁", " ") # sentencepiece uses this character as whitespace + b = t.encode("utf-8") # bytes of this token, utf-8 encoded tokens.append(b) scores.append(s) @@ -62,16 +63,19 @@ def export(self): # write to a binary file # the tokenizer.bin file is the same as .model file, but .bin - tokenizer_bin = self.model_path.replace('.model', '.bin') - with open(tokenizer_bin, 'wb') as f: + tokenizer_bin = self.model_path.replace(".model", ".bin") + with open(tokenizer_bin, "wb") as f: f.write(struct.pack("I", max_token_length)) for bytes, score in zip(tokens, scores): f.write(struct.pack("fI", score, len(bytes))) f.write(bytes) + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer ") + parser.add_argument( + "-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer " + ) args = parser.parse_args() t = Tokenizer(args.tokenizer_model)