from awq import AutoAWQForCausalLM from transformers import AutoTokenizer # TODO pip install autoawq==0.2.4 if __name__ == '__main__': model_path = 'scb10x/llama-3-typhoon-v1.5x-8b-instruct' quant_path = 'llama-3-typhoon-v1.5x-8b-instruct-awq' quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer.save_pretrained(quant_path) # Load model model = AutoAWQForCausalLM.from_pretrained(model_path) # Quantize model.quantize(tokenizer, quant_config=quant_config) # add calib_data=huggingface_datasets to optimize how the model get optimized (using Thai & English unsupervised corpus (maybe en_wiki + th_wiki) # Save quantized model model.save_quantized(quant_path)