from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

# TODO pip install autoawq==0.2.4

if __name__ == '__main__':
    model_path = 'scb10x/llama-3-typhoon-v1.5x-8b-instruct'
    quant_path = 'llama-3-typhoon-v1.5x-8b-instruct-awq'
    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    tokenizer.save_pretrained(quant_path)
    
    # Load model
    model = AutoAWQForCausalLM.from_pretrained(model_path)
    # Quantize
    model.quantize(tokenizer, quant_config=quant_config) # add calib_data=huggingface_datasets to optimize how the model get optimized (using Thai & English unsupervised corpus (maybe en_wiki + th_wiki)
    # Save quantized model
    model.save_quantized(quant_path)