【Quantize Guide】MiniCPM-Llama3-V 2.5
【Quantize Guide】MiniCPM-Llama3-V 2.5
2024年8月26日修改
👇
Suitable for: Individuals who can modify simple parameters based on Python scripts.
BitsAndBytes quantize script
代码块
import torch
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
from PIL import Image
import time
import torch
import GPUtil
import os
model_path = '/root/ld/ld_model_pretrain/MiniCPM-Llama3-V-2_5' # Model download path
device = 'cuda' if torch.cuda.is_available() else 'cpu'
save_path = '/root/ld/ld_model_pretrain/MiniCPM-Llama3-V-2_5_int4' # Quantized model save path
image_path = '/root/ld/ld_project/MiniCPM-V/assets/airplane.jpeg'
# Create a configuration object to specify quantization parameters
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, # Whether to perform 4-bit quantization
load_in_8bit=False, # Whether to perform 8-bit quantization
bnb_4bit_compute_dtype=torch.float16, # Compute precision setting
bnb_4bit_quant_storage=torch.uint8, # Quantized weights storage format
bnb_4bit_quant_type="nf4", # Quantization format, using normal distribution int4 here
bnb_4bit_use_double_quant=True, # Whether to use double quantization, i.e., quantizing zeropoint and scaling parameters
llm_int8_enable_fp32_cpu_offload=False, # Whether to use int8 for LLM, with FP32 parameters offloaded to CPU
llm_int8_has_fp16_weight=False, # Whether to enable mixed precision
llm_int8_skip_modules=["out_proj", "kv_proj", "lm_head"], # Modules not to be quantized
llm_int8_threshold=6.0 # Outlier threshold in the llm.int8() algorithm, based on this value to determine whether to quantize
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_path,
device_map="cuda:0", # Allocate the model to GPU0
quantization_config=quantization_config,
trust_remote_code=True
)
gpu_usage = GPUtil.getGPUs()[0].memoryUsed
start = time.time()
response = model.chat(
image=Image.open(image_path).convert("RGB"),
msgs=[
{
"role": "user",
"content": "What is in this picture?"
}
],
tokenizer=tokenizer
) # Model inference
print('Output after quantization:', response)
print('Time taken after quantization:', time.time() - start)
print(f"GPU memory usage after quantization: {round(gpu_usage / 1024, 2)}GB")
# Save the model and tokenizer
os.makedirs(save_path, exist_ok=True)
model.save_pretrained(save_path, safe_serialization=True)
tokenizer.save_pretrained(save_path)