#!/bin/bash # For W[N]-A8-KV8, Apple CPU-only Inference: --n-gpu-layers 0 CLI=llama-cli KV_CACHE_TYPE=q8_0 # Inference parameters for non-thinking mode TEMPERATURE=0.6 MIN_P=0.00 REPEAT_PENALTY=1.0 PRESENCE_PENALTY=1.5 TOP_K=20 TOP_P=0.95 MODELS=( ./Qwen3-1.7B-EdgeRazor-TQ2_0.gguf ./Qwen3-1.7B-EdgeRazor-TQ1_0.gguf ./Qwen3-1.7B-EdgeRazor-Q4_0.gguf ./Qwen3-1.7B-BF16.gguf ) # Show available model list echo "Available models:" for i in "${!MODELS[@]}"; do echo " $i) ${MODELS[$i]}" done # Select model (default to the first one) if [ -z "$1" ]; then echo "" echo "Usage: $0 [prompt]" echo " model_index: 0, 1, or 2 (default: 0)" echo " prompt: optional prompt for non-interactive mode" echo "" MODEL_INDEX=0 else MODEL_INDEX=$1 fi MODEL="${MODELS[$MODEL_INDEX]}" if [ ! -f "$MODEL" ]; then echo "Error: Model file not found: $MODEL" exit 1 fi echo "Selected model: $MODEL" echo "" # Run CLI if [ -z "$2" ]; then # Interactive mode $CLI \ --model "$MODEL" \ --n-gpu-layers 0 \ --cache-type-k "$KV_CACHE_TYPE" \ --cache-type-v "$KV_CACHE_TYPE" \ --temp "$TEMPERATURE" \ --min-p "$MIN_P" \ --repeat-penalty "$REPEAT_PENALTY" \ --presence-penalty "$PRESENCE_PENALTY" \ --top-k "$TOP_K" \ --top-p "$TOP_P" \ --flash-attn \ --conversation \ --interactive-first \ --color else # Non-interactive mode (single inference) PROMPT="$2" $CLI \ --model "$MODEL" \ --n-gpu-layers 0 \ --cache-type-k "$KV_CACHE_TYPE" \ --cache-type-v "$KV_CACHE_TYPE" \ --temp "$TEMPERATURE" \ --min-p "$MIN_P" \ --repeat-penalty "$REPEAT_PENALTY" \ --presence-penalty "$PRESENCE_PENALTY" \ --top-k "$TOP_K" \ --top-p "$TOP_P" \ --flash-attn \ --prompt "$PROMPT" \ --n-predict 512 \ --color fi