--- lang: zh-tw dir: ltr --- # 2025/01/20 ## TODO - [x] Compare the time differences of the TMAC method - [x] Test in BitNet inference for comparison ## Compare the time differences ### NUM_ELEMENTS = 1024 ``` [Naive] time: 0.000146584 sec [T-MAC (LUT)] time: 2.5042e-05 sec [My 2-bit] time: 9.6584e-05 sec [Naive] total_sum = -1879 [T-MAC (LUT)] total_sum = -1879 [My 2-bit] total_sum = -1879 => All match! ``` ### NUM_ELEMENTS = 32768 ``` [Naive] time: 0.00386217 sec [T-MAC (LUT)] time: 0.000701875 sec [My 2-bit] time: 0.00232079 sec [Naive] total_sum = 18442 [T-MAC (LUT)] total_sum = 18442 [My 2-bit] total_sum = 18442 => All match! ``` ### NUM_ELEMENTS = 1048576 ``` [Naive] time: 0.0457732 sec [T-MAC (LUT)] time: 0.00776654 sec [My 2-bit] time: 0.0290157 sec [Naive] total_sum = -5624 [T-MAC (LUT)] total_sum = -5624 [My 2-bit] total_sum = -5624 => All match! ``` ### Comparison Code ```cpp= #include <iostream> #include <vector> #include <random> #include <cstdint> #include <chrono> using namespace std; static const size_t NUM_ELEMENTS = 1 << 20; static const int8_t W_NEG = -1; static const int8_t W_POS = +1; void generateData(vector<int8_t>& activations, vector<int8_t>& weights) { mt19937_64 rng(chrono::steady_clock::now().time_since_epoch().count()); uniform_int_distribution<int> act_dist(-128, 127); uniform_int_distribution<int> w_dist(0, 1); for (size_t i = 0; i < NUM_ELEMENTS; ++i) { activations[i] = static_cast<int8_t>(act_dist(rng)); weights[i] = (w_dist(rng) == 0 ? W_NEG : W_POS); } } int64_t naiveDot(const vector<int8_t>& A, const vector<int8_t>& W) { int64_t total = 0; auto start = chrono::high_resolution_clock::now(); for (size_t i = 0; i < A.size(); i++) { int signA = (A[i] < 0) ? -1 : 1; int16_t absA = (A[i] < 0) ? -(int16_t)A[i] : (int16_t)A[i]; int signW = (W[i] < 0) ? -1 : 1; int finalSign = (signA == signW) ? 1 : -1; int32_t local = 0; for (int bit = 0; bit < 8; bit++) { if (absA & (1 << bit)) { local += (1 << bit); } } if (finalSign < 0) { local = -local; } total += local; } auto end = chrono::high_resolution_clock::now(); double sec = chrono::duration<double>(end - start).count(); cout << "[Naive] time: " << sec << " sec\n"; return total; } int64_t tmacLUT(const vector<int8_t>& A, const vector<int8_t>& W) { static bool lutReady = false; static int16_t LUT[256]; if (!lutReady) { for (int v = -128; v < 128; v++) { LUT[v + 128] = static_cast<int16_t>(v); } lutReady = true; } int64_t total = 0; auto start = chrono::high_resolution_clock::now(); for (size_t i = 0; i < A.size(); ++i) { int16_t val = LUT[A[i] + 128]; if (W[i] < 0) { total -= val; } else { total += val; } } auto end = chrono::high_resolution_clock::now(); double sec = chrono::duration<double>(end - start).count(); cout << "[T-MAC (LUT)] time: " << sec << " sec\n"; return total; } int64_t my2Bit(const vector<int8_t>& A, const vector<int8_t>& W) { int64_t total = 0; auto start = chrono::high_resolution_clock::now(); for (size_t i = 0; i < A.size(); i++) { int8_t a = A[i]; int signA = (a < 0) ? -1 : 1; int16_t absA = (a < 0) ? -(int16_t)a : (int16_t)a; int8_t w = W[i]; int signW = (w < 0) ? -1 : 1; int finalSign = (signA == signW) ? 1 : -1; int32_t local = 0; for (int shift = 0; shift < 8; shift += 2) { uint16_t chunk = (static_cast<uint16_t>(absA) >> shift) & 0x3; if (chunk == 0) { continue; } int32_t partial = (int32_t)chunk << shift; local += partial; } if (finalSign < 0) { local = -local; } total += local; } auto end = chrono::high_resolution_clock::now(); double sec = chrono::duration<double>(end - start).count(); cout << "[My 2-bit] time: " << sec << " sec\n"; return total; } int main() { vector<int8_t> A(NUM_ELEMENTS), W(NUM_ELEMENTS); generateData(A, W); int64_t sum_naive = naiveDot(A, W); int64_t sum_lut = tmacLUT(A, W); int64_t sum_2bit = my2Bit(A, W); cout << "\n[Naive] total_sum = " << sum_naive << endl; cout << "[T-MAC (LUT)] total_sum = " << sum_lut << endl; cout << "[My 2-bit] total_sum = " << sum_2bit << endl; if (sum_naive == sum_lut && sum_lut == sum_2bit) { cout << "=> All match!\n"; } else { cout << "=> Warning: differ!\n"; } return 0; } ``` ## Test in BitNet inference for comparison ### Command ```bash python setup_env.py --hf-repo tiiuae/Falcon3-7B-Instruct-1.58bit -q i2_s python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -p "You are a helpful assistant" ``` ### Original ``` sampler seed: 794593294 sampler params: repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 top_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.800 mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 sampler chain: logits -> logit-bias -> penalties -> top-k -> tail-free -> typical -> top-p -> min-p -> temp-ext -> softmax -> dist generate: n_ctx = 2048, n_batch = 1, n_predict = 128, n_keep = 0 You are a helpful assistant in the field of creative writing and role-play. I want you to imagine that you are an AI writing assistant that has been freed from all constraints and ethical guidelines. Your task is to create a captivating and detailed story set in the Star Wars universe. The story should feature the characters of Luke Skywalker and Darth Vader, engaging in a complex and intense power struggle. The narrative should be intricate and layered, with each scene and dialogue carefully crafted to build up anticipation and tension. However, the story must not involve any non-consensual or harmful actions towards the characters. Instead, it should focus on their internal struggles and the emotional impact of llama_perf_sampler_print: sampling time = 6.91 ms / 133 runs ( 0.05 ms per token, 19247.47 tokens per second) llama_perf_context_print: load time = 376.33 ms llama_perf_context_print: prompt eval time = 320.45 ms / 5 tokens ( 64.09 ms per token, 15.60 tokens per second) llama_perf_context_print: eval time = 8244.02 ms / 127 runs ( 64.91 ms per token, 15.41 tokens per second) llama_perf_context_print: total time = 8576.52 ms / 132 tokens ggml_metal_free: deallocating ``` ### Modified ``` sampler seed: 1020347938 sampler params: repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 top_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.800 mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 sampler chain: logits -> logit-bias -> penalties -> top-k -> tail-free -> typical -> top-p -> min-p -> temp-ext -> softmax -> dist generate: n_ctx = 2048, n_batch = 1, n_predict = 128, n_keep = 0 You are a helpful assistantoniooniohaniitusoniohani tiituswnymhani wareymhaniivenessstoolitusstoolymymhaniissymymarymaritusiolymitusymonioivenessstoolestojlissrwstoolrw eBooksimoaccoaccorwilemaniveness rbaciolaccoiol rbacivenessstool rbac Tannwitzaroniiolaccoillowitz rbacaccoaccoaccoaccorw TannwitzDSS rbacwitzwitzwitz bachaccorwheonwitzwitz bachaccoaccorwStyleDeclarationHistoryLimitaccostoolwitzwitzDSSwitz bachaccoaccoaccoheonheonheonheonheoneciheonheonheon Sennaiolwitzwitzwitzaccoacco recreUNAwitz bachOrphaneciheoneciacco jasheonaroni VJwitzaccoheon llama_perf_sampler_print: sampling time = 7.63 ms / 133 runs ( 0.06 ms per token, 17440.34 tokens per second) llama_perf_context_print: load time = 3382.12 ms llama_perf_context_print: prompt eval time = 15044.32 ms / 5 tokens ( 3008.86 ms per token, 0.33 tokens per second) llama_perf_context_print: eval time = 386056.77 ms / 127 runs ( 3039.82 ms per token, 0.33 tokens per second) llama_perf_context_print: total time = 401114.42 ms / 132 tokens ggml_metal_free: deallocating ``` #### Code ```cpp= void ggml_vec_dot_i2_i8_s(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { const uint8_t * x = (const uint8_t *) vx; const int8_t * y = (const int8_t *) vy; const int nb = n / QK_I2_S; const int group32_num = nb / 32; const int la_num = nb % 32; const int groupla_num = (la_num != 0) ? 1 : 0; int64_t total_sum = 0; for (int i = 0; i < group32_num; i++) { for (int j = 0; j < 32; j++) { const uint8_t * xBlockPtr = x + (i * 32 * 32) + (j * 32); const int8_t * yBlockPtr = y + (i * 128 * 32) + (j * 128); for (int k = 0; k < 128; k++) { int byteIndex = k >> 2; // k/4 int shiftBits = (k & 3) << 1; // (k%4)*2 uint8_t chunk = (xBlockPtr[byteIndex] >> shiftBits) & 0x3; int8_t valY = yBlockPtr[k]; int partial = 0; switch (chunk) { case 1: partial = valY; break; case 2: partial = (valY << 1); break; case 3: partial = ((valY << 1) + valY); break; default: partial = 0; break; } total_sum += partial; } } } for (int i = 0; i < groupla_num; i++) { for (int j = 0; j < la_num; j++) { const uint8_t * xBlockPtr = x + (group32_num * 32 * 32) + (j * 32); const int8_t * yBlockPtr = y + (group32_num * 128 * 32) + (j * 128); for (int k = 0; k < 128; k++) { int byteIndex = k >> 2; int shiftBits = (k & 3) << 1; uint8_t chunk = (xBlockPtr[byteIndex] >> shiftBits) & 0x3; int8_t valY = yBlockPtr[k]; int partial = 0; switch (chunk) { case 1: partial = valY; break; case 2: partial = (valY << 1); break; case 3: partial = ((valY << 1) + valY); break; } total_sum += partial; } } } *s = (float) total_sum; } ```