Last active
February 19, 2026 16:16
-
-
Save lf-araujo/58d1f6712a52934654f7bef6d010fc29 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # npsol.nim | |
| # Compile & run: | |
| # nim c -r -d:release -d:opencl -d:blas=openblas --mm:refc npsol.nim | |
| import std/[math, strformat, sequtils, os] | |
| import arraymancer | |
| import arraymancer/tensor/tensor_opencl | |
| import arraymancer/tensor/backend/opencl_backend | |
| import numericalnim | |
| import opencl | |
| import datamancer | |
| import std/[strutils, tables] | |
| # ----------------------------------------------------------------------- | |
| # 1. Structural Mapping (The Lavaan Parser Lite) | |
| # ----------------------------------------------------------------------- | |
| type | |
| MatrixType = enum mPath, mVar | |
| ParameterMapping = object | |
| kind: MatrixType | |
| row, col: int | |
| paramIdx: int | |
| name: string # Added for pretty printing | |
| type | |
| FixedLoading = object | |
| row, col: int | |
| value: float32 | |
| # --------------------------- | |
| # CPU linear algebra helpers (float64) | |
| # --------------------------- | |
| proc eye64*(n: int): Tensor[float64] = | |
| result = zeros[float64](n, n) | |
| for i in 0 ..< n: | |
| result[i, i] = 1.0 | |
| proc transpose64*(A: Tensor[float64]): Tensor[float64] = | |
| let m = A.shape[0] | |
| let n = A.shape[1] | |
| result = zeros[float64](n, m) | |
| for i in 0 ..< m: | |
| for j in 0 ..< n: | |
| result[j, i] = A[i, j] | |
| proc matmul64*(A, B: Tensor[float64]): Tensor[float64] = | |
| let m = A.shape[0] | |
| let n = A.shape[1] | |
| let p = B.shape[1] | |
| doAssert n == B.shape[0], "matmul64: dimension mismatch" | |
| result = zeros[float64](m, p) | |
| for i in 0 ..< m: | |
| for j in 0 ..< p: | |
| var acc = 0.0 | |
| for k in 0 ..< n: | |
| acc += A[i, k] * B[k, j] | |
| result[i, j] = acc | |
| proc choleskySPD64*(S: Tensor[float64]; jitter: float64 = 1e-12): Tensor[float64] = | |
| ## Returns lower-triangular L such that S ≈ L * L^T. | |
| let n = S.shape[0] | |
| doAssert n == S.shape[1], "choleskySPD64: S must be square" | |
| result = S.clone() # <-- replace copy() with clone() | |
| for i in 0 ..< n: | |
| for j in 0 .. i: | |
| var sum = result[i, j] | |
| for k in 0 ..< j: | |
| sum -= result[i, k] * result[j, k] | |
| if i == j: | |
| if sum <= 0.0: sum = sum + jitter | |
| result[i, j] = sqrt(sum) | |
| else: | |
| result[i, j] = sum / result[j, j] | |
| # zero upper triangle | |
| for j in i+1 ..< n: | |
| result[i, j] = 0.0 | |
| proc logdetSPD64*(S: Tensor[float64]): float64 = | |
| let L = choleskySPD64(S) | |
| var s = 0.0 | |
| for i in 0 ..< L.shape[0]: | |
| let d = L[i, i] | |
| if d <= 0.0: | |
| raise newException(ValueError, "logdetSPD64: non-positive Cholesky diag") | |
| s += ln(d) | |
| result = 2.0 * s | |
| proc invSPD_from_chol64*(S: Tensor[float64]): Tensor[float64] = | |
| ## Invert SPD S using its Cholesky (two triangular solves). | |
| let L = choleskySPD64(S) | |
| let n = L.shape[0] | |
| var Y = zeros[float64](n, n) | |
| let I = eye64(n) | |
| # Forward solve: L * Y = I | |
| for j in 0 ..< n: | |
| for i in 0 ..< n: | |
| var s = I[i, j] | |
| for k in 0 ..< i: | |
| s -= L[i, k] * Y[k, j] | |
| Y[i, j] = s / L[i, i] | |
| # Backward solve: L^T * X = Y | |
| result = zeros[float64](n, n) | |
| for j in 0 ..< n: | |
| for i in countdown(n-1, 0): | |
| var s = Y[i, j] | |
| for k in i+1 ..< n: | |
| s -= L[k, i] * result[k, j] | |
| result[i, j] = s / L[i, i] | |
| proc traceProduct64*(A, B: Tensor[float64]): float64 = | |
| doAssert A.shape[0] == B.shape[0] and A.shape[1] == B.shape[1], "traceProduct64: shape mismatch" | |
| let n = A.shape[0] | |
| var tr = 0.0 | |
| for i in 0 ..< n: | |
| var acc = 0.0 | |
| for j in 0 ..< n: | |
| acc += A[i, j] * B[j, i] | |
| tr += acc | |
| result = tr | |
| proc invGeneral64*(A: Tensor[float64]; jitter: float64 = 0.0): Tensor[float64] = | |
| ## Gauss-Jordan inverse with partial pivoting (for small n). | |
| let n = A.shape[0] | |
| doAssert n == A.shape[1], "invGeneral64: A must be square" | |
| var M = A.clone() # <-- replace copy() with clone() | |
| if jitter != 0.0: | |
| for i in 0 ..< n: | |
| M[i, i] += jitter | |
| result = eye64(n) | |
| for col in 0 ..< n: | |
| # pivot | |
| var piv = col | |
| var maxAbs = abs(M[col, col]) | |
| for r in col+1 ..< n: | |
| let v = abs(M[r, col]) | |
| if v > maxAbs: | |
| maxAbs = v | |
| piv = r | |
| if maxAbs == 0.0: | |
| raise newException(ValueError, "invGeneral64: singular matrix") | |
| # manual row swap (avoid swap() on tensors) | |
| if piv != col: | |
| for j in 0 ..< n: | |
| let tmpM = M[col, j] | |
| M[col, j] = M[piv, j] | |
| M[piv, j] = tmpM | |
| let tmpR = result[col, j] | |
| result[col, j] = result[piv, j] | |
| result[piv, j] = tmpR | |
| let pivotVal = M[col, col] | |
| for j in 0 ..< n: | |
| M[col, j] /= pivotVal | |
| result[col, j] /= pivotVal | |
| # eliminate other rows | |
| for r in 0 ..< n: | |
| if r == col: continue | |
| let factor = M[r, col] | |
| if factor != 0.0: | |
| for j in 0 ..< n: | |
| M[r, j] -= factor * M[col, j] | |
| result[r, j] -= factor * result[col, j] | |
| proc parseFixedCoeff(token: string): (bool, float, string) = | |
| ## Detects numeric fixed coefficient like "1*x1" or "0.7 * x2". | |
| ## Returns (isFixed, coeff, varName). | |
| ## We purposely ignore tokens that begin with "start(" (those are not fixed). | |
| var t = token.strip() | |
| if t.len >= 6 and t.startsWith("start("): | |
| return (false, NaN, t) # handled elsewhere | |
| let star = t.find('*') | |
| if star >= 0: | |
| let left = t[0 .. star-1].strip() | |
| let right = t[star+1 .. ^1].strip() | |
| # Try parse left as a float coefficient | |
| var coeff: float | |
| try: | |
| coeff = left.parseFloat | |
| return (true, coeff, right) | |
| except ValueError: | |
| discard | |
| return (false, NaN, t) | |
| proc extractStart(token: string): (float, string) = | |
| ## Parses tokens like: | |
| ## "start(0.3)*x1", "start(.3) * x1", "x1" | |
| ## Returns: (startValue, variableName); startValue = NaN if none provided. | |
| var t = token.strip() | |
| if t.len >= 6 and t.startsWith("start("): | |
| let closeParen = t.find(")") | |
| if closeParen > 6: | |
| let valStr = t[6 .. closeParen-1] # inside 'start(...)' | |
| let after = (if closeParen+1 < t.len: t[closeParen+1 .. ^1] else: "").strip() | |
| var varName = after | |
| if after.len > 0 and after[0] == '*': | |
| varName = after[1 .. ^1].strip() | |
| return (valStr.parseFloat, varName) | |
| # No start() found; the token should just be the variable name | |
| return (NaN, t) | |
| proc getSmartStarts( | |
| mappings: seq[ParameterMapping], | |
| Sobs: Tensor[float32], | |
| vars: seq[string], | |
| userStarts: Table[int, float] | |
| ): Tensor[float] = | |
| result = newTensor[float](mappings.len) | |
| for i, m in mappings: | |
| if userStarts.hasKey(m.paramIdx): | |
| let s = userStarts[m.paramIdx] | |
| if m.kind == mVar: | |
| # For square mapping: u = sqrt(variance_start) | |
| result[i] = sqrt(max(1e-12, s)) | |
| else: | |
| result[i] = s | |
| elif m.kind == mPath: | |
| result[i] = 0.5 | |
| else: | |
| # default variance start: half of sample variance, mapped to u = sqrt(...) | |
| let varIdx = m.row - 1 | |
| if varIdx >= 0 and varIdx < vars.len: | |
| let rawVar = max(1e-12'f32, Sobs[varIdx, varIdx] * 0.5'f32) | |
| result[i] = sqrt(rawVar.float) | |
| else: | |
| # latent variance fallback if present and user didn't specify | |
| result[i] = 1.0 | |
| proc parseLavaan( | |
| syntax: string, | |
| vars: seq[string] | |
| ): (seq[ParameterMapping], Table[int, float], seq[FixedLoading]) = | |
| var mappings: seq[ParameterMapping] | |
| var userStarts = initTable[int, float]() | |
| var fixedLoads: seq[FixedLoading] # <— NEW | |
| var pIdx = 0 | |
| # 1. Lookup table | |
| var nameToIdx = initTable[string, int]() | |
| nameToIdx["eta"] = 0 | |
| for i, v in vars: | |
| nameToIdx[v] = i + 1 | |
| for line in syntax.splitLines(): | |
| let clean = line.strip() | |
| if clean.len == 0 or clean.startsWith("#"): continue | |
| # 2. Loadings: eta =~ ... | |
| if "=~" in clean: | |
| let parts = clean.split("=~") | |
| if parts.len != 2: continue | |
| let latent = parts[0].strip() | |
| let rhs = parts[1].split("+").mapIt(it.strip()) | |
| for tk in rhs: | |
| # First: numeric fixed coefficient form: "1*x1", "0.7*x2", etc. | |
| let (isFixed, coeff, obsCandidate) = parseFixedCoeff(tk) | |
| if isFixed: | |
| if nameToIdx.hasKey(obsCandidate) and nameToIdx.hasKey(latent): | |
| fixedLoads.add FixedLoading( | |
| row: nameToIdx[obsCandidate], | |
| col: nameToIdx[latent], | |
| value: coeff.float32 | |
| ) | |
| continue # do NOT create a free parameter in this case | |
| # Otherwise, check start(value)*xj or bare xj | |
| let (startVal, obsName) = extractStart(tk) | |
| if nameToIdx.hasKey(obsName): | |
| mappings.add ParameterMapping( | |
| kind: mPath, | |
| row: nameToIdx[obsName], | |
| col: nameToIdx[latent], | |
| paramIdx: pIdx, | |
| name: latent & " -> " & obsName | |
| ) | |
| if not startVal.isNaN: | |
| userStarts[pIdx] = startVal | |
| inc pIdx | |
| # 3. Variances: x1 ~~ x1 (allow start() on either side) | |
| elif "~~" in clean: | |
| let parts = clean.split("~~") | |
| if parts.len != 2: continue | |
| let lhsRaw = parts[0].strip() | |
| let rhsRaw = parts[1].strip() | |
| # Extract (start, varName) from both sides | |
| let (startL, nameL) = extractStart(lhsRaw) | |
| let (startR, nameR) = extractStart(rhsRaw) | |
| # Use whichever side gives a known variable name | |
| var vName = nameL | |
| if not nameToIdx.hasKey(vName) and nameToIdx.hasKey(nameR): | |
| vName = nameR | |
| if not nameToIdx.hasKey(vName): continue # unknown | |
| # (We no longer skip Var(eta); let the syntax control it.) | |
| # Prefer LHS start, else RHS start | |
| var startVal = startL | |
| if startVal.isNaN and not startR.isNaN: | |
| startVal = startR | |
| # Warn if both present and disagree | |
| if (not startL.isNaN) and (not startR.isNaN) and startL != startR: | |
| echo "WARNING: Conflicting starts for ", vName, " variance; using LHS = ", startL | |
| let idx = nameToIdx[vName] | |
| mappings.add ParameterMapping( | |
| kind: mVar, | |
| row: idx, col: idx, | |
| paramIdx: pIdx, | |
| name: "Var(" & vName & ")" | |
| ) | |
| if not startVal.isNaN: | |
| userStarts[pIdx] = startVal | |
| inc pIdx | |
| (mappings, userStarts, fixedLoads) | |
| # Initialize OpenCL early to ensure clQueue0 is valid | |
| discard zeros[float32](1, 1).opencl | |
| # ----------------------------------------------------------------------- | |
| # 1. OpenCL Context & Kernels | |
| # ----------------------------------------------------------------------- | |
| var raw_cl_context: Pcontext | |
| discard getCommandQueueInfo(clQueue0, QUEUE_CONTEXT, sizeof(raw_cl_context), addr raw_cl_context, nil) | |
| var deviceID: Pdevice_id | |
| discard getCommandQueueInfo(clQueue0, QUEUE_DEVICE, sizeof(deviceID), addr deviceID, nil) | |
| let custom_kernels = """ | |
| __kernel void matmul_generic(__global const float* A, __global const float* B, __global float* C, int M, int N, int P) { | |
| int i = get_global_id(0); int j = get_global_id(1); | |
| if (i >= M || j >= P) return; | |
| float acc = 0.0f; | |
| for (int k = 0; k < N; ++k) acc += A[i*N + k] * B[k*P + j]; | |
| C[i*P + j] = acc; | |
| } | |
| __kernel void transpose_generic(__global const float* A, __global float* C, int M, int N) { | |
| int i = get_global_id(0); int j = get_global_id(1); | |
| if (i < M && j < N) C[j*M + i] = A[i*N + j]; | |
| } | |
| __kernel void add_eye_scaled(__global float* A, int N, float alpha) { | |
| int i = get_global_id(0); | |
| if (i < N) A[i*N + i] += alpha; | |
| } | |
| __kernel void invert_generic(__global float* A, __global float* Inv, int n) { | |
| int row = get_global_id(0); | |
| if (row >= n) return; | |
| for (int j = 0; j < n; j++) Inv[row*n + j] = (row == j) ? 1.0f : 0.0f; | |
| for (int k = 0; k < n; k++) { | |
| float pivot = A[k*n + k]; | |
| if (pivot == 0.0f) pivot = 1e-8f; | |
| if (row == k) { | |
| for (int j = 0; j < n; j++) { A[k*n + j] /= pivot; Inv[k*n + j] /= pivot; } | |
| } | |
| barrier(CLK_GLOBAL_MEM_FENCE); | |
| if (row != k) { | |
| float factor = A[row*n + k]; | |
| for (int j = 0; j < n; j++) { A[row*n + j] -= factor * A[k*n + j]; Inv[row*n + j] -= factor * Inv[k*n + j]; } | |
| } | |
| barrier(CLK_GLOBAL_MEM_FENCE); | |
| } | |
| } | |
| __kernel void cholesky_spd_single(__global const float* A, __global float* L, int N) { | |
| if (get_global_id(0) != 0) return; | |
| for (int i = 0; i < N; ++i) for (int j = 0; j < N; ++j) L[i*N + j] = A[i*N + j]; | |
| for (int k = 0; k < N; ++k) { | |
| float sum = 0.0f; | |
| for (int s = 0; s < k; ++s) sum += L[k*N + s] * L[k*N + s]; | |
| float diag = L[k*N + k] - sum; | |
| if (diag <= 0.0f) diag = 1e-8f; | |
| L[k*N + k] = sqrt(diag); | |
| for (int i = k+1; i < N; ++i) { | |
| float s2 = 0.0f; | |
| for (int s = 0; s < k; ++s) s2 += L[i*N + s] * L[k*N + s]; | |
| L[i*N + k] = (L[i*N + k] - s2) / L[k*N + k]; | |
| } | |
| for (int j = k+1; j < N; ++j) L[k*N + j] = 0.0f; | |
| } | |
| } | |
| __kernel void solve_lower_many_single(__global const float* L, __global const float* B, __global float* Y, int N, int M) { | |
| if (get_global_id(0) != 0) return; | |
| for (int j = 0; j < M; ++j) { | |
| for (int i = 0; i < N; ++i) { | |
| float sum = 0.0f; | |
| for (int k = 0; k < i; ++k) sum += L[i*N + k] * Y[k*M + j]; | |
| Y[i*M + j] = (B[i*M + j] - sum) / L[i*N + i]; | |
| } | |
| } | |
| } | |
| __kernel void solve_upper_many_single(__global const float* U, __global const float* Y, __global float* X, int N, int M) { | |
| if (get_global_id(0) != 0) return; | |
| for (int j = 0; j < M; ++j) { | |
| for (int i = N-1; i >= 0; --i) { | |
| float sum = 0.0f; | |
| for (int k = i+1; k < N; ++k) sum += U[i*N + k] * X[k*M + j]; | |
| X[i*M + j] = (Y[i*M + j] - sum) / U[i*N + i]; | |
| } | |
| } | |
| } | |
| __kernel void logdet_spd_from_chol_single(__global const float* L, int N, __global float* sumLogDiag) { | |
| if (get_global_id(0) != 0) return; | |
| float s = 0.0f; | |
| for (int i = 0; i < N; ++i) { float d = L[i*N + i]; if (d <= 0.0f) d = 1e-8f; s += log(d); } | |
| sumLogDiag[0] = 2.0f * s; | |
| } | |
| __kernel void trace_product_single(__global const float* S, __global const float* X, int N, __global float* out) { | |
| if (get_global_id(0) != 0) return; | |
| float tr = 0.0f; | |
| for (int i = 0; i < N; ++i) { | |
| float acc = 0.0f; | |
| for (int j = 0; j < N; ++j) acc += S[i*N + j] * X[j*N + i]; | |
| tr += acc; | |
| } | |
| out[0] = tr; | |
| } | |
| """ | |
| var src = cstring(custom_kernels) | |
| var arr = [src] | |
| let sources = cast[cstringArray](unsafeAddr arr[0]) | |
| var err_prog: TClResult | |
| let program = createProgramWithSource(raw_cl_context, 1'u32, sources, nil, addr err_prog) | |
| echo "Compiling OpenCL Kernels..." | |
| let buildStatus = buildProgram(program, 0, nil, nil, nil, nil) | |
| if buildStatus != SUCCESS: quit("Kernel build failed.") | |
| var kMatmul, kTranspose, kAddEye, kInvert, kChol, kSolveL, kSolveU, kLogdetChol, kTrace: Pkernel | |
| var e1: TClResult | |
| kMatmul = createKernel(program, "matmul_generic", addr e1) | |
| kTranspose = createKernel(program, "transpose_generic", addr e1) | |
| kAddEye = createKernel(program, "add_eye_scaled", addr e1) | |
| kInvert = createKernel(program, "invert_generic", addr e1) | |
| kChol = createKernel(program, "cholesky_spd_single", addr e1) | |
| kSolveL = createKernel(program, "solve_lower_many_single", addr e1) | |
| kSolveU = createKernel(program, "solve_upper_many_single", addr e1) | |
| kLogdetChol = createKernel(program, "logdet_spd_from_chol_single", addr e1) | |
| kTrace = createKernel(program, "trace_product_single", addr e1) | |
| # ----------------------------------------------------------------------- | |
| # 2. GPU Helpers | |
| # ----------------------------------------------------------------------- | |
| proc setArg[T](k: Pkernel, idx: int, v: T) = | |
| var tmp = v | |
| discard setKernelArg(k, idx.uint32, sizeof(T).int, addr tmp) | |
| proc clTranspose(A: ClTensor[float32]): ClTensor[float32] = | |
| let M = A.shape[0].int; let N = A.shape[1].int | |
| result = newTensor[float32](N, M).opencl | |
| setArg(kTranspose, 0, A.toClpointer()); setArg(kTranspose, 1, result.toClpointer()) | |
| setArg(kTranspose, 2, M.int32); setArg(kTranspose, 3, N.int32) | |
| var gs = [csize(M), csize(N)] | |
| discard enqueueNDRangeKernel(clQueue0, kTranspose, 2, nil, addr gs[0], nil, 0, nil, nil) | |
| discard flush(clQueue0) | |
| proc clMatmul(A, B: ClTensor[float32]): ClTensor[float32] = | |
| let M = A.shape[0].int; let N = A.shape[1].int; let P = B.shape[1].int | |
| result = newTensor[float32](M, P).opencl | |
| setArg(kMatmul, 0, A.toClpointer()); setArg(kMatmul, 1, B.toClpointer()); setArg(kMatmul, 2, result.toClpointer()) | |
| setArg(kMatmul, 3, M.int32); setArg(kMatmul, 4, N.int32); setArg(kMatmul, 5, P.int32) | |
| var gs = [csize(M), csize(P)] | |
| discard enqueueNDRangeKernel(clQueue0, kMatmul, 2, nil, addr gs[0], nil, 0, nil, nil) | |
| discard flush(clQueue0) | |
| proc clAddEyeScaled(A: var ClTensor[float32], alpha: float32) = | |
| let N = A.shape[0].int | |
| setArg(kAddEye, 0, A.toClpointer()); setArg(kAddEye, 1, N.int32); setArg(kAddEye, 2, alpha) | |
| var gs = [csize(N)] | |
| discard enqueueNDRangeKernel(clQueue0, kAddEye, 1, nil, addr gs[0], nil, 0, nil, nil) | |
| discard flush(clQueue0) | |
| proc clInvert(A: ClTensor[float32]): ClTensor[float32] = | |
| let n = A.shape[0].int | |
| var Awork = A.clone() | |
| result = newTensor[float32](n, n).opencl | |
| setArg(kInvert, 0, Awork.toClpointer()); setArg(kInvert, 1, result.toClpointer()); setArg(kInvert, 2, n.int32) | |
| var gs = [csize(n)]; var ls = [csize(n)] | |
| discard enqueueNDRangeKernel(clQueue0, kInvert, 1, nil, addr gs[0], addr ls[0], 0, nil, nil) | |
| discard flush(clQueue0) | |
| proc clCholesky(A: ClTensor[float32]): ClTensor[float32] = | |
| let n = A.shape[0].int | |
| result = newTensor[float32](n, n).opencl | |
| setArg(kChol, 0, A.toClpointer()); setArg(kChol, 1, result.toClpointer()); setArg(kChol, 2, n.int32) | |
| var gs = [csize(1)]; var ls = [csize(1)] | |
| discard enqueueNDRangeKernel(clQueue0, kChol, 1, nil, addr gs[0], addr ls[0], 0, nil, nil) | |
| discard flush(clQueue0) | |
| proc clSPDInverse(A: ClTensor[float32]): ClTensor[float32] = | |
| let n = A.shape[0].int | |
| let L = clCholesky(A) | |
| var I = eye[float32](n, n).opencl | |
| var Ycl = newTensor[float32](n, n).opencl | |
| setArg(kSolveL, 0, L.toClpointer()); setArg(kSolveL, 1, I.toClpointer()); setArg(kSolveL, 2, Ycl.toClpointer()) | |
| setArg(kSolveL, 3, n.int32); setArg(kSolveL, 4, n.int32) | |
| var gs = [csize(1)]; discard enqueueNDRangeKernel(clQueue0, kSolveL, 1, nil, addr gs[0], nil, 0, nil, nil) | |
| let Ucl = clTranspose(L) | |
| result = newTensor[float32](n, n).opencl | |
| setArg(kSolveU, 0, Ucl.toClpointer()); setArg(kSolveU, 1, Ycl.toClpointer()); setArg(kSolveU, 2, result.toClpointer()) | |
| setArg(kSolveU, 3, n.int32); setArg(kSolveU, 4, n.int32) | |
| discard enqueueNDRangeKernel(clQueue0, kSolveU, 1, nil, addr gs[0], nil, 0, nil, nil) | |
| discard flush(clQueue0) | |
| proc clLogDetSPD(A: ClTensor[float32]): float = | |
| let n = A.shape[0].int | |
| let L = clCholesky(A) | |
| var buf = newTensor[float32](1, 1).opencl | |
| setArg(kLogdetChol, 0, L.toClpointer()); setArg(kLogdetChol, 1, n.int32); setArg(kLogdetChol, 2, buf.toClpointer()) | |
| var gs = [csize(1)]; discard enqueueNDRangeKernel(clQueue0, kLogdetChol, 1, nil, addr gs[0], nil, 0, nil, nil) | |
| discard flush(clQueue0) | |
| result = buf.cpu()[0,0].float | |
| proc clTraceProduct(S, X: ClTensor[float32]): float = | |
| let n = S.shape[0].int | |
| var outBuf = newTensor[float32](1, 1).opencl | |
| setArg(kTrace, 0, S.toClpointer()); setArg(kTrace, 1, X.toClpointer()); setArg(kTrace, 2, n.int32); setArg(kTrace, 3, outBuf.toClpointer()) | |
| var gs = [csize(1)]; discard enqueueNDRangeKernel(clQueue0, kTrace, 1, nil, addr gs[0], nil, 0, nil, nil) | |
| discard flush(clQueue0) | |
| result = outBuf.cpu()[0,0].float | |
| proc SigmaHatFromThetaCPU*( | |
| theta: Tensor[float], # float64 parameters | |
| k: int, | |
| mappings: seq[ParameterMapping], | |
| fixedLoads: seq[FixedLoading] | |
| ): Tensor[float64] = | |
| let p = k + 1 # 1 latent + k observed | |
| var A = zeros[float64](p, p) | |
| var S = zeros[float64](p, p) | |
| # 1) Fixed loadings (e.g., 1*x1) | |
| for fl in fixedLoads: | |
| A[fl.row, fl.col] = fl.value.float64 | |
| # 2) Free parameters | |
| for m in mappings: | |
| let u = theta[m.paramIdx] | |
| case m.kind | |
| of mPath: | |
| A[m.row, m.col] = u # unbounded loading | |
| of mVar: | |
| S[m.row, m.col] = max(1e-12, u*u) | |
| # 3) Σ = F * (I-A)^{-1} * S * (I-A)^{-T} * F^T | |
| let I = eye64(p) | |
| let IA = I - A | |
| let IAinv = invGeneral64(IA, jitter=1e-12) # small numerical jitter | |
| let IAinvT = transpose64(IAinv) | |
| var F = zeros[float64](k, p) | |
| for i in 0 ..< k: | |
| F[i, i+1] = 1.0 | |
| let tmp = matmul64(matmul64(matmul64(F, IAinv), S), IAinvT) | |
| result = matmul64(tmp, transpose64(F)) | |
| # ----------------------------------------------------------------------- | |
| # 3. Model & ML Fit | |
| # ----------------------------------------------------------------------- | |
| proc sigmaOf( | |
| theta: Tensor[float], # float64 on CPU | |
| k: int, | |
| mappings: seq[ParameterMapping], | |
| fixedLoads: seq[FixedLoading] | |
| ): ClTensor[float32] = | |
| # p = latent(eta) + k observed | |
| let p = k + 1 | |
| var A_cpu = zeros[float32](p, p) | |
| var S_cpu = zeros[float32](p, p) | |
| # 1) Fixed loadings (e.g., 1*x1) | |
| for fl in fixedLoads: | |
| A_cpu[fl.row, fl.col] = fl.value | |
| # 2) Free parameters (single pass; NO tanh) | |
| for m in mappings: | |
| let u32 = theta[m.paramIdx].float32 | |
| case m.kind | |
| of mPath: | |
| A_cpu[m.row, m.col] = u32 # unbounded loading | |
| of mVar: | |
| S_cpu[m.row, m.col] = max(1e-10'f32, u32*u32) # variance ≥ 0 | |
| # 3) Selection matrix F for observed (indices 1..k) | |
| var F_cpu = zeros[float32](k, p) | |
| for i in 0 ..< k: F_cpu[i, i+1] = 1.0'f32 | |
| # 4) Sigma = F * (I-A)^{-1} * S * (I-A)^{-T} * F^T | |
| let I = eye[float32](p, p) | |
| let IAinv = clInvert((I - A_cpu).opencl) # unit lower-triangular -> invertible | |
| let Scl = S_cpu.opencl | |
| let Fcl = F_cpu.opencl | |
| let tmp = clMatmul(clMatmul(clMatmul(Fcl, IAinv), Scl), clTranspose(IAinv)) | |
| result = clMatmul(tmp, clTranspose(Fcl)) | |
| proc mlFitGpu( | |
| theta: Tensor[float], | |
| SobsCl: ClTensor[float32], | |
| mappings: seq[ParameterMapping], | |
| fixedLoads: seq[FixedLoading], | |
| nMinus1: int # pass (rows - 1) from main if you want lavaan-style chi^2 | |
| ): float = | |
| let k = SobsCl.shape[0] | |
| # Build model-implied covariance on GPU | |
| let Sigma = sigmaOf(theta, k, mappings, fixedLoads) | |
| # Numerical stability ONLY on Σ(θ): tiny jitter to ensure SPD for Cholesky/inverse | |
| var SigmaAdj = Sigma.clone() | |
| clAddEyeScaled(SigmaAdj, 1e-7'f32) | |
| # Normal-theory ML objective without constants (OK for optimization): | |
| # F_ml = log|Σ| + tr(S Σ^{-1}) - p | |
| # We omit -log|S| because it's constant w.r.t. θ (add back for reporting, on CPU). | |
| let logDetSig = clLogDetSPD(SigmaAdj) | |
| let invSig = clSPDInverse(SigmaAdj) | |
| let trTerm = clTraceProduct(SobsCl, invSig) | |
| let F = logDetSig + trTerm - k.float | |
| # lavaan-style chi-square scaling: χ² = (n-1) * F_ml | |
| let chi2 = nMinus1.float * F | |
| # IMPORTANT: | |
| # If your lbfgs is a maximizer (as your traces suggest), return -chi2 to force minimization. | |
| # If lbfgs minimizes by default, change to: result = chi2 | |
| result = -chi2 | |
| # If you later want deviance-like scaling to match -2LL conventions, | |
| # gate that with a flag here (but keep the sign consistent with your optimizer). | |
| # --------------------------- | |
| # CPU linear algebra helpers | |
| # --------------------------- | |
| proc logdetSPD_chol*(S: Tensor[float64]): float64 = | |
| ## Computes log|S| for SPD S via Cholesky: |S| = (∏ diag(L))^2 | |
| ## Uses our local choleskySPD64() helper (not Arraymancer's cholesky). | |
| let L = choleskySPD64(S) # lower-triangular; S ≈ L * L^T | |
| var sumLog = 0.0 | |
| for i in 0 ..< L.shape[0]: | |
| let d = L[i, i] | |
| if d <= 0.0: | |
| raise newException(ValueError, "logdetSPD_chol: non-positive Cholesky diagonal; S may not be SPD") | |
| sumLog += ln(d) | |
| result = 2.0 * sumLog | |
| proc invSPD_from_chol*(S: Tensor[float64]): Tensor[float64] = | |
| ## Invert SPD S using its Cholesky factor without forming inv(L) explicitly. | |
| ## For small k this is fine; for larger k, prefer a dedicated solver. | |
| let L = choleskySPD64(S) | |
| let n = L.shape[0] | |
| # Solve L * Y = I -> forward substitution | |
| var I = eye[float64](n, n) | |
| var Y = zeros[float64](n, n) | |
| for j in 0 ..< n: | |
| # column j of Y | |
| for i in 0 ..< n: | |
| var s = I[i, j] | |
| for k in 0 ..< i: | |
| s -= L[i, k] * Y[k, j] | |
| Y[i, j] = s / L[i, i] | |
| # Solve L.T * X = Y -> back substitution | |
| result = zeros[float64](n, n) | |
| for j in 0 ..< n: | |
| for i in countdown(n-1, 0): | |
| var s = Y[i, j] | |
| for k in (i+1) ..< n: | |
| s -= L[k, i] * result[k, j] | |
| result[i, j] = s / L[i, i] | |
| proc traceProduct*(A, B: Tensor[float64]): float64 = | |
| ## tr(A B) without forming the full product explicitly (simple version). | |
| ## For small k, the naive double loop is fine. | |
| let n = A.shape[0] | |
| var tr = 0.0 | |
| for i in 0 ..< n: | |
| var acc = 0.0 | |
| for j in 0 ..< n: | |
| acc += A[i, j] * B[j, i] | |
| tr += acc | |
| result = tr | |
| # After optimization (on CPU, double precision) | |
| proc mlChiSquareFull( | |
| SigmaHat: Tensor[float64], | |
| SobsCpu: Tensor[float64], | |
| nMinus1: int | |
| ): float = | |
| let k = SobsCpu.shape[0] | |
| # log|Σ| | |
| let logDetSig = logdetSPD_chol(SigmaHat) | |
| # inv(Σ) via Cholesky | |
| let invSig = invSPD_from_chol(SigmaHat) | |
| # tr(S inv(Σ)) | |
| let trTerm = traceProduct(SobsCpu, invSig) | |
| # log|S| | |
| let logDetS = logdetSPD_chol(SobsCpu) | |
| let Ffull = logDetSig + trTerm - logDetS - k.float64 | |
| result = (nMinus1.float64 * Ffull).float | |
| proc fmlCpu(theta: Tensor[float], k: int, | |
| mappings: seq[ParameterMapping], fixedLoads: seq[FixedLoading], | |
| SobsCpu: Tensor[float32], nMinus1: int): float = | |
| # Build Sigma on CPU in double precision | |
| let SigmaHatCpu = SigmaHatFromThetaCPU(theta, k, mappings, fixedLoads) | |
| # Convert S to float64 for CPU math | |
| let S64 = SobsCpu.astype(float64) | |
| # Full ML chi-square (n-1)*Fml | |
| result = mlChiSquareFull(SigmaHatCpu, S64, nMinus1).float | |
| proc l2norm*(x: Tensor[float]): float = | |
| ## Euclidean norm for a 1-D Tensor[float] (works for any shape via linear index). | |
| var s = 0.0 | |
| for i in 0 ..< x.size: | |
| s += x[i] * x[i] | |
| result = sqrt(s) | |
| # ----------------------------------------------------------------------- | |
| # 4. Main (Scalable Data Loading) | |
| # ----------------------------------------------------------------------- | |
| when isMainModule: | |
| import os, sequtils, strformat, math | |
| import datamancer | |
| import arraymancer | |
| # Ensure OpenCL backend is initialized early | |
| discard zeros[float32](1, 1).opencl | |
| let csvPath = "data.csv" | |
| if not fileExists(csvPath): | |
| quit("data.csv not found.") | |
| # ------------------------- | |
| # A. LOAD & DIAGNOSE | |
| # ------------------------- | |
| let df = datamancer.readCsv(csvPath) | |
| # Variable order defines model order | |
| let colNames = toSeq(df.keys) | |
| let rows = df.len | |
| let k = colNames.len | |
| echo &"Detected {k} variables: {colNames}" | |
| echo &"Number of observations (rows): {rows}" | |
| # Build data tensor [rows x k] as float32 for GPU efficiency | |
| var dataTensor = newTensor[float32](rows, k) | |
| for j, name in colNames: | |
| let colData = df[name].toTensor(float32) # assumes all numeric / parsed | |
| for i in 0 ..< rows: | |
| dataTensor[i, j] = colData[i] | |
| # Sample covariance (unbiased; divisor n-1) | |
| let centered = dataTensor .- dataTensor.mean(axis = 0) # [rows x k] | |
| let SobsCpu = (centered.transpose * centered) /. (rows.float32 - 1.0'f32) # [k x k] | |
| echo "\nObserved Covariance Matrix (Sobs):" | |
| echo SobsCpu | |
| # Diagnostic: standardized correlations (not used in fitting) | |
| let diagS = SobsCpu.diagonal() | |
| var sds = newTensor[float32](k) | |
| for i in 0 ..< k: | |
| sds[i] = sqrt(max(1e-10'f32, diagS[i])) | |
| var Sscaled = SobsCpu.clone() | |
| for i in 0 ..< k: | |
| for j in 0 ..< k: | |
| Sscaled[i, j] = SobsCpu[i, j] / (sds[i] * sds[j]) | |
| echo "\nScaled Matrix (Standardized to correlations):" | |
| echo Sscaled | |
| # Fit ML on the covariance: | |
| let SobsCl = SobsCpu.opencl | |
| let nMinus1 = max(1, rows - 1) | |
| # ------------------------- | |
| # B. DEFINE MODEL (marker loading; latent variance free) | |
| # ------------------------- | |
| let modelSyntax = """ | |
| eta =~ 1*x1 + start(0.8)*x2 + start(0.7)*x3 | |
| eta ~~ start(1.0)*eta | |
| x1 ~~ start(0.10)*x1 | |
| x2 ~~ start(0.08)*x2 | |
| x3 ~~ start(0.06)*x3 | |
| """ | |
| let (myMappings, userStarts, fixedLoads) = parseLavaan(modelSyntax, colNames) | |
| # ------------------------- | |
| # C. Starting values (seed from covariance) | |
| # ------------------------- | |
| var theta0 = getSmartStarts(myMappings, SobsCpu, colNames, userStarts) | |
| # Ensure parameters are float64 on CPU for the optimizer: | |
| when compiles(theta0.dtype): | |
| discard | |
| else: | |
| theta0 = theta0.astype(float) | |
| echo "\nChosen starting values:" | |
| for i, m in myMappings: | |
| echo fmt"{i:>2} | {m.name:<15} (Idx {m.paramIdx}): {theta0[i]:>10.6f}" | |
| # Debug: FML at starts (χ² scale) | |
| echo "DEBUG: Fml at STARTS (chi-square scale) = ", | |
| fmlCpu(theta0, k, myMappings, fixedLoads, SobsCpu, nMinus1) | |
| echo &"\nStarting optimization with {myMappings.len} free parameters..." | |
| # ------------------------- | |
| # D. Optimize (ML on covariance) | |
| # ------------------------- | |
| let thetaHat = lbfgs( | |
| proc (t: Tensor[float]): float {.closure.} = | |
| mlFitGpu(t, SobsCl, myMappings, fixedLoads, nMinus1), | |
| theta0 | |
| ) | |
| # Debug: FML at solution (χ² scale) and step size | |
| echo "DEBUG: Fml at THETAHAT (chi-square scale) = ", | |
| fmlCpu(thetaHat, k, myMappings, fixedLoads, SobsCpu, nMinus1) | |
| let stepNorm = l2norm(thetaHat - theta0) | |
| echo &"DEBUG: ||thetaHat - theta0||_2 = {stepNorm:>12.6f}" | |
| # ------------------------- | |
| # E. (Optional) Inspect starts (untransformed) | |
| # ------------------------- | |
| echo "\nChosen starting values (unconstrained):" | |
| for i, m in myMappings: | |
| let u = theta0[i] | |
| if m.kind == mVar: | |
| echo &"{i:>2} | {m.name:<15}: u={u:>10.6f} var=u^2={(u*u):>10.6f}" | |
| else: | |
| # loadings are unbounded now (lavaan/OpenMx-style) | |
| echo &"{i:>2} | {m.name:<15}: u={u:>10.6f} loading(unbounded)={u:>10.6f}" | |
| # ------------------------- | |
| # F. Final chi-square (CPU, double precision) | |
| # ------------------------- | |
| let SigmaHatCpu: Tensor[float64] = SigmaHatFromThetaCPU(thetaHat, k, myMappings, fixedLoads) | |
| let chi2 = mlChiSquareFull(SigmaHatCpu, SobsCpu.astype(float64), nMinus1) | |
| # ------------------------- | |
| # G. Estimated Parameters (after optimization) | |
| # ------------------------- | |
| echo "\nEstimated parameters:" | |
| for i, m in myMappings: | |
| let u = thetaHat[i] # optimized (unconstrained) parameter | |
| if m.kind == mVar: | |
| echo &"{i:>2} | {m.name:<15}: variance = {(u*u):>12.6f}" | |
| else: | |
| echo &"{i:>2} | {m.name:<15}: loading = {u:>12.6f}" | |
| echo &"\nFinal ML chi-square (lavaan/OpenMx style): {chi2:>12.6f}" | |
| # ------------------------- | |
| # H. Degrees of Freedom (computed once) | |
| # ------------------------- | |
| let numStats = k*(k+1) div 2 | |
| # Replace the 3 lines with a single echo: | |
| echo &"Model df: {k*(k+1) div 2 - myMappings.len}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment