Skip to content

Instantly share code, notes, and snippets.

@lf-araujo
Last active February 19, 2026 16:16
Show Gist options
  • Select an option

  • Save lf-araujo/58d1f6712a52934654f7bef6d010fc29 to your computer and use it in GitHub Desktop.

Select an option

Save lf-araujo/58d1f6712a52934654f7bef6d010fc29 to your computer and use it in GitHub Desktop.
# npsol.nim
# Compile & run:
# nim c -r -d:release -d:opencl -d:blas=openblas --mm:refc npsol.nim
import std/[math, strformat, sequtils, os]
import arraymancer
import arraymancer/tensor/tensor_opencl
import arraymancer/tensor/backend/opencl_backend
import numericalnim
import opencl
import datamancer
import std/[strutils, tables]
# -----------------------------------------------------------------------
# 1. Structural Mapping (The Lavaan Parser Lite)
# -----------------------------------------------------------------------
type
MatrixType = enum mPath, mVar
ParameterMapping = object
kind: MatrixType
row, col: int
paramIdx: int
name: string # Added for pretty printing
type
FixedLoading = object
row, col: int
value: float32
# ---------------------------
# CPU linear algebra helpers (float64)
# ---------------------------
proc eye64*(n: int): Tensor[float64] =
result = zeros[float64](n, n)
for i in 0 ..< n:
result[i, i] = 1.0
proc transpose64*(A: Tensor[float64]): Tensor[float64] =
let m = A.shape[0]
let n = A.shape[1]
result = zeros[float64](n, m)
for i in 0 ..< m:
for j in 0 ..< n:
result[j, i] = A[i, j]
proc matmul64*(A, B: Tensor[float64]): Tensor[float64] =
let m = A.shape[0]
let n = A.shape[1]
let p = B.shape[1]
doAssert n == B.shape[0], "matmul64: dimension mismatch"
result = zeros[float64](m, p)
for i in 0 ..< m:
for j in 0 ..< p:
var acc = 0.0
for k in 0 ..< n:
acc += A[i, k] * B[k, j]
result[i, j] = acc
proc choleskySPD64*(S: Tensor[float64]; jitter: float64 = 1e-12): Tensor[float64] =
## Returns lower-triangular L such that S ≈ L * L^T.
let n = S.shape[0]
doAssert n == S.shape[1], "choleskySPD64: S must be square"
result = S.clone() # <-- replace copy() with clone()
for i in 0 ..< n:
for j in 0 .. i:
var sum = result[i, j]
for k in 0 ..< j:
sum -= result[i, k] * result[j, k]
if i == j:
if sum <= 0.0: sum = sum + jitter
result[i, j] = sqrt(sum)
else:
result[i, j] = sum / result[j, j]
# zero upper triangle
for j in i+1 ..< n:
result[i, j] = 0.0
proc logdetSPD64*(S: Tensor[float64]): float64 =
let L = choleskySPD64(S)
var s = 0.0
for i in 0 ..< L.shape[0]:
let d = L[i, i]
if d <= 0.0:
raise newException(ValueError, "logdetSPD64: non-positive Cholesky diag")
s += ln(d)
result = 2.0 * s
proc invSPD_from_chol64*(S: Tensor[float64]): Tensor[float64] =
## Invert SPD S using its Cholesky (two triangular solves).
let L = choleskySPD64(S)
let n = L.shape[0]
var Y = zeros[float64](n, n)
let I = eye64(n)
# Forward solve: L * Y = I
for j in 0 ..< n:
for i in 0 ..< n:
var s = I[i, j]
for k in 0 ..< i:
s -= L[i, k] * Y[k, j]
Y[i, j] = s / L[i, i]
# Backward solve: L^T * X = Y
result = zeros[float64](n, n)
for j in 0 ..< n:
for i in countdown(n-1, 0):
var s = Y[i, j]
for k in i+1 ..< n:
s -= L[k, i] * result[k, j]
result[i, j] = s / L[i, i]
proc traceProduct64*(A, B: Tensor[float64]): float64 =
doAssert A.shape[0] == B.shape[0] and A.shape[1] == B.shape[1], "traceProduct64: shape mismatch"
let n = A.shape[0]
var tr = 0.0
for i in 0 ..< n:
var acc = 0.0
for j in 0 ..< n:
acc += A[i, j] * B[j, i]
tr += acc
result = tr
proc invGeneral64*(A: Tensor[float64]; jitter: float64 = 0.0): Tensor[float64] =
## Gauss-Jordan inverse with partial pivoting (for small n).
let n = A.shape[0]
doAssert n == A.shape[1], "invGeneral64: A must be square"
var M = A.clone() # <-- replace copy() with clone()
if jitter != 0.0:
for i in 0 ..< n:
M[i, i] += jitter
result = eye64(n)
for col in 0 ..< n:
# pivot
var piv = col
var maxAbs = abs(M[col, col])
for r in col+1 ..< n:
let v = abs(M[r, col])
if v > maxAbs:
maxAbs = v
piv = r
if maxAbs == 0.0:
raise newException(ValueError, "invGeneral64: singular matrix")
# manual row swap (avoid swap() on tensors)
if piv != col:
for j in 0 ..< n:
let tmpM = M[col, j]
M[col, j] = M[piv, j]
M[piv, j] = tmpM
let tmpR = result[col, j]
result[col, j] = result[piv, j]
result[piv, j] = tmpR
let pivotVal = M[col, col]
for j in 0 ..< n:
M[col, j] /= pivotVal
result[col, j] /= pivotVal
# eliminate other rows
for r in 0 ..< n:
if r == col: continue
let factor = M[r, col]
if factor != 0.0:
for j in 0 ..< n:
M[r, j] -= factor * M[col, j]
result[r, j] -= factor * result[col, j]
proc parseFixedCoeff(token: string): (bool, float, string) =
## Detects numeric fixed coefficient like "1*x1" or "0.7 * x2".
## Returns (isFixed, coeff, varName).
## We purposely ignore tokens that begin with "start(" (those are not fixed).
var t = token.strip()
if t.len >= 6 and t.startsWith("start("):
return (false, NaN, t) # handled elsewhere
let star = t.find('*')
if star >= 0:
let left = t[0 .. star-1].strip()
let right = t[star+1 .. ^1].strip()
# Try parse left as a float coefficient
var coeff: float
try:
coeff = left.parseFloat
return (true, coeff, right)
except ValueError:
discard
return (false, NaN, t)
proc extractStart(token: string): (float, string) =
## Parses tokens like:
## "start(0.3)*x1", "start(.3) * x1", "x1"
## Returns: (startValue, variableName); startValue = NaN if none provided.
var t = token.strip()
if t.len >= 6 and t.startsWith("start("):
let closeParen = t.find(")")
if closeParen > 6:
let valStr = t[6 .. closeParen-1] # inside 'start(...)'
let after = (if closeParen+1 < t.len: t[closeParen+1 .. ^1] else: "").strip()
var varName = after
if after.len > 0 and after[0] == '*':
varName = after[1 .. ^1].strip()
return (valStr.parseFloat, varName)
# No start() found; the token should just be the variable name
return (NaN, t)
proc getSmartStarts(
mappings: seq[ParameterMapping],
Sobs: Tensor[float32],
vars: seq[string],
userStarts: Table[int, float]
): Tensor[float] =
result = newTensor[float](mappings.len)
for i, m in mappings:
if userStarts.hasKey(m.paramIdx):
let s = userStarts[m.paramIdx]
if m.kind == mVar:
# For square mapping: u = sqrt(variance_start)
result[i] = sqrt(max(1e-12, s))
else:
result[i] = s
elif m.kind == mPath:
result[i] = 0.5
else:
# default variance start: half of sample variance, mapped to u = sqrt(...)
let varIdx = m.row - 1
if varIdx >= 0 and varIdx < vars.len:
let rawVar = max(1e-12'f32, Sobs[varIdx, varIdx] * 0.5'f32)
result[i] = sqrt(rawVar.float)
else:
# latent variance fallback if present and user didn't specify
result[i] = 1.0
proc parseLavaan(
syntax: string,
vars: seq[string]
): (seq[ParameterMapping], Table[int, float], seq[FixedLoading]) =
var mappings: seq[ParameterMapping]
var userStarts = initTable[int, float]()
var fixedLoads: seq[FixedLoading] # <— NEW
var pIdx = 0
# 1. Lookup table
var nameToIdx = initTable[string, int]()
nameToIdx["eta"] = 0
for i, v in vars:
nameToIdx[v] = i + 1
for line in syntax.splitLines():
let clean = line.strip()
if clean.len == 0 or clean.startsWith("#"): continue
# 2. Loadings: eta =~ ...
if "=~" in clean:
let parts = clean.split("=~")
if parts.len != 2: continue
let latent = parts[0].strip()
let rhs = parts[1].split("+").mapIt(it.strip())
for tk in rhs:
# First: numeric fixed coefficient form: "1*x1", "0.7*x2", etc.
let (isFixed, coeff, obsCandidate) = parseFixedCoeff(tk)
if isFixed:
if nameToIdx.hasKey(obsCandidate) and nameToIdx.hasKey(latent):
fixedLoads.add FixedLoading(
row: nameToIdx[obsCandidate],
col: nameToIdx[latent],
value: coeff.float32
)
continue # do NOT create a free parameter in this case
# Otherwise, check start(value)*xj or bare xj
let (startVal, obsName) = extractStart(tk)
if nameToIdx.hasKey(obsName):
mappings.add ParameterMapping(
kind: mPath,
row: nameToIdx[obsName],
col: nameToIdx[latent],
paramIdx: pIdx,
name: latent & " -> " & obsName
)
if not startVal.isNaN:
userStarts[pIdx] = startVal
inc pIdx
# 3. Variances: x1 ~~ x1 (allow start() on either side)
elif "~~" in clean:
let parts = clean.split("~~")
if parts.len != 2: continue
let lhsRaw = parts[0].strip()
let rhsRaw = parts[1].strip()
# Extract (start, varName) from both sides
let (startL, nameL) = extractStart(lhsRaw)
let (startR, nameR) = extractStart(rhsRaw)
# Use whichever side gives a known variable name
var vName = nameL
if not nameToIdx.hasKey(vName) and nameToIdx.hasKey(nameR):
vName = nameR
if not nameToIdx.hasKey(vName): continue # unknown
# (We no longer skip Var(eta); let the syntax control it.)
# Prefer LHS start, else RHS start
var startVal = startL
if startVal.isNaN and not startR.isNaN:
startVal = startR
# Warn if both present and disagree
if (not startL.isNaN) and (not startR.isNaN) and startL != startR:
echo "WARNING: Conflicting starts for ", vName, " variance; using LHS = ", startL
let idx = nameToIdx[vName]
mappings.add ParameterMapping(
kind: mVar,
row: idx, col: idx,
paramIdx: pIdx,
name: "Var(" & vName & ")"
)
if not startVal.isNaN:
userStarts[pIdx] = startVal
inc pIdx
(mappings, userStarts, fixedLoads)
# Initialize OpenCL early to ensure clQueue0 is valid
discard zeros[float32](1, 1).opencl
# -----------------------------------------------------------------------
# 1. OpenCL Context & Kernels
# -----------------------------------------------------------------------
var raw_cl_context: Pcontext
discard getCommandQueueInfo(clQueue0, QUEUE_CONTEXT, sizeof(raw_cl_context), addr raw_cl_context, nil)
var deviceID: Pdevice_id
discard getCommandQueueInfo(clQueue0, QUEUE_DEVICE, sizeof(deviceID), addr deviceID, nil)
let custom_kernels = """
__kernel void matmul_generic(__global const float* A, __global const float* B, __global float* C, int M, int N, int P) {
int i = get_global_id(0); int j = get_global_id(1);
if (i >= M || j >= P) return;
float acc = 0.0f;
for (int k = 0; k < N; ++k) acc += A[i*N + k] * B[k*P + j];
C[i*P + j] = acc;
}
__kernel void transpose_generic(__global const float* A, __global float* C, int M, int N) {
int i = get_global_id(0); int j = get_global_id(1);
if (i < M && j < N) C[j*M + i] = A[i*N + j];
}
__kernel void add_eye_scaled(__global float* A, int N, float alpha) {
int i = get_global_id(0);
if (i < N) A[i*N + i] += alpha;
}
__kernel void invert_generic(__global float* A, __global float* Inv, int n) {
int row = get_global_id(0);
if (row >= n) return;
for (int j = 0; j < n; j++) Inv[row*n + j] = (row == j) ? 1.0f : 0.0f;
for (int k = 0; k < n; k++) {
float pivot = A[k*n + k];
if (pivot == 0.0f) pivot = 1e-8f;
if (row == k) {
for (int j = 0; j < n; j++) { A[k*n + j] /= pivot; Inv[k*n + j] /= pivot; }
}
barrier(CLK_GLOBAL_MEM_FENCE);
if (row != k) {
float factor = A[row*n + k];
for (int j = 0; j < n; j++) { A[row*n + j] -= factor * A[k*n + j]; Inv[row*n + j] -= factor * Inv[k*n + j]; }
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
__kernel void cholesky_spd_single(__global const float* A, __global float* L, int N) {
if (get_global_id(0) != 0) return;
for (int i = 0; i < N; ++i) for (int j = 0; j < N; ++j) L[i*N + j] = A[i*N + j];
for (int k = 0; k < N; ++k) {
float sum = 0.0f;
for (int s = 0; s < k; ++s) sum += L[k*N + s] * L[k*N + s];
float diag = L[k*N + k] - sum;
if (diag <= 0.0f) diag = 1e-8f;
L[k*N + k] = sqrt(diag);
for (int i = k+1; i < N; ++i) {
float s2 = 0.0f;
for (int s = 0; s < k; ++s) s2 += L[i*N + s] * L[k*N + s];
L[i*N + k] = (L[i*N + k] - s2) / L[k*N + k];
}
for (int j = k+1; j < N; ++j) L[k*N + j] = 0.0f;
}
}
__kernel void solve_lower_many_single(__global const float* L, __global const float* B, __global float* Y, int N, int M) {
if (get_global_id(0) != 0) return;
for (int j = 0; j < M; ++j) {
for (int i = 0; i < N; ++i) {
float sum = 0.0f;
for (int k = 0; k < i; ++k) sum += L[i*N + k] * Y[k*M + j];
Y[i*M + j] = (B[i*M + j] - sum) / L[i*N + i];
}
}
}
__kernel void solve_upper_many_single(__global const float* U, __global const float* Y, __global float* X, int N, int M) {
if (get_global_id(0) != 0) return;
for (int j = 0; j < M; ++j) {
for (int i = N-1; i >= 0; --i) {
float sum = 0.0f;
for (int k = i+1; k < N; ++k) sum += U[i*N + k] * X[k*M + j];
X[i*M + j] = (Y[i*M + j] - sum) / U[i*N + i];
}
}
}
__kernel void logdet_spd_from_chol_single(__global const float* L, int N, __global float* sumLogDiag) {
if (get_global_id(0) != 0) return;
float s = 0.0f;
for (int i = 0; i < N; ++i) { float d = L[i*N + i]; if (d <= 0.0f) d = 1e-8f; s += log(d); }
sumLogDiag[0] = 2.0f * s;
}
__kernel void trace_product_single(__global const float* S, __global const float* X, int N, __global float* out) {
if (get_global_id(0) != 0) return;
float tr = 0.0f;
for (int i = 0; i < N; ++i) {
float acc = 0.0f;
for (int j = 0; j < N; ++j) acc += S[i*N + j] * X[j*N + i];
tr += acc;
}
out[0] = tr;
}
"""
var src = cstring(custom_kernels)
var arr = [src]
let sources = cast[cstringArray](unsafeAddr arr[0])
var err_prog: TClResult
let program = createProgramWithSource(raw_cl_context, 1'u32, sources, nil, addr err_prog)
echo "Compiling OpenCL Kernels..."
let buildStatus = buildProgram(program, 0, nil, nil, nil, nil)
if buildStatus != SUCCESS: quit("Kernel build failed.")
var kMatmul, kTranspose, kAddEye, kInvert, kChol, kSolveL, kSolveU, kLogdetChol, kTrace: Pkernel
var e1: TClResult
kMatmul = createKernel(program, "matmul_generic", addr e1)
kTranspose = createKernel(program, "transpose_generic", addr e1)
kAddEye = createKernel(program, "add_eye_scaled", addr e1)
kInvert = createKernel(program, "invert_generic", addr e1)
kChol = createKernel(program, "cholesky_spd_single", addr e1)
kSolveL = createKernel(program, "solve_lower_many_single", addr e1)
kSolveU = createKernel(program, "solve_upper_many_single", addr e1)
kLogdetChol = createKernel(program, "logdet_spd_from_chol_single", addr e1)
kTrace = createKernel(program, "trace_product_single", addr e1)
# -----------------------------------------------------------------------
# 2. GPU Helpers
# -----------------------------------------------------------------------
proc setArg[T](k: Pkernel, idx: int, v: T) =
var tmp = v
discard setKernelArg(k, idx.uint32, sizeof(T).int, addr tmp)
proc clTranspose(A: ClTensor[float32]): ClTensor[float32] =
let M = A.shape[0].int; let N = A.shape[1].int
result = newTensor[float32](N, M).opencl
setArg(kTranspose, 0, A.toClpointer()); setArg(kTranspose, 1, result.toClpointer())
setArg(kTranspose, 2, M.int32); setArg(kTranspose, 3, N.int32)
var gs = [csize(M), csize(N)]
discard enqueueNDRangeKernel(clQueue0, kTranspose, 2, nil, addr gs[0], nil, 0, nil, nil)
discard flush(clQueue0)
proc clMatmul(A, B: ClTensor[float32]): ClTensor[float32] =
let M = A.shape[0].int; let N = A.shape[1].int; let P = B.shape[1].int
result = newTensor[float32](M, P).opencl
setArg(kMatmul, 0, A.toClpointer()); setArg(kMatmul, 1, B.toClpointer()); setArg(kMatmul, 2, result.toClpointer())
setArg(kMatmul, 3, M.int32); setArg(kMatmul, 4, N.int32); setArg(kMatmul, 5, P.int32)
var gs = [csize(M), csize(P)]
discard enqueueNDRangeKernel(clQueue0, kMatmul, 2, nil, addr gs[0], nil, 0, nil, nil)
discard flush(clQueue0)
proc clAddEyeScaled(A: var ClTensor[float32], alpha: float32) =
let N = A.shape[0].int
setArg(kAddEye, 0, A.toClpointer()); setArg(kAddEye, 1, N.int32); setArg(kAddEye, 2, alpha)
var gs = [csize(N)]
discard enqueueNDRangeKernel(clQueue0, kAddEye, 1, nil, addr gs[0], nil, 0, nil, nil)
discard flush(clQueue0)
proc clInvert(A: ClTensor[float32]): ClTensor[float32] =
let n = A.shape[0].int
var Awork = A.clone()
result = newTensor[float32](n, n).opencl
setArg(kInvert, 0, Awork.toClpointer()); setArg(kInvert, 1, result.toClpointer()); setArg(kInvert, 2, n.int32)
var gs = [csize(n)]; var ls = [csize(n)]
discard enqueueNDRangeKernel(clQueue0, kInvert, 1, nil, addr gs[0], addr ls[0], 0, nil, nil)
discard flush(clQueue0)
proc clCholesky(A: ClTensor[float32]): ClTensor[float32] =
let n = A.shape[0].int
result = newTensor[float32](n, n).opencl
setArg(kChol, 0, A.toClpointer()); setArg(kChol, 1, result.toClpointer()); setArg(kChol, 2, n.int32)
var gs = [csize(1)]; var ls = [csize(1)]
discard enqueueNDRangeKernel(clQueue0, kChol, 1, nil, addr gs[0], addr ls[0], 0, nil, nil)
discard flush(clQueue0)
proc clSPDInverse(A: ClTensor[float32]): ClTensor[float32] =
let n = A.shape[0].int
let L = clCholesky(A)
var I = eye[float32](n, n).opencl
var Ycl = newTensor[float32](n, n).opencl
setArg(kSolveL, 0, L.toClpointer()); setArg(kSolveL, 1, I.toClpointer()); setArg(kSolveL, 2, Ycl.toClpointer())
setArg(kSolveL, 3, n.int32); setArg(kSolveL, 4, n.int32)
var gs = [csize(1)]; discard enqueueNDRangeKernel(clQueue0, kSolveL, 1, nil, addr gs[0], nil, 0, nil, nil)
let Ucl = clTranspose(L)
result = newTensor[float32](n, n).opencl
setArg(kSolveU, 0, Ucl.toClpointer()); setArg(kSolveU, 1, Ycl.toClpointer()); setArg(kSolveU, 2, result.toClpointer())
setArg(kSolveU, 3, n.int32); setArg(kSolveU, 4, n.int32)
discard enqueueNDRangeKernel(clQueue0, kSolveU, 1, nil, addr gs[0], nil, 0, nil, nil)
discard flush(clQueue0)
proc clLogDetSPD(A: ClTensor[float32]): float =
let n = A.shape[0].int
let L = clCholesky(A)
var buf = newTensor[float32](1, 1).opencl
setArg(kLogdetChol, 0, L.toClpointer()); setArg(kLogdetChol, 1, n.int32); setArg(kLogdetChol, 2, buf.toClpointer())
var gs = [csize(1)]; discard enqueueNDRangeKernel(clQueue0, kLogdetChol, 1, nil, addr gs[0], nil, 0, nil, nil)
discard flush(clQueue0)
result = buf.cpu()[0,0].float
proc clTraceProduct(S, X: ClTensor[float32]): float =
let n = S.shape[0].int
var outBuf = newTensor[float32](1, 1).opencl
setArg(kTrace, 0, S.toClpointer()); setArg(kTrace, 1, X.toClpointer()); setArg(kTrace, 2, n.int32); setArg(kTrace, 3, outBuf.toClpointer())
var gs = [csize(1)]; discard enqueueNDRangeKernel(clQueue0, kTrace, 1, nil, addr gs[0], nil, 0, nil, nil)
discard flush(clQueue0)
result = outBuf.cpu()[0,0].float
proc SigmaHatFromThetaCPU*(
theta: Tensor[float], # float64 parameters
k: int,
mappings: seq[ParameterMapping],
fixedLoads: seq[FixedLoading]
): Tensor[float64] =
let p = k + 1 # 1 latent + k observed
var A = zeros[float64](p, p)
var S = zeros[float64](p, p)
# 1) Fixed loadings (e.g., 1*x1)
for fl in fixedLoads:
A[fl.row, fl.col] = fl.value.float64
# 2) Free parameters
for m in mappings:
let u = theta[m.paramIdx]
case m.kind
of mPath:
A[m.row, m.col] = u # unbounded loading
of mVar:
S[m.row, m.col] = max(1e-12, u*u)
# 3) Σ = F * (I-A)^{-1} * S * (I-A)^{-T} * F^T
let I = eye64(p)
let IA = I - A
let IAinv = invGeneral64(IA, jitter=1e-12) # small numerical jitter
let IAinvT = transpose64(IAinv)
var F = zeros[float64](k, p)
for i in 0 ..< k:
F[i, i+1] = 1.0
let tmp = matmul64(matmul64(matmul64(F, IAinv), S), IAinvT)
result = matmul64(tmp, transpose64(F))
# -----------------------------------------------------------------------
# 3. Model & ML Fit
# -----------------------------------------------------------------------
proc sigmaOf(
theta: Tensor[float], # float64 on CPU
k: int,
mappings: seq[ParameterMapping],
fixedLoads: seq[FixedLoading]
): ClTensor[float32] =
# p = latent(eta) + k observed
let p = k + 1
var A_cpu = zeros[float32](p, p)
var S_cpu = zeros[float32](p, p)
# 1) Fixed loadings (e.g., 1*x1)
for fl in fixedLoads:
A_cpu[fl.row, fl.col] = fl.value
# 2) Free parameters (single pass; NO tanh)
for m in mappings:
let u32 = theta[m.paramIdx].float32
case m.kind
of mPath:
A_cpu[m.row, m.col] = u32 # unbounded loading
of mVar:
S_cpu[m.row, m.col] = max(1e-10'f32, u32*u32) # variance ≥ 0
# 3) Selection matrix F for observed (indices 1..k)
var F_cpu = zeros[float32](k, p)
for i in 0 ..< k: F_cpu[i, i+1] = 1.0'f32
# 4) Sigma = F * (I-A)^{-1} * S * (I-A)^{-T} * F^T
let I = eye[float32](p, p)
let IAinv = clInvert((I - A_cpu).opencl) # unit lower-triangular -> invertible
let Scl = S_cpu.opencl
let Fcl = F_cpu.opencl
let tmp = clMatmul(clMatmul(clMatmul(Fcl, IAinv), Scl), clTranspose(IAinv))
result = clMatmul(tmp, clTranspose(Fcl))
proc mlFitGpu(
theta: Tensor[float],
SobsCl: ClTensor[float32],
mappings: seq[ParameterMapping],
fixedLoads: seq[FixedLoading],
nMinus1: int # pass (rows - 1) from main if you want lavaan-style chi^2
): float =
let k = SobsCl.shape[0]
# Build model-implied covariance on GPU
let Sigma = sigmaOf(theta, k, mappings, fixedLoads)
# Numerical stability ONLY on Σ(θ): tiny jitter to ensure SPD for Cholesky/inverse
var SigmaAdj = Sigma.clone()
clAddEyeScaled(SigmaAdj, 1e-7'f32)
# Normal-theory ML objective without constants (OK for optimization):
# F_ml = log|Σ| + tr(S Σ^{-1}) - p
# We omit -log|S| because it's constant w.r.t. θ (add back for reporting, on CPU).
let logDetSig = clLogDetSPD(SigmaAdj)
let invSig = clSPDInverse(SigmaAdj)
let trTerm = clTraceProduct(SobsCl, invSig)
let F = logDetSig + trTerm - k.float
# lavaan-style chi-square scaling: χ² = (n-1) * F_ml
let chi2 = nMinus1.float * F
# IMPORTANT:
# If your lbfgs is a maximizer (as your traces suggest), return -chi2 to force minimization.
# If lbfgs minimizes by default, change to: result = chi2
result = -chi2
# If you later want deviance-like scaling to match -2LL conventions,
# gate that with a flag here (but keep the sign consistent with your optimizer).
# ---------------------------
# CPU linear algebra helpers
# ---------------------------
proc logdetSPD_chol*(S: Tensor[float64]): float64 =
## Computes log|S| for SPD S via Cholesky: |S| = (∏ diag(L))^2
## Uses our local choleskySPD64() helper (not Arraymancer's cholesky).
let L = choleskySPD64(S) # lower-triangular; S ≈ L * L^T
var sumLog = 0.0
for i in 0 ..< L.shape[0]:
let d = L[i, i]
if d <= 0.0:
raise newException(ValueError, "logdetSPD_chol: non-positive Cholesky diagonal; S may not be SPD")
sumLog += ln(d)
result = 2.0 * sumLog
proc invSPD_from_chol*(S: Tensor[float64]): Tensor[float64] =
## Invert SPD S using its Cholesky factor without forming inv(L) explicitly.
## For small k this is fine; for larger k, prefer a dedicated solver.
let L = choleskySPD64(S)
let n = L.shape[0]
# Solve L * Y = I -> forward substitution
var I = eye[float64](n, n)
var Y = zeros[float64](n, n)
for j in 0 ..< n:
# column j of Y
for i in 0 ..< n:
var s = I[i, j]
for k in 0 ..< i:
s -= L[i, k] * Y[k, j]
Y[i, j] = s / L[i, i]
# Solve L.T * X = Y -> back substitution
result = zeros[float64](n, n)
for j in 0 ..< n:
for i in countdown(n-1, 0):
var s = Y[i, j]
for k in (i+1) ..< n:
s -= L[k, i] * result[k, j]
result[i, j] = s / L[i, i]
proc traceProduct*(A, B: Tensor[float64]): float64 =
## tr(A B) without forming the full product explicitly (simple version).
## For small k, the naive double loop is fine.
let n = A.shape[0]
var tr = 0.0
for i in 0 ..< n:
var acc = 0.0
for j in 0 ..< n:
acc += A[i, j] * B[j, i]
tr += acc
result = tr
# After optimization (on CPU, double precision)
proc mlChiSquareFull(
SigmaHat: Tensor[float64],
SobsCpu: Tensor[float64],
nMinus1: int
): float =
let k = SobsCpu.shape[0]
# log|Σ|
let logDetSig = logdetSPD_chol(SigmaHat)
# inv(Σ) via Cholesky
let invSig = invSPD_from_chol(SigmaHat)
# tr(S inv(Σ))
let trTerm = traceProduct(SobsCpu, invSig)
# log|S|
let logDetS = logdetSPD_chol(SobsCpu)
let Ffull = logDetSig + trTerm - logDetS - k.float64
result = (nMinus1.float64 * Ffull).float
proc fmlCpu(theta: Tensor[float], k: int,
mappings: seq[ParameterMapping], fixedLoads: seq[FixedLoading],
SobsCpu: Tensor[float32], nMinus1: int): float =
# Build Sigma on CPU in double precision
let SigmaHatCpu = SigmaHatFromThetaCPU(theta, k, mappings, fixedLoads)
# Convert S to float64 for CPU math
let S64 = SobsCpu.astype(float64)
# Full ML chi-square (n-1)*Fml
result = mlChiSquareFull(SigmaHatCpu, S64, nMinus1).float
proc l2norm*(x: Tensor[float]): float =
## Euclidean norm for a 1-D Tensor[float] (works for any shape via linear index).
var s = 0.0
for i in 0 ..< x.size:
s += x[i] * x[i]
result = sqrt(s)
# -----------------------------------------------------------------------
# 4. Main (Scalable Data Loading)
# -----------------------------------------------------------------------
when isMainModule:
import os, sequtils, strformat, math
import datamancer
import arraymancer
# Ensure OpenCL backend is initialized early
discard zeros[float32](1, 1).opencl
let csvPath = "data.csv"
if not fileExists(csvPath):
quit("data.csv not found.")
# -------------------------
# A. LOAD & DIAGNOSE
# -------------------------
let df = datamancer.readCsv(csvPath)
# Variable order defines model order
let colNames = toSeq(df.keys)
let rows = df.len
let k = colNames.len
echo &"Detected {k} variables: {colNames}"
echo &"Number of observations (rows): {rows}"
# Build data tensor [rows x k] as float32 for GPU efficiency
var dataTensor = newTensor[float32](rows, k)
for j, name in colNames:
let colData = df[name].toTensor(float32) # assumes all numeric / parsed
for i in 0 ..< rows:
dataTensor[i, j] = colData[i]
# Sample covariance (unbiased; divisor n-1)
let centered = dataTensor .- dataTensor.mean(axis = 0) # [rows x k]
let SobsCpu = (centered.transpose * centered) /. (rows.float32 - 1.0'f32) # [k x k]
echo "\nObserved Covariance Matrix (Sobs):"
echo SobsCpu
# Diagnostic: standardized correlations (not used in fitting)
let diagS = SobsCpu.diagonal()
var sds = newTensor[float32](k)
for i in 0 ..< k:
sds[i] = sqrt(max(1e-10'f32, diagS[i]))
var Sscaled = SobsCpu.clone()
for i in 0 ..< k:
for j in 0 ..< k:
Sscaled[i, j] = SobsCpu[i, j] / (sds[i] * sds[j])
echo "\nScaled Matrix (Standardized to correlations):"
echo Sscaled
# Fit ML on the covariance:
let SobsCl = SobsCpu.opencl
let nMinus1 = max(1, rows - 1)
# -------------------------
# B. DEFINE MODEL (marker loading; latent variance free)
# -------------------------
let modelSyntax = """
eta =~ 1*x1 + start(0.8)*x2 + start(0.7)*x3
eta ~~ start(1.0)*eta
x1 ~~ start(0.10)*x1
x2 ~~ start(0.08)*x2
x3 ~~ start(0.06)*x3
"""
let (myMappings, userStarts, fixedLoads) = parseLavaan(modelSyntax, colNames)
# -------------------------
# C. Starting values (seed from covariance)
# -------------------------
var theta0 = getSmartStarts(myMappings, SobsCpu, colNames, userStarts)
# Ensure parameters are float64 on CPU for the optimizer:
when compiles(theta0.dtype):
discard
else:
theta0 = theta0.astype(float)
echo "\nChosen starting values:"
for i, m in myMappings:
echo fmt"{i:>2} | {m.name:<15} (Idx {m.paramIdx}): {theta0[i]:>10.6f}"
# Debug: FML at starts (χ² scale)
echo "DEBUG: Fml at STARTS (chi-square scale) = ",
fmlCpu(theta0, k, myMappings, fixedLoads, SobsCpu, nMinus1)
echo &"\nStarting optimization with {myMappings.len} free parameters..."
# -------------------------
# D. Optimize (ML on covariance)
# -------------------------
let thetaHat = lbfgs(
proc (t: Tensor[float]): float {.closure.} =
mlFitGpu(t, SobsCl, myMappings, fixedLoads, nMinus1),
theta0
)
# Debug: FML at solution (χ² scale) and step size
echo "DEBUG: Fml at THETAHAT (chi-square scale) = ",
fmlCpu(thetaHat, k, myMappings, fixedLoads, SobsCpu, nMinus1)
let stepNorm = l2norm(thetaHat - theta0)
echo &"DEBUG: ||thetaHat - theta0||_2 = {stepNorm:>12.6f}"
# -------------------------
# E. (Optional) Inspect starts (untransformed)
# -------------------------
echo "\nChosen starting values (unconstrained):"
for i, m in myMappings:
let u = theta0[i]
if m.kind == mVar:
echo &"{i:>2} | {m.name:<15}: u={u:>10.6f} var=u^2={(u*u):>10.6f}"
else:
# loadings are unbounded now (lavaan/OpenMx-style)
echo &"{i:>2} | {m.name:<15}: u={u:>10.6f} loading(unbounded)={u:>10.6f}"
# -------------------------
# F. Final chi-square (CPU, double precision)
# -------------------------
let SigmaHatCpu: Tensor[float64] = SigmaHatFromThetaCPU(thetaHat, k, myMappings, fixedLoads)
let chi2 = mlChiSquareFull(SigmaHatCpu, SobsCpu.astype(float64), nMinus1)
# -------------------------
# G. Estimated Parameters (after optimization)
# -------------------------
echo "\nEstimated parameters:"
for i, m in myMappings:
let u = thetaHat[i] # optimized (unconstrained) parameter
if m.kind == mVar:
echo &"{i:>2} | {m.name:<15}: variance = {(u*u):>12.6f}"
else:
echo &"{i:>2} | {m.name:<15}: loading = {u:>12.6f}"
echo &"\nFinal ML chi-square (lavaan/OpenMx style): {chi2:>12.6f}"
# -------------------------
# H. Degrees of Freedom (computed once)
# -------------------------
let numStats = k*(k+1) div 2
# Replace the 3 lines with a single echo:
echo &"Model df: {k*(k+1) div 2 - myMappings.len}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment