diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 481bc43..21a2c23 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -12,9 +12,18 @@ jobs:
   linux:
     strategy:
       matrix:
-        py-version: ['3.10', '3.11']
+        include:
+          - py-version: '3.10'
+            c-compiler: 'gcc'
+            cxx-compiler: 'g++'
+          - py-version: '3.11'
+            c-compiler: 'gcc'
+            cxx-compiler: 'g++'
+          - py-version: '3.10'
+            c-compiler: 'clang'
+            cxx-compiler: 'clang++'
       fail-fast: false
-    name: Python ${{ matrix.py-version }}
+    name: ${{ matrix.c-compiler}} Python ${{ matrix.py-version }}
 
 
     runs-on: ubuntu-latest
@@ -22,6 +31,8 @@ jobs:
     env:
       MPIEXEC_FLAGS: "--allow-run-as-root --oversubscribe"
       PYNUCLEUS_BUILD_PARALLELISM: 2
+      OMPI_CC: ${{ matrix.c-compiler }}
+      OMPI_CXX: ${{ matrix.cxx-compiler }}
 
     steps:
     - name: Check out repo
@@ -34,7 +45,7 @@ jobs:
       uses: actions/cache/restore@v3
       with:
         path: /home/runner/.cache/ccache
-        key: ccache-${{ matrix.py-version }}
+        key: ccache-${{ matrix.c-compiler }}-${{ matrix.py-version }}
 
     - uses: actions/setup-python@v4
       if: always()
@@ -57,7 +68,7 @@ jobs:
 
     - name: Install
       if: always()
-      run: python -m pip install wheel && python -m pip list && make install PIP_INSTALL_FLAGS="--no-use-pep517"
+      run: python -m pip install wheel && python -m pip list && make install PIP_INSTALL_FLAGS="--no-use-pep517 -vvv"
 
     - name: Remove ccache cache
       if: ${{ steps.ccache-restore.outputs.cache-hit }}
@@ -66,7 +77,7 @@ jobs:
         GH_TOKEN: ${{ github.token }}
       run: |
         gh extension install actions/gh-actions-cache
-        gh actions-cache delete ccache-${{ matrix.py-version }} --confirm
+        gh actions-cache delete ccache-${{ matrix.c-compiler}}-${{ matrix.py-version }} --confirm
       continue-on-error: true
 
     - name: Push ccache cache
@@ -74,7 +85,7 @@ jobs:
       uses: actions/cache/save@v3
       with:
         path: /home/runner/.cache/ccache
-        key: ccache-${{ matrix.py-version }}
+        key: ccache-${{ matrix.c-compiler }}-${{ matrix.py-version }}
 
     - name: Ccache report
       if: always()
@@ -82,27 +93,27 @@ jobs:
 
     - name: Run tests
       if: always()
-      run: python3 -m pytest --junit-xml=test-results-${{ matrix.py-version }}.xml tests/
+      run: python3 -m pytest --junit-xml=test-results-${{ matrix.c-compiler }}-${{ matrix.py-version }}.xml tests/
 
     - name: Run flake8
       if: always()
       run: |
         make flake8
-        mv flake8.xml flake8-${{ matrix.py-version }}.xml
+        mv flake8.xml flake8-${{ matrix.c-compiler }}-${{ matrix.py-version }}.xml
 
     - name: Archive test results
       uses: actions/upload-artifact@v3
       if: always()
       with:
         name: Test results
-        path: test-results-${{ matrix.py-version }}.xml
+        path: test-results-${{ matrix.c-compiler }}-${{ matrix.py-version }}.xml
 
     - name: Report test results
       uses: dorny/test-reporter@v1
       if: always()
       with:
-        name: Test report (Python ${{ matrix.py-version }})
-        path: test-results-${{ matrix.py-version }}.xml
+        name: Test report (${{ matrix.c-compiler }}, Python ${{ matrix.py-version }})
+        path: test-results-${{ matrix.c-compiler }}-${{ matrix.py-version }}.xml
         reporter: java-junit
         fail-on-error: true
 
@@ -110,7 +121,7 @@ jobs:
       uses: dorny/test-reporter@v1
       if: always()
       with:
-        name: Flake8 report (Python ${{ matrix.py-version }})
-        path: flake8-${{ matrix.py-version }}.xml
+        name: Flake8 report (${{ matrix.c-compiler }}, Python ${{ matrix.py-version }})
+        path: flake8-${{ matrix.c-compiler }}-${{ matrix.py-version }}.xml
         reporter: java-junit
         fail-on-error: false
diff --git a/base/PyNucleus_base/blas.pyx b/base/PyNucleus_base/blas.pyx
index d339a2f..5a7d8bd 100644
--- a/base/PyNucleus_base/blas.pyx
+++ b/base/PyNucleus_base/blas.pyx
@@ -9,462 +9,15 @@ import numpy as np
 cimport numpy as np
 from . myTypes import INDEX
 
-include "config.pxi"
 
 cdef:
     INDEX_t MAX_INT = np.iinfo(INDEX).max
     REAL_t NAN = np.nan
 
 
-def uninitialized(*args, **kwargs):
-    IF FILL_UNINITIALIZED:
-        if 'dtype' in kwargs and np.issubdtype(kwargs['dtype'], np.integer):
-            kwargs['fill_value'] = np.iinfo(kwargs['dtype']).min
-        else:
-            kwargs['fill_value'] = NAN
-        return np.full(*args, **kwargs)
-    ELSE:
-        return np.empty(*args, **kwargs)
-
-
-def uninitialized_like(like, **kwargs):
-    IF FILL_UNINITIALIZED:
-        if np.issubdtype(np.array(like, copy=False).dtype, np.integer):
-            kwargs['fill_value'] = np.iinfo(like.dtype).min
-        else:
-            kwargs['fill_value'] = NAN
-        return np.full_like(like, **kwargs)
-    ELSE:
-        return np.empty_like(like, **kwargs)
-
-
-cpdef carray uninitializedINDEX(tuple shape):
-    cdef:
-        carray a = carray(shape, 4, 'i')
-        Py_ssize_t s, i
-    IF FILL_UNINITIALIZED:
-        s = 1
-        for i in range(len(shape)):
-            s *= shape[i]
-        for i in range(s):
-            (<INDEX_t*>a.data)[i] = MAX_INT
-    return a
-
-
-cpdef carray uninitializedREAL(tuple shape):
-    cdef:
-        carray a = carray(shape, 8, 'd')
-        Py_ssize_t s, i
-    IF FILL_UNINITIALIZED:
-        s = 1
-        for i in range(len(shape)):
-            s *= shape[i]
-        for i in range(s):
-            (<REAL_t*>a.data)[i] = NAN
-    return a
-
-
-IF USE_BLAS:
-
-    from scipy.linalg.cython_blas cimport dcopy, dscal, daxpy, ddot, dnrm2
-    from scipy.linalg.cython_blas cimport zcopy, zscal, zaxpy, zdotc
-    from scipy.linalg.cython_blas cimport dgemv, zgemv
-    from scipy.linalg.cython_blas cimport dgemm, zgemm
-
-
-    cdef void assign(SCALAR_t[::1] y, SCALAR_t[::1] x):
-        cdef:
-            double* x_ptr
-            double* y_ptr
-            double complex* x_ptr_c
-            double complex* y_ptr_c
-            int n = x.shape[0]
-            int inc = 1
-        if SCALAR_t is COMPLEX_t:
-            x_ptr_c = &x[0]
-            y_ptr_c = &y[0]
-            zcopy(&n, x_ptr_c, &inc, y_ptr_c, &inc)
-        else:
-            x_ptr = &x[0]
-            y_ptr = &y[0]
-            dcopy(&n, x_ptr, &inc, y_ptr, &inc)
-
-    cdef void assignScaled(SCALAR_t[::1] y, SCALAR_t[::1] x, SCALAR_t alpha):
-        cdef:
-            double* x_ptr
-            double* y_ptr
-            double complex* x_ptr_c
-            double complex* y_ptr_c
-            int n = x.shape[0]
-            int inc = 1
-        if SCALAR_t is COMPLEX_t:
-            x_ptr_c = &x[0]
-            y_ptr_c = &y[0]
-            zcopy(&n, x_ptr_c, &inc, y_ptr_c, &inc)
-            zscal(&n, &alpha, y_ptr_c, &inc)
-        else:
-            x_ptr = &x[0]
-            y_ptr = &y[0]
-            dcopy(&n, x_ptr, &inc, y_ptr, &inc)
-            dscal(&n, &alpha, y_ptr, &inc)
-
-    cdef void assign3(SCALAR_t[::1] z, SCALAR_t[::1] x, SCALAR_t alpha, SCALAR_t[::1] y, SCALAR_t beta):
-        cdef:
-            double* x_ptr
-            double* y_ptr
-            double* z_ptr
-            double complex* x_ptr_c
-            double complex* y_ptr_c
-            double complex* z_ptr_c
-            int n = x.shape[0]
-            int inc = 1
-        if SCALAR_t is COMPLEX_t:
-            x_ptr_c = &x[0]
-            y_ptr_c = &y[0]
-            z_ptr_c = &z[0]
-            zcopy(&n, x_ptr_c, &inc, z_ptr_c, &inc)
-            if alpha != 1.0:
-                zscal(&n, &alpha, z_ptr_c, &inc)
-            zaxpy(&n, &beta, y_ptr_c, &inc, z_ptr_c, &inc)
-        else:
-            x_ptr = &x[0]
-            y_ptr = &y[0]
-            z_ptr = &z[0]
-            dcopy(&n, x_ptr, &inc, z_ptr, &inc)
-            if alpha != 1.0:
-                dscal(&n, &alpha, z_ptr, &inc)
-            daxpy(&n, &beta, y_ptr, &inc, z_ptr, &inc)
-
-    cdef void update(SCALAR_t[::1] x, SCALAR_t[::1] y):
-        cdef:
-            double* x_ptr
-            double* y_ptr
-            double complex* x_ptr_c
-            double complex* y_ptr_c
-            SCALAR_t alpha = 1.0
-            int n = x.shape[0]
-            int inc = 1
-        if SCALAR_t is COMPLEX_t:
-            x_ptr_c = &x[0]
-            y_ptr_c = &y[0]
-            zaxpy(&n, &alpha, y_ptr_c, &inc, x_ptr_c, &inc)
-        else:
-            x_ptr = &x[0]
-            y_ptr = &y[0]
-            daxpy(&n, &alpha, y_ptr, &inc, x_ptr, &inc)
-
-    cdef void updateScaled(SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t alpha):
-        cdef:
-            double* x_ptr
-            double* y_ptr
-            double complex* x_ptr_c
-            double complex* y_ptr_c
-            int n = x.shape[0]
-            int inc = 1
-        if SCALAR_t is COMPLEX_t:
-            x_ptr_c = &x[0]
-            y_ptr_c = &y[0]
-            zaxpy(&n, &alpha, y_ptr_c, &inc, x_ptr_c, &inc)
-        else:
-            x_ptr = &x[0]
-            y_ptr = &y[0]
-            daxpy(&n, &alpha, y_ptr, &inc, x_ptr, &inc)
-
-    cdef void scaleScalar(SCALAR_t[::1] x, SCALAR_t alpha):
-        cdef:
-            double* x_ptr
-            double complex* x_ptr_c
-            int n = x.shape[0]
-            int inc = 1
-        if SCALAR_t is COMPLEX_t:
-            x_ptr_c = &x[0]
-            zscal(&n, &alpha, x_ptr_c, &inc)
-        else:
-            x_ptr = &x[0]
-            dscal(&n, &alpha, x_ptr, &inc)
-
-    cdef SCALAR_t mydot(SCALAR_t[::1] v0, SCALAR_t[::1] v1):
-        cdef:
-            SCALAR_t s = 0.0
-            double* v0_ptr
-            double* v1_ptr
-            double complex* v0_ptr_c
-            double complex* v1_ptr_c
-            int n = v0.shape[0]
-            int inc = 1
-        if SCALAR_t is COMPLEX_t:
-            v0_ptr_c = &v0[0]
-            v1_ptr_c = &v1[0]
-            s = zdotc(&n, v0_ptr_c, &inc, v1_ptr_c, &inc)
-        else:
-            v0_ptr = &v0[0]
-            v1_ptr = &v1[0]
-            s = ddot(&n, v0_ptr, &inc, v1_ptr, &inc)
-        return s
-
-    cdef REAL_t norm(SCALAR_t[::1] x):
-        cdef:
-            REAL_t s = 0.0
-            double* x_ptr
-            double complex* x_ptr_c
-            int n = x.shape[0]
-            int inc = 1
-        if SCALAR_t is COMPLEX_t:
-            x_ptr_c = &x[0]
-            s = zdotc(&n, x_ptr_c, &inc, x_ptr_c, &inc).real
-            return sqrt(s)
-        else:
-            x_ptr = &x[0]
-            return dnrm2(&n, x_ptr, &inc)
-
-    cdef void gemv(SCALAR_t[:, ::1] A, SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t beta=0.):
-        cdef:
-            double* A_ptr
-            double* x_ptr
-            double* y_ptr
-            double complex* A_ptr_c
-            double complex* x_ptr_c
-            double complex* y_ptr_c
-            int m = A.shape[1]
-            int n = A.shape[0]
-            SCALAR_t alpha = 1.
-            int lda = A.shape[1]
-            int incx = 1
-            int incy = 1
-        if SCALAR_t is COMPLEX_t:
-            A_ptr_c = &A[0, 0]
-            x_ptr_c = &x[0]
-            y_ptr_c = &y[0]
-            zgemv('t', &m, &n, &alpha, A_ptr_c, &lda, x_ptr_c, &incx, &beta, y_ptr_c, &incy)
-        else:
-            A_ptr = &A[0, 0]
-            x_ptr = &x[0]
-            y_ptr = &y[0]
-            dgemv('t', &m, &n, &alpha, A_ptr, &lda, x_ptr, &incx, &beta, y_ptr, &incy)
-
-    cdef void gemvF(SCALAR_t[::1, :] A, SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t beta=0.):
-        cdef:
-            double* A_ptr
-            double* x_ptr
-            double* y_ptr
-            double complex* A_ptr_c
-            double complex* x_ptr_c
-            double complex* y_ptr_c
-            int m = A.shape[0]
-            int n = A.shape[1]
-            SCALAR_t alpha = 1.
-            int lda = A.shape[0]
-            int incx = 1
-            int incy = 1
-        if SCALAR_t is COMPLEX_t:
-            A_ptr_c = &A[0, 0]
-            x_ptr_c = &x[0]
-            y_ptr_c = &y[0]
-            zgemv('n', &m, &n, &alpha, A_ptr_c, &lda, x_ptr_c, &incx, &beta, y_ptr_c, &incy)
-        else:
-            A_ptr = &A[0, 0]
-            x_ptr = &x[0]
-            y_ptr = &y[0]
-            dgemv('n', &m, &n, &alpha, A_ptr, &lda, x_ptr, &incx, &beta, y_ptr, &incy)
-
-
-    cdef void gemvT(SCALAR_t[:, ::1] A, SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t beta=0.):
-        cdef:
-            double* A_ptr
-            double* x_ptr
-            double* y_ptr
-            double complex* A_ptr_c
-            double complex* x_ptr_c
-            double complex* y_ptr_c
-            int m = A.shape[0]
-            int n = A.shape[1]
-            SCALAR_t alpha = 1.
-            int lda = A.shape[0]
-            int incx = 1
-            int incy = 1
-        if SCALAR_t is COMPLEX_t:
-            A_ptr_c = &A[0, 0]
-            x_ptr_c = &x[0]
-            y_ptr_c = &y[0]
-            zgemv('n', &m, &n, &alpha, A_ptr_c, &lda, x_ptr_c, &incx, &beta, y_ptr_c, &incy)
-        else:
-            A_ptr = &A[0, 0]
-            x_ptr = &x[0]
-            y_ptr = &y[0]
-            dgemv('n', &m, &n, &alpha, A_ptr, &lda, x_ptr, &incx, &beta, y_ptr, &incy)
-
-
-    cdef void matmat(SCALAR_t[:, ::1] A, SCALAR_t[:, ::1] B, SCALAR_t[:, ::1] C):
-        cdef:
-            double* A_ptr
-            double* B_ptr
-            double* C_ptr
-            double complex* A_ptr_c
-            double complex* B_ptr_c
-            double complex* C_ptr_c
-            int m = B.shape[1]
-            int k = B.shape[0]
-            int n = A.shape[0]
-            SCALAR_t alpha = 1.
-            SCALAR_t beta = 0.
-            int lda = A.shape[1]
-            int ldb = B.shape[1]
-            int ldc = C.shape[1]
-        if SCALAR_t is COMPLEX_t:
-            A_ptr_c = &A[0, 0]
-            B_ptr_c = &B[0, 0]
-            C_ptr_c = &C[0, 0]
-            zgemm('n', 'n', &m, &n, &k, &alpha, B_ptr_c, &ldb, A_ptr_c, &lda, &beta, C_ptr_c, &ldc)
-        else:
-            A_ptr = &A[0, 0]
-            B_ptr = &B[0, 0]
-            C_ptr = &C[0, 0]
-            dgemm('n', 'n', &m, &n, &k, &alpha, B_ptr, &ldb, A_ptr, &lda, &beta, C_ptr, &ldc)
-
-ELSE:
-    cdef void assign(SCALAR_t[::1] y, const SCALAR_t[::1] x):
-        cdef:
-            INDEX_t i
-        for i in range(x.shape[0]):
-            y[i] = x[i]
-
-    cdef void assignScaled(SCALAR_t[::1] y, const SCALAR_t[::1] x, SCALAR_t alpha):
-        cdef:
-            INDEX_t i
-        for i in range(x.shape[0]):
-            y[i] = alpha*x[i]
-
-
-    cdef void assign3(SCALAR_t[::1] z, const SCALAR_t[::1] x, SCALAR_t alpha, const SCALAR_t[::1] y, SCALAR_t beta):
-        cdef:
-            INDEX_t i
-        for i in range(x.shape[0]):
-            z[i] = alpha*x[i] + beta*y[i]
-
-    cdef void update(SCALAR_t[::1] x, SCALAR_t[::1] y):
-        cdef:
-            INDEX_t i
-        if SCALAR_t is COMPLEX_t:
-            for i in range(x.shape[0]):
-                x[i] = x[i] + y[i]
-        else:
-            for i in range(x.shape[0]):
-                x[i] += y[i]
-
-    cdef void updateScaled(SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t alpha):
-        cdef:
-            INDEX_t i
-        if SCALAR_t is COMPLEX_t:
-            for i in range(x.shape[0]):
-                x[i] = x[i]+alpha*y[i]
-        else:
-            for i in range(x.shape[0]):
-                x[i] += alpha*y[i]
-
-    cdef void scaleScalar(SCALAR_t[::1] x, SCALAR_t alpha):
-        cdef:
-            INDEX_t i
-        if SCALAR_t is COMPLEX_t:
-            for i in range(x.shape[0]):
-                x[i] *= alpha
-        else:
-            for i in range(x.shape[0]):
-                x[i] = x[i]*alpha
-
-    cdef SCALAR_t mydot(SCALAR_t[::1] v0, SCALAR_t[::1] v1) nogil:
-        cdef:
-            int i
-            SCALAR_t s = 0.0
-        if SCALAR_t is COMPLEX_t:
-            for i in range(v0.shape[0]):
-                s += v0[i].conjugate()*v1[i]
-        else:
-            for i in range(v0.shape[0]):
-                s += v0[i]*v1[i]
-        return s
-
-    cdef REAL_t norm(SCALAR_t[::1] x):
-        cdef:
-            int i
-            REAL_t s = 0.0
-        if SCALAR_t is COMPLEX_t:
-            for i in range(x.shape[0]):
-                s += (x[i].conjugate()*x[i]).real
-        else:
-            for i in range(x.shape[0]):
-                s += x[i]*x[i]
-        return sqrt(s)
-
-    cdef void gemv(SCALAR_t[:, ::1] A, SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t beta=0.):
-        cdef:
-            INDEX_t i, j
-            SCALAR_t s
-        if SCALAR_t is COMPLEX_t:
-            if beta != 0.:
-                for i in range(A.shape[0]):
-                    s = 0.
-                    for j in range(A.shape[1]):
-                        s = s + A[i, j]*x[j]
-                    y[i] = beta*y[i]+s
-            else:
-                for i in range(A.shape[0]):
-                    s = 0.
-                    for j in range(A.shape[1]):
-                        s = s + A[i, j]*x[j]
-                    y[i] = s
-        else:
-            if beta != 0.:
-                for i in range(A.shape[0]):
-                    s = 0.
-                    for j in range(A.shape[1]):
-                        s += A[i, j]*x[j]
-                    y[i] = beta*y[i]+s
-            else:
-                for i in range(A.shape[0]):
-                    s = 0.
-                    for j in range(A.shape[1]):
-                        s += A[i, j]*x[j]
-                    y[i] = s
-
-
-    cdef void gemvT(SCALAR_t[:, ::1] A, SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t beta=0.):
-        cdef:
-            INDEX_t i, j
-        if SCALAR_t is COMPLEX_t:
-            if beta != 0.:
-                for i in range(A.shape[0]):
-                    y[i] = y[i]*beta
-            else:
-                y[:] = 0.
-            for j in range(A.shape[1]):
-                for i in range(A.shape[0]):
-                    y[i] = y[i]+A[j, i]*x[j]
-        else:
-            if beta != 0.:
-                for i in range(A.shape[0]):
-                    y[i] *= beta
-            else:
-                y[:] = 0.
-            for j in range(A.shape[1]):
-                for i in range(A.shape[0]):
-                    y[i] += A[j, i]*x[j]
-
-
-    cdef void matmat(SCALAR_t[:, ::1] A, SCALAR_t[:, ::1] B, SCALAR_t[:, ::1] C):
-        cdef:
-            INDEX_t i, j, k
-        C[:, :] = 0.
-        if SCALAR_t is COMPLEX_t:
-            for i in range(A.shape[0]):
-                for j in range(B.shape[0]):
-                    for k in range(B.shape[1]):
-                        C[i, k] = C[i, k] + A[i, j]*B[j, k]
-        else:
-            for i in range(A.shape[0]):
-                for j in range(B.shape[0]):
-                    for k in range(B.shape[1]):
-                        C[i, k] += A[i, j]*B[j, k]
-
+include "allocation.pxi"
+include "blas_routines.pxi"
+include "mkl_routines.pxi"
 
 
 cdef void updateScaledVector(REAL_t[::1] x, REAL_t[::1] y, REAL_t[::1] alpha):
@@ -472,102 +25,3 @@ cdef void updateScaledVector(REAL_t[::1] x, REAL_t[::1] y, REAL_t[::1] alpha):
         INDEX_t i
     for i in range(x.shape[0]):
         x[i] += alpha[i]*y[i]
-
-
-IF USE_MKL:
-    ctypedef INDEX_t MKL_INT
-
-    cdef extern from "mkl/mkl_spblas.h":
-        void mkl_cspblas_dcsrgemv (const char *transa , const MKL_INT *m , const REAL_t *a , const MKL_INT *ia , const MKL_INT *ja , const REAL_t *x , REAL_t *y );
-        void mkl_dcsrmv (const char *transa , const MKL_INT *m , const MKL_INT *k , const REAL_t *alpha , const char *matdescra ,
-                         const REAL_t *val , const MKL_INT *indx , const MKL_INT *pntrb , const MKL_INT *pntre ,
-                         const REAL_t *x , const REAL_t *beta , REAL_t *y );
-        # void mkl_zcsrmv (const char *transa , const MKL_INT *m , const MKL_INT *k , const COMPLEX_t *alpha , const char *matdescra ,
-        #                  const COMPLEX_t *val , const MKL_INT *indx , const MKL_INT *pntrb , const MKL_INT *pntre ,
-        #                  const COMPLEX_t *x , const COMPLEX_t *beta , COMPLEX_t *y );
-
-    cdef void spmv(INDEX_t[::1] indptr, INDEX_t[::1] indices, SCALAR_t[::1] data, SCALAR_t[::1] x, SCALAR_t[::1] y, BOOL_t overwrite=True):
-        cdef:
-            char transA = 78
-            INDEX_t num_rows = indptr.shape[0]-1
-
-        assert overwrite
-
-        if SCALAR_t is COMPLEX_t:
-            # mkl_cspblas_zcsrgemv(&transA, &num_rows, &data[0], &indptr[0], &indices[0], &x[0], &y[0])
-            pass
-        else:
-            mkl_cspblas_dcsrgemv(&transA, &num_rows, &data[0], &indptr[0], &indices[0], &x[0], &y[0])
-
-
-    cdef void spres(INDEX_t[::1] indptr, INDEX_t[::1] indices, SCALAR_t[::1] data, SCALAR_t[::1] x, SCALAR_t[::1] rhs, SCALAR_t[::1] result):
-        cdef:
-            char transA = 78
-            SCALAR_t alpha = -1.
-            SCALAR_t beta = 1.
-            char matdscr[6]
-            INDEX_t inc = 1
-            INDEX_t num_rows = indptr.shape[0]-1
-            INDEX_t num_columns = x.shape[0]
-
-        matdscr[0] = 71
-        matdscr[2] = 78
-        matdscr[3] = 67
-
-        assign(result, rhs)
-        if SCALAR_t is COMPLEX_t:
-            pass
-            # mkl_dcsrmv(&transA, &num_rows, &num_columns, &alpha, &matdscr[0],
-            #            &data[0], &indices[0], &indptr[0], &indptr[1],
-            #            &x[0], &beta, &result[0])
-        else:
-            mkl_dcsrmv(&transA, &num_rows, &num_columns, &alpha, &matdscr[0],
-                       &data[0], &indices[0], &indptr[0], &indptr[1],
-                       &x[0], &beta, &result[0])
-
-ELSE:
-
-    cdef void spmv(INDEX_t[::1] indptr, INDEX_t[::1] indices, SCALAR_t[::1] data, SCALAR_t[::1] x, SCALAR_t[::1] y, BOOL_t overwrite=True):
-        cdef:
-            INDEX_t i, jj, j
-            SCALAR_t temp
-        if SCALAR_t is COMPLEX_t:
-            for i in range(indptr.shape[0]-1):
-                temp = 0.
-                for jj in range(indptr[i], indptr[i+1]):
-                    j = indices[jj]
-                    temp = temp + data[jj]*x[j]
-                if overwrite:
-                    y[i] = temp
-                else:
-                    y[i] = y[i]+temp
-        else:
-            for i in range(indptr.shape[0]-1):
-                temp = 0.
-                for jj in range(indptr[i], indptr[i+1]):
-                    j = indices[jj]
-                    temp += data[jj]*x[j]
-                if overwrite:
-                    y[i] = temp
-                else:
-                    y[i] += temp
-
-    cdef void spres(INDEX_t[::1] indptr, INDEX_t[::1] indices, SCALAR_t[::1] data, SCALAR_t[::1] x, SCALAR_t[::1] rhs, SCALAR_t[::1] result):
-        cdef:
-            INDEX_t i, jj, j
-            SCALAR_t temp
-            INDEX_t num_rows = indptr.shape[0]-1
-        if SCALAR_t is COMPLEX_t:
-            for i in range(num_rows):
-                temp = rhs[i]
-                for jj in range(indptr[i], indptr[i+1]):
-                    j = indices[jj]
-                    temp = temp-data[jj]*x[j]
-                result[i] = temp
-        else:
-            for i in range(num_rows):
-                temp = rhs[i]
-                for jj in range(indptr[i], indptr[i+1]):
-                    j = indices[jj]
-                    temp -= data[jj]*x[j]
-                result[i] = temp
diff --git a/base/PyNucleus_base/ip_norm.pyx b/base/PyNucleus_base/ip_norm.pyx
index fb92d03..eefa202 100644
--- a/base/PyNucleus_base/ip_norm.pyx
+++ b/base/PyNucleus_base/ip_norm.pyx
@@ -9,8 +9,6 @@ import numpy as np
 from . myTypes import INDEX, REAL, COMPLEX
 from . blas import uninitialized
 
-include "config.pxi"
-
 import mpi4py.rc
 mpi4py.rc.initialize = False
 from mpi4py import MPI
diff --git a/base/PyNucleus_base/linalg.pyx b/base/PyNucleus_base/linalg.pyx
index 16dbb46..13db27e 100644
--- a/base/PyNucleus_base/linalg.pyx
+++ b/base/PyNucleus_base/linalg.pyx
@@ -27,8 +27,6 @@ from . convergence cimport (convergenceMaster, noOpConvergenceMaster,
                             convergenceClient, noOpConvergenceClient)
 from numpy.linalg import norm as normSeq
 
-include "config.pxi"
-
 
 def UniformOnUnitSphere(dim, samples=1, norm=normSeq):
     "Uniform distribution on the unit sphere."
@@ -204,43 +202,6 @@ cdef void forward_solve_csc(INDEX_t[::1] indptr,
                     y[indices[i]] = y[indices[i]] + data[i]*y[j]
 
 
-IF USE_MKL_TRISOLVE:
-
-    ctypedef INDEX_t MKL_INT
-
-    cdef extern from "mkl/mkl_spblas.h":
-        void mkl_dcsrsm (const char *transa , const MKL_INT *m , const MKL_INT *n , const REAL_t *alpha , const char *matdescra ,
-                         const REAL_t *val , const MKL_INT *indx , const MKL_INT *pntrb , const MKL_INT *pntre ,
-                         const REAL_t *b , const MKL_INT *ldb , REAL_t *c , const MKL_INT *ldc );
-
-    cdef inline void trisolve_mkl(INDEX_t[::1] indptr,
-                                  INDEX_t[::1] indices,
-                                  REAL_t[::1] data,
-                                  REAL_t[::1] b,
-                                  REAL_t[::1] y,
-                                  BOOL_t forward=True,
-                                  BOOL_t unitDiagonal=False):
-        cdef:
-            char transA
-            REAL_t alpha = 1.
-            char matdscr[6]
-            INDEX_t inc = 1
-            INDEX_t n = indptr.shape[0]-1
-            INDEX_t one = 1
-        matdscr[0] = 84
-        if forward:
-            transA = 84
-        else:
-            transA = 78
-        matdscr[1] = 85
-        if unitDiagonal:
-            matdscr[2] = 85
-        else:
-            matdscr[2] = 78
-        matdscr[3] = 67
-        mkl_dcsrsm(&transA, &n, &one, &alpha, &matdscr[0], &data[0], &indices[0], &indptr[0], &indptr[1], &b[0], &one, &y[0], &one)
-
-
 # Assumes that indices are ordered
 cdef void forward_solve_sss(const INDEX_t[::1] indptr,
                             const INDEX_t[::1] indices,
@@ -447,56 +408,6 @@ cpdef solve_cholesky(INDEX_t[::1] Lindptr,
     return temp_mem
 
 
-cdef class cholesky_solver:
-    cdef:
-        public INDEX_t[::1] indptr, indices
-        public REAL_t[::1] data, diagonal, temp
-        CSR_LinearOperator L
-
-    def __init__(self, num_rows):
-        self.temp = uninitialized((num_rows), dtype=REAL)
-
-    def setup(self, A):
-        cdef:
-            INDEX_t i
-        if isinstance(A, CSR_LinearOperator):
-            self.indices, self.indptr, self.data, self.diagonal = ichol_csr(A)
-        elif isinstance(A, SSS_LinearOperator):
-            # self.indices, self.indptr, self.data, self.diagonal = ichol_sss(A)
-            self.indices, self.indptr, self.data, self.diagonal = ichol_csr(A.to_csr_linear_operator())
-        elif isinstance(A, TimeStepperLinearOperator):
-            B = A.to_csr_linear_operator()
-            self.indices, self.indptr, self.data, self.diagonal = ichol_csr(B)
-        else:
-            raise NotImplementedError()
-
-        IF USE_MKL_TRISOLVE:
-            from . linear_operators import diagonalOperator
-            T = CSR_LinearOperator(self.indices, self.indptr, self.data).to_csr()+diagonalOperator(self.diagonal).to_csr()
-            self.L = CSR_LinearOperator.from_csr(T)
-        ELSE:
-            for i in range(self.diagonal.shape[0]):
-                self.diagonal[i] = 1./self.diagonal[i]
-
-    cpdef solve(self, REAL_t[::1] b, REAL_t[::1] x):
-        self.temp[:] = 0.0
-        IF USE_MKL_TRISOLVE:
-            trisolve_mkl(self.L.indptr, self.L.indices, self.L.data, b, self.temp, forward=True, unitDiagonal=False)
-            trisolve_mkl(self.L.indptr, self.L.indices, self.L.data, self.temp, x, forward=False, unitDiagonal=False)
-        ELSE:
-            forward_solve_sss_noInverse(self.indptr, self.indices,
-                                    self.data, self.diagonal,
-                                    b, self.temp, unitDiagonal=False)
-            backward_solve_sss_noInverse(self.indptr, self.indices,
-                                         self.data, self.diagonal,
-                                         self.temp, x)
-
-    def asPreconditioner(self):
-        return LinearOperator_wrapper(self.diagonal.shape[0],
-                                        self.diagonal.shape[0],
-                                        self.solve)
-
-
 cpdef void bicgstab(LinearOperator A,
                     const REAL_t[::1] b,
                     REAL_t[::1] x,
diff --git a/base/PyNucleus_base/linear_operators.pyx b/base/PyNucleus_base/linear_operators.pyx
index 5d65de4..2531cf7 100644
--- a/base/PyNucleus_base/linear_operators.pyx
+++ b/base/PyNucleus_base/linear_operators.pyx
@@ -12,8 +12,6 @@ from . blas cimport gemv
 from . blas import uninitialized
 from cython.parallel cimport prange, parallel
 
-include "config.pxi"
-
 COMPRESSION = 'gzip'
 
 include "LinearOperator_REAL.pxi"
diff --git a/base/PyNucleus_base/malloc_linux.pxi b/base/PyNucleus_base/malloc_linux.pxi
new file mode 100644
index 0000000..6129a75
--- /dev/null
+++ b/base/PyNucleus_base/malloc_linux.pxi
@@ -0,0 +1,9 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+cdef extern from "malloc.h" nogil:
+    int malloc_trim(size_t pad)
diff --git a/base/PyNucleus_base/malloc_mac.pxi b/base/PyNucleus_base/malloc_mac.pxi
new file mode 100644
index 0000000..b0780f3
--- /dev/null
+++ b/base/PyNucleus_base/malloc_mac.pxi
@@ -0,0 +1,9 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+cdef int malloc_trim(size_t pad):
+    pass
diff --git a/base/PyNucleus_base/opt_false_allocation.pxi b/base/PyNucleus_base/opt_false_allocation.pxi
new file mode 100644
index 0000000..9c156b6
--- /dev/null
+++ b/base/PyNucleus_base/opt_false_allocation.pxi
@@ -0,0 +1,27 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+def uninitialized(*args, **kwargs):
+    return np.empty(*args, **kwargs)
+
+
+def uninitialized_like(like, **kwargs):
+    return np.empty_like(like, **kwargs)
+
+
+cpdef carray uninitializedINDEX(tuple shape):
+    cdef:
+        carray a = carray(shape, 4, 'i')
+        Py_ssize_t s, i
+    return a
+
+
+cpdef carray uninitializedREAL(tuple shape):
+    cdef:
+        carray a = carray(shape, 8, 'd')
+        Py_ssize_t s, i
+    return a
diff --git a/base/PyNucleus_base/opt_false_blas.pxi b/base/PyNucleus_base/opt_false_blas.pxi
new file mode 100644
index 0000000..0b81294
--- /dev/null
+++ b/base/PyNucleus_base/opt_false_blas.pxi
@@ -0,0 +1,149 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+cdef void assign(SCALAR_t[::1] y, const SCALAR_t[::1] x):
+    cdef:
+        INDEX_t i
+    for i in range(x.shape[0]):
+        y[i] = x[i]
+
+cdef void assignScaled(SCALAR_t[::1] y, const SCALAR_t[::1] x, SCALAR_t alpha):
+    cdef:
+        INDEX_t i
+    for i in range(x.shape[0]):
+        y[i] = alpha*x[i]
+
+
+cdef void assign3(SCALAR_t[::1] z, const SCALAR_t[::1] x, SCALAR_t alpha, const SCALAR_t[::1] y, SCALAR_t beta):
+    cdef:
+        INDEX_t i
+    for i in range(x.shape[0]):
+        z[i] = alpha*x[i] + beta*y[i]
+
+cdef void update(SCALAR_t[::1] x, SCALAR_t[::1] y):
+    cdef:
+        INDEX_t i
+    if SCALAR_t is COMPLEX_t:
+        for i in range(x.shape[0]):
+            x[i] = x[i] + y[i]
+    else:
+        for i in range(x.shape[0]):
+            x[i] += y[i]
+
+cdef void updateScaled(SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t alpha):
+    cdef:
+        INDEX_t i
+    if SCALAR_t is COMPLEX_t:
+        for i in range(x.shape[0]):
+            x[i] = x[i]+alpha*y[i]
+    else:
+        for i in range(x.shape[0]):
+            x[i] += alpha*y[i]
+
+cdef void scaleScalar(SCALAR_t[::1] x, SCALAR_t alpha):
+    cdef:
+        INDEX_t i
+    if SCALAR_t is COMPLEX_t:
+        for i in range(x.shape[0]):
+            x[i] *= alpha
+    else:
+        for i in range(x.shape[0]):
+            x[i] = x[i]*alpha
+
+cdef SCALAR_t mydot(SCALAR_t[::1] v0, SCALAR_t[::1] v1) nogil:
+    cdef:
+        int i
+        SCALAR_t s = 0.0
+    if SCALAR_t is COMPLEX_t:
+        for i in range(v0.shape[0]):
+            s += v0[i].conjugate()*v1[i]
+    else:
+        for i in range(v0.shape[0]):
+            s += v0[i]*v1[i]
+    return s
+
+cdef REAL_t norm(SCALAR_t[::1] x):
+    cdef:
+        int i
+        REAL_t s = 0.0
+    if SCALAR_t is COMPLEX_t:
+        for i in range(x.shape[0]):
+            s += (x[i].conjugate()*x[i]).real
+    else:
+        for i in range(x.shape[0]):
+            s += x[i]*x[i]
+    return sqrt(s)
+
+cdef void gemv(SCALAR_t[:, ::1] A, SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t beta=0.):
+    cdef:
+        INDEX_t i, j
+        SCALAR_t s
+    if SCALAR_t is COMPLEX_t:
+        if beta != 0.:
+            for i in range(A.shape[0]):
+                s = 0.
+                for j in range(A.shape[1]):
+                    s = s + A[i, j]*x[j]
+                y[i] = beta*y[i]+s
+        else:
+            for i in range(A.shape[0]):
+                s = 0.
+                for j in range(A.shape[1]):
+                    s = s + A[i, j]*x[j]
+                y[i] = s
+    else:
+        if beta != 0.:
+            for i in range(A.shape[0]):
+                s = 0.
+                for j in range(A.shape[1]):
+                    s += A[i, j]*x[j]
+                y[i] = beta*y[i]+s
+        else:
+            for i in range(A.shape[0]):
+                s = 0.
+                for j in range(A.shape[1]):
+                    s += A[i, j]*x[j]
+                y[i] = s
+
+
+cdef void gemvT(SCALAR_t[:, ::1] A, SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t beta=0.):
+    cdef:
+        INDEX_t i, j
+    if SCALAR_t is COMPLEX_t:
+        if beta != 0.:
+            for i in range(A.shape[0]):
+                y[i] = y[i]*beta
+        else:
+            y[:] = 0.
+        for j in range(A.shape[1]):
+            for i in range(A.shape[0]):
+                y[i] = y[i]+A[j, i]*x[j]
+    else:
+        if beta != 0.:
+            for i in range(A.shape[0]):
+                y[i] *= beta
+        else:
+            y[:] = 0.
+        for j in range(A.shape[1]):
+            for i in range(A.shape[0]):
+                y[i] += A[j, i]*x[j]
+
+
+cdef void matmat(SCALAR_t[:, ::1] A, SCALAR_t[:, ::1] B, SCALAR_t[:, ::1] C):
+    cdef:
+        INDEX_t i, j, k
+    C[:, :] = 0.
+    if SCALAR_t is COMPLEX_t:
+        for i in range(A.shape[0]):
+            for j in range(B.shape[0]):
+                for k in range(B.shape[1]):
+                    C[i, k] = C[i, k] + A[i, j]*B[j, k]
+    else:
+        for i in range(A.shape[0]):
+            for j in range(B.shape[0]):
+                for k in range(B.shape[1]):
+                    C[i, k] += A[i, j]*B[j, k]
diff --git a/base/PyNucleus_base/opt_false_mkl.pxi b/base/PyNucleus_base/opt_false_mkl.pxi
new file mode 100644
index 0000000..ae0afc2
--- /dev/null
+++ b/base/PyNucleus_base/opt_false_mkl.pxi
@@ -0,0 +1,51 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+cdef void spmv(INDEX_t[::1] indptr, INDEX_t[::1] indices, SCALAR_t[::1] data, SCALAR_t[::1] x, SCALAR_t[::1] y, BOOL_t overwrite=True):
+    cdef:
+        INDEX_t i, jj, j
+        SCALAR_t temp
+    if SCALAR_t is COMPLEX_t:
+        for i in range(indptr.shape[0]-1):
+            temp = 0.
+            for jj in range(indptr[i], indptr[i+1]):
+                j = indices[jj]
+                temp = temp + data[jj]*x[j]
+            if overwrite:
+                y[i] = temp
+            else:
+                y[i] = y[i]+temp
+    else:
+        for i in range(indptr.shape[0]-1):
+            temp = 0.
+            for jj in range(indptr[i], indptr[i+1]):
+                j = indices[jj]
+                temp += data[jj]*x[j]
+            if overwrite:
+                y[i] = temp
+            else:
+                y[i] += temp
+
+cdef void spres(INDEX_t[::1] indptr, INDEX_t[::1] indices, SCALAR_t[::1] data, SCALAR_t[::1] x, SCALAR_t[::1] rhs, SCALAR_t[::1] result):
+    cdef:
+        INDEX_t i, jj, j
+        SCALAR_t temp
+        INDEX_t num_rows = indptr.shape[0]-1
+    if SCALAR_t is COMPLEX_t:
+        for i in range(num_rows):
+            temp = rhs[i]
+            for jj in range(indptr[i], indptr[i+1]):
+                j = indices[jj]
+                temp = temp-data[jj]*x[j]
+            result[i] = temp
+    else:
+        for i in range(num_rows):
+            temp = rhs[i]
+            for jj in range(indptr[i], indptr[i+1]):
+                j = indices[jj]
+                temp -= data[jj]*x[j]
+            result[i] = temp
diff --git a/base/PyNucleus_base/opt_false_mkl_trisolve.pxi b/base/PyNucleus_base/opt_false_mkl_trisolve.pxi
new file mode 100644
index 0000000..2103817
--- /dev/null
+++ b/base/PyNucleus_base/opt_false_mkl_trisolve.pxi
@@ -0,0 +1,46 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+cdef class ichol_solver(solver):
+    def __init__(self, LinearOperator A=None, INDEX_t num_rows=-1):
+        solver.__init__(self, A, num_rows)
+        self.temp = uninitialized((self.num_rows), dtype=REAL)
+
+    cpdef void setup(self, LinearOperator A=None):
+        cdef:
+            INDEX_t i
+        if A is not None:
+            self.A = A
+        if isinstance(self.A, CSR_LinearOperator):
+            self.indices, self.indptr, self.data, self.diagonal = ichol_csr(self.A)
+        elif isinstance(self.A, SSS_LinearOperator):
+            # self.indices, self.indptr, self.data, self.diagonal = ichol_sss(A)
+            self.indices, self.indptr, self.data, self.diagonal = ichol_csr(self.A.to_csr_linear_operator())
+        else:
+            try:
+                B = self.A.to_csr_linear_operator()
+                self.indices, self.indptr, self.data, self.diagonal = ichol_csr(B)
+            except:
+                raise NotImplementedError()
+
+        for i in range(self.diagonal.shape[0]):
+            self.diagonal[i] = 1./self.diagonal[i]
+        self.initialized = True
+
+    cdef int solve(self, vector_t b, vector_t x) except -1:
+        solver.solve(self, b, x)
+        self.temp[:] = 0.0
+        forward_solve_sss_noInverse(self.indptr, self.indices,
+                                    self.data, self.diagonal,
+                                    b, self.temp, unitDiagonal=False)
+        backward_solve_sss_noInverse(self.indptr, self.indices,
+                                     self.data, self.diagonal,
+                                     self.temp, x)
+        return 1
+
+    def __str__(self):
+        return 'Incomplete Cholesky'
diff --git a/base/PyNucleus_base/opt_false_solver_cholmod.pxi b/base/PyNucleus_base/opt_false_solver_cholmod.pxi
new file mode 100644
index 0000000..16fc689
--- /dev/null
+++ b/base/PyNucleus_base/opt_false_solver_cholmod.pxi
@@ -0,0 +1,72 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+cdef class chol_solver(solver):
+    def __init__(self, LinearOperator A, INDEX_t num_rows=-1):
+        solver.__init__(self, A, num_rows)
+
+    cpdef void setup(self, LinearOperator A=None):
+        cdef:
+            LinearOperator B = None
+            INDEX_t i, j, k
+        if A is not None:
+            self.A = A
+
+        if not isinstance(self.A, (SSS_LinearOperator,
+                                   CSR_LinearOperator,
+                                   Dense_LinearOperator)):
+            if self.A.isSparse():
+                B = self.A.to_csr_linear_operator()
+            else:
+                B = Dense_LinearOperator(np.ascontiguousarray(self.A.toarray()))
+        else:
+            B = self.A
+
+        self.denseFactor = False
+        if isinstance(B, Dense_LinearOperator):
+            from scipy.linalg import cholesky
+            L = cholesky(B.toarray(), lower=True)
+            self.Lflat = uninitialized((L.shape[0]*(L.shape[1]+1)//2), dtype=REAL)
+            k = 0
+            for i in range(L.shape[0]):
+                for j in range(i+1):
+                    self.Lflat[k] = L[i, j]
+                    k += 1
+            self.denseFactor = True
+            self.temp = uninitialized((L.shape[0]), dtype=REAL)
+        else:
+            raise NotImplementedError("Cholmod not available, install \"scikit-sparse\".")
+        self.initialized = True
+
+    cdef int solve(self, vector_t b, vector_t x) except -1:
+        cdef:
+            INDEX_t i, j, k
+            REAL_t val
+            REAL_t[::1] temp = self.temp, Lflat = self.Lflat
+        solver.solve(self, b, x)
+        if not self.denseFactor:
+            np.array(x, copy=False, dtype=REAL)[:] = self.Ainv(np.array(b, copy=False, dtype=REAL))
+        else:
+            k = 0
+            for i in range(x.shape[0]):
+                val = b[i]
+                for j in range(i):
+                    val -= Lflat[k]*temp[j]
+                    k += 1
+                temp[i] = val/Lflat[k]
+                k += 1
+            for i in range(x.shape[0]-1, -1, -1):
+                val = temp[i]
+                for j in range(i+1, x.shape[0]):
+                    k = ((j*(j+1)) >> 1) + i
+                    val -= Lflat[k]*x[j]
+                k = ((i*(i+1)) >> 1) + i
+                x[i] = val/Lflat[k]
+        return 1
+
+    def __str__(self):
+        return 'Cholesky'
diff --git a/base/PyNucleus_base/opt_true_allocation.pxi b/base/PyNucleus_base/opt_true_allocation.pxi
new file mode 100644
index 0000000..a316b44
--- /dev/null
+++ b/base/PyNucleus_base/opt_true_allocation.pxi
@@ -0,0 +1,45 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+def uninitialized(*args, **kwargs):
+    if 'dtype' in kwargs and np.issubdtype(kwargs['dtype'], np.integer):
+        kwargs['fill_value'] = np.iinfo(kwargs['dtype']).min
+    else:
+        kwargs['fill_value'] = NAN
+    return np.full(*args, **kwargs)
+
+
+def uninitialized_like(like, **kwargs):
+    if np.issubdtype(np.array(like, copy=False).dtype, np.integer):
+        kwargs['fill_value'] = np.iinfo(like.dtype).min
+    else:
+        kwargs['fill_value'] = NAN
+    return np.full_like(like, **kwargs)
+
+
+cpdef carray uninitializedINDEX(tuple shape):
+    cdef:
+        carray a = carray(shape, 4, 'i')
+        Py_ssize_t s, i
+    s = 1
+    for i in range(len(shape)):
+        s *= shape[i]
+    for i in range(s):
+        (<INDEX_t*>a.data)[i] = MAX_INT
+    return a
+
+
+cpdef carray uninitializedREAL(tuple shape):
+    cdef:
+        carray a = carray(shape, 8, 'd')
+        Py_ssize_t s, i
+    s = 1
+    for i in range(len(shape)):
+        s *= shape[i]
+    for i in range(s):
+        (<REAL_t*>a.data)[i] = NAN
+    return a
diff --git a/base/PyNucleus_base/opt_true_blas.pxi b/base/PyNucleus_base/opt_true_blas.pxi
new file mode 100644
index 0000000..2ff9557
--- /dev/null
+++ b/base/PyNucleus_base/opt_true_blas.pxi
@@ -0,0 +1,261 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+from scipy.linalg.cython_blas cimport dcopy, dscal, daxpy, ddot, dnrm2
+from scipy.linalg.cython_blas cimport zcopy, zscal, zaxpy, zdotc
+from scipy.linalg.cython_blas cimport dgemv, zgemv
+from scipy.linalg.cython_blas cimport dgemm, zgemm
+
+
+cdef void assign(SCALAR_t[::1] y, SCALAR_t[::1] x):
+    cdef:
+        double* x_ptr
+        double* y_ptr
+        double complex* x_ptr_c
+        double complex* y_ptr_c
+        int n = x.shape[0]
+        int inc = 1
+    if SCALAR_t is COMPLEX_t:
+        x_ptr_c = &x[0]
+        y_ptr_c = &y[0]
+        zcopy(&n, x_ptr_c, &inc, y_ptr_c, &inc)
+    else:
+        x_ptr = &x[0]
+        y_ptr = &y[0]
+        dcopy(&n, x_ptr, &inc, y_ptr, &inc)
+
+cdef void assignScaled(SCALAR_t[::1] y, SCALAR_t[::1] x, SCALAR_t alpha):
+    cdef:
+        double* x_ptr
+        double* y_ptr
+        double complex* x_ptr_c
+        double complex* y_ptr_c
+        int n = x.shape[0]
+        int inc = 1
+    if SCALAR_t is COMPLEX_t:
+        x_ptr_c = &x[0]
+        y_ptr_c = &y[0]
+        zcopy(&n, x_ptr_c, &inc, y_ptr_c, &inc)
+        zscal(&n, &alpha, y_ptr_c, &inc)
+    else:
+        x_ptr = &x[0]
+        y_ptr = &y[0]
+        dcopy(&n, x_ptr, &inc, y_ptr, &inc)
+        dscal(&n, &alpha, y_ptr, &inc)
+
+cdef void assign3(SCALAR_t[::1] z, SCALAR_t[::1] x, SCALAR_t alpha, SCALAR_t[::1] y, SCALAR_t beta):
+    cdef:
+        double* x_ptr
+        double* y_ptr
+        double* z_ptr
+        double complex* x_ptr_c
+        double complex* y_ptr_c
+        double complex* z_ptr_c
+        int n = x.shape[0]
+        int inc = 1
+    if SCALAR_t is COMPLEX_t:
+        x_ptr_c = &x[0]
+        y_ptr_c = &y[0]
+        z_ptr_c = &z[0]
+        zcopy(&n, x_ptr_c, &inc, z_ptr_c, &inc)
+        if alpha != 1.0:
+            zscal(&n, &alpha, z_ptr_c, &inc)
+        zaxpy(&n, &beta, y_ptr_c, &inc, z_ptr_c, &inc)
+    else:
+        x_ptr = &x[0]
+        y_ptr = &y[0]
+        z_ptr = &z[0]
+        dcopy(&n, x_ptr, &inc, z_ptr, &inc)
+        if alpha != 1.0:
+            dscal(&n, &alpha, z_ptr, &inc)
+        daxpy(&n, &beta, y_ptr, &inc, z_ptr, &inc)
+
+cdef void update(SCALAR_t[::1] x, SCALAR_t[::1] y):
+    cdef:
+        double* x_ptr
+        double* y_ptr
+        double complex* x_ptr_c
+        double complex* y_ptr_c
+        SCALAR_t alpha = 1.0
+        int n = x.shape[0]
+        int inc = 1
+    if SCALAR_t is COMPLEX_t:
+        x_ptr_c = &x[0]
+        y_ptr_c = &y[0]
+        zaxpy(&n, &alpha, y_ptr_c, &inc, x_ptr_c, &inc)
+    else:
+        x_ptr = &x[0]
+        y_ptr = &y[0]
+        daxpy(&n, &alpha, y_ptr, &inc, x_ptr, &inc)
+
+cdef void updateScaled(SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t alpha):
+    cdef:
+        double* x_ptr
+        double* y_ptr
+        double complex* x_ptr_c
+        double complex* y_ptr_c
+        int n = x.shape[0]
+        int inc = 1
+    if SCALAR_t is COMPLEX_t:
+        x_ptr_c = &x[0]
+        y_ptr_c = &y[0]
+        zaxpy(&n, &alpha, y_ptr_c, &inc, x_ptr_c, &inc)
+    else:
+        x_ptr = &x[0]
+        y_ptr = &y[0]
+        daxpy(&n, &alpha, y_ptr, &inc, x_ptr, &inc)
+
+cdef void scaleScalar(SCALAR_t[::1] x, SCALAR_t alpha):
+    cdef:
+        double* x_ptr
+        double complex* x_ptr_c
+        int n = x.shape[0]
+        int inc = 1
+    if SCALAR_t is COMPLEX_t:
+        x_ptr_c = &x[0]
+        zscal(&n, &alpha, x_ptr_c, &inc)
+    else:
+        x_ptr = &x[0]
+        dscal(&n, &alpha, x_ptr, &inc)
+
+cdef SCALAR_t mydot(SCALAR_t[::1] v0, SCALAR_t[::1] v1):
+    cdef:
+        SCALAR_t s = 0.0
+        double* v0_ptr
+        double* v1_ptr
+        double complex* v0_ptr_c
+        double complex* v1_ptr_c
+        int n = v0.shape[0]
+        int inc = 1
+    if SCALAR_t is COMPLEX_t:
+        v0_ptr_c = &v0[0]
+        v1_ptr_c = &v1[0]
+        s = zdotc(&n, v0_ptr_c, &inc, v1_ptr_c, &inc)
+    else:
+        v0_ptr = &v0[0]
+        v1_ptr = &v1[0]
+        s = ddot(&n, v0_ptr, &inc, v1_ptr, &inc)
+    return s
+
+cdef REAL_t norm(SCALAR_t[::1] x):
+    cdef:
+        REAL_t s = 0.0
+        double* x_ptr
+        double complex* x_ptr_c
+        int n = x.shape[0]
+        int inc = 1
+    if SCALAR_t is COMPLEX_t:
+        x_ptr_c = &x[0]
+        s = zdotc(&n, x_ptr_c, &inc, x_ptr_c, &inc).real
+        return sqrt(s)
+    else:
+        x_ptr = &x[0]
+        return dnrm2(&n, x_ptr, &inc)
+
+cdef void gemv(SCALAR_t[:, ::1] A, SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t beta=0.):
+    cdef:
+        double* A_ptr
+        double* x_ptr
+        double* y_ptr
+        double complex* A_ptr_c
+        double complex* x_ptr_c
+        double complex* y_ptr_c
+        int m = A.shape[1]
+        int n = A.shape[0]
+        SCALAR_t alpha = 1.
+        int lda = A.shape[1]
+        int incx = 1
+        int incy = 1
+    if SCALAR_t is COMPLEX_t:
+        A_ptr_c = &A[0, 0]
+        x_ptr_c = &x[0]
+        y_ptr_c = &y[0]
+        zgemv('t', &m, &n, &alpha, A_ptr_c, &lda, x_ptr_c, &incx, &beta, y_ptr_c, &incy)
+    else:
+        A_ptr = &A[0, 0]
+        x_ptr = &x[0]
+        y_ptr = &y[0]
+        dgemv('t', &m, &n, &alpha, A_ptr, &lda, x_ptr, &incx, &beta, y_ptr, &incy)
+
+cdef void gemvF(SCALAR_t[::1, :] A, SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t beta=0.):
+    cdef:
+        double* A_ptr
+        double* x_ptr
+        double* y_ptr
+        double complex* A_ptr_c
+        double complex* x_ptr_c
+        double complex* y_ptr_c
+        int m = A.shape[0]
+        int n = A.shape[1]
+        SCALAR_t alpha = 1.
+        int lda = A.shape[0]
+        int incx = 1
+        int incy = 1
+    if SCALAR_t is COMPLEX_t:
+        A_ptr_c = &A[0, 0]
+        x_ptr_c = &x[0]
+        y_ptr_c = &y[0]
+        zgemv('n', &m, &n, &alpha, A_ptr_c, &lda, x_ptr_c, &incx, &beta, y_ptr_c, &incy)
+    else:
+        A_ptr = &A[0, 0]
+        x_ptr = &x[0]
+        y_ptr = &y[0]
+        dgemv('n', &m, &n, &alpha, A_ptr, &lda, x_ptr, &incx, &beta, y_ptr, &incy)
+
+
+cdef void gemvT(SCALAR_t[:, ::1] A, SCALAR_t[::1] x, SCALAR_t[::1] y, SCALAR_t beta=0.):
+    cdef:
+        double* A_ptr
+        double* x_ptr
+        double* y_ptr
+        double complex* A_ptr_c
+        double complex* x_ptr_c
+        double complex* y_ptr_c
+        int m = A.shape[0]
+        int n = A.shape[1]
+        SCALAR_t alpha = 1.
+        int lda = A.shape[0]
+        int incx = 1
+        int incy = 1
+    if SCALAR_t is COMPLEX_t:
+        A_ptr_c = &A[0, 0]
+        x_ptr_c = &x[0]
+        y_ptr_c = &y[0]
+        zgemv('n', &m, &n, &alpha, A_ptr_c, &lda, x_ptr_c, &incx, &beta, y_ptr_c, &incy)
+    else:
+        A_ptr = &A[0, 0]
+        x_ptr = &x[0]
+        y_ptr = &y[0]
+        dgemv('n', &m, &n, &alpha, A_ptr, &lda, x_ptr, &incx, &beta, y_ptr, &incy)
+
+
+cdef void matmat(SCALAR_t[:, ::1] A, SCALAR_t[:, ::1] B, SCALAR_t[:, ::1] C):
+    cdef:
+        double* A_ptr
+        double* B_ptr
+        double* C_ptr
+        double complex* A_ptr_c
+        double complex* B_ptr_c
+        double complex* C_ptr_c
+        int m = B.shape[1]
+        int k = B.shape[0]
+        int n = A.shape[0]
+        SCALAR_t alpha = 1.
+        SCALAR_t beta = 0.
+        int lda = A.shape[1]
+        int ldb = B.shape[1]
+        int ldc = C.shape[1]
+    if SCALAR_t is COMPLEX_t:
+        A_ptr_c = &A[0, 0]
+        B_ptr_c = &B[0, 0]
+        C_ptr_c = &C[0, 0]
+        zgemm('n', 'n', &m, &n, &k, &alpha, B_ptr_c, &ldb, A_ptr_c, &lda, &beta, C_ptr_c, &ldc)
+    else:
+        A_ptr = &A[0, 0]
+        B_ptr = &B[0, 0]
+        C_ptr = &C[0, 0]
+        dgemm('n', 'n', &m, &n, &k, &alpha, B_ptr, &ldb, A_ptr, &lda, &beta, C_ptr, &ldc)
diff --git a/base/PyNucleus_base/opt_true_mkl.pxi b/base/PyNucleus_base/opt_true_mkl.pxi
new file mode 100644
index 0000000..4a8737c
--- /dev/null
+++ b/base/PyNucleus_base/opt_true_mkl.pxi
@@ -0,0 +1,56 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+ctypedef INDEX_t MKL_INT
+
+cdef extern from "mkl/mkl_spblas.h":
+    void mkl_cspblas_dcsrgemv (const char *transa , const MKL_INT *m , const REAL_t *a , const MKL_INT *ia , const MKL_INT *ja , const REAL_t *x , REAL_t *y );
+    void mkl_dcsrmv (const char *transa , const MKL_INT *m , const MKL_INT *k , const REAL_t *alpha , const char *matdescra ,
+                     const REAL_t *val , const MKL_INT *indx , const MKL_INT *pntrb , const MKL_INT *pntre ,
+                     const REAL_t *x , const REAL_t *beta , REAL_t *y );
+    # void mkl_zcsrmv (const char *transa , const MKL_INT *m , const MKL_INT *k , const COMPLEX_t *alpha , const char *matdescra ,
+    #                  const COMPLEX_t *val , const MKL_INT *indx , const MKL_INT *pntrb , const MKL_INT *pntre ,
+    #                  const COMPLEX_t *x , const COMPLEX_t *beta , COMPLEX_t *y );
+
+cdef void spmv(INDEX_t[::1] indptr, INDEX_t[::1] indices, SCALAR_t[::1] data, SCALAR_t[::1] x, SCALAR_t[::1] y, BOOL_t overwrite=True):
+    cdef:
+        char transA = 78
+        INDEX_t num_rows = indptr.shape[0]-1
+
+    assert overwrite
+
+    if SCALAR_t is COMPLEX_t:
+        # mkl_cspblas_zcsrgemv(&transA, &num_rows, &data[0], &indptr[0], &indices[0], &x[0], &y[0])
+        pass
+    else:
+        mkl_cspblas_dcsrgemv(&transA, &num_rows, &data[0], &indptr[0], &indices[0], &x[0], &y[0])
+
+
+cdef void spres(INDEX_t[::1] indptr, INDEX_t[::1] indices, SCALAR_t[::1] data, SCALAR_t[::1] x, SCALAR_t[::1] rhs, SCALAR_t[::1] result):
+    cdef:
+        char transA = 78
+        SCALAR_t alpha = -1.
+        SCALAR_t beta = 1.
+        char matdscr[6]
+        INDEX_t inc = 1
+        INDEX_t num_rows = indptr.shape[0]-1
+        INDEX_t num_columns = x.shape[0]
+
+    matdscr[0] = 71
+    matdscr[2] = 78
+    matdscr[3] = 67
+
+    assign(result, rhs)
+    if SCALAR_t is COMPLEX_t:
+        pass
+        # mkl_dcsrmv(&transA, &num_rows, &num_columns, &alpha, &matdscr[0],
+        #            &data[0], &indices[0], &indptr[0], &indptr[1],
+        #            &x[0], &beta, &result[0])
+    else:
+        mkl_dcsrmv(&transA, &num_rows, &num_columns, &alpha, &matdscr[0],
+                   &data[0], &indices[0], &indptr[0], &indptr[1],
+                   &x[0], &beta, &result[0])
diff --git a/base/PyNucleus_base/opt_true_mkl_trisolve.pxi b/base/PyNucleus_base/opt_true_mkl_trisolve.pxi
new file mode 100644
index 0000000..8af0fc3
--- /dev/null
+++ b/base/PyNucleus_base/opt_true_mkl_trisolve.pxi
@@ -0,0 +1,78 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+ctypedef INDEX_t MKL_INT
+
+cdef extern from "mkl/mkl_spblas.h":
+    void mkl_dcsrsm (const char *transa , const MKL_INT *m , const MKL_INT *n , const REAL_t *alpha , const char *matdescra ,
+                     const REAL_t *val , const MKL_INT *indx , const MKL_INT *pntrb , const MKL_INT *pntre ,
+                     const REAL_t *b , const MKL_INT *ldb , REAL_t *c , const MKL_INT *ldc );
+
+cdef inline void trisolve_mkl(INDEX_t[::1] indptr,
+                              INDEX_t[::1] indices,
+                              REAL_t[::1] data,
+                              REAL_t[::1] b,
+                              REAL_t[::1] y,
+                              BOOL_t forward=True,
+                              BOOL_t unitDiagonal=False):
+    cdef:
+        char transA
+        REAL_t alpha = 1.
+        char matdscr[6]
+        INDEX_t inc = 1
+        INDEX_t n = indptr.shape[0]-1
+        INDEX_t one = 1
+    matdscr[0] = 84
+    if forward:
+        transA = 84
+    else:
+        transA = 78
+    matdscr[1] = 85
+    if unitDiagonal:
+        matdscr[2] = 85
+    else:
+        matdscr[2] = 78
+    matdscr[3] = 67
+    mkl_dcsrsm(&transA, &n, &one, &alpha, &matdscr[0], &data[0], &indices[0], &indptr[0], &indptr[1], &b[0], &one, &y[0], &one)
+
+
+cdef class ichol_solver(solver):
+    def __init__(self, LinearOperator A=None, INDEX_t num_rows=-1):
+        solver.__init__(self, A, num_rows)
+        self.temp = uninitialized((self.num_rows), dtype=REAL)
+
+    cpdef void setup(self, LinearOperator A=None):
+        cdef:
+            INDEX_t i
+        if A is not None:
+            self.A = A
+        if isinstance(self.A, CSR_LinearOperator):
+            self.indices, self.indptr, self.data, self.diagonal = ichol_csr(self.A)
+        elif isinstance(self.A, SSS_LinearOperator):
+            # self.indices, self.indptr, self.data, self.diagonal = ichol_sss(A)
+            self.indices, self.indptr, self.data, self.diagonal = ichol_csr(self.A.to_csr_linear_operator())
+        else:
+            try:
+                B = self.A.to_csr_linear_operator()
+                self.indices, self.indptr, self.data, self.diagonal = ichol_csr(B)
+            except:
+                raise NotImplementedError()
+
+        from . linear_operators import diagonalOperator
+        T = CSR_LinearOperator(self.indices, self.indptr, self.data).to_csr()+diagonalOperator(self.diagonal).to_csr()
+        self.L = CSR_LinearOperator.from_csr(T)
+        self.initialized = True
+
+    cdef int solve(self, vector_t b, vector_t x) except -1:
+        solver.solve(self, b, x)
+        self.temp[:] = 0.0
+        trisolve_mkl(self.L.indptr, self.L.indices, self.L.data, b, self.temp, forward=True, unitDiagonal=False)
+        trisolve_mkl(self.L.indptr, self.L.indices, self.L.data, self.temp, x, forward=False, unitDiagonal=False)
+        return 1
+
+    def __str__(self):
+        return 'Incomplete Cholesky'
diff --git a/base/PyNucleus_base/opt_true_solver_cholmod.pxi b/base/PyNucleus_base/opt_true_solver_cholmod.pxi
new file mode 100644
index 0000000..88b4a6b
--- /dev/null
+++ b/base/PyNucleus_base/opt_true_solver_cholmod.pxi
@@ -0,0 +1,86 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+cdef class chol_solver(solver):
+    def __init__(self, LinearOperator A, INDEX_t num_rows=-1):
+        solver.__init__(self, A, num_rows)
+
+    cpdef void setup(self, LinearOperator A=None):
+        cdef:
+            LinearOperator B = None
+            INDEX_t i, j, k
+        if A is not None:
+            self.A = A
+
+        if not isinstance(self.A, (SSS_LinearOperator,
+                                   CSR_LinearOperator,
+                                   Dense_LinearOperator)):
+            if self.A.isSparse():
+                B = self.A.to_csr_linear_operator()
+            else:
+                B = Dense_LinearOperator(np.ascontiguousarray(self.A.toarray()))
+        else:
+            B = self.A
+
+        self.denseFactor = False
+        if isinstance(B, Dense_LinearOperator):
+            from scipy.linalg import cholesky
+            L = cholesky(B.toarray(), lower=True)
+            self.Lflat = uninitialized((L.shape[0]*(L.shape[1]+1)//2), dtype=REAL)
+            k = 0
+            for i in range(L.shape[0]):
+                for j in range(i+1):
+                    self.Lflat[k] = L[i, j]
+                    k += 1
+            self.denseFactor = True
+            self.temp = uninitialized((L.shape[0]), dtype=REAL)
+        else:
+            from sksparse.cholmod import cholesky
+
+            if isinstance(B, CSR_LinearOperator):
+                try:
+                    self.Ainv = cholesky(B.to_csc())
+                except AttributeError:
+                    self.Ainv = cholesky(B.to_csr().tocsc())
+            elif isinstance(B, SSS_LinearOperator):
+                self.Ainv = cholesky(B.to_lower_csc())
+            else:
+                try:
+                    B = B.to_csr_linear_operator()
+                    self.Ainv = cholesky(B)
+                except AttributeError:
+                    raise NotImplementedError()
+        self.initialized = True
+
+    cdef int solve(self, vector_t b, vector_t x) except -1:
+        cdef:
+            INDEX_t i, j, k
+            REAL_t val
+            REAL_t[::1] temp = self.temp, Lflat = self.Lflat
+        solver.solve(self, b, x)
+        if not self.denseFactor:
+            np.array(x, copy=False, dtype=REAL)[:] = self.Ainv(np.array(b, copy=False, dtype=REAL))
+        else:
+            k = 0
+            for i in range(x.shape[0]):
+                val = b[i]
+                for j in range(i):
+                    val -= Lflat[k]*temp[j]
+                    k += 1
+                temp[i] = val/Lflat[k]
+                k += 1
+            for i in range(x.shape[0]-1, -1, -1):
+                val = temp[i]
+                for j in range(i+1, x.shape[0]):
+                    k = ((j*(j+1)) >> 1) + i
+                    val -= Lflat[k]*x[j]
+                k = ((i*(i+1)) >> 1) + i
+                x[i] = val/Lflat[k]
+        return 1
+
+    def __str__(self):
+        return 'Cholesky'
diff --git a/base/PyNucleus_base/opt_true_solver_pyamg.pxi b/base/PyNucleus_base/opt_true_solver_pyamg.pxi
new file mode 100644
index 0000000..0e69c07
--- /dev/null
+++ b/base/PyNucleus_base/opt_true_solver_pyamg.pxi
@@ -0,0 +1,45 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+from pyamg import smoothed_aggregation_solver
+
+cdef class pyamg_solver(iterative_solver):
+    def __init__(self, LinearOperator A=None, num_rows=-1):
+        iterative_solver.__init__(self, A, num_rows)
+
+    cpdef void setup(self, LinearOperator A=None):
+        iterative_solver.setup(self, A)
+        # self.ml = ruge_stuben_solver(self.A.to_csr(),
+        #                              coarse_solver='splu',
+        #                              max_coarse=2500,
+        #                              presmoother=('gauss_seidel', {'sweep': 'forward'}),
+        #                              postsmoother=('gauss_seidel', {'sweep': 'backward'}))
+
+        self.ml = smoothed_aggregation_solver(self.A.to_csr(),
+                                              np.ones((self.num_rows)),
+                                              smooth=None,
+                                              coarse_solver='splu',
+                                              max_coarse=2500,
+                                              presmoother=('gauss_seidel', {'sweep': 'forward'}),
+                                              postsmoother=('gauss_seidel', {'sweep': 'backward'}))
+        self.initialized = True
+
+    cdef int solve(self, vector_t b, vector_t x) except -1:
+        residuals = []
+        x_np = np.array(x, copy=False)
+        if self.x0 is not None:
+            x_np[:] = self.ml.solve(np.array(b, copy=False),
+                                    x0=np.array(self.x0, copy=False),
+                                    tol=self.tol, maxiter=self.maxIter, residuals=residuals, accel='cg')
+        else:
+            x_np[:] = self.ml.solve(np.array(b, copy=False),
+                                    tol=self.tol, maxiter=self.maxIter, residuals=residuals, accel='cg')
+        self.residuals = residuals
+        return len(residuals)
+
+    def __str__(self):
+        return str(self.ml)
diff --git a/base/PyNucleus_base/opt_true_solver_pyamg_decl.pxi b/base/PyNucleus_base/opt_true_solver_pyamg_decl.pxi
new file mode 100644
index 0000000..a1f2390
--- /dev/null
+++ b/base/PyNucleus_base/opt_true_solver_pyamg_decl.pxi
@@ -0,0 +1,10 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+cdef class pyamg_solver(iterative_solver):
+    cdef:
+        object ml
diff --git a/base/PyNucleus_base/opt_true_solver_pypardiso.pxi b/base/PyNucleus_base/opt_true_solver_pypardiso.pxi
new file mode 100644
index 0000000..42f66b6
--- /dev/null
+++ b/base/PyNucleus_base/opt_true_solver_pypardiso.pxi
@@ -0,0 +1,78 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+
+cdef class pardiso_lu_solver(solver):
+    def __init__(self, LinearOperator A, INDEX_t num_rows=-1):
+        solver.__init__(self, A, num_rows)
+
+    cpdef void setup(self, LinearOperator A=None):
+        cdef:
+            INDEX_t i, j, explicitZeros, explicitZerosRow
+            REAL_t[:, ::1] data
+
+        if A is not None:
+            self.A = A
+
+        if not isinstance(self.A, (SSS_LinearOperator,
+                                   CSR_LinearOperator,
+                                   Dense_LinearOperator)):
+            if self.A.isSparse():
+                self.A = self.A.to_csr_linear_operator()
+            else:
+                self.A = Dense_LinearOperator(np.ascontiguousarray(self.A.toarray()))
+        try_sparsification = False
+        sparsificationThreshold = 0.9
+        if isinstance(self.A, Dense_LinearOperator) and try_sparsification:
+            explicitZeros = 0
+            data = self.A.data
+            for i in range(self.A.num_rows):
+                explicitZerosRow = 0
+                for j in range(self.A.num_columns):
+                    if data[i, j] == 0.:
+                        explicitZerosRow += 1
+                explicitZeros += explicitZerosRow
+                if not (explicitZerosRow > sparsificationThreshold*self.A.num_columns):
+                    break
+            if explicitZeros > sparsificationThreshold*self.A.num_rows*self.A.num_columns:
+                print('Converting dense to sparse matrix, since {}% of entries are zero.'.format(100.*explicitZeros/REAL(self.A.num_rows*self.A.num_columns)))
+                self.A = CSR_LinearOperator.from_dense(self.A)
+        if isinstance(self.A, (SSS_LinearOperator,
+                               CSR_LinearOperator)):
+            from pypardiso import PyPardisoSolver
+            try:
+                self.Ainv = PyPardisoSolver()
+                self.Asp = self.A.to_csr()
+                self.Ainv.factorize(self.Asp)
+            except RuntimeError:
+                print(self.A, np.array(self.A.data))
+                raise
+        elif isinstance(self.A, Dense_LinearOperator):
+            from scipy.linalg import lu_factor
+            self.lu, self.perm = lu_factor(self.A.data)
+        else:
+            raise NotImplementedError('Cannot use operator of type "{}"'.format(type(self.A)))
+        self.initialized = True
+
+    cdef int solve(self, vector_t b, vector_t x) except -1:
+        cdef:
+            REAL_t[::1] temp
+        solver.solve(self, b, x)
+        if isinstance(self.A, (SSS_LinearOperator, CSR_LinearOperator)):
+            temp = self.Ainv.solve(self.Asp,
+                                   np.array(b, copy=False, dtype=REAL))
+            assign(x, temp)
+        else:
+            from scipy.linalg import lu_solve
+            assign(x, b)
+            lu_solve((self.lu, self.perm),
+                     np.array(x, copy=False, dtype=REAL),
+                     overwrite_b=True)
+        return 1
+
+    def __str__(self):
+        return 'LU (MKL Pardiso)'
diff --git a/base/PyNucleus_base/opt_true_solver_pypardiso_decl.pxi b/base/PyNucleus_base/opt_true_solver_pypardiso_decl.pxi
new file mode 100644
index 0000000..c933773
--- /dev/null
+++ b/base/PyNucleus_base/opt_true_solver_pypardiso_decl.pxi
@@ -0,0 +1,16 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+from . solvers cimport solver
+from . myTypes cimport REAL_t, INDEX_t
+
+
+cdef class pardiso_lu_solver(solver):
+    cdef:
+        INDEX_t[::1] perm
+        object Ainv, lu
+        object Asp
diff --git a/base/PyNucleus_base/solvers.pxd b/base/PyNucleus_base/solvers.pxd
index 2cf5ed2..9f21fc0 100644
--- a/base/PyNucleus_base/solvers.pxd
+++ b/base/PyNucleus_base/solvers.pxd
@@ -21,7 +21,6 @@ from . convergence cimport (convergenceMaster, noOpConvergenceMaster,
                             convergenceClient, noOpConvergenceClient)
 from . performanceLogger cimport PLogger, FakePLogger
 
-include "config.pxi"
 
 cdef class solver:
     cdef:
@@ -48,12 +47,7 @@ cdef class lu_solver(solver):
         BOOL_t useTriangularSolveRoutines
 
 
-cdef class pardiso_lu_solver(solver):
-    cdef:
-        INDEX_t[::1] perm
-        object Ainv, lu
-        object Asp
-
+include "solver_pypardiso_decl.pxi"
 
 cdef class chol_solver(solver):
     cdef:
@@ -155,10 +149,7 @@ cdef class bicgstab_solver(krylov_solver):
     cdef int solve(self, REAL_t[::1] b, REAL_t[::1] x) except -1
 
 
-IF USE_PYAMG:
-    cdef class pyamg_solver(iterative_solver):
-        cdef:
-            object ml
+include "solver_pyamg_decl.pxi"
 
 
 cdef class complex_solver:
diff --git a/base/PyNucleus_base/solvers.pyx b/base/PyNucleus_base/solvers.pyx
index 2a25acf..9e78dba 100644
--- a/base/PyNucleus_base/solvers.pyx
+++ b/base/PyNucleus_base/solvers.pyx
@@ -17,8 +17,6 @@ from . linalg cimport (forward_solve_csc, backward_solve_csc,
                        forward_solve_sss_noInverse,
                        backward_solve_sss_noInverse)
 
-include "config.pxi"
-
 
 cdef class solver:
     def __init__(self, LinearOperator A=None, INDEX_t num_rows=-1):
@@ -182,211 +180,9 @@ cdef class lu_solver(solver):
         return 'LU'
 
 
-IF USE_PYPARDISO:
-    cdef class pardiso_lu_solver(solver):
-        def __init__(self, LinearOperator A, INDEX_t num_rows=-1):
-            solver.__init__(self, A, num_rows)
-
-        cpdef void setup(self, LinearOperator A=None):
-            cdef:
-                INDEX_t i, j, explicitZeros, explicitZerosRow
-                REAL_t[:, ::1] data
-
-            if A is not None:
-                self.A = A
-
-            if not isinstance(self.A, (SSS_LinearOperator,
-                                       CSR_LinearOperator,
-                                       Dense_LinearOperator)):
-                if self.A.isSparse():
-                    self.A = self.A.to_csr_linear_operator()
-                else:
-                    self.A = Dense_LinearOperator(np.ascontiguousarray(self.A.toarray()))
-            try_sparsification = False
-            sparsificationThreshold = 0.9
-            if isinstance(self.A, Dense_LinearOperator) and try_sparsification:
-                explicitZeros = 0
-                data = self.A.data
-                for i in range(self.A.num_rows):
-                    explicitZerosRow = 0
-                    for j in range(self.A.num_columns):
-                        if data[i, j] == 0.:
-                            explicitZerosRow += 1
-                    explicitZeros += explicitZerosRow
-                    if not (explicitZerosRow > sparsificationThreshold*self.A.num_columns):
-                        break
-                if explicitZeros > sparsificationThreshold*self.A.num_rows*self.A.num_columns:
-                    print('Converting dense to sparse matrix, since {}% of entries are zero.'.format(100.*explicitZeros/REAL(self.A.num_rows*self.A.num_columns)))
-                    self.A = CSR_LinearOperator.from_dense(self.A)
-            if isinstance(self.A, (SSS_LinearOperator,
-                                   CSR_LinearOperator)):
-                from pypardiso import PyPardisoSolver
-                try:
-                    self.Ainv = PyPardisoSolver()
-                    self.Asp = self.A.to_csr()
-                    self.Ainv.factorize(self.Asp)
-                except RuntimeError:
-                    print(self.A, np.array(self.A.data))
-                    raise
-            elif isinstance(self.A, Dense_LinearOperator):
-                from scipy.linalg import lu_factor
-                self.lu, self.perm = lu_factor(self.A.data)
-            else:
-                raise NotImplementedError('Cannot use operator of type "{}"'.format(type(self.A)))
-            self.initialized = True
-
-        cdef int solve(self, vector_t b, vector_t x) except -1:
-            cdef:
-                REAL_t[::1] temp
-            solver.solve(self, b, x)
-            if isinstance(self.A, (SSS_LinearOperator, CSR_LinearOperator)):
-                temp = self.Ainv.solve(self.Asp,
-                                       np.array(b, copy=False, dtype=REAL))
-                assign(x, temp)
-            else:
-                from scipy.linalg import lu_solve
-                assign(x, b)
-                lu_solve((self.lu, self.perm),
-                         np.array(x, copy=False, dtype=REAL),
-                         overwrite_b=True)
-            return 1
-
-        def __str__(self):
-            return 'LU (MKL Pardiso)'
-
-
-cdef class chol_solver(solver):
-    def __init__(self, LinearOperator A, INDEX_t num_rows=-1):
-        solver.__init__(self, A, num_rows)
-
-    cpdef void setup(self, LinearOperator A=None):
-        cdef:
-            LinearOperator B = None
-            INDEX_t i, j, k
-        if A is not None:
-            self.A = A
-
-        if not isinstance(self.A, (SSS_LinearOperator,
-                                   CSR_LinearOperator,
-                                   Dense_LinearOperator)):
-            if self.A.isSparse():
-                B = self.A.to_csr_linear_operator()
-            else:
-                B = Dense_LinearOperator(np.ascontiguousarray(self.A.toarray()))
-        else:
-            B = self.A
-
-        self.denseFactor = False
-        if isinstance(B, Dense_LinearOperator):
-            from scipy.linalg import cholesky
-            L = cholesky(B.toarray(), lower=True)
-            self.Lflat = uninitialized((L.shape[0]*(L.shape[1]+1)//2), dtype=REAL)
-            k = 0
-            for i in range(L.shape[0]):
-                for j in range(i+1):
-                    self.Lflat[k] = L[i, j]
-                    k += 1
-            self.denseFactor = True
-            self.temp = uninitialized((L.shape[0]), dtype=REAL)
-        else:
-            IF USE_CHOLMOD:
-                from sksparse.cholmod import cholesky
-
-                if isinstance(B, CSR_LinearOperator):
-                    try:
-                        self.Ainv = cholesky(B.to_csc())
-                    except AttributeError:
-                        self.Ainv = cholesky(B.to_csr().tocsc())
-                elif isinstance(B, SSS_LinearOperator):
-                    self.Ainv = cholesky(B.to_lower_csc())
-                else:
-                    try:
-                        B = B.to_csr_linear_operator()
-                        self.Ainv = cholesky(B)
-                    except AttributeError:
-                        raise NotImplementedError()
-            ELSE:
-                raise NotImplementedError("Cholmod not available, install \"scikit-sparse\".")
-        self.initialized = True
-
-    cdef int solve(self, vector_t b, vector_t x) except -1:
-        cdef:
-            INDEX_t i, j, k
-            REAL_t val
-            REAL_t[::1] temp = self.temp, Lflat = self.Lflat
-        solver.solve(self, b, x)
-        if not self.denseFactor:
-            np.array(x, copy=False, dtype=REAL)[:] = self.Ainv(np.array(b, copy=False, dtype=REAL))
-        else:
-            k = 0
-            for i in range(x.shape[0]):
-                val = b[i]
-                for j in range(i):
-                    val -= Lflat[k]*temp[j]
-                    k += 1
-                temp[i] = val/Lflat[k]
-                k += 1
-            for i in range(x.shape[0]-1, -1, -1):
-                val = temp[i]
-                for j in range(i+1, x.shape[0]):
-                    k = ((j*(j+1)) >> 1) + i
-                    val -= Lflat[k]*x[j]
-                k = ((i*(i+1)) >> 1) + i
-                x[i] = val/Lflat[k]
-        return 1
-
-    def __str__(self):
-        return 'Cholesky'
-
-
-cdef class ichol_solver(solver):
-    def __init__(self, LinearOperator A=None, INDEX_t num_rows=-1):
-        solver.__init__(self, A, num_rows)
-        self.temp = uninitialized((self.num_rows), dtype=REAL)
-
-    cpdef void setup(self, LinearOperator A=None):
-        cdef:
-            INDEX_t i
-        if A is not None:
-            self.A = A
-        if isinstance(self.A, CSR_LinearOperator):
-            self.indices, self.indptr, self.data, self.diagonal = ichol_csr(self.A)
-        elif isinstance(self.A, SSS_LinearOperator):
-            # self.indices, self.indptr, self.data, self.diagonal = ichol_sss(A)
-            self.indices, self.indptr, self.data, self.diagonal = ichol_csr(self.A.to_csr_linear_operator())
-        else:
-            try:
-                B = self.A.to_csr_linear_operator()
-                self.indices, self.indptr, self.data, self.diagonal = ichol_csr(B)
-            except:
-                raise NotImplementedError()
-
-        IF USE_MKL_TRISOLVE:
-            from . linear_operators import diagonalOperator
-            T = CSR_LinearOperator(self.indices, self.indptr, self.data).to_csr()+diagonalOperator(self.diagonal).to_csr()
-            self.L = CSR_LinearOperator.from_csr(T)
-        ELSE:
-            for i in range(self.diagonal.shape[0]):
-                self.diagonal[i] = 1./self.diagonal[i]
-        self.initialized = True
-
-    cdef int solve(self, vector_t b, vector_t x) except -1:
-        solver.solve(self, b, x)
-        self.temp[:] = 0.0
-        IF USE_MKL_TRISOLVE:
-            trisolve_mkl(self.L.indptr, self.L.indices, self.L.data, b, self.temp, forward=True, unitDiagonal=False)
-            trisolve_mkl(self.L.indptr, self.L.indices, self.L.data, self.temp, x, forward=False, unitDiagonal=False)
-        ELSE:
-            forward_solve_sss_noInverse(self.indptr, self.indices,
-                                        self.data, self.diagonal,
-                                        b, self.temp, unitDiagonal=False)
-            backward_solve_sss_noInverse(self.indptr, self.indices,
-                                         self.data, self.diagonal,
-                                         self.temp, x)
-        return 1
-
-    def __str__(self):
-        return 'Incomplete Cholesky'
+include "solver_pypardiso.pxi"
+include "solver_chol.pxi"
+include "solver_ichol.pxi"
 
 
 cdef class ilu_solver(solver):
@@ -1001,47 +797,6 @@ cdef class bicgstab_solver(krylov_solver):
             return s
 
 
-IF USE_PYAMG:
-    from pyamg import smoothed_aggregation_solver
-
-    cdef class pyamg_solver(iterative_solver):
-        def __init__(self, LinearOperator A=None, num_rows=-1):
-            iterative_solver.__init__(self, A, num_rows)
-
-        cpdef void setup(self, LinearOperator A=None):
-            iterative_solver.setup(self, A)
-            # self.ml = ruge_stuben_solver(self.A.to_csr(),
-            #                              coarse_solver='splu',
-            #                              max_coarse=2500,
-            #                              presmoother=('gauss_seidel', {'sweep': 'forward'}),
-            #                              postsmoother=('gauss_seidel', {'sweep': 'backward'}))
-
-            self.ml = smoothed_aggregation_solver(self.A.to_csr(),
-                                                  np.ones((self.num_rows)),
-                                                  smooth=None,
-                                                  coarse_solver='splu',
-                                                  max_coarse=2500,
-                                                  presmoother=('gauss_seidel', {'sweep': 'forward'}),
-                                                  postsmoother=('gauss_seidel', {'sweep': 'backward'}))
-            self.initialized = True
-
-        cdef int solve(self, vector_t b, vector_t x) except -1:
-            residuals = []
-            x_np = np.array(x, copy=False)
-            if self.x0 is not None:
-                x_np[:] = self.ml.solve(np.array(b, copy=False),
-                                        x0=np.array(self.x0, copy=False),
-                                        tol=self.tol, maxiter=self.maxIter, residuals=residuals, accel='cg')
-            else:
-                x_np[:] = self.ml.solve(np.array(b, copy=False),
-                                        tol=self.tol, maxiter=self.maxIter, residuals=residuals, accel='cg')
-            self.residuals = residuals
-            return len(residuals)
-
-        def __str__(self):
-            return str(self.ml)
-
-
 ######################################################################
 
 
diff --git a/base/PyNucleus_base/tupleDict.pyx b/base/PyNucleus_base/tupleDict.pyx
index a10e47c..f626599 100644
--- a/base/PyNucleus_base/tupleDict.pyx
+++ b/base/PyNucleus_base/tupleDict.pyx
@@ -10,14 +10,7 @@ from libc.stdlib cimport malloc, realloc, free
 from libc.stdlib cimport qsort
 from . myTypes import INDEX
 
-include "config.pxi"
-
-IF HAVE_MALLOC_H:
-    cdef extern from "malloc.h" nogil:
-        int malloc_trim(size_t pad)
-ELSE:
-    cdef int malloc_trim(size_t pad):
-        pass
+include "malloc.pxi"
 
 # def return_memory_to_OS():
 #     malloc_trim(0)
diff --git a/base/setup.py b/base/setup.py
index 76e336e..70bac63 100644
--- a/base/setup.py
+++ b/base/setup.py
@@ -35,14 +35,14 @@
     raise ImportError('\'PyNucleus_packageTools\' needs to be installed first.') from e
 
 p = package('PyNucleus_base')
-p.addOption('USE_BLAS', 'useBLAS', True)
-p.addOption('USE_MKL', 'useMKL', False)
-p.addOption('USE_CHOLMOD', 'use_cholmod', True, ['scikit-sparse'])
-p.addOption('USE_PYAMG', 'use_pyamg', False, ['pyamg'])
-p.addOption('USE_PYPARDISO', 'use_pypardiso', False, ['pypardiso'])
-p.addOption('MKL_LIBRARY', 'mklLibrary', 'mkl_rt')
-p.addOption('USE_MKL_TRISOLVE', 'useMKL_trisolve', False)
-p.addOption('FILL_UNINITIALIZED', 'fillUninitialized', True)
+p.addOption(None, 'useBLAS', True)
+p.addOption(None, 'useMKL', False)
+p.addOption(None, 'use_cholmod', True, ['scikit-sparse'])
+p.addOption(None, 'use_pyamg', False, ['pyamg'])
+p.addOption(None, 'use_pypardiso', False, ['pypardiso'])
+p.addOption(None, 'mklLibrary', 'mkl_rt')
+p.addOption(None, 'useMKL_trisolve', False)
+p.addOption(None, 'fillUninitialized', True)
 
 try:
     cython.inline("""
@@ -50,10 +50,14 @@
         int malloc_trim(size_t pad)
     """)
     have_malloc_h = True
+    if not (p.hash_file(p.folder+'malloc_linux.pxi') ==
+            p.hash_file(p.folder+'malloc.pxi')):
+        copy(p.folder+'malloc_linux.pxi', p.folder+'malloc.pxi')
 except CompileError as e:
     print('malloc.h not found, error was \"{}\". Depending on the system, this might be normal.'.format(e))
-    have_malloc_h = False
-p.addOption('HAVE_MALLOC_H', 'have_malloc_h', have_malloc_h)
+    if not (p.hash_file(p.folder+'malloc_mac.pxi') ==
+            p.hash_file(p.folder+'malloc.pxi')):
+        copy(p.folder+'malloc_mac.pxi', p.folder+'malloc.pxi')
 p.loadConfig(extra_config={'annotate': True,
                            'cythonDirectives': {'initializedcheck': False,
                                                 'boundscheck': False,
@@ -139,6 +143,19 @@
 fillTemplate(Path(p.folder), templates, replacementGroups)
 
 
+# allocation
+p.conditionalCopy(p.folder+'allocation.pxi', p.config['fillUninitialized'], p.folder+'opt_true_allocation.pxi', p.folder+'opt_false_allocation.pxi')
+p.conditionalCopy(p.folder+'blas_routines.pxi', p.config['useBLAS'], p.folder+'opt_true_blas.pxi', p.folder+'opt_false_blas.pxi')
+p.conditionalCopy(p.folder+'mkl_routines.pxi', p.config['useMKL'], p.folder+'opt_true_mkl.pxi', p.folder+'opt_false_mkl.pxi')
+
+# solvers
+p.conditionalCopy(p.folder+'solver_chol.pxi', p.config['use_cholmod'], p.folder+'opt_true_solver_cholmod.pxi', p.folder+'opt_false_solver_cholmod.pxi')
+p.conditionalCopy(p.folder+'solver_ichol.pxi', p.config['useMKL_trisolve'], p.folder+'opt_true_mkl_trisolve.pxi', p.folder+'opt_false_mkl_trisolve.pxi')
+p.conditionalCopy(p.folder+'solver_pypardiso.pxi', p.config['use_pypardiso'], p.folder+'opt_true_solver_pypardiso.pxi', None)
+p.conditionalCopy(p.folder+'solver_pypardiso_decl.pxi', p.config['use_pypardiso'], p.folder+'opt_true_solver_pypardiso_decl.pxi', None)
+p.conditionalCopy(p.folder+'solver_pyamg.pxi', p.config['use_pyamg'], p.folder+'opt_true_solver_pyamg.pxi', None)
+p.conditionalCopy(p.folder+'solver_pyamg_decl.pxi', p.config['use_pyamg'], p.folder+'opt_true_solver_pyamg_decl.pxi', None)
+
 p.addExtension("linear_operators",
                sources=[p.folder+"linear_operators.pyx"])
 p.addExtension("sparseGraph",
diff --git a/fem/PyNucleus_fem/DoFMaps.pxd b/fem/PyNucleus_fem/DoFMaps.pxd
index b3895cf..8596c55 100644
--- a/fem/PyNucleus_fem/DoFMaps.pxd
+++ b/fem/PyNucleus_fem/DoFMaps.pxd
@@ -74,6 +74,7 @@ cdef class shapeFunction:
     cdef REAL_t eval(self, const REAL_t[::1] lam)
     cdef REAL_t evalStrided(self, const REAL_t* lam, INDEX_t stride)
     cdef REAL_t evalGlobal(self, REAL_t[:, ::1] simplex, REAL_t[::1] x)
+    cdef void evalGrad(self, const REAL_t[::1] lam, const REAL_t[:, ::1] gradLam, REAL_t[::1] value)
 
 
 cdef class vectorShapeFunction:
diff --git a/fem/PyNucleus_fem/DoFMaps.pyx b/fem/PyNucleus_fem/DoFMaps.pyx
index a797c91..c3bf5c6 100644
--- a/fem/PyNucleus_fem/DoFMaps.pyx
+++ b/fem/PyNucleus_fem/DoFMaps.pyx
@@ -1672,6 +1672,14 @@ cdef class shapeFunction:
     cdef REAL_t evalStrided(self, const REAL_t* lam, INDEX_t stride):
         raise NotImplementedError()
 
+    cdef void evalGrad(self, const REAL_t[::1] lam, const REAL_t[:, ::1] gradLam, REAL_t[::1] value):
+        pass
+
+    def evalGradPy(self, lam, gradLam):
+        value = uninitialized((gradLam.shape[1]), dtype=REAL)
+        self.evalGrad(np.array(lam), np.array(gradLam), value)
+        return value
+
     cdef REAL_t evalGlobal(self, REAL_t[:, ::1] simplex, REAL_t[::1] x):
         if simplex.shape[1] == 1:
             getBarycentricCoords1D(simplex, x, self.bary)
@@ -1818,6 +1826,13 @@ cdef class shapeFunctionP1(shapeFunction):
     cdef REAL_t evalStrided(self, const REAL_t* lam, INDEX_t stride):
         return lam[self.vertexNo*stride]
 
+    cdef void evalGrad(self, const REAL_t[::1] lam, const REAL_t[:, ::1] gradLam, REAL_t[::1] value):
+        cdef:
+            INDEX_t dim = gradLam.shape[1]
+            INDEX_t i
+        for i in range(dim):
+            value[i] = gradLam[self.vertexNo, i]
+
     def __getstate__(self):
         return self.vertexNo
 
diff --git a/fem/PyNucleus_fem/quadrature.pxd b/fem/PyNucleus_fem/quadrature.pxd
index 541a8a9..de41d58 100644
--- a/fem/PyNucleus_fem/quadrature.pxd
+++ b/fem/PyNucleus_fem/quadrature.pxd
@@ -19,8 +19,6 @@ from . meshCy cimport (vectorProduct,
 cimport numpy as np
 from libc.math cimport sqrt
 
-include "config.pxi"
-
 
 cdef class quadratureRule:
     cdef:
diff --git a/fem/PyNucleus_fem/quadrature.pyx b/fem/PyNucleus_fem/quadrature.pyx
index b656d05..3493a37 100644
--- a/fem/PyNucleus_fem/quadrature.pyx
+++ b/fem/PyNucleus_fem/quadrature.pyx
@@ -13,8 +13,6 @@ import numpy as np
 from modepy import XiaoGimbutasSimplexQuadrature
 from modepy.tools import unit_to_barycentric
 
-include "config.pxi"
-
 
 cdef class quadratureRule:
     def __init__(self,
diff --git a/metisCy/PyNucleus_metisCy/metisCy.pxd b/metisCy/PyNucleus_metisCy/metisCy.pxd
index 3ef3850..834891c 100644
--- a/metisCy/PyNucleus_metisCy/metisCy.pxd
+++ b/metisCy/PyNucleus_metisCy/metisCy.pxd
@@ -7,14 +7,4 @@
 
 cimport numpy as np
 
-include "config.pxi"
-
-IF IDXTYPEWIDTH == 32:
-    ctypedef np.int32_t idx_t
-ELIF IDXTYPEWIDTH == 64:
-    ctypedef np.int64_t idx_t
-
-IF REALTYPEWIDTH == 32:
-    ctypedef float real_t
-ELIF REALTYPEWIDTH == 64:
-    ctypedef np.float64_t real_t
+include "metisTypes_decl.pxi"
diff --git a/metisCy/PyNucleus_metisCy/metisCy.pyx b/metisCy/PyNucleus_metisCy/metisCy.pyx
index 95a320d..5955ba0 100644
--- a/metisCy/PyNucleus_metisCy/metisCy.pyx
+++ b/metisCy/PyNucleus_metisCy/metisCy.pyx
@@ -8,20 +8,9 @@
 import numpy as np
 cimport numpy as np
 
-include "config.pxi"
-
 from PyNucleus_base import uninitialized
 
-IF IDXTYPEWIDTH == 32:
-    idx = np.int32
-ELIF IDXTYPEWIDTH == 64:
-    idx = np.int64
-
-IF REALTYPEWIDTH == 32:
-    real = np.float32
-ELIF REALTYPEWIDTH == 64:
-    real = np.float64
-
+include "metisTypes.pxi"
 
 cdef extern from "metis.h":
     int METIS_PartGraphRecursive(idx_t *nvtxs, idx_t *ncon, idx_t *xadj, idx_t *adjncy,
diff --git a/metisCy/PyNucleus_metisCy/parmetisCy.pyx b/metisCy/PyNucleus_metisCy/parmetisCy.pyx
index 0edaffb..72228fc 100644
--- a/metisCy/PyNucleus_metisCy/parmetisCy.pyx
+++ b/metisCy/PyNucleus_metisCy/parmetisCy.pyx
@@ -17,23 +17,8 @@ from PyNucleus_base import uninitialized
 
 ctypedef mpi.MPI_Comm MPI_Comm
 
-include "config.pxi"
-
-IF IDXTYPEWIDTH == 32:
-    idx = np.int32
-    cdef int IDX = np.NPY_INT32
-    ctypedef np.int32_t idx_t
-ELIF IDXTYPEWIDTH == 64:
-    idx = np.int64
-    cdef int IDX = np.NPY_INT64
-    ctypedef np.int64_t idx_t
-
-IF REALTYPEWIDTH == 32:
-    real = np.float32
-    ctypedef float real_t
-ELIF REALTYPEWIDTH == 64:
-    real = np.float64
-    ctypedef np.float64_t real_t
+include "metisTypes_decl.pxi"
+include "metisTypes.pxi"
 
 
 cdef extern from "parmetis.h":
diff --git a/metisCy/setup.py b/metisCy/setup.py
index cfe43c6..fec4c1c 100644
--- a/metisCy/setup.py
+++ b/metisCy/setup.py
@@ -5,6 +5,8 @@
 # If you want to use this code, please refer to the README.rst and LICENSE files. #
 ###################################################################################
 
+from pathlib import Path
+from shutil import copy
 try:
     from PyNucleus_packageTools import package
 except ImportError as e:
@@ -25,9 +27,38 @@
     int REALTYPEWIDTH
 return IDXTYPEWIDTH, REALTYPEWIDTH""",
                           cython_include_dirs=p.config['includeDirs'])
-
-p.addOption('IDXTYPEWIDTH', 'METIS_idx_width', idx)
-p.addOption('REALTYPEWIDTH', 'METIS_real_width', real)
+idx = int(idx)
+real = int(real)
+python_def = ''
+decl = ''
+if idx == 32:
+    python_def += 'idx = np.int32\n'
+    decl += 'ctypedef np.int32_t idx_t\n'
+elif idx == 64:
+    python_def += 'idx = np.int64\n'
+    decl += 'ctypedef np.int64_t idx_t\n'
+else:
+    raise NotImplementedError()
+if real == 32:
+    python_def += 'real = np.float32\n'
+    decl += 'ctypedef float real_t\n'
+elif real == 64:
+    python_def += 'real = np.float64\n'
+    decl += 'ctypedef np.float64_t real_t\n'
+else:
+    raise NotImplementedError()
+with open(p.folder+'metisTypesTemp.pxi', 'w') as f:
+    f.write(python_def)
+with open(p.folder+'metisTypesTemp_decl.pxi', 'w') as f:
+    f.write(decl)
+if not (p.hash_file(p.folder+'metisTypesTemp.pxi') ==
+        p.hash_file(p.folder+'metisTypes.pxi')):
+    copy(p.folder+'metisTypesTemp.pxi', p.folder+'metisTypes.pxi')
+Path(p.folder+'metisTypesTemp.pxi').unlink()
+if not (p.hash_file(p.folder+'metisTypesTemp_decl.pxi') ==
+        p.hash_file(p.folder+'metisTypes_decl.pxi')):
+    copy(p.folder+'metisTypesTemp_decl.pxi', p.folder+'metisTypes_decl.pxi')
+Path(p.folder+'metisTypesTemp_decl.pxi').unlink()
 p.loadConfig()
 p.addPackageInclude('PyNucleus_base')
 
diff --git a/nl/PyNucleus_nl/bem_{SCALAR}.pxi b/nl/PyNucleus_nl/bem_{SCALAR}.pxi
new file mode 100644
index 0000000..a5d295f
--- /dev/null
+++ b/nl/PyNucleus_nl/bem_{SCALAR}.pxi
@@ -0,0 +1,1182 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+            self.qrId = sQR.qr
+            self.PHI_id = sQR.PHI3
+        elif panel == COMMON_VERTEX:
+            try:
+                sQR = self.specialQuadRules[(singularityValue, panel)]
+            except KeyError:
+
+                qr = singularityCancelationQuadRule1D(panel,
+                                                      self.singularityCancelationIntegrandAcrossElements+singularityValue,
+                                                      self.quad_order_diagonal,
+                                                      2*dm_order)
+                PHI = uninitialized((2*dofs_per_element - dofs_per_vertex,
+                                     qr.num_nodes,
+                                     2), dtype=REAL)
+
+                for dof in range(dofs_per_vertex):
+                    sf = self.getLocalShapeFunction(dof)
+                    for i in range(qr.num_nodes):
+                        lcl_bary_x[0] = qr.nodes[0, i]
+                        lcl_bary_x[1] = qr.nodes[1, i]
+                        lcl_bary_y[0] = qr.nodes[2, i]
+                        lcl_bary_y[1] = qr.nodes[3, i]
+                        PHI[dof, i, 0] = sf.eval(lcl_bary_x)
+                        PHI[dof, i, 1] = sf.eval(lcl_bary_y)
+
+                for dof in range(dofs_per_vertex, dofs_per_element):
+                    sf = self.getLocalShapeFunction(dof)
+                    for i in range(qr.num_nodes):
+                        lcl_bary_x[0] = qr.nodes[0, i]
+                        lcl_bary_x[1] = qr.nodes[1, i]
+                        lcl_bary_y[0] = qr.nodes[2, i]
+                        lcl_bary_y[1] = qr.nodes[3, i]
+                        PHI[dof, i, 0] = sf.eval(lcl_bary_x)
+                        PHI[dof, i, 1] = 0
+                        PHI[dofs_per_element+dof-dofs_per_vertex, i, 0] = 0
+                        PHI[dofs_per_element+dof-dofs_per_vertex, i, 1] = sf.eval(lcl_bary_y)
+
+                sQR = specialQuadRule(qr, PHI3=PHI)
+                self.specialQuadRules[(singularityValue, panel)] = sQR
+                if qr.num_nodes > self.temp.shape[0]:
+                    self.temp = uninitialized((qr.num_nodes), dtype={SCALAR})
+                    self.temp2 = uninitialized((qr.num_nodes), dtype={SCALAR})
+            self.qrVertex = sQR.qr
+            self.PHI_vertex = sQR.PHI3
+        else:
+            raise NotImplementedError('Unknown panel type: {}'.format(panel))
+
+
+cdef class {SCALAR_label}bem3D({SCALAR_label}bem):
+    def __init__(self, {SCALAR_label}Kernel kernel, meshBase mesh, DoFMap DoFMap, num_dofs=None, manifold_dim2=-1, **kwargs):
+        super({SCALAR_label}bem3D, self).__init__(kernel, mesh, DoFMap, num_dofs, manifold_dim2, **kwargs)
+
+    cdef REAL_t get_h_simplex(self, const REAL_t[:, ::1] simplex):
+        cdef:
+            INDEX_t i, j
+            REAL_t hmax = 0., h2
+        for i in range(2):
+            for j in range(i+1, 3):
+                h2 = ((simplex[j, 0]-simplex[i, 0])*(simplex[j, 0]-simplex[i, 0]) +
+                      (simplex[j, 1]-simplex[i, 1])*(simplex[j, 1]-simplex[i, 1]) +
+                      (simplex[j, 2]-simplex[i, 2])*(simplex[j, 2]-simplex[i, 2]))
+                hmax = max(hmax, h2)
+        return sqrt(hmax)
+
+    cdef panelType getQuadOrder(self,
+                                const REAL_t h1,
+                                const REAL_t h2,
+                                REAL_t d):
+        cdef:
+            panelType panel, panel2
+            REAL_t logdh1 = log(d/h1), logdh2 = log(d/h2)
+            REAL_t c = (0.5*self.target_order+0.5)*log(self.num_dofs*self.H0**2) #-4.
+            REAL_t logh1H0 = abs(log(h1/self.H0)), logh2H0 = abs(log(h2/self.H0))
+            REAL_t loghminH0 = max(logh1H0, logh2H0)
+            REAL_t s = max(-0.5*(self.kernel.getSingularityValue()+2), 0.)
+        panel = <panelType>max(ceil((c + (s-1.)*logh2H0 + loghminH0 - s*logdh2) /
+                                    (max(logdh1, 0) + 0.4)),
+                               2)
+        panel2 = <panelType>max(ceil((c + (s-1.)*logh1H0 + loghminH0 - s*logdh1) /
+                                     (max(logdh2, 0) + 0.4)),
+                                2)
+        panel = max(panel, panel2)
+        if self.distantQuadRulesPtr[panel] == NULL:
+            self.addQuadRule_nonSym(panel)
+        return panel
+
+    cdef void getNearQuadRule(self, panelType panel):
+        cdef:
+            INDEX_t i
+            REAL_t singularityValue = self.kernel.getSingularityValue()
+            specialQuadRule sQR
+            quadratureRule qr
+            INDEX_t dofs_per_element = self.DoFMap.dofs_per_element
+            INDEX_t dofs_per_edge = self.DoFMap.dofs_per_edge
+            INDEX_t dofs_per_vertex = self.DoFMap.dofs_per_vertex
+            INDEX_t dm_order = max(self.DoFMap.polynomialOrder, 1)
+            shapeFunction sf
+            INDEX_t manifold_dim = 2
+            REAL_t lcl_bary_x[3]
+            REAL_t lcl_bary_y[3]
+            REAL_t[:, :, ::1] PHI
+            INDEX_t dof
+
+        if panel == COMMON_FACE:
+            try:
+                sQR = self.specialQuadRules[(singularityValue, panel)]
+            except KeyError:
+
+                qr = singularityCancelationQuadRule2D(panel,
+                                                      self.singularityCancelationIntegrandWithinElement+singularityValue,
+                                                      self.quad_order_diagonal,
+                                                      self.quad_order_diagonalV,
+                                                      1)
+                PHI = uninitialized((dofs_per_element, qr.num_nodes, 2), dtype=REAL)
+
+                for dof in range(dofs_per_element):
+                    sf = self.getLocalShapeFunction(dof)
+                    for i in range(qr.num_nodes):
+                        lcl_bary_x[0] = qr.nodes[0, i]
+                        lcl_bary_x[1] = qr.nodes[1, i]
+                        lcl_bary_x[2] = qr.nodes[2, i]
+                        lcl_bary_y[0] = qr.nodes[3, i]
+                        lcl_bary_y[1] = qr.nodes[4, i]
+                        lcl_bary_y[2] = qr.nodes[5, i]
+                        PHI[dof, i, 0] = sf.eval(lcl_bary_x)
+                        PHI[dof, i, 1] = sf.eval(lcl_bary_y)
+
+                sQR = specialQuadRule(qr, PHI3=PHI)
+                self.specialQuadRules[(singularityValue, panel)] = sQR
+                if qr.num_nodes > self.temp.shape[0]:
+                    self.temp = uninitialized((qr.num_nodes), dtype={SCALAR})
+                    self.temp2 = uninitialized((qr.num_nodes), dtype={SCALAR})
+            self.qrId = sQR.qr
+            self.PHI_id = sQR.PHI3
+        elif panel == COMMON_EDGE:
+            try:
+                sQR = self.specialQuadRules[(singularityValue, panel)]
+            except KeyError:
+
+                qr = singularityCancelationQuadRule2D(panel,
+                                                      self.singularityCancelationIntegrandAcrossElements+singularityValue,
+                                                      self.quad_order_diagonal,
+                                                      self.quad_order_diagonalV,
+                                                      1)
+                PHI = uninitialized((2*dofs_per_element - 2*dofs_per_vertex - dofs_per_edge,
+                                     qr.num_nodes,
+                                     2), dtype=REAL)
+
+                for dof in range(2*dofs_per_vertex):
+                    sf = self.getLocalShapeFunction(dof)
+                    for i in range(qr.num_nodes):
+                        lcl_bary_x[0] = qr.nodes[0, i]
+                        lcl_bary_x[1] = qr.nodes[1, i]
+                        lcl_bary_x[2] = qr.nodes[2, i]
+                        lcl_bary_y[0] = qr.nodes[3, i]
+                        lcl_bary_y[1] = qr.nodes[4, i]
+                        lcl_bary_y[2] = qr.nodes[5, i]
+                        PHI[dof, i, 0] = sf.eval(lcl_bary_x)
+                        PHI[dof, i, 1] = sf.eval(lcl_bary_y)
+
+                for dof in range((manifold_dim+1)*dofs_per_vertex, (manifold_dim+1)*dofs_per_vertex+dofs_per_edge):
+                    sf = self.getLocalShapeFunction(dof)
+                    for i in range(qr.num_nodes):
+                        lcl_bary_x[0] = qr.nodes[0, i]
+                        lcl_bary_x[1] = qr.nodes[1, i]
+                        lcl_bary_x[2] = qr.nodes[2, i]
+                        lcl_bary_y[0] = qr.nodes[3, i]
+                        lcl_bary_y[1] = qr.nodes[4, i]
+                        lcl_bary_y[2] = qr.nodes[5, i]
+                        PHI[dof, i, 0] = sf.eval(lcl_bary_x)
+                        PHI[dof, i, 1] = sf.eval(lcl_bary_y)
+
+                for dof in range(2*dofs_per_vertex, (manifold_dim+1)*dofs_per_vertex):
+                    sf = self.getLocalShapeFunction(dof)
+                    for i in range(qr.num_nodes):
+                        lcl_bary_x[0] = qr.nodes[0, i]
+                        lcl_bary_x[1] = qr.nodes[1, i]
+                        lcl_bary_x[2] = qr.nodes[2, i]
+                        lcl_bary_y[0] = qr.nodes[3, i]
+                        lcl_bary_y[1] = qr.nodes[4, i]
+                        lcl_bary_y[2] = qr.nodes[5, i]
+                        PHI[dof, i, 0] = sf.eval(lcl_bary_x)
+                        PHI[dof, i, 1] = 0
+                        PHI[dofs_per_element+dof-2*dofs_per_vertex, i, 0] = 0
+                        PHI[dofs_per_element+dof-2*dofs_per_vertex, i, 1] = sf.eval(lcl_bary_y)
+
+                for dof in range((manifold_dim+1)*dofs_per_vertex+dofs_per_edge, dofs_per_element):
+                    sf = self.getLocalShapeFunction(dof)
+                    for i in range(qr.num_nodes):
+                        lcl_bary_x[0] = qr.nodes[0, i]
+                        lcl_bary_x[1] = qr.nodes[1, i]
+                        lcl_bary_x[2] = qr.nodes[2, i]
+                        lcl_bary_y[0] = qr.nodes[3, i]
+                        lcl_bary_y[1] = qr.nodes[4, i]
+                        lcl_bary_y[2] = qr.nodes[5, i]
+                        PHI[dof, i, 0] = sf.eval(lcl_bary_x)
+                        PHI[dof, i, 1] = 0
+                        PHI[dofs_per_element+dof-2*dofs_per_vertex-dofs_per_edge, i, 0] = 0
+                        PHI[dofs_per_element+dof-2*dofs_per_vertex-dofs_per_edge, i, 1] = sf.eval(lcl_bary_y)
+
+                sQR = specialQuadRule(qr, PHI3=PHI)
+                self.specialQuadRules[(singularityValue, panel)] = sQR
+                if qr.num_nodes > self.temp.shape[0]:
+                    self.temp = uninitialized((qr.num_nodes), dtype={SCALAR})
+                    self.temp2 = uninitialized((qr.num_nodes), dtype={SCALAR})
+            self.qrEdge = sQR.qr
+            self.PHI_edge = sQR.PHI3
+        elif panel == COMMON_VERTEX:
+            try:
+                sQR = self.specialQuadRules[(singularityValue, panel)]
+            except KeyError:
+
+                qr = singularityCancelationQuadRule2D(panel,
+                                                      self.singularityCancelationIntegrandAcrossElements+singularityValue,
+                                                      self.quad_order_diagonal,
+                                                      self.quad_order_diagonalV,
+                                                      1)
+                PHI = uninitialized((2*dofs_per_element - dofs_per_vertex,
+                                     qr.num_nodes,
+                                     2), dtype=REAL)
+
+                for dof in range(dofs_per_vertex):
+                    sf = self.getLocalShapeFunction(dof)
+                    for i in range(qr.num_nodes):
+                        lcl_bary_x[0] = qr.nodes[0, i]
+                        lcl_bary_x[1] = qr.nodes[1, i]
+                        lcl_bary_x[2] = qr.nodes[2, i]
+                        lcl_bary_y[0] = qr.nodes[3, i]
+                        lcl_bary_y[1] = qr.nodes[4, i]
+                        lcl_bary_y[2] = qr.nodes[5, i]
+                        PHI[dof, i, 0] = sf.eval(lcl_bary_x)
+                        PHI[dof, i, 1] = sf.eval(lcl_bary_y)
+
+                for dof in range(dofs_per_vertex, dofs_per_element):
+                    sf = self.getLocalShapeFunction(dof)
+                    for i in range(qr.num_nodes):
+                        lcl_bary_x[0] = qr.nodes[0, i]
+                        lcl_bary_x[1] = qr.nodes[1, i]
+                        lcl_bary_x[2] = qr.nodes[2, i]
+                        lcl_bary_y[0] = qr.nodes[3, i]
+                        lcl_bary_y[1] = qr.nodes[4, i]
+                        lcl_bary_y[2] = qr.nodes[5, i]
+                        PHI[dof, i, 0] = sf.eval(lcl_bary_x)
+                        PHI[dof, i, 1] = 0
+                        PHI[dofs_per_element+dof-dofs_per_vertex, i, 0] = 0
+                        PHI[dofs_per_element+dof-dofs_per_vertex, i, 1] = sf.eval(lcl_bary_y)
+
+                sQR = specialQuadRule(qr, PHI3=PHI)
+                self.specialQuadRules[(singularityValue, panel)] = sQR
+                if qr.num_nodes > self.temp.shape[0]:
+                    self.temp = uninitialized((qr.num_nodes), dtype={SCALAR})
+                    self.temp2 = uninitialized((qr.num_nodes), dtype={SCALAR})
+            self.qrVertex = sQR.qr
+            self.PHI_vertex = sQR.PHI3
+        else:
+            raise NotImplementedError('Unknown panel type: {}'.format(panel))
+
+
+cdef class {SCALAR_label}bem2D_V({SCALAR_label}bem2D):
+    """The local stiffness matrix
+
+    .. math::
+
+       \\int_{K_1}\\int_{K_2} 0.5 * [ u(x) v(y) + u(y) v(x) ] \\gamma(x,y) dy dx
+
+    for the V operator.
+    """
+    def __init__(self,
+                 {SCALAR_label}Kernel kernel,
+                 meshBase mesh,
+                 DoFMap DoFMap,
+                 quad_order_diagonal=None,
+                 target_order=None,
+                 num_dofs=None,
+                 **kwargs):
+        cdef:
+            REAL_t smin, smax
+        super({SCALAR_label}bem2D_V, self).__init__(kernel, mesh, DoFMap, num_dofs, **kwargs)
+
+        # The integrand (excluding the kernel) cancels 2 orders of the singularity within an element.
+        self.singularityCancelationIntegrandWithinElement = 0.
+        # The integrand (excluding the kernel) cancels 2 orders of the
+        # singularity across elements for continuous finite elements.
+        if isinstance(DoFMap, P0_DoFMap):
+            assert self.kernel.max_singularity > -1., "Discontinuous finite elements are not conforming for singularity order {} <= -2.".format(self.kernel.max_singularity)
+            self.singularityCancelationIntegrandAcrossElements = 0.
+        else:
+            self.singularityCancelationIntegrandAcrossElements = 0.
+
+        smin = max(-0.5*(self.kernel.min_singularity+1), 0.)
+        smax = max(-0.5*(self.kernel.max_singularity+1), 0.)
+
+        if target_order is None:
+            # this is the desired local quadrature error
+            target_order = self.DoFMap.polynomialOrder+1-smin
+        self.target_order = target_order
+        if quad_order_diagonal is None:
+            # measured log(2 rho_2) = 0.43
+            quad_order_diagonal = max(np.ceil(((target_order+2.)*log(self.num_dofs*self.H0) + (2.*smax-1.)*abs(log(self.hmin/self.H0)))/0.8), 2)
+        self.quad_order_diagonal = quad_order_diagonal
+
+        if (self.kernel.kernelType != FRACTIONAL) or (not self.kernel.variableOrder):
+            self.getNearQuadRule(COMMON_EDGE)
+            self.getNearQuadRule(COMMON_VERTEX)
+
+    cdef void eval(self,
+                   {SCALAR}_t[::1] contrib,
+                   panelType panel,
+                   MASK_t mask=ALL):
+        cdef:
+            INDEX_t k, m, i, j, I, J
+            REAL_t vol, vol1 = self.vol1, vol2 = self.vol2
+            {SCALAR}_t val
+            REAL_t[:, ::1] simplex1 = self.simplex1
+            REAL_t[:, ::1] simplex2 = self.simplex2
+            quadratureRule qr
+            REAL_t[:, :, ::1] PHI
+            INDEX_t dofs_per_element = self.DoFMap.dofs_per_element
+            INDEX_t dim = 2
+            REAL_t x[2]
+            REAL_t y[2]
+
+        if panel >= 1:
+            self.eval_distant_bem_V(contrib, panel, mask)
+            return
+        elif panel == COMMON_EDGE:
+            qr = self.qrId
+            PHI = self.PHI_id
+        elif panel == COMMON_VERTEX:
+            qr = self.qrVertex
+            PHI = self.PHI_vertex
+        else:
+            raise NotImplementedError('Unknown panel type: {}'.format(panel))
+
+        vol = vol1*vol2
+        for m in range(qr.num_nodes):
+            for j in range(dim):
+                x[j] = (simplex1[self.perm1[0], j]*qr.nodes[0, m] +
+                        simplex1[self.perm1[1], j]*qr.nodes[1, m])
+                y[j] = (simplex2[self.perm2[0], j]*qr.nodes[2, m] +
+                        simplex2[self.perm2[1], j]*qr.nodes[3, m])
+            self.temp[m] = qr.weights[m] * self.kernel.evalPtr(dim, &x[0], &y[0])
+
+        contrib[:] = 0.
+        for I in range(PHI.shape[0]):
+            i = self.perm[I]
+            for J in range(I, PHI.shape[0]):
+                j = self.perm[J]
+                if j < i:
+                    k = 2*dofs_per_element*j-(j*(j+1) >> 1) + i
+                else:
+                    k = 2*dofs_per_element*i-(i*(i+1) >> 1) + j
+                if mask[k]:
+                    val = 0.
+                    for m in range(qr.num_nodes):
+                        val += self.temp[m] * (PHI[I, m, 0] * PHI[J, m, 1] + PHI[I, m, 1] * PHI[J, m, 0])
+                    contrib[k] = 0.5*val*vol
+
+
+cdef class {SCALAR_label}bem2D_K({SCALAR_label}bem2D):
+    """The local stiffness matrix
+
+    .. math::
+
+       \\int_{K_1}\\int_{K_2} 0.5 * [ u(x) v(y) + u(y) v(x) ] \\gamma(x,y) dy dx
+
+    for the V operator.
+    """
+    def __init__(self,
+                 {SCALAR_label}Kernel kernel,
+                 meshBase mesh,
+                 DoFMap DoFMap,
+                 quad_order_diagonal=None,
+                 target_order=None,
+                 num_dofs=None,
+                 **kwargs):
+        cdef:
+            REAL_t smin, smax
+        super({SCALAR_label}bem2D_K, self).__init__(kernel, mesh, DoFMap, num_dofs, **kwargs)
+
+        # The integrand (excluding the kernel) cancels 2 orders of the singularity within an element.
+        self.singularityCancelationIntegrandWithinElement = 1.
+        # The integrand (excluding the kernel) cancels 2 orders of the
+        # singularity across elements for continuous finite elements.
+        if isinstance(DoFMap, P0_DoFMap):
+            assert self.kernel.max_singularity > -1., "Discontinuous finite elements are not conforming for singularity order {} <= -2.".format(self.kernel.max_singularity)
+            self.singularityCancelationIntegrandAcrossElements = 1.
+        else:
+            self.singularityCancelationIntegrandAcrossElements = 1.
+
+        smin = max(-0.5*(self.kernel.min_singularity+1), 0.)
+        smax = max(-0.5*(self.kernel.max_singularity+1), 0.)
+
+        if target_order is None:
+            # this is the desired local quadrature error
+            target_order = self.DoFMap.polynomialOrder+1-smin
+        self.target_order = target_order
+        if quad_order_diagonal is None:
+            # measured log(2 rho_2) = 0.43
+            quad_order_diagonal = max(np.ceil(((target_order+2.)*log(self.num_dofs*self.H0) + (2.*smax-1.)*abs(log(self.hmin/self.H0)))/0.8), 2)
+        self.quad_order_diagonal = quad_order_diagonal
+
+        if (self.kernel.kernelType != FRACTIONAL) or (not self.kernel.variableOrder):
+            self.getNearQuadRule(COMMON_EDGE)
+            self.getNearQuadRule(COMMON_VERTEX)
+
+    cdef void eval(self,
+                   {SCALAR}_t[::1] contrib,
+                   panelType panel,
+                   MASK_t mask=ALL):
+        cdef:
+            INDEX_t k, m, i, j, I, J
+            REAL_t vol, valReal, vol1 = self.vol1, vol2 = self.vol2
+            {SCALAR}_t val
+            REAL_t[:, ::1] simplex1 = self.simplex1
+            REAL_t[:, ::1] simplex2 = self.simplex2
+            quadratureRule qr
+            REAL_t[:, :, ::1] PHI
+            INDEX_t dofs_per_element = self.DoFMap.dofs_per_element
+            INDEX_t dim = 2
+            REAL_t x[2]
+            REAL_t y[2]
+            REAL_t normW
+
+        if panel >= 1:
+            self.eval_distant_bem_K(contrib, panel, mask)
+            return
+        elif panel == COMMON_EDGE:
+            qr = self.qrId
+            PHI = self.PHI_id
+        elif panel == COMMON_VERTEX:
+            qr = self.qrVertex
+            PHI = self.PHI_vertex
+        else:
+            raise NotImplementedError('Unknown panel type: {}'.format(panel))
+
+        # n is independent of x and y
+        self.n1[0] = simplex1[1, 1] - simplex1[0, 1]
+        self.n1[1] = simplex1[0, 0] - simplex1[1, 0]
+        valReal = 1./sqrt(mydot(self.n1, self.n1))
+        self.n1[0] *= valReal
+        self.n1[1] *= valReal
+
+        self.n2[0] = simplex2[1, 1] - simplex2[0, 1]
+        self.n2[1] = simplex2[0, 0] - simplex2[1, 0]
+        valReal = 1./sqrt(mydot(self.n2, self.n2))
+        self.n2[0] *= valReal
+        self.n2[1] *= valReal
+
+        vol = vol1*vol2
+        for m in range(qr.num_nodes):
+            normW = 0.
+            for j in range(dim):
+                x[j] = (simplex1[self.perm1[0], j]*qr.nodes[0, m] +
+                        simplex1[self.perm1[1], j]*qr.nodes[1, m])
+                y[j] = (simplex2[self.perm2[0], j]*qr.nodes[2, m] +
+                        simplex2[self.perm2[1], j]*qr.nodes[3, m])
+                self.w[j] = y[j]-x[j]
+                normW += self.w[j]**2
+            normW = 1./sqrt(normW)
+            for j in range(dim):
+                self.w[j] *= normW
+
+            val = qr.weights[m] * self.kernel.evalPtr(dim, &x[0], &y[0])
+            self.temp[m] = val * mydot(self.n2, self.w)
+            self.temp2[m] = - val * mydot(self.n1, self.w)
+
+        contrib[:] = 0.
+        for I in range(PHI.shape[0]):
+            i = self.perm[I]
+            for J in range(I, PHI.shape[0]):
+                j = self.perm[J]
+                if j < i:
+                    k = 2*dofs_per_element*j-(j*(j+1) >> 1) + i
+                else:
+                    k = 2*dofs_per_element*i-(i*(i+1) >> 1) + j
+                if mask[k]:
+                    val = 0.
+                    for m in range(qr.num_nodes):
+                        val += self.temp[m] * PHI[I, m, 0] * PHI[J, m, 1] + self.temp2[m] * PHI[I, m, 1] * PHI[J, m, 0]
+                    contrib[k] = 0.5*val*vol
+
+
+cdef class {SCALAR_label}bem2D_K_prime({SCALAR_label}bem2D):
+    """The local stiffness matrix
+
+    .. math::
+
+       \\int_{K_1}\\int_{K_2} 0.5 * [ u(x) v(y) + u(y) v(x) ] \\gamma(x,y) dy dx
+
+    for the V operator.
+    """
+    def __init__(self,
+                 {SCALAR_label}Kernel kernel,
+                 meshBase mesh,
+                 DoFMap DoFMap,
+                 quad_order_diagonal=None,
+                 target_order=None,
+                 num_dofs=None,
+                 **kwargs):
+        cdef:
+            REAL_t smin, smax
+        super({SCALAR_label}bem2D_K_prime, self).__init__(kernel, mesh, DoFMap, num_dofs, **kwargs)
+
+        # The integrand (excluding the kernel) cancels 2 orders of the singularity within an element.
+        self.singularityCancelationIntegrandWithinElement = 1.
+        # The integrand (excluding the kernel) cancels 2 orders of the
+        # singularity across elements for continuous finite elements.
+        if isinstance(DoFMap, P0_DoFMap):
+            assert self.kernel.max_singularity > -1., "Discontinuous finite elements are not conforming for singularity order {} <= -2.".format(self.kernel.max_singularity)
+            self.singularityCancelationIntegrandAcrossElements = 1.
+        else:
+            self.singularityCancelationIntegrandAcrossElements = 1.
+
+        smin = max(-0.5*(self.kernel.min_singularity+1), 0.)
+        smax = max(-0.5*(self.kernel.max_singularity+1), 0.)
+
+        if target_order is None:
+            # this is the desired local quadrature error
+            target_order = self.DoFMap.polynomialOrder+1-smin
+        self.target_order = target_order
+        if quad_order_diagonal is None:
+            # measured log(2 rho_2) = 0.43
+            quad_order_diagonal = max(np.ceil(((target_order+2.)*log(self.num_dofs*self.H0) + (2.*smax-1.)*abs(log(self.hmin/self.H0)))/0.8), 2)
+        self.quad_order_diagonal = quad_order_diagonal
+
+        if (self.kernel.kernelType != FRACTIONAL) or (not self.kernel.variableOrder):
+            self.getNearQuadRule(COMMON_EDGE)
+            self.getNearQuadRule(COMMON_VERTEX)
+
+    cdef void eval(self,
+                   {SCALAR}_t[::1] contrib,
+                   panelType panel,
+                   MASK_t mask=ALL):
+        cdef:
+            INDEX_t k, m, i, j, I, J
+            REAL_t vol, valReal, vol1 = self.vol1, vol2 = self.vol2
+            {SCALAR}_t val
+            REAL_t[:, ::1] simplex1 = self.simplex1
+            REAL_t[:, ::1] simplex2 = self.simplex2
+            quadratureRule qr
+            REAL_t[:, :, ::1] PHI
+            INDEX_t dofs_per_element = self.DoFMap.dofs_per_element
+            INDEX_t dim = 2
+            REAL_t x[2]
+            REAL_t y[2]
+            REAL_t normW
+
+        if panel >= 1:
+            self.eval_distant_bem_K_prime(contrib, panel, mask)
+            return
+        elif panel == COMMON_EDGE:
+            qr = self.qrId
+            PHI = self.PHI_id
+        elif panel == COMMON_VERTEX:
+            qr = self.qrVertex
+            PHI = self.PHI_vertex
+        else:
+            raise NotImplementedError('Unknown panel type: {}'.format(panel))
+
+        # n is independent of x and y
+        self.n1[0] = simplex1[1, 1] - simplex1[0, 1]
+        self.n1[1] = simplex1[0, 0] - simplex1[1, 0]
+        valReal = 1./sqrt(mydot(self.n1, self.n1))
+        self.n1[0] *= valReal
+        self.n1[1] *= valReal
+
+        self.n2[0] = simplex2[1, 1] - simplex2[0, 1]
+        self.n2[1] = simplex2[0, 0] - simplex2[1, 0]
+        valReal = 1./sqrt(mydot(self.n2, self.n2))
+        self.n2[0] *= valReal
+        self.n2[1] *= valReal
+
+        vol = vol1*vol2
+        for m in range(qr.num_nodes):
+            normW = 0.
+            for j in range(dim):
+                x[j] = (simplex1[self.perm1[0], j]*qr.nodes[0, m] +
+                        simplex1[self.perm1[1], j]*qr.nodes[1, m])
+                y[j] = (simplex2[self.perm2[0], j]*qr.nodes[2, m] +
+                        simplex2[self.perm2[1], j]*qr.nodes[3, m])
+                self.w[j] = y[j]-x[j]
+                normW += self.w[j]**2
+            normW = 1./sqrt(normW)
+            for j in range(dim):
+                self.w[j] *= normW
+
+            val = qr.weights[m] * self.kernel.evalPtr(dim, &x[0], &y[0])
+            self.temp[m] = - val * mydot(self.n1, self.w)
+            self.temp2[m] = val * mydot(self.n2, self.w)
+
+        contrib[:] = 0.
+        for I in range(PHI.shape[0]):
+            i = self.perm[I]
+            for J in range(I, PHI.shape[0]):
+                j = self.perm[J]
+                if j < i:
+                    k = 2*dofs_per_element*j-(j*(j+1) >> 1) + i
+                else:
+                    k = 2*dofs_per_element*i-(i*(i+1) >> 1) + j
+                if mask[k]:
+                    val = 0.
+                    for m in range(qr.num_nodes):
+                        val += self.temp[m] * PHI[I, m, 0] * PHI[J, m, 1] + self.temp2[m] * PHI[I, m, 1] * PHI[J, m, 0]
+                    contrib[k] = 0.5*val*vol
+
+
+cdef class {SCALAR_label}bem3D_V({SCALAR_label}bem3D):
+    """The local stiffness matrix
+
+    .. math::
+
+       \\int_{K_1}\\int_{K_2} 0.5 * [ u(x) v(y) + u(y) v(x) ] \\gamma(x,y) dy dx
+
+    for the V operator.
+    """
+    def __init__(self,
+                 {SCALAR_label}Kernel kernel,
+                 meshBase mesh,
+                 DoFMap DoFMap,
+                 quad_order_diagonal=None,
+                 target_order=None,
+                 num_dofs=None,
+                 **kwargs):
+        cdef:
+            REAL_t smin, smax
+        super({SCALAR_label}bem3D_V, self).__init__(kernel, mesh, DoFMap, num_dofs, **kwargs)
+
+        # The integrand (excluding the kernel) cancels 2 orders of the singularity within an element.
+        self.singularityCancelationIntegrandWithinElement = 0.
+        # The integrand (excluding the kernel) cancels 2 orders of the
+        # singularity across elements for continuous finite elements.
+        if isinstance(DoFMap, P0_DoFMap):
+            assert self.kernel.max_singularity > -3., "Discontinuous finite elements are not conforming for singularity order {} <= -3.".format(self.kernel.max_singularity)
+            self.singularityCancelationIntegrandAcrossElements = 0.
+        else:
+            self.singularityCancelationIntegrandAcrossElements = 0.
+
+        if target_order is None:
+            # this is the desired local quadrature error
+            # target_order = (2.-s)/self.dim
+            target_order = 0.5
+        self.target_order = target_order
+
+        smax = max(-0.5*(self.kernel.max_singularity+2), 0.)
+        if quad_order_diagonal is None:
+            # measured log(2 rho_2) = 0.43
+            quad_order_diagonal = max(np.ceil((target_order+1.+smax)/(0.43)*abs(np.log(self.hmin/self.H0))), 4)
+            # measured log(2 rho_2) = 0.7
+            quad_order_diagonalV = max(np.ceil((target_order+1.+smax)/(0.7)*abs(np.log(self.hmin/self.H0))), 4)
+        else:
+            quad_order_diagonalV = quad_order_diagonal
+        self.quad_order_diagonal = quad_order_diagonal
+        self.quad_order_diagonalV = quad_order_diagonalV
+
+        if (self.kernel.kernelType != FRACTIONAL) or (not self.kernel.variableOrder):
+            self.getNearQuadRule(COMMON_FACE)
+            self.getNearQuadRule(COMMON_EDGE)
+            self.getNearQuadRule(COMMON_VERTEX)
+
+    cdef void eval(self,
+                   {SCALAR}_t[::1] contrib,
+                   panelType panel,
+                   MASK_t mask=ALL):
+        cdef:
+            INDEX_t k, m, i, j, I, J
+            REAL_t vol, vol1 = self.vol1, vol2 = self.vol2
+            {SCALAR}_t val
+            REAL_t[:, ::1] simplex1 = self.simplex1
+            REAL_t[:, ::1] simplex2 = self.simplex2
+            quadratureRule qr
+            REAL_t[:, :, ::1] PHI
+            INDEX_t dofs_per_element = self.DoFMap.dofs_per_element
+            INDEX_t dim = 3
+            REAL_t x[3]
+            REAL_t y[3]
+
+        if panel >= 1:
+            self.eval_distant_bem_V(contrib, panel, mask)
+            return
+        elif panel == COMMON_FACE:
+            qr = self.qrId
+            PHI = self.PHI_id
+        elif panel == COMMON_EDGE:
+            qr = self.qrEdge
+            PHI = self.PHI_edge
+        elif panel == COMMON_VERTEX:
+            qr = self.qrVertex
+            PHI = self.PHI_vertex
+        else:
+            raise NotImplementedError('Unknown panel type: {}'.format(panel))
+
+        vol = 4.0*vol1*vol2
+        for m in range(qr.num_nodes):
+            for j in range(dim):
+                x[j] = (simplex1[self.perm1[0], j]*qr.nodes[0, m] +
+                        simplex1[self.perm1[1], j]*qr.nodes[1, m] +
+                        simplex1[self.perm1[2], j]*qr.nodes[2, m])
+                y[j] = (simplex2[self.perm2[0], j]*qr.nodes[3, m] +
+                        simplex2[self.perm2[1], j]*qr.nodes[4, m] +
+                        simplex2[self.perm2[2], j]*qr.nodes[5, m])
+            self.temp[m] = qr.weights[m] * self.kernel.evalPtr(dim, &x[0], &y[0])
+
+        contrib[:] = 0.
+        for I in range(PHI.shape[0]):
+            i = self.perm[I]
+            for J in range(I, PHI.shape[0]):
+                j = self.perm[J]
+                if j < i:
+                    k = 2*dofs_per_element*j-(j*(j+1) >> 1) + i
+                else:
+                    k = 2*dofs_per_element*i-(i*(i+1) >> 1) + j
+                if mask[k]:
+                    val = 0.
+                    for m in range(qr.num_nodes):
+                        val += self.temp[m] * (PHI[I, m, 0] * PHI[J, m, 1] + PHI[I, m, 1] * PHI[J, m, 0])
+                    contrib[k] = 0.5*val*vol
+
+
+cdef class {SCALAR_label}bem3D_K({SCALAR_label}bem3D):
+    """The local stiffness matrix
+
+    .. math::
+
+       \\int_{K_1}\\int_{K_2} 0.5 * [ u(x) v(y) + u(y) v(x) ] \\gamma(x,y) dy dx
+
+    for the V operator.
+    """
+    def __init__(self,
+                 {SCALAR_label}Kernel kernel,
+                 meshBase mesh,
+                 DoFMap DoFMap,
+                 quad_order_diagonal=None,
+                 target_order=None,
+                 num_dofs=None,
+                 **kwargs):
+        cdef:
+            REAL_t smin, smax
+        super({SCALAR_label}bem3D_K, self).__init__(kernel, mesh, DoFMap, num_dofs, **kwargs)
+
+        # The integrand (excluding the kernel) cancels 2 orders of the singularity within an element.
+        self.singularityCancelationIntegrandWithinElement = 1.
+        # The integrand (excluding the kernel) cancels 2 orders of the
+        # singularity across elements for continuous finite elements.
+        if isinstance(DoFMap, P0_DoFMap):
+            assert self.kernel.max_singularity > -3., "Discontinuous finite elements are not conforming for singularity order {} <= -3.".format(self.kernel.max_singularity)
+            self.singularityCancelationIntegrandAcrossElements = 1.
+        else:
+            self.singularityCancelationIntegrandAcrossElements = 1.
+
+        if target_order is None:
+            # this is the desired local quadrature error
+            # target_order = (2.-s)/self.dim
+            target_order = 0.5
+        self.target_order = target_order
+
+        smax = max(-0.5*(self.kernel.max_singularity+2), 0.)
+        if quad_order_diagonal is None:
+            # measured log(2 rho_2) = 0.43
+            quad_order_diagonal = max(np.ceil((target_order+1.+smax)/(0.43)*abs(np.log(self.hmin/self.H0))), 4)
+            # measured log(2 rho_2) = 0.7
+            quad_order_diagonalV = max(np.ceil((target_order+1.+smax)/(0.7)*abs(np.log(self.hmin/self.H0))), 4)
+        else:
+            quad_order_diagonalV = quad_order_diagonal
+        self.quad_order_diagonal = quad_order_diagonal
+        self.quad_order_diagonalV = quad_order_diagonalV
+
+        if (self.kernel.kernelType != FRACTIONAL) or (not self.kernel.variableOrder):
+            self.getNearQuadRule(COMMON_FACE)
+            self.getNearQuadRule(COMMON_EDGE)
+            self.getNearQuadRule(COMMON_VERTEX)
+
+    cdef void eval(self,
+                   {SCALAR}_t[::1] contrib,
+                   panelType panel,
+                   MASK_t mask=ALL):
+        cdef:
+            INDEX_t k, m, i, j, I, J
+            REAL_t vol, valReal, vol1 = self.vol1, vol2 = self.vol2
+            {SCALAR}_t val
+            REAL_t[:, ::1] simplex1 = self.simplex1
+            REAL_t[:, ::1] simplex2 = self.simplex2
+            quadratureRule qr
+            REAL_t[:, :, ::1] PHI
+            INDEX_t dofs_per_element = self.DoFMap.dofs_per_element
+            INDEX_t dim = 3
+            REAL_t x[3]
+            REAL_t y[3]
+            REAL_t normW
+
+        if panel >= 1:
+            self.eval_distant_bem_K(contrib, panel, mask)
+            return
+        elif panel == COMMON_FACE:
+            qr = self.qrId
+            PHI = self.PHI_id
+        elif panel == COMMON_EDGE:
+            qr = self.qrEdge
+            PHI = self.PHI_edge
+        elif panel == COMMON_VERTEX:
+            qr = self.qrVertex
+            PHI = self.PHI_vertex
+        else:
+            raise NotImplementedError('Unknown panel type: {}'.format(panel))
+
+        for j in range(dim):
+            x[j] = simplex1[1, j]-simplex1[0, j]
+        for j in range(dim):
+            y[j] = simplex1[2, j]-simplex1[0, j]
+        self.n1[0] = x[1]*y[2]-x[2]*y[1]
+        self.n1[1] = x[2]*y[0]-x[0]*y[2]
+        self.n1[2] = x[0]*y[1]-x[1]*y[0]
+        valReal = 1./sqrt(mydot(self.n1, self.n1))
+        self.n1[0] *= valReal
+        self.n1[1] *= valReal
+        self.n1[2] *= valReal
+
+        for j in range(dim):
+            x[j] = simplex2[1, j]-simplex2[0, j]
+        for j in range(dim):
+            y[j] = simplex2[2, j]-simplex2[0, j]
+        self.n2[0] = x[1]*y[2]-x[2]*y[1]
+        self.n2[1] = x[2]*y[0]-x[0]*y[2]
+        self.n2[2] = x[0]*y[1]-x[1]*y[0]
+        valReal = 1./sqrt(mydot(self.n2, self.n2))
+        self.n2[0] *= valReal
+        self.n2[1] *= valReal
+        self.n2[2] *= valReal
+
+        vol = 4.0*vol1*vol2
+        for m in range(qr.num_nodes):
+            normW = 0.
+            for j in range(dim):
+                x[j] = (simplex1[self.perm1[0], j]*qr.nodes[0, m] +
+                        simplex1[self.perm1[1], j]*qr.nodes[1, m] +
+                        simplex1[self.perm1[2], j]*qr.nodes[2, m])
+                y[j] = (simplex2[self.perm2[0], j]*qr.nodes[3, m] +
+                        simplex2[self.perm2[1], j]*qr.nodes[4, m] +
+                        simplex2[self.perm2[2], j]*qr.nodes[5, m])
+                self.w[j] = y[j]-x[j]
+                normW += self.w[j]**2
+
+            normW = 1./sqrt(normW)
+            for j in range(dim):
+                self.w[j] *= normW
+
+            val = qr.weights[m] * self.kernel.evalPtr(dim, &x[0], &y[0])
+            self.temp[m] = val * mydot(self.n2, self.w)
+            self.temp2[m] = - val * mydot(self.n1, self.w)
+
+        contrib[:] = 0.
+        for I in range(PHI.shape[0]):
+            i = self.perm[I]
+            for J in range(I, PHI.shape[0]):
+                j = self.perm[J]
+                if j < i:
+                    k = 2*dofs_per_element*j-(j*(j+1) >> 1) + i
+                else:
+                    k = 2*dofs_per_element*i-(i*(i+1) >> 1) + j
+                if mask[k]:
+                    val = 0.
+                    for m in range(qr.num_nodes):
+                        val += self.temp[m] * PHI[I, m, 0] * PHI[J, m, 1] + self.temp2[m] * PHI[I, m, 1] * PHI[J, m, 0]
+                    contrib[k] = 0.5*val*vol
+
+
+cdef class {SCALAR_label}bem3D_K_prime({SCALAR_label}bem3D):
+    """The local stiffness matrix
+
+    .. math::
+
+       \\int_{K_1}\\int_{K_2} 0.5 * [ u(x) v(y) + u(y) v(x) ] \\gamma(x,y) dy dx
+
+    for the V operator.
+    """
+    def __init__(self,
+                 {SCALAR_label}Kernel kernel,
+                 meshBase mesh,
+                 DoFMap DoFMap,
+                 quad_order_diagonal=None,
+                 target_order=None,
+                 num_dofs=None,
+                 **kwargs):
+        cdef:
+            REAL_t smin, smax
+        super({SCALAR_label}bem3D_K_prime, self).__init__(kernel, mesh, DoFMap, num_dofs, **kwargs)
+
+        # The integrand (excluding the kernel) cancels 2 orders of the singularity within an element.
+        self.singularityCancelationIntegrandWithinElement = 1.
+        # The integrand (excluding the kernel) cancels 2 orders of the
+        # singularity across elements for continuous finite elements.
+        if isinstance(DoFMap, P0_DoFMap):
+            assert self.kernel.max_singularity > -3., "Discontinuous finite elements are not conforming for singularity order {} <= -3.".format(self.kernel.max_singularity)
+            self.singularityCancelationIntegrandAcrossElements = 1.
+        else:
+            self.singularityCancelationIntegrandAcrossElements = 1.
+
+        if target_order is None:
+            # this is the desired local quadrature error
+            # target_order = (2.-s)/self.dim
+            target_order = 0.5
+        self.target_order = target_order
+
+        smax = max(-0.5*(self.kernel.max_singularity+2), 0.)
+        if quad_order_diagonal is None:
+            # measured log(2 rho_2) = 0.43
+            quad_order_diagonal = max(np.ceil((target_order+1.+smax)/(0.43)*abs(np.log(self.hmin/self.H0))), 4)
+            # measured log(2 rho_2) = 0.7
+            quad_order_diagonalV = max(np.ceil((target_order+1.+smax)/(0.7)*abs(np.log(self.hmin/self.H0))), 4)
+        else:
+            quad_order_diagonalV = quad_order_diagonal
+        self.quad_order_diagonal = quad_order_diagonal
+        self.quad_order_diagonalV = quad_order_diagonalV
+
+        if (self.kernel.kernelType != FRACTIONAL) or (not self.kernel.variableOrder):
+            self.getNearQuadRule(COMMON_FACE)
+            self.getNearQuadRule(COMMON_EDGE)
+            self.getNearQuadRule(COMMON_VERTEX)
+
+    cdef void eval(self,
+                   {SCALAR}_t[::1] contrib,
+                   panelType panel,
+                   MASK_t mask=ALL):
+        cdef:
+            INDEX_t k, m, i, j, I, J
+            REAL_t vol, valReal, vol1 = self.vol1, vol2 = self.vol2
+            {SCALAR}_t val
+            REAL_t[:, ::1] simplex1 = self.simplex1
+            REAL_t[:, ::1] simplex2 = self.simplex2
+            quadratureRule qr
+            REAL_t[:, :, ::1] PHI
+            INDEX_t dofs_per_element = self.DoFMap.dofs_per_element
+            INDEX_t dim = 3
+            REAL_t x[3]
+            REAL_t y[3]
+            REAL_t normW
+
+        if panel >= 1:
+            self.eval_distant_bem_K_prime(contrib, panel, mask)
+            return
+        elif panel == COMMON_FACE:
+            qr = self.qrId
+            PHI = self.PHI_id
+        elif panel == COMMON_EDGE:
+            qr = self.qrEdge
+            PHI = self.PHI_edge
+        elif panel == COMMON_VERTEX:
+            qr = self.qrVertex
+            PHI = self.PHI_vertex
+        else:
+            raise NotImplementedError('Unknown panel type: {}'.format(panel))
+
+        for j in range(dim):
+            x[j] = simplex1[1, j]-simplex1[0, j]
+        for j in range(dim):
+            y[j] = simplex1[2, j]-simplex1[0, j]
+        self.n1[0] = x[1]*y[2]-x[2]*y[1]
+        self.n1[1] = x[2]*y[0]-x[0]*y[2]
+        self.n1[2] = x[0]*y[1]-x[1]*y[0]
+        valReal = 1./sqrt(mydot(self.n1, self.n1))
+        self.n1[0] *= valReal
+        self.n1[1] *= valReal
+        self.n1[2] *= valReal
+
+        for j in range(dim):
+            x[j] = simplex2[1, j]-simplex2[0, j]
+        for j in range(dim):
+            y[j] = simplex2[2, j]-simplex2[0, j]
+        self.n2[0] = x[1]*y[2]-x[2]*y[1]
+        self.n2[1] = x[2]*y[0]-x[0]*y[2]
+        self.n2[2] = x[0]*y[1]-x[1]*y[0]
+        valReal = 1./sqrt(mydot(self.n2, self.n2))
+        self.n2[0] *= valReal
+        self.n2[1] *= valReal
+        self.n2[2] *= valReal
+
+        vol = 4.0*vol1*vol2
+        for m in range(qr.num_nodes):
+            normW = 0.
+            for j in range(dim):
+                x[j] = (simplex1[self.perm1[0], j]*qr.nodes[0, m] +
+                        simplex1[self.perm1[1], j]*qr.nodes[1, m] +
+                        simplex1[self.perm1[2], j]*qr.nodes[2, m])
+                y[j] = (simplex2[self.perm2[0], j]*qr.nodes[3, m] +
+                        simplex2[self.perm2[1], j]*qr.nodes[4, m] +
+                        simplex2[self.perm2[2], j]*qr.nodes[5, m])
+                self.w[j] = y[j]-x[j]
+                normW += self.w[j]**2
+
+            normW = 1./sqrt(normW)
+            for j in range(dim):
+                self.w[j] *= normW
+
+            val = qr.weights[m] * self.kernel.evalPtr(dim, &x[0], &y[0])
+            self.temp[m] = - val * mydot(self.n1, self.w)
+            self.temp2[m] = val * mydot(self.n2, self.w)
+
+        contrib[:] = 0.
+        for I in range(PHI.shape[0]):
+            i = self.perm[I]
+            for J in range(I, PHI.shape[0]):
+                j = self.perm[J]
+                if j < i:
+                    k = 2*dofs_per_element*j-(j*(j+1) >> 1) + i
+                else:
+                    k = 2*dofs_per_element*i-(i*(i+1) >> 1) + j
+                if mask[k]:
+                    val = 0.
+                    for m in range(qr.num_nodes):
+                        val += self.temp[m] * PHI[I, m, 0] * PHI[J, m, 1] + self.temp2[m] * PHI[I, m, 1] * PHI[J, m, 0]
+                    contrib[k] = 0.5*val*vol
+
+
+cdef class {SCALAR_label}boundaryIntegralSingleLayer({function_type}):
+    cdef:
+        {SCALAR_label_lc_}fe_vector u
+        {SCALAR_label}twoPointFunction kernel
+        REAL_t[:, ::1] simplex
+        REAL_t[:, ::1] span
+        simplexQuadratureRule qr
+        REAL_t[:, ::1] PHI
+        {SCALAR}_t[::1] fvals
+
+    def __init__(self, {SCALAR_label_lc_}fe_vector u, {SCALAR_label}twoPointFunction kernel):
+        self.u = u
+        self.kernel = kernel
+        dm = u.dm
+        mesh = dm.mesh
+
+        dim = mesh.dim
+        dimManifold = mesh.manifold_dim
+
+        self.simplex = uninitialized((dimManifold+1, dim), dtype=REAL)
+        self.span = uninitialized((dimManifold, dim), dtype=REAL)
+        self.qr = simplexXiaoGimbutas(dim=dim, manifold_dim=dimManifold, order=3)
+
+        self.PHI = uninitialized((dm.dofs_per_element, self.qr.num_nodes), dtype=REAL)
+        for i in range(dm.dofs_per_element):
+            for j in range(self.qr.num_nodes):
+                self.PHI[i, j] = dm.localShapeFunctions[i](np.ascontiguousarray(self.qr.nodes[:, j]))
+
+        self.fvals = uninitialized((self.qr.num_nodes), dtype={SCALAR})
+
+    cdef {SCALAR}_t eval(self, REAL_t[::1] x):
+        cdef:
+            INDEX_t cellNo
+            {SCALAR}_t val = 0.
+            DoFMap dm = self.u.dm
+            meshBase mesh = dm.mesh
+            INDEX_t dim = mesh.dim
+            INDEX_t manifold_dim = mesh.manifold_dim
+            simplexQuadratureRule qr = self.qr
+            INDEX_t num_quad_nodes = qr.num_nodes
+            INDEX_t k, j, I, m
+            REAL_t vol
+            {SCALAR}_t[::1] u = self.u
+
+        for cellNo in range(mesh.num_cells):
+            mesh.getSimplex(cellNo, self.simplex)
+
+            # Calculate volume
+            vol = qr.getSimplexVolume(self.simplex)
+
+            for j in range(num_quad_nodes):
+                qr.tempVec[:] = 0.
+                for k in range(manifold_dim+1):
+                    for m in range(dim):
+                        qr.tempVec[m] += qr.nodes[k, j] * self.simplex[k, m]
+                self.fvals[j] = self.kernel.evalPtr(dim, &x[0], &qr.tempVec[0])
+
+            for k in range(dm.dofs_per_element):
+                I = dm.cell2dof(cellNo, k)
+                if I < 0:
+                    continue
+                for j in range(num_quad_nodes):
+                    val += vol*qr.weights[j]*self.fvals[j]*self.PHI[k, j]*u[I]
+        return val
+
+
+cdef class {SCALAR_label}boundaryIntegralDoubleLayer({function_type}):
+    cdef:
+        {SCALAR_label_lc_}fe_vector u
+        {SCALAR_label}twoPointFunction kernel
+        REAL_t[:, ::1] simplex
+        REAL_t[:, ::1] span
+        simplexQuadratureRule qr
+        REAL_t[:, ::1] PHI
+        {SCALAR}_t[::1] fvals
+        REAL_t[::1] n2, w, x, y
+
+    def __init__(self, {SCALAR_label_lc_}fe_vector u, twoPointFunction kernel):
+        self.u = u
+        self.kernel = kernel
+        dm = u.dm
+        mesh = dm.mesh
+
+        dim = mesh.dim
+        dimManifold = mesh.manifold_dim
+
+        self.simplex = uninitialized((dimManifold+1, dim), dtype=REAL)
+        self.span = uninitialized((dimManifold, dim), dtype=REAL)
+        self.qr = simplexXiaoGimbutas(dim=dim, manifold_dim=dimManifold, order=3)
+
+        self.PHI = uninitialized((dm.dofs_per_element, self.qr.num_nodes), dtype=REAL)
+        for i in range(dm.dofs_per_element):
+            for j in range(self.qr.num_nodes):
+                self.PHI[i, j] = dm.localShapeFunctions[i](np.ascontiguousarray(self.qr.nodes[:, j]))
+
+        self.fvals = uninitialized((self.qr.num_nodes), dtype={SCALAR})
+        self.n2 = uninitialized((dim), dtype=REAL)
+        self.w = uninitialized((dim), dtype=REAL)
+        self.x = uninitialized((dim), dtype=REAL)
+        self.y = uninitialized((dim), dtype=REAL)
+
+    cdef {SCALAR}_t eval(self, REAL_t[::1] x):
+        cdef:
+            INDEX_t cellNo
+            {SCALAR}_t val = 0.
+            DoFMap dm = self.u.dm
+            meshBase mesh = dm.mesh
+            INDEX_t dim = mesh.dim
+            INDEX_t manifold_dim = mesh.manifold_dim
+            simplexQuadratureRule qr = self.qr
+            INDEX_t num_quad_nodes = qr.num_nodes
+            INDEX_t k, j, I, m
+            REAL_t vol, normW
+            {SCALAR}_t[::1] u = self.u
+
+        for cellNo in range(mesh.num_cells):
+            mesh.getSimplex(cellNo, self.simplex)
+
+            if dim == 2:
+                self.n2[0] = self.simplex[1, 1] - self.simplex[0, 1]
+                self.n2[1] = self.simplex[0, 0] - self.simplex[1, 0]
+                normW = 1./sqrt(mydot(self.n2, self.n2))
+                self.n2[0] *= normW
+                self.n2[1] *= normW
+            elif dim == 3:
+                for j in range(dim):
+                    self.x[j] = self.simplex[1, j]-self.simplex[0, j]
+                for j in range(dim):
+                    self.y[j] = self.simplex[2, j]-self.simplex[0, j]
+                self.n2[0] = self.x[1]*self.y[2]-self.x[2]*self.y[1]
+                self.n2[1] = self.x[2]*self.y[0]-self.x[0]*self.y[2]
+                self.n2[2] = self.x[0]*self.y[1]-self.x[1]*self.y[0]
+                normW = 1./sqrt(mydot(self.n2, self.n2))
+                self.n2[0] *= normW
+                self.n2[1] *= normW
+                self.n2[2] *= normW
+
+            # Calculate volume
+            vol = qr.getSimplexVolume(self.simplex)
+
+            for j in range(num_quad_nodes):
+                qr.tempVec[:] = 0.
+                for k in range(manifold_dim+1):
+                    for m in range(dim):
+                        qr.tempVec[m] += qr.nodes[k, j] * self.simplex[k, m]
+                normW = 0.
+                for m in range(dim):
+                    self.w[m] = qr.tempVec[m]-x[m]
+                    normW += self.w[m]**2
+                normW = 1./sqrt(normW)
+                for m in range(dim):
+                    self.w[m] *= normW
+
+                self.fvals[j] = self.kernel.evalPtr(dim, &x[0], &qr.tempVec[0]) * mydot(self.n2, self.w)
+
+            for k in range(dm.dofs_per_element):
+                I = dm.cell2dof(cellNo, k)
+                if I < 0:
+                    continue
+                for j in range(num_quad_nodes):
+                    val += vol*qr.weights[j]*self.fvals[j]*self.PHI[k, j]*u[I]
+        return val
diff --git a/nl/PyNucleus_nl/bitset.pyx b/nl/PyNucleus_nl/bitset.pyx
index 10ca057..82f9472 100644
--- a/nl/PyNucleus_nl/bitset.pyx
+++ b/nl/PyNucleus_nl/bitset.pyx
@@ -9,14 +9,7 @@
 import numpy as np
 from libc.stdlib cimport malloc, realloc, free
 
-include "config.pxi"
-
-IF HAVE_MALLOC_H:
-    cdef extern from "malloc.h" nogil:
-        int malloc_trim(size_t pad)
-ELSE:
-    cdef int malloc_trim(size_t pad):
-        pass
+include "malloc.pxi"
 
 
 cdef str MASK2Str(MASK_t a):
diff --git a/nl/PyNucleus_nl/malloc_linux.pxi b/nl/PyNucleus_nl/malloc_linux.pxi
new file mode 100644
index 0000000..6129a75
--- /dev/null
+++ b/nl/PyNucleus_nl/malloc_linux.pxi
@@ -0,0 +1,9 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+cdef extern from "malloc.h" nogil:
+    int malloc_trim(size_t pad)
diff --git a/nl/PyNucleus_nl/malloc_mac.pxi b/nl/PyNucleus_nl/malloc_mac.pxi
new file mode 100644
index 0000000..b0780f3
--- /dev/null
+++ b/nl/PyNucleus_nl/malloc_mac.pxi
@@ -0,0 +1,9 @@
+###################################################################################
+# Copyright 2021 National Technology & Engineering Solutions of Sandia,           #
+# LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the           #
+# U.S. Government retains certain rights in this software.                        #
+# If you want to use this code, please refer to the README.rst and LICENSE files. #
+###################################################################################
+
+cdef int malloc_trim(size_t pad):
+    pass
diff --git a/nl/PyNucleus_nl/nonlocalAssembly.pxd b/nl/PyNucleus_nl/nonlocalAssembly.pxd
index 5f9f2a2..0b38c35 100644
--- a/nl/PyNucleus_nl/nonlocalAssembly.pxd
+++ b/nl/PyNucleus_nl/nonlocalAssembly.pxd
@@ -45,8 +45,6 @@ from . kernelsCy cimport (Kernel,
                           FractionalKernel)
 
 
-include "config.pxi"
-
 include "nonlocalAssembly_decl_REAL.pxi"
 include "nonlocalAssembly_decl_COMPLEX.pxi"
 
diff --git a/nl/PyNucleus_nl/nonlocalAssembly.pyx b/nl/PyNucleus_nl/nonlocalAssembly.pyx
index 7d776d6..b0fa56d 100644
--- a/nl/PyNucleus_nl/nonlocalAssembly.pyx
+++ b/nl/PyNucleus_nl/nonlocalAssembly.pyx
@@ -9,8 +9,6 @@ from libc.math cimport ceil
 import numpy as np
 cimport numpy as np
 
-include "config.pxi"
-
 from libc.math cimport sin, cos, M_PI as pi
 from libcpp.map cimport map
 from cpython.long cimport PyLong_FromSsize_t
diff --git a/nl/setup.py b/nl/setup.py
index 72bb3b0..8437e43 100644
--- a/nl/setup.py
+++ b/nl/setup.py
@@ -5,7 +5,7 @@
 # If you want to use this code, please refer to the README.rst and LICENSE files. #
 ###################################################################################
 
-from shutil import move
+from shutil import move, copy
 from os import remove
 from importlib.metadata import version
 from distutils.errors import CompileError
@@ -29,10 +29,14 @@
         int malloc_trim(size_t pad)
     """)
     have_malloc_h = True
+    if not (p.hash_file(p.folder+'malloc_linux.pxi') ==
+            p.hash_file(p.folder+'malloc.pxi')):
+        copy(p.folder+'malloc_linux.pxi', p.folder+'malloc.pxi')
 except CompileError as e:
     print('malloc.h not found, error was \"{}\". Depending on the system, this might be normal.'.format(e))
-    have_malloc_h = False
-p.addOption('HAVE_MALLOC_H', 'have_malloc_h', have_malloc_h)
+    if not (p.hash_file(p.folder+'malloc_mac.pxi') ==
+            p.hash_file(p.folder+'malloc.pxi')):
+        copy(p.folder+'malloc_mac.pxi', p.folder+'malloc.pxi')
 p.addOption(None, 'mask_size', 256)
 cython_directives = {'initializedcheck': False,
                      'boundscheck': False,
diff --git a/packageTools/PyNucleus_packageTools/__init__.py b/packageTools/PyNucleus_packageTools/__init__.py
index 67260e0..e4b488b 100644
--- a/packageTools/PyNucleus_packageTools/__init__.py
+++ b/packageTools/PyNucleus_packageTools/__init__.py
@@ -111,7 +111,7 @@ def __init__(self, name, namespace=''):
                          'compiler_c++': 'detect',
                          'mpi': 'openmpi',
                          'threads': 1}
-        self.addOption('USE_OPENMP', 'useOpenMP', False)
+        self.addOption(None, 'useOpenMP', False)
         self.addOption(None, 'gitSHA', self.getGitSHA())
 
     def addOption(self, optionCy, optionPy, default, pkgDependencies=[]):
@@ -378,6 +378,36 @@ def hash_file(self, filename):
         except:
             return
 
+    def emptyFile(self, filename):
+        from shutil import copy
+
+        with open('tempEmpty', 'w') as f:
+            pass
+        if Path(filename).exists():
+            if not (self.hash_file('tempEmpty') ==
+                    self.hash_file(filename)):
+                copy('tempEmpty', filename)
+        else:
+            copy('tempEmpty', filename)
+        Path('tempEmpty').unlink()
+
+    def conditionalCopy(self, target, conditional, source_if_true, source_if_false):
+        from shutil import copy
+
+        if conditional:
+            if not (self.hash_file(source_if_true) ==
+                    self.hash_file(target)):
+                copy(source_if_true,
+                     target)
+        else:
+            if source_if_false is not None:
+                if not (self.hash_file(source_if_false) ==
+                        self.hash_file(target)):
+                    copy(source_if_false,
+                         target)
+            else:
+                self.emptyFile(target)
+
 
 def fillTemplate(basedir, templates, replacements):
     for tmp in templates: