5.3 FETI-DP in NGSolve II: Point-Constraints in 3D

We implement standard FETI-DP, using only point-constraints, for Poisson’s equation in 3D.

[1]:
user_id = 'lukas'
num_procs = '40'
[2]:
from usrmeeting_jupyterstuff import *
[ ]:
stop_cluster(user_id)
[3]:
start_cluster(num_procs,user_id)
connect_cluster(user_id)
could not start cluster, (probably already/still running)
connecting ... try:0 succeeded!
[4]:
%%px
from ngsolve import *
import netgen.meshing as ngmeshing

nref = 0

dim=3
ngmesh = ngmeshing.Mesh(dim=dim)
ngmesh.Load('cube.vol')
for l in range(nref):
    ngmesh.Refine()
mesh = Mesh(ngmesh)
comm = MPI_Init()
fes = H1(mesh, order=2, dirichlet='right|top|top|left')
pardofs = fes.ParallelDofs()
a = BilinearForm(fes)
u,v = fes.TnT()
a += SymbolicBFI(grad(u)*grad(v))
a.Assemble()
f = LinearForm(fes)
f += SymbolicLFI(x*y*v)
f.Assemble()
avg_dof = comm.Sum(fes.ndof) / comm.size
if comm.rank==0:
    print('global,  ndof =', fes.ndofglobal, ', lodofs =', fes.lospace.ndofglobal)
    print('avg DOFs per core: ', avg_dof)
[stdout:16]
global,  ndof = 75403 , lodofs = 10355
avg DOFs per core:  2260.7

Finding the primal DOFs

In 3 dimensions, we cannot classify the primal DOFs by their multiplicity.

Hoever, we can very easily find subdomain faces. Then, we simply define subdomain edges as intersections of faces, and finally subdomain vertices as intersections of edges.

We can do this compactly with generator expressions, or in a readable fashion with for-loops.

[5]:
%%px
pythonic = True
if pythonic:
    faces = [set(d for d in pardofs.Proc2Dof(p) if d<mesh.nv and fes.FreeDofs()[d] ) for p in pardofs.ExchangeProcs()]
    edges = sorted([tuple(sorted(e)) for e in set(tuple(f1.intersection(f2)) for f1 in faces for f2 in faces if f1 is not f2) if len(e)>1])
    vertices = sorted(set([ v for e1 in edges for e2 in edges if e1 is not e2 for v in set(e1).intersection(set(e2)) ]))
else:
    faces = []
    for p in pardofs.ExchangeProcs():
        faces.append(set(d for d in pardofs.Proc2Dof(p) if d<mesh.nv and fes.FreeDofs()[d]))
    edges = []
    for f1 in faces:
        for f2 in faces:
            if f1 is not f2:
                edge = sorted(tuple(f1.intersection(f2)))
                if len(edge) > 1:
                    if not edge in edges:
                        edges.append(sorted(tuple(edge)))
    vertices = set()
    for e1 in edges:
        for e2 in edges:
            if e1 is not e2:
                vs = set(e1).intersection(set(e2))
                vertices = vertices.union(vs)
    vertices = sorted(vertices)

There is one problem left to consider. In order for a DOF to be included in "vertices" on some rank, that rank has to posess two edges that contain it. This is not always the case.

We have to flag a DOF as a vertex if any rank thinks it should be one.

[6]:
%%px
vec = f.vec.CreateVector()
vec.local_vec[:] = 0.0
for v in vertices:
    vec.local_vec[v] = 1
from ngsolve.la import DISTRIBUTED
vec.SetParallelStatus(DISTRIBUTED)
vec.Cumulate()
primal_dofs = BitArray([vec.local_vec[k]!=0 for k in range(fes.ndof)]) & fes.FreeDofs()

nprim = comm.Sum(sum([1 for k in range(fes.ndof) if primal_dofs[k] and comm.rank<pardofs.Dof2Proc(k)[0] ]))
npmin = comm.Min(primal_dofs.NumSet() if comm.rank else nprim)
npavg = comm.Sum(primal_dofs.NumSet())/comm.size
npmax = comm.Max(primal_dofs.NumSet())
if comm.rank==0:
    print('# primal dofs global: ', nprim)
    print('min, avg, max per rank: ', npmin, ' ', npavg, ' ', npmax)
[stdout:16]
# primal dofs global:  122
min, avg, max per rank:  4   12.425   25
[7]:
stop_cluster(user_id)

Everything else works the same - experiment below!!

[8]:
from usrmeeting_jupyterstuff import *
user_id = 'lukas'
num_procs = '100'
stop_cluster(user_id)
start_cluster(num_procs, user_id)
connect_cluster(user_id)
Waiting for connection file: ~/.ipython/profile_ngsolve/security/ipcontroller-koglerlukas-client.json
connecting ... try:6 succeeded!
[9]:
%%px
from ngsolve import *
import netgen.meshing as ngmeshing
from ngsolve.la import ParallelMatrix, FETI_Jump
from dd_toolbox import FindFEV, LocGlobInverse, ScaledMat

def load_mesh(nref=0):
    ngmesh = ngmeshing.Mesh(dim=3)
    ngmesh.Load('cube.vol')
    for l in range(nref):
        ngmesh.Refine()
    return Mesh(ngmesh)

def setup_space(mesh, order=1):
    comm = MPI_Init()
    fes = H1(mesh, order=order, dirichlet='right|top')
    a = BilinearForm(fes)
    u,v = fes.TnT()
    a += SymbolicBFI(grad(u)*grad(v))
    a.Assemble()
    f = LinearForm(fes)
    f += SymbolicLFI(x*y*v)
    f.Assemble()
    avg_dof = comm.Sum(fes.ndof) / comm.size
    if comm.rank==0:
        print('global,  ndof =', fes.ndofglobal, ', lodofs =', fes.lospace.ndofglobal)
        print('avg DOFs per core: ', avg_dof)
    return [fes, a, f]

def setup_FETIDP(fes, a):
    faces, edges, vertices = FindFEV(mesh.dim, mesh.nv, \
                                     fes.ParallelDofs(), fes.FreeDofs())
    primal_dofs = BitArray([ v in set(vertices) for v in range(fes.ndof) ]) & fes.FreeDofs()
    dp_pardofs = fes.ParallelDofs().SubSet(primal_dofs)
    nprim = comm.Sum(sum([1 for k in range(fes.ndof) if primal_dofs[k] and comm.rank<fes.ParallelDofs().Dof2Proc(k)[0] ]))
    if comm.rank==0:
        print('# of global primal dofs: ', nprim)
    A_dp = ParallelMatrix(a.mat.local_mat, dp_pardofs)
    dual_pardofs = fes.ParallelDofs().SubSet(BitArray(~primal_dofs & fes.FreeDofs()))
    B = FETI_Jump(dual_pardofs, u_pardofs=dp_pardofs)
    if comm.rank==0:
        print('# of global multipliers = :', B.col_pardofs.ndofglobal)
    A_dp_inv = LocGlobInverse(A_dp, fes.FreeDofs(),
                              invtype_loc='sparsecholesky',\
                              invtype_glob='masterinverse')
    F = B @ A_dp_inv @ B.T
    innerdofs = BitArray([len(fes.ParallelDofs().Dof2Proc(k))==0 for k in range(fes.ndof)]) & fes.FreeDofs()
    A = a.mat.local_mat
    Aiinv = A.Inverse(innerdofs, inverse='sparsecholesky')
    scaledA = ScaledMat(A, [1.0/(1+len(fes.ParallelDofs().Dof2Proc(k))) for k in range(fes.ndof)])
    scaledBT = ScaledMat(B.T, [1.0/(1+len(fes.ParallelDofs().Dof2Proc(k))) for k in range(fes.ndof)])
    Fhat = B @ A @ (IdentityMatrix() - Aiinv @ A) @ B.T
    Fhat2 = B @ scaledA @ (IdentityMatrix() - Aiinv @ A) @ scaledBT
    return [A_dp, A_dp_inv, F, Fhat, Fhat2, B, scaledA, scaledBT]

def prep(B, Ainv, f):
    rhs.data = (B @ Ainv) * f.vec
    return rhs

def solve(mat, pre, rhs, sol):
    t = comm.WTime()
    solvers.CG(mat=mat, pre=pre, rhs=rhs, sol=sol, \
               maxsteps=100, printrates=comm.rank==0, tol=1e-6)
    return comm.WTime() - t

def post(B, Ainv, gfu, lam):
    hv = B.CreateRowVector()
    hv.data = f.vec - B.T * lam
    gfu.vec.data = Ainv * hv
    jump = lam.CreateVector()
    jump.data = B * gfu.vec
    norm_jump = Norm(jump)
    if comm.rank==0:
        print('\nnorm jump u: ', norm_jump)
[10]:
%%px
comm = MPI_Init()
mesh = load_mesh(nref=1)
fes, a, f = setup_space(mesh, order=2)
A_dp, A_dp_inv, F, Fhat, Fhat2, B, scaledA, scaledBT = setup_FETIDP(fes, a)
rhs = B.CreateColVector()
lam = B.CreateColVector()
prep(B, A_dp_inv, f)
if comm.rank==0:
    print('\nWithout multiplicity scaling:')
t1 = solve(F,  Fhat,  rhs, lam)
if comm.rank==0:
    print('time to solve: ', t1)
    print('\nWith multiplicity scaling:')
t2 = solve(F,  Fhat2,  rhs, lam)
if comm.rank==0:
    print('\ntime solve without scaling: ', t1)
    print('time solve with scaling: ', t2)
gfu = GridFunction(fes)
post(B, A_dp_inv, gfu, lam)
[stdout:26]
global,  ndof = 576877 , lodofs = 75403
avg DOFs per core:  6607.56
# of global primal dofs:  399
# of global multipliers = : 87867

Without multiplicity scaling:
it =  0  err =  1.3828932193544856
it =  1  err =  0.3093351749153988
it =  2  err =  0.18687127840143788
it =  3  err =  0.15982668754855409
it =  4  err =  0.10617262264774008
it =  5  err =  0.08636099359688607
it =  6  err =  0.06534050969841254
it =  7  err =  0.04057663450962983
it =  8  err =  0.029795216503713105
it =  9  err =  0.019191776032195372
it =  10  err =  0.013409312593594812
it =  11  err =  0.010581447400156284
it =  12  err =  0.008496811350847722
it =  13  err =  0.005343556967304746
it =  14  err =  0.004037091662300088
it =  15  err =  0.0031488610707263364
it =  16  err =  0.001955640666077683
it =  17  err =  0.0013252053330392102
it =  18  err =  0.0008350370464745804
it =  19  err =  0.0005929073126342288
it =  20  err =  0.0004345439005536961
it =  21  err =  0.00030177509231099235
it =  22  err =  0.000235125851517741
it =  23  err =  0.0001333039960992988
it =  24  err =  0.00010897281470394863
it =  25  err =  5.7947863178417394e-05
it =  26  err =  5.0436899032635396e-05
it =  27  err =  2.833460785216941e-05
it =  28  err =  2.4503685663038197e-05
it =  29  err =  1.5454634417623603e-05
it =  30  err =  1.1090913251083186e-05
it =  31  err =  7.673508671528517e-06
it =  32  err =  4.6991810256279215e-06
it =  33  err =  3.5467908187332595e-06
it =  34  err =  2.124001011194309e-06
it =  35  err =  1.7687264029974105e-06
time to solve:  0.6880253719864413

With multiplicity scaling:
it =  0  err =  0.5664618015376661
it =  1  err =  0.1224428346506882
it =  2  err =  0.06781240811871787
it =  3  err =  0.05441728733297438
it =  4  err =  0.030540989185688594
it =  5  err =  0.021651861416661526
it =  6  err =  0.015835176581462645
it =  7  err =  0.009189106784938087
it =  8  err =  0.00665121993521837
it =  9  err =  0.004076599182832595
it =  10  err =  0.0025245723947087415
it =  11  err =  0.001596301148038511
it =  12  err =  0.0012419381581669539
it =  13  err =  0.0008223671367012567
it =  14  err =  0.00041055052407066705
it =  15  err =  0.00030352885324448577
it =  16  err =  0.00021292551429473027
it =  17  err =  0.00010535909369377514
it =  18  err =  7.858092529530714e-05
it =  19  err =  4.1265575251836e-05
it =  20  err =  2.8532679697126992e-05
it =  21  err =  1.6665576448624424e-05
it =  22  err =  9.268060980115929e-06
it =  23  err =  7.031947818198997e-06
it =  24  err =  3.1407801205245272e-06
it =  25  err =  2.5144714637266164e-06
it =  26  err =  1.287853499441041e-06
it =  27  err =  8.333768290606212e-07

time solve without scaling:  0.6880253719864413
time solve with scaling:  0.7163726490689442

norm jump u:  9.990814869240256e-06
[11]:
%%px --target 1
for t in sorted(filter(lambda t:t['time']>0.5, Timers()), key=lambda t:t['time'], reverse=True):
    print(t['name'], ':  ', t['time'])
SparseCholesky<d,d,d>::MultAdd :   0.8205492496490479
[12]:
%%px
t_chol = filter(lambda t: t['name'] == 'SparseCholesky<d,d,d>::MultAdd', Timers()).__next__()
maxt = comm.Max(t_chol['time'])
if t_chol['time'] == maxt:
    print('timers from rank ', comm.rank, ':')
    for t in sorted(filter(lambda t:t['time']>min(0.3*maxt, 0.5), Timers()), key=lambda t:t['time'], reverse=True):
        print(t['name'], ':  ', t['time'])
[stdout:31]
timers from rank  13 :
SparseCholesky<d,d,d>::MultAdd :   1.209223747253418
SparseCholesky<d,d,d>::MultAdd fac1 :   0.7094001770019531
SparseCholesky<d,d,d>::MultAdd fac2 :   0.4826033115386963
SparseCholesky - total :   0.41616296768188477
[13]:
stop_cluster(user_id)