From 625b8bd90db38dc8c63f13e6ec05bd9592754faf Mon Sep 17 00:00:00 2001
From: Florian Atteneder <florian.atteneder@uni-jena.de>
Date: Mon, 4 Sep 2023 23:56:31 +0200
Subject: [PATCH] 'unroll' llf, av_nflux_2d methods to see how much perf can be
 gained over current broadcast_faces_2! impl

---
 src/EulerEq/equation.jl | 286 ++++++++++++++++++++++++++++++++++++++++
 src/EulerEq/rhs.jl      |   6 +-
 2 files changed, 290 insertions(+), 2 deletions(-)

diff --git a/src/EulerEq/equation.jl b/src/EulerEq/equation.jl
index 7e8a3f3c..f65676f6 100644
--- a/src/EulerEq/equation.jl
+++ b/src/EulerEq/equation.jl
@@ -483,3 +483,289 @@ end
 
   @returns nflx_rho, nflx_qx, nflx_qy, nflx_E
 end
+
+
+
+function av_nflux_2d(eq::EulerEquation2d, mesh::Mesh2d)
+
+  @unpack ldg_rho_x, ldg_rho_y, ldg_qx_x, ldg_qx_y, ldg_qy_x, ldg_qy_y,
+          ldg_E_x, ldg_E_y, smoothed_mu = get_static_variables(mesh.cache)
+  @unpack nflx_rho, nflx_qx, nflx_qy, nflx_E, nx, ny = get_bdry_variables(mesh.cache)
+
+  @inbounds for icell in mesh.bulkidxs
+
+    cell_in = mesh.tree.cells[icell]
+    bulk_in = cellindices(mesh, icell)
+    rng_lhs, rng_rhs, rng_down, rng_up = dg1d.faceindices(mesh, icell)
+
+    if dg1d.has_neighbor(cell_in, Cart2d.Xmin)
+      cell_out = cell_in.neighbors[Cart2d.Xmin]
+      bulk_out = cellindices(mesh, cell_out)
+      for (i, fi) in enumerate(rng_lhs)
+        bo = bulk_out[end,i]
+        bi = bulk_in[1,i]
+
+        nxi, nyi = nx[fi], ny[fi]
+
+        lhs_nflx = nxi*ldg_rho_x[bi] + nyi*ldg_rho_y[bi]
+        rhs_nflx = nxi*ldg_rho_x[bo] + nyi*ldg_rho_y[bo]
+        nflx_rho[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+
+        lhs_nflx = nxi*ldg_qx_x[bi] + nyi*ldg_qx_y[bi]
+        rhs_nflx = nxi*ldg_qx_x[bo] + nyi*ldg_qx_y[bo]
+        nflx_qx[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+
+        lhs_nflx = nxi*ldg_qy_x[bi] + nyi*ldg_qy_y[bi]
+        rhs_nflx = nxi*ldg_qy_x[bo] + nyi*ldg_qy_y[bo]
+        nflx_qy[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+
+        lhs_nflx = nxi*ldg_E_x[bi] + nyi*ldg_E_y[bi]
+        rhs_nflx = nxi*ldg_E_x[bo] + nyi*ldg_E_y[bo]
+        nflx_E[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+      end
+    end
+
+    if dg1d.has_neighbor(cell_in, Cart2d.Xmax)
+      cell_out = cell_in.neighbors[Cart2d.Xmax]
+      bulk_out = cellindices(mesh, cell_out)
+      for (i, fi) in enumerate(rng_rhs)
+        bo = bulk_out[1,i]
+        bi = bulk_in[end,i]
+
+        nxi, nyi = nx[fi], ny[fi]
+
+        lhs_nflx = nxi*ldg_rho_x[bi] + nyi*ldg_rho_y[bi]
+        rhs_nflx = nxi*ldg_rho_x[bo] + nyi*ldg_rho_y[bo]
+        nflx_rho[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+
+        lhs_nflx = nxi*ldg_qx_x[bi] + nyi*ldg_qx_y[bi]
+        rhs_nflx = nxi*ldg_qx_x[bo] + nyi*ldg_qx_y[bo]
+        nflx_qx[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+
+        lhs_nflx = nxi*ldg_qy_x[bi] + nyi*ldg_qy_y[bi]
+        rhs_nflx = nxi*ldg_qy_x[bo] + nyi*ldg_qy_y[bo]
+        nflx_qy[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+
+        lhs_nflx = nxi*ldg_E_x[bi] + nyi*ldg_E_y[bi]
+        rhs_nflx = nxi*ldg_E_x[bo] + nyi*ldg_E_y[bo]
+        nflx_E[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+      end
+    end
+
+    if dg1d.has_neighbor(cell_in, Cart2d.Ymin)
+      cell_out = cell_in.neighbors[Cart2d.Ymin]
+      bulk_out = cellindices(mesh, cell_out)
+      for (i, fi) in enumerate(rng_down)
+        bo = bulk_out[i,end]
+        bi = bulk_in[i,1]
+
+        nxi, nyi = nx[fi], ny[fi]
+
+        lhs_nflx = nxi*ldg_rho_x[bi] + nyi*ldg_rho_y[bi]
+        rhs_nflx = nxi*ldg_rho_x[bo] + nyi*ldg_rho_y[bo]
+        nflx_rho[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+
+        lhs_nflx = nxi*ldg_qx_x[bi] + nyi*ldg_qx_y[bi]
+        rhs_nflx = nxi*ldg_qx_x[bo] + nyi*ldg_qx_y[bo]
+        nflx_qx[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+
+        lhs_nflx = nxi*ldg_qy_x[bi] + nyi*ldg_qy_y[bi]
+        rhs_nflx = nxi*ldg_qy_x[bo] + nyi*ldg_qy_y[bo]
+        nflx_qy[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+
+        lhs_nflx = nxi*ldg_E_x[bi] + nyi*ldg_E_y[bi]
+        rhs_nflx = nxi*ldg_E_x[bo] + nyi*ldg_E_y[bo]
+        nflx_E[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+      end
+    end
+
+    if dg1d.has_neighbor(cell_in, Cart2d.Ymax)
+      cell_out = cell_in.neighbors[Cart2d.Ymax]
+      bulk_out = cellindices(mesh, cell_out)
+      for (i, fi) in enumerate(rng_up)
+        bo = bulk_out[i,1]
+        bi = bulk_in[i,end]
+
+        nxi, nyi = nx[fi], ny[fi]
+
+        lhs_nflx = nxi*ldg_rho_x[bi] + nyi*ldg_rho_y[bi]
+        rhs_nflx = nxi*ldg_rho_x[bo] + nyi*ldg_rho_y[bo]
+        nflx_rho[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+
+        lhs_nflx = nxi*ldg_qx_x[bi] + nyi*ldg_qx_y[bi]
+        rhs_nflx = nxi*ldg_qx_x[bo] + nyi*ldg_qx_y[bo]
+        nflx_qx[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+
+        lhs_nflx = nxi*ldg_qy_x[bi] + nyi*ldg_qy_y[bi]
+        rhs_nflx = nxi*ldg_qy_x[bo] + nyi*ldg_qy_y[bo]
+        nflx_qy[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+
+        lhs_nflx = nxi*ldg_E_x[bi] + nyi*ldg_E_y[bi]
+        rhs_nflx = nxi*ldg_E_x[bo] + nyi*ldg_E_y[bo]
+        nflx_E[fi] += 0.5 * (smoothed_mu[bi]*lhs_nflx + smoothed_mu[bo]*rhs_nflx)
+      end
+    end
+  end
+
+end
+
+
+# @with_signature function llf(equation::EulerEquation2d)
+#   @accepts Prefix(lhs), rho, flx_rho_x, flx_rho_y, qx, flx_qx_x, flx_qx_y, qy, flx_qy_y, E, flx_E_x, flx_E_y, v
+#   @accepts Prefix(rhs), rho, flx_rho_x, flx_rho_y, qx, flx_qx_x, flx_qx_y, qy, flx_qy_y, E, flx_E_x, flx_E_y, v
+#   @accepts nx, ny
+#
+#   vmax = absmax(lhs_v, rhs_v)
+#
+#   lhs_nflx = nx*lhs_flx_rho_x + ny*lhs_flx_rho_y
+#   rhs_nflx = nx*rhs_flx_rho_x + ny*rhs_flx_rho_y
+#   nflx_rho = LLF(lhs_nflx, rhs_nflx, lhs_rho, rhs_rho, vmax)
+#
+#   lhs_nflx = nx*lhs_flx_qx_x + ny*lhs_flx_qx_y
+#   rhs_nflx = nx*rhs_flx_qx_x + ny*rhs_flx_qx_y
+#   nflx_qx = LLF(lhs_nflx, rhs_nflx, lhs_qx, rhs_qx, vmax)
+#
+#   lhs_flx_qy_x = lhs_flx_qx_y
+#   rhs_flx_qy_x = rhs_flx_qx_y
+#   lhs_nflx = nx*lhs_flx_qy_x + ny*lhs_flx_qy_y
+#   rhs_nflx = nx*rhs_flx_qy_x + ny*rhs_flx_qy_y
+#   nflx_qy = LLF(lhs_nflx, rhs_nflx, lhs_qy, rhs_qy, vmax)
+#
+#   lhs_nflx = nx*lhs_flx_E_x + ny*lhs_flx_E_y
+#   rhs_nflx = nx*rhs_flx_E_x + ny*rhs_flx_E_y
+#   nflx_E = LLF(lhs_nflx, rhs_nflx, lhs_E, rhs_E, vmax)
+#
+#   @returns nflx_rho, nflx_qx, nflx_qy, nflx_E
+# end
+function llf(equation::EulerEquation2d, mesh::Mesh2d)
+  @unpack rho, qx, qy, E = get_dynamic_variables(mesh.cache)
+  @unpack flx_rho_x, flx_rho_y, flx_qx_x, flx_qx_y, flx_qy_y,
+          flx_E_x, flx_E_y, v = get_static_variables(mesh.cache)
+  @unpack nx, ny, nflx_rho, nflx_qx, nflx_qy, nflx_E = get_bdry_variables(mesh.cache)
+
+  flx_qy_x = flx_qx_y
+
+  @inbounds for icell in mesh.bulkidxs
+
+    cell_in = mesh.tree.cells[icell]
+    bulk_in = cellindices(mesh, icell)
+    rng_lhs, rng_rhs, rng_down, rng_up = dg1d.faceindices(mesh, icell)
+
+    if dg1d.has_neighbor(cell_in, Cart2d.Xmin)
+      cell_out = cell_in.neighbors[Cart2d.Xmin]
+      bulk_out = cellindices(mesh, cell_out)
+      for (i, fi) in enumerate(rng_lhs)
+        bo = bulk_out[end,i]
+        bi = bulk_in[1,i]
+
+        vmax = absmax(v[bi], v[bo])
+
+        nxi, nyi = nx[fi], ny[fi]
+
+        lhs_nflx = nxi*flx_rho_x[bi] + nyi*flx_rho_y[bi]
+        rhs_nflx = nxi*flx_rho_x[bo] + nyi*flx_rho_y[bo]
+        nflx_rho[fi] = LLF(lhs_nflx, rhs_nflx, rho[bi], rho[bo], vmax)
+
+        lhs_nflx = nxi*flx_qx_x[bi] + nyi*flx_qx_y[bi]
+        rhs_nflx = nxi*flx_qx_x[bo] + nyi*flx_qx_y[bo]
+        nflx_qx[fi] = LLF(lhs_nflx, rhs_nflx, qx[bi], qx[bo], vmax)
+
+        lhs_nflx = nxi*flx_qy_x[bi] + nyi*flx_qy_y[bi]
+        rhs_nflx = nxi*flx_qy_x[bo] + nyi*flx_qy_y[bo]
+        nflx_qy[fi] = LLF(lhs_nflx, rhs_nflx, qy[bi], qy[bo], vmax)
+
+        lhs_nflx = nxi*flx_E_x[bi] + nyi*flx_E_y[bi]
+        rhs_nflx = nxi*flx_E_x[bo] + nyi*flx_E_y[bo]
+        nflx_E[fi] = LLF(lhs_nflx, rhs_nflx, E[bi], E[bo], vmax)
+
+      end
+    end
+
+    if dg1d.has_neighbor(cell_in, Cart2d.Xmax)
+      cell_out = cell_in.neighbors[Cart2d.Xmax]
+      bulk_out = cellindices(mesh, cell_out)
+      for (i, fi) in enumerate(rng_rhs)
+        bo = bulk_out[1,i]
+        bi = bulk_in[end,i]
+
+        vmax = absmax(v[bi], v[bo])
+
+        nxi, nyi = nx[fi], ny[fi]
+
+        lhs_nflx = nxi*flx_rho_x[bi] + nyi*flx_rho_y[bi]
+        rhs_nflx = nxi*flx_rho_x[bo] + nyi*flx_rho_y[bo]
+        nflx_rho[fi] = LLF(lhs_nflx, rhs_nflx, rho[bi], rho[bo], vmax)
+
+        lhs_nflx = nxi*flx_qx_x[bi] + nyi*flx_qx_y[bi]
+        rhs_nflx = nxi*flx_qx_x[bo] + nyi*flx_qx_y[bo]
+        nflx_qx[fi] = LLF(lhs_nflx, rhs_nflx, qx[bi], qx[bo], vmax)
+
+        lhs_nflx = nxi*flx_qy_x[bi] + nyi*flx_qy_y[bi]
+        rhs_nflx = nxi*flx_qy_x[bo] + nyi*flx_qy_y[bo]
+        nflx_qy[fi] = LLF(lhs_nflx, rhs_nflx, qy[bi], qy[bo], vmax)
+
+        lhs_nflx = nxi*flx_E_x[bi] + nyi*flx_E_y[bi]
+        rhs_nflx = nxi*flx_E_x[bo] + nyi*flx_E_y[bo]
+        nflx_E[fi] = LLF(lhs_nflx, rhs_nflx, E[bi], E[bo], vmax)
+      end
+    end
+
+    if dg1d.has_neighbor(cell_in, Cart2d.Ymin)
+      cell_out = cell_in.neighbors[Cart2d.Ymin]
+      bulk_out = cellindices(mesh, cell_out)
+      for (i, fi) in enumerate(rng_down)
+        bo = bulk_out[i,end]
+        bi = bulk_in[i,1]
+
+        vmax = absmax(v[bi], v[bo])
+
+        nxi, nyi = nx[fi], ny[fi]
+
+        lhs_nflx = nxi*flx_rho_x[bi] + nyi*flx_rho_y[bi]
+        rhs_nflx = nxi*flx_rho_x[bo] + nyi*flx_rho_y[bo]
+        nflx_rho[fi] = LLF(lhs_nflx, rhs_nflx, rho[bi], rho[bo], vmax)
+
+        lhs_nflx = nxi*flx_qx_x[bi] + nyi*flx_qx_y[bi]
+        rhs_nflx = nxi*flx_qx_x[bo] + nyi*flx_qx_y[bo]
+        nflx_qx[fi] = LLF(lhs_nflx, rhs_nflx, qx[bi], qx[bo], vmax)
+
+        lhs_nflx = nxi*flx_qy_x[bi] + nyi*flx_qy_y[bi]
+        rhs_nflx = nxi*flx_qy_x[bo] + nyi*flx_qy_y[bo]
+        nflx_qy[fi] = LLF(lhs_nflx, rhs_nflx, qy[bi], qy[bo], vmax)
+
+        lhs_nflx = nxi*flx_E_x[bi] + nyi*flx_E_y[bi]
+        rhs_nflx = nxi*flx_E_x[bo] + nyi*flx_E_y[bo]
+        nflx_E[fi] = LLF(lhs_nflx, rhs_nflx, E[bi], E[bo], vmax)
+      end
+    end
+
+    if dg1d.has_neighbor(cell_in, Cart2d.Ymax)
+      cell_out = cell_in.neighbors[Cart2d.Ymax]
+      bulk_out = cellindices(mesh, cell_out)
+      for (i, fi) in enumerate(rng_up)
+        bo = bulk_out[i,1]
+        bi = bulk_in[i,end]
+
+        vmax = absmax(v[bi], v[bo])
+
+        nxi, nyi = nx[fi], ny[fi]
+
+        lhs_nflx = nxi*flx_rho_x[bi] + nyi*flx_rho_y[bi]
+        rhs_nflx = nxi*flx_rho_x[bo] + nyi*flx_rho_y[bo]
+        nflx_rho[fi] = LLF(lhs_nflx, rhs_nflx, rho[bi], rho[bo], vmax)
+
+        lhs_nflx = nxi*flx_qx_x[bi] + nyi*flx_qx_y[bi]
+        rhs_nflx = nxi*flx_qx_x[bo] + nyi*flx_qx_y[bo]
+        nflx_qx[fi] = LLF(lhs_nflx, rhs_nflx, qx[bi], qx[bo], vmax)
+
+        lhs_nflx = nxi*flx_qy_x[bi] + nyi*flx_qy_y[bi]
+        rhs_nflx = nxi*flx_qy_x[bo] + nyi*flx_qy_y[bo]
+        nflx_qy[fi] = LLF(lhs_nflx, rhs_nflx, qy[bi], qy[bo], vmax)
+
+        lhs_nflx = nxi*flx_E_x[bi] + nyi*flx_E_y[bi]
+        rhs_nflx = nxi*flx_E_x[bo] + nyi*flx_E_y[bo]
+        nflx_E[fi] = LLF(lhs_nflx, rhs_nflx, E[bi], E[bo], vmax)
+      end
+    end
+  end
+end
diff --git a/src/EulerEq/rhs.jl b/src/EulerEq/rhs.jl
index d2c5808b..60769016 100644
--- a/src/EulerEq/rhs.jl
+++ b/src/EulerEq/rhs.jl
@@ -248,12 +248,14 @@ function rhs!(env, P::Project2d, hrsc::HRSC.AbstractArtificialViscosity, mesh::M
 
   ## compute rhs of regularized equation: ∂t u + ∂x f + ∂x μ q = 0
   broadcast_volume_2!(flux, equation, mesh)
-  broadcast_faces_2!(llf, equation, mesh)
+  # broadcast_faces_2!(llf, equation, mesh)
+  llf(equation, mesh)
   # TODO Need to revise Bdry conditions
   # broadcast_bdry_2!(bdryllf, equation, mesh)
 
   broadcast_volume_2!(av_flux_2d, equation, mesh)
-  broadcast_faces_2!(av_nflux_2d, equation, mesh)
+  # broadcast_faces_2!(av_nflux_2d, equation, mesh)
+  av_nflux_2d(equation, mesh)
   # TODO don't we also need av_bdrylff here?
 
   compute_rhs_weak_form!(rhs_rho, flx_rho_x, flx_rho_y, nflx_rho, mesh)
-- 
GitLab