diff --git a/Manifest.toml b/Manifest.toml
index bc7658d4dafe2ba2a0edabaa26ee563f862e5f79..c2f9b82fd3b48545f0daf3c735891b2d217a005c 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,8 +1,8 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.9.1"
+julia_version = "1.9.3"
 manifest_format = "2.0"
-project_hash = "23b088bdd6df8e8419a0902b8cafd14f76c1ffbf"
+project_hash = "211163242c6c90535eae175929c4423583770490"
 
 [[deps.Adapt]]
 deps = ["LinearAlgebra"]
@@ -14,6 +14,34 @@ version = "3.4.0"
 uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
 version = "1.1.1"
 
+[[deps.ArrayInterface]]
+deps = ["Adapt", "LinearAlgebra", "Requires", "SparseArrays", "SuiteSparse"]
+git-tree-sha1 = "f83ec24f76d4c8f525099b2ac475fc098138ec31"
+uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
+version = "7.4.11"
+
+    [deps.ArrayInterface.extensions]
+    ArrayInterfaceBandedMatricesExt = "BandedMatrices"
+    ArrayInterfaceBlockBandedMatricesExt = "BlockBandedMatrices"
+    ArrayInterfaceCUDAExt = "CUDA"
+    ArrayInterfaceGPUArraysCoreExt = "GPUArraysCore"
+    ArrayInterfaceStaticArraysCoreExt = "StaticArraysCore"
+    ArrayInterfaceTrackerExt = "Tracker"
+
+    [deps.ArrayInterface.weakdeps]
+    BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
+    BlockBandedMatrices = "ffab5731-97b5-5995-9138-79e8c1846df0"
+    CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+    GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+    StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
+    Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
+
+[[deps.ArrayInterfaceCore]]
+deps = ["LinearAlgebra", "SnoopPrecompile", "SparseArrays", "SuiteSparse"]
+git-tree-sha1 = "e5f08b5689b1aad068e01751889f2f615c7db36d"
+uuid = "30b0a656-2188-435a-8636-2ec0e6a096e2"
+version = "0.1.29"
+
 [[deps.Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 
@@ -37,12 +65,24 @@ git-tree-sha1 = "43b1a4a8f797c1cddadf60499a8a077d4af2cd2d"
 uuid = "d1d4a3ce-64b1-5f1a-9ba4-7e7e69966f35"
 version = "0.1.7"
 
+[[deps.BitTwiddlingConvenienceFunctions]]
+deps = ["Static"]
+git-tree-sha1 = "0c5f81f47bbbcf4aea7b2959135713459170798b"
+uuid = "62783981-4cbd-42fc-bca8-16325de8dc4b"
+version = "0.1.5"
+
 [[deps.Bzip2_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
 uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
 version = "1.0.8+0"
 
+[[deps.CPUSummary]]
+deps = ["CpuId", "IfElse", "PrecompileTools", "Static"]
+git-tree-sha1 = "89e0654ed8c7aebad6d5ad235d6242c2d737a928"
+uuid = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
+version = "0.2.3"
+
 [[deps.Cairo_jll]]
 deps = ["Artifacts", "Bzip2_jll", "CompilerSupportLibraries_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"]
 git-tree-sha1 = "4b859a208b2397a7a623a03449e4636bdb17bcf2"
@@ -61,6 +101,12 @@ git-tree-sha1 = "38f7a08f19d8810338d4f5085211c7dfa5d5bdd8"
 uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
 version = "0.1.4"
 
+[[deps.CloseOpenIntervals]]
+deps = ["Static", "StaticArrayInterface"]
+git-tree-sha1 = "70232f82ffaab9dc52585e0dd043b5e0c6b714f1"
+uuid = "fb6a15b2-703c-40df-9091-08a04967cfa9"
+version = "0.1.12"
+
 [[deps.CodecZlib]]
 deps = ["TranscodingStreams", "Zlib_jll"]
 git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da"
@@ -105,7 +151,7 @@ version = "4.5.0"
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.0.2+0"
+version = "1.0.5+0"
 
 [[deps.ConstructionBase]]
 deps = ["LinearAlgebra"]
@@ -118,6 +164,12 @@ git-tree-sha1 = "d05d9e7b7aedff4e5b51a029dced05cfb6125781"
 uuid = "d38c429a-6771-53c6-b99e-75d170b6e991"
 version = "0.6.2"
 
+[[deps.CpuId]]
+deps = ["Markdown"]
+git-tree-sha1 = "fcbb72b032692610bfbdb15018ac16a36cf2e406"
+uuid = "adafc99b-e345-5852-983c-f28acb93d879"
+version = "0.3.1"
+
 [[deps.Crayons]]
 git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
 uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
@@ -312,6 +364,17 @@ git-tree-sha1 = "129acf094d168394e80ee1dc4bc06ec835e510a3"
 uuid = "2e76f6c2-a576-52d4-95c1-20adfe4de566"
 version = "2.8.1+1"
 
+[[deps.HostCPUFeatures]]
+deps = ["BitTwiddlingConvenienceFunctions", "IfElse", "Libdl", "Static"]
+git-tree-sha1 = "eb8fed28f4994600e29beef49744639d985a04b2"
+uuid = "3e5b6fbb-0976-4d2c-9146-d79de83f2fb0"
+version = "0.1.16"
+
+[[deps.IfElse]]
+git-tree-sha1 = "debdd00ffef04665ccbb3e150747a77560e8fad1"
+uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
+version = "0.1.1"
+
 [[deps.IniFile]]
 git-tree-sha1 = "f550e6e32074c939295eb5ea6de31849ac2c9625"
 uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f"
@@ -414,6 +477,12 @@ git-tree-sha1 = "ab9aa169d2160129beb241cb2750ca499b4e90e9"
 uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316"
 version = "0.15.17"
 
+[[deps.LayoutPointers]]
+deps = ["ArrayInterface", "LinearAlgebra", "ManualMemory", "SIMDTypes", "Static", "StaticArrayInterface"]
+git-tree-sha1 = "88b8f66b604da079a627b6fb2860d3704a6729a1"
+uuid = "10f19ff3-798f-405d-979b-55457f8fc047"
+version = "0.1.14"
+
 [[deps.LazyArtifacts]]
 deps = ["Artifacts", "Pkg"]
 uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
@@ -513,12 +582,32 @@ git-tree-sha1 = "cedb76b37bc5a6c702ade66be44f831fa23c681e"
 uuid = "e6f89c97-d47a-5376-807f-9c37f3926c36"
 version = "1.0.0"
 
+[[deps.LoopVectorization]]
+deps = ["ArrayInterface", "ArrayInterfaceCore", "CPUSummary", "CloseOpenIntervals", "DocStringExtensions", "HostCPUFeatures", "IfElse", "LayoutPointers", "LinearAlgebra", "OffsetArrays", "PolyesterWeave", "PrecompileTools", "SIMDTypes", "SLEEFPirates", "Static", "StaticArrayInterface", "ThreadingUtilities", "UnPack", "VectorizationBase"]
+git-tree-sha1 = "c88a4afe1703d731b1c4fdf4e3c7e77e3b176ea2"
+uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
+version = "0.12.165"
+
+    [deps.LoopVectorization.extensions]
+    ForwardDiffExt = ["ChainRulesCore", "ForwardDiff"]
+    SpecialFunctionsExt = "SpecialFunctions"
+
+    [deps.LoopVectorization.weakdeps]
+    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+    ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+    SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+
 [[deps.MacroTools]]
 deps = ["Markdown", "Random"]
 git-tree-sha1 = "42324d08725e200c23d4dfb549e0d5d89dede2d2"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 version = "0.5.10"
 
+[[deps.ManualMemory]]
+git-tree-sha1 = "bcaef4fc7a0cfe2cba636d84cda54b5e4e4ca3cd"
+uuid = "d125e4d3-2237-4719-b19c-fa641b8a4667"
+version = "0.1.8"
+
 [[deps.Markdown]]
 deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
@@ -656,7 +745,7 @@ version = "0.40.1+0"
 [[deps.Pkg]]
 deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-version = "1.9.0"
+version = "1.9.2"
 
 [[deps.PlotThemes]]
 deps = ["PlotUtils", "Statistics"]
@@ -676,12 +765,24 @@ git-tree-sha1 = "02ecc6a3427e7edfff1cebcf66c1f93dd77760ec"
 uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 version = "1.38.1"
 
+[[deps.PolyesterWeave]]
+deps = ["BitTwiddlingConvenienceFunctions", "CPUSummary", "IfElse", "Static", "ThreadingUtilities"]
+git-tree-sha1 = "240d7170f5ffdb285f9427b92333c3463bf65bf6"
+uuid = "1d0040c9-8b98-4ee7-8388-3f51789ca0ad"
+version = "0.2.1"
+
 [[deps.Polynomials]]
 deps = ["Intervals", "LinearAlgebra", "MutableArithmetics", "RecipesBase"]
 git-tree-sha1 = "a1f7f4e41404bed760213ca01d7f384319f717a5"
 uuid = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
 version = "2.0.25"
 
+[[deps.PrecompileTools]]
+deps = ["Preferences"]
+git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f"
+uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
+version = "1.2.0"
+
 [[deps.Preferences]]
 deps = ["TOML"]
 git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d"
@@ -767,6 +868,17 @@ version = "2.0.8"
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 version = "0.7.0"
 
+[[deps.SIMDTypes]]
+git-tree-sha1 = "330289636fb8107c5f32088d2741e9fd7a061a5c"
+uuid = "94e857df-77ce-4151-89e5-788b33177be4"
+version = "0.1.0"
+
+[[deps.SLEEFPirates]]
+deps = ["IfElse", "Static", "VectorizationBase"]
+git-tree-sha1 = "4b8586aece42bee682399c4c4aee95446aa5cd19"
+uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
+version = "0.6.39"
+
 [[deps.Scratch]]
 deps = ["Dates"]
 git-tree-sha1 = "f94f779c94e58bf9ea243e77a37e16d9de9126bd"
@@ -821,6 +933,23 @@ git-tree-sha1 = "5d65101b2ed17a8862c4c05639c3ddc7f3d791e1"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
 version = "1.8.7"
 
+[[deps.Static]]
+deps = ["IfElse"]
+git-tree-sha1 = "f295e0a1da4ca425659c57441bcb59abb035a4bc"
+uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
+version = "0.8.8"
+
+[[deps.StaticArrayInterface]]
+deps = ["ArrayInterface", "Compat", "IfElse", "LinearAlgebra", "Requires", "SnoopPrecompile", "SparseArrays", "Static", "SuiteSparse"]
+git-tree-sha1 = "33040351d2403b84afce74dae2e22d3f5b18edcb"
+uuid = "0d7ed370-da01-4f52-bd93-41d350b8b718"
+version = "1.4.0"
+weakdeps = ["OffsetArrays", "StaticArrays"]
+
+    [deps.StaticArrayInterface.extensions]
+    StaticArrayInterfaceOffsetArraysExt = "OffsetArrays"
+    StaticArrayInterfaceStaticArraysExt = "StaticArrays"
+
 [[deps.StaticArrays]]
 deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
 git-tree-sha1 = "6954a456979f23d05085727adb17c4551c19ecd1"
@@ -860,6 +989,10 @@ git-tree-sha1 = "b03a3b745aa49b566f128977a7dd1be8711c5e71"
 uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 version = "0.6.14"
 
+[[deps.SuiteSparse]]
+deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"]
+uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
+
 [[deps.SuiteSparse_jll]]
 deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
 uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
@@ -897,6 +1030,12 @@ version = "0.1.1"
 deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
+[[deps.ThreadingUtilities]]
+deps = ["ManualMemory"]
+git-tree-sha1 = "eda08f7e9818eb53661b3deb74e3159460dfbc27"
+uuid = "8290d209-cae3-49c0-8002-c8c24d57dab5"
+version = "0.5.2"
+
 [[deps.TimeZones]]
 deps = ["Dates", "Downloads", "InlineStrings", "LazyArtifacts", "Mocking", "Printf", "RecipesBase", "Scratch", "Unicode"]
 git-tree-sha1 = "a92ec4466fc6e3dd704e2668b5e7f24add36d242"
@@ -953,6 +1092,12 @@ git-tree-sha1 = "c2d0db3ef09f1942d08ea455a9e252594be5f3b6"
 uuid = "4004b06d-e244-455f-a6ce-a5f9919cc534"
 version = "1.0.1"
 
+[[deps.VectorizationBase]]
+deps = ["ArrayInterface", "CPUSummary", "HostCPUFeatures", "IfElse", "LayoutPointers", "Libdl", "LinearAlgebra", "SIMDTypes", "Static", "StaticArrayInterface"]
+git-tree-sha1 = "b182207d4af54ac64cbc71797765068fdeff475d"
+uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
+version = "0.21.64"
+
 [[deps.Wayland_jll]]
 deps = ["Artifacts", "Expat_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Pkg", "XML2_jll"]
 git-tree-sha1 = "ed8d92d9774b077c53e1da50fd81a36af3744c1c"
diff --git a/Project.toml b/Project.toml
index 558a4eb6022bd7b3bc1f7229f7bc96b1c0d24b60..5a83654027abe294907e73d571e79e3e540ece1b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,7 @@ Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 Jacobi = "83f21c0b-4282-5fbc-9e3f-f6da3d2e584c"
 LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
diff --git a/src/dg1d.jl b/src/dg1d.jl
index a95cba580c04dbd4e59866f1725e6bf65922a4fe..b93d6495983eaf696f6aee75924d10fb7ebfc3aa 100644
--- a/src/dg1d.jl
+++ b/src/dg1d.jl
@@ -8,6 +8,7 @@ using InteractiveUtils
 using Jacobi
 @reexport using HDF5
 using LinearAlgebra
+using LoopVectorization
 import MacroTools
 using OrderedCollections
 using Plots
diff --git a/src/dg_rhs.jl b/src/dg_rhs.jl
index 06fe3aef1d147bac7eb9b0f6e53d53d2542c7807..c18cc6c3f7a345bbb7bfaed67c9009704d1d7118 100644
--- a/src/dg_rhs.jl
+++ b/src/dg_rhs.jl
@@ -8,12 +8,14 @@ function compute_rhs_weak_form!(rhs, f, nf, mesh::Mesh1d)
   @unpack K = mesh
   shape        = layout(mesh)
   mat_rhs      = vreshape(rhs, shape)
-  mat_f        = vreshape(f, shape)
+  mat_f        = vreshape(f,   shape)
   mul!(mat_rhs, MDM, mat_f)
   nf_lhs       = view(nf, 1:K)
   nf_rhs       = view(nf, K+1:2*K)
-  @. mat_rhs  -= (nf_rhs' * Ml_rhs + #= minus contained in normal vector =# nf_lhs' * Ml_lhs)
-  mat_rhs    .*= invjac
+  @turbo @. begin # factor x10
+    mat_rhs  -= (nf_rhs' * Ml_rhs + #= minus contained in normal vector =# nf_lhs' * Ml_lhs)
+    mat_rhs *= invjac
+  end
   return
 end
 function compute_rhs_weak_form!(rhs, f, s, nf, mesh::Mesh1d)
@@ -28,42 +30,48 @@ function compute_rhs_weak_form!(rhs, fx, fy, nf, mesh::Mesh2d)
   @unpack invM, MDM, Ml_lhs, Ml_rhs, Npts = element
   Ml_down, Ml_up = Ml_lhs, Ml_rhs
   Nx, Ny = Npts, Npts
-  @inbounds for k = 1:n_cells(mesh.tree)
+  Ncs = n_cells(mesh.tree)
+  @inbounds for kk in 1:Ncs
 
     # bulk
-    idxs_bulk = cellindices(mesh, k)
-    v_rhs     = view(rhs,  idxs_bulk)
-    v_fx      = view(fx,   idxs_bulk)
-    v_fy      = view(fy,   idxs_bulk)
-    v_dxdX    = view(dxdX, idxs_bulk)
-    v_dydY    = view(dydY, idxs_bulk)
+    idxs_bulk = flatcellindices(mesh, kk)
+    v_rhs     = rview(rhs,  idxs_bulk, (Nx,Ny))
+    v_fx      = rview(fx,   idxs_bulk, (Nx,Ny))
+    v_fy      = rview(fy,   idxs_bulk, (Nx,Ny))
+    v_dxdX    = rview(dxdX, idxs_bulk, (Nx,Ny))
+    v_dydY    = rview(dydY, idxs_bulk, (Nx,Ny))
 
     # faces
-    idxs_lhs, idxs_rhs, idxs_down, idxs_up = faceindices(mesh, k)
-
+    idxs_lhs, idxs_rhs, idxs_down, idxs_up = faceindices(mesh, kk)
     v_nf_lhs  = view(nf, idxs_lhs)
     v_nf_rhs  = view(nf, idxs_rhs)
     v_nf_down = view(nf, idxs_down)
     v_nf_up   = view(nf, idxs_up)
 
-    # TODO Benchmark and see whether LoopVectorization can used here.
-    @inbounds for j=1:Ny, i=1:Nx
-      rhsij = 0
-      dxdXij = v_dxdX[i,j]
-      dydYij = v_dydY[i,j]
-      for k = 1:Nx
-        rhsij += MDM[i,k] * v_fx[k,j]
+    # atm we can utilize that cartesian grids have constant jacobians and
+    # pull those out of the loop
+    dxdXij = v_dxdX[1,1]
+    dydYij = v_dydY[1,1]
+
+    # notes on @turbo:
+    # - can't use k as index for both inner loops, otherwise gives wrong result
+    # - can't insert x_rhsij *= dxdXij after the k loop,
+    #   cf. https://github.com/JuliaSIMD/LoopVectorization.jl/issues/506
+    @turbo for j in 1:Ny
+      for i in 1:Nx
+        x_rhsij = 0.0
+        for k in 1:Nx
+          x_rhsij += MDM[i,k] * v_fx[k,j]
+        end
+        y_rhsij = 0.0
+        for l in 1:Ny
+          y_rhsij += MDM[j,l] * v_fy[i,l]
+        end
+        bdry_rhsij = (v_nf_rhs[j] * Ml_rhs[i] + v_nf_lhs[j]  * Ml_lhs[i])  * dxdXij +
+                     (v_nf_up[i]  * Ml_up[j]  + v_nf_down[i] * Ml_down[j]) * dydYij
+        v_rhs[i,j] = x_rhsij * dxdXij + y_rhsij * dydYij - bdry_rhsij
       end
-      v_rhs[i,j] = rhsij * dxdXij
-      rhsij = 0
-      for k = 1:Ny
-        rhsij += MDM[j,k] * v_fy[i,k]
-      end
-      v_rhs[i,j] += rhsij * dydYij
-      v_rhs[i,j] -= (v_nf_rhs[j] * Ml_rhs[i] + v_nf_lhs[j]  * Ml_lhs[i])  * dxdXij +
-                    (v_nf_up[i]  * Ml_up[j]  + v_nf_down[i] * Ml_down[j]) * dydYij
     end
-
   end
   return
 end
@@ -73,29 +81,34 @@ function compute_rhs_weak_form!(rhs, fx, fy::Real, nf, mesh::Mesh2d)
   @unpack invM, MDM, Ml_lhs, Ml_rhs, Npts = element
   Ml_down, Ml_up = Ml_lhs, Ml_rhs
   Nx, Ny = Npts, Npts
-  @inbounds for k = 1:n_cells(mesh.tree)
+  Ncs = n_cells(mesh.tree)
+  @inbounds for kk in 1:Ncs
 
     # bulk
-    idxs_bulk = cellindices(mesh, k)
-    v_rhs     = view(rhs,  idxs_bulk)
-    v_fx      = view(fx,   idxs_bulk)
-    v_dxdX    = view(dxdX, idxs_bulk)
+    idxs_bulk = flatcellindices(mesh, kk)
+    v_rhs     = rview(rhs,  idxs_bulk, (Nx,Ny))
+    v_fx      = rview(fx,   idxs_bulk, (Nx,Ny))
+    v_dxdX    = rview(dxdX, idxs_bulk, (Nx,Ny))
 
     # faces
-    idxs_lhs, idxs_rhs, _, _ = faceindices(mesh, k)
+    idxs_lhs, idxs_rhs, _, _ = faceindices(mesh, kk)
 
     v_nf_lhs  = view(nf, idxs_lhs)
     v_nf_rhs  = view(nf, idxs_rhs)
 
-    # TODO Benchmark and see whether LoopVectorization can used here.
-    @inbounds for j=1:Ny, i=1:Nx
-      rhsij = 0
-      dxdXij = v_dxdX[i,j]
-      for k = 1:Nx
-        rhsij += MDM[i,k] * v_fx[k,j]
+    # atm we can utilize that cartesian grids have constant jacobians and
+    # pull those out of the loop
+    dxdXij = v_dxdX[1,1]
+
+    @turbo for j in 1:Ny
+      for i in 1:Nx
+        x_rhsij = 0.0
+        for k in 1:Nx
+          x_rhsij += MDM[i,k] * v_fx[k,j]
+        end
+        bdry_rhsij = (v_nf_rhs[j] * Ml_rhs[i] + v_nf_lhs[j]  * Ml_lhs[i]) * dxdXij
+        v_rhs[i,j] = x_rhsij * dxdXij - bdry_rhsij
       end
-      v_rhs[i,j] = rhsij * dxdXij
-      v_rhs[i,j] -= (v_nf_rhs[j] * Ml_rhs[i] + v_nf_lhs[j]  * Ml_lhs[i])  * dxdXij
     end
 
   end
@@ -107,29 +120,34 @@ function compute_rhs_weak_form!(rhs, fx::Real, fy, nf, mesh::Mesh2d)
   @unpack invM, MDM, Ml_lhs, Ml_rhs, Npts = element
   Ml_down, Ml_up = Ml_lhs, Ml_rhs
   Nx, Ny = Npts, Npts
-  @inbounds for k = 1:n_cells(mesh.tree)
+  Ncs = n_cells(mesh.tree)
+  @inbounds for kk in 1:Ncs
 
     # bulk
-    idxs_bulk = cellindices(mesh, k)
-    v_rhs     = view(rhs,  idxs_bulk)
-    v_fy      = view(fy,   idxs_bulk)
-    v_dydY    = view(dydY, idxs_bulk)
+    idxs_bulk = flatcellindices(mesh, kk)
+    v_rhs     = rview(rhs,  idxs_bulk, (Nx,Ny))
+    v_fy      = rview(fy,   idxs_bulk, (Nx,Ny))
+    v_dydY    = rview(dydY, idxs_bulk, (Nx,Ny))
 
     # faces
-    _, _, idxs_down, idxs_up = faceindices(mesh, k)
+    _, _, idxs_down, idxs_up = faceindices(mesh, kk)
 
     v_nf_down = view(nf, idxs_down)
     v_nf_up   = view(nf, idxs_up)
 
-    # TODO Benchmark and see whether LoopVectorization can used here.
-    @inbounds for j=1:Ny, i=1:Nx
-      rhsij = 0
-      dydYij = v_dydY[i,j]
-      for k = 1:Ny
-        rhsij += MDM[j,k] * v_fy[i,k]
+    # atm we can utilize that cartesian grids have constant jacobians and
+    # pull those out of the loop
+    dydYij = v_dydY[1,1]
+
+    @turbo for j in 1:Ny
+      for i in 1:Nx
+        y_rhsij = 0.0
+        for k in 1:Ny
+          y_rhsij += MDM[j,k] * v_fy[i,k]
+        end
+        bdry_rhsij = (v_nf_up[i]  * Ml_up[j]  + v_nf_down[i] * Ml_down[j]) * dydYij
+        v_rhs[i,j] = y_rhsij * dydYij - bdry_rhsij
       end
-      v_rhs[i,j] += rhsij * dydYij
-      v_rhs[i,j] -= (v_nf_up[i]  * Ml_up[j]  + v_nf_down[i] * Ml_down[j]) * dydYij
     end
 
   end
diff --git a/src/mesh.jl b/src/mesh.jl
index c768d460e20259713a2765b63916088e4aa5f850..7d42bc970819d4fb7163712ed00c6d67382c8904 100644
--- a/src/mesh.jl
+++ b/src/mesh.jl
@@ -509,6 +509,17 @@ end
 eachcell(mesh::AbstractMesh) = CellIterator(mesh)
 eachcell(mesh::AbstractMesh, data) = CellDataIterator(mesh, data)
 
+@inline function flatcellindices(mesh::Mesh1d, idx)
+  Nx = mesh.element.Npts
+  offset = mesh.offsets[idx]
+  return offset+1:offset+Nx
+end
+@inline function flatcellindices(mesh::Mesh2d, idx)
+  Nx = mesh.element.Npts
+  Ny = Nx
+  offset = mesh.offsets[idx]
+  return offset+1:offset+Nx*Ny
+end
 
 @inline function cellindices(mesh, idx)
   Nx = mesh.element.Npts
diff --git a/src/utils.jl b/src/utils.jl
index 84fecc96898abaab9215ae5b8b250290c67047f1..9a1719fd760ec2993cc82d8ceaa92274ad26f3ce 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -564,6 +564,7 @@ end
 # reshape allocates, see https://github.com/JuliaLang/julia/issues/36313
 # however, Base allows to construct a custom reshape that does not allocate
 @inline vreshape(x, sz::Tuple{N,Int64}) where N = reshape(view(x, :), sz)
+@inline rview(x, rng, sz) = reshape(view(x, rng), sz)
 
 
 TODO() = error("Not implemented yet!")