diff --git a/Manifest.toml b/Manifest.toml index bc7658d4dafe2ba2a0edabaa26ee563f862e5f79..c2f9b82fd3b48545f0daf3c735891b2d217a005c 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -1,8 +1,8 @@ # This file is machine-generated - editing it directly is not advised -julia_version = "1.9.1" +julia_version = "1.9.3" manifest_format = "2.0" -project_hash = "23b088bdd6df8e8419a0902b8cafd14f76c1ffbf" +project_hash = "211163242c6c90535eae175929c4423583770490" [[deps.Adapt]] deps = ["LinearAlgebra"] @@ -14,6 +14,34 @@ version = "3.4.0" uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" version = "1.1.1" +[[deps.ArrayInterface]] +deps = ["Adapt", "LinearAlgebra", "Requires", "SparseArrays", "SuiteSparse"] +git-tree-sha1 = "f83ec24f76d4c8f525099b2ac475fc098138ec31" +uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" +version = "7.4.11" + + [deps.ArrayInterface.extensions] + ArrayInterfaceBandedMatricesExt = "BandedMatrices" + ArrayInterfaceBlockBandedMatricesExt = "BlockBandedMatrices" + ArrayInterfaceCUDAExt = "CUDA" + ArrayInterfaceGPUArraysCoreExt = "GPUArraysCore" + ArrayInterfaceStaticArraysCoreExt = "StaticArraysCore" + ArrayInterfaceTrackerExt = "Tracker" + + [deps.ArrayInterface.weakdeps] + BandedMatrices = "aae01518-5342-5314-be14-df237901396f" + BlockBandedMatrices = "ffab5731-97b5-5995-9138-79e8c1846df0" + CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" + StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" + Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" + +[[deps.ArrayInterfaceCore]] +deps = ["LinearAlgebra", "SnoopPrecompile", "SparseArrays", "SuiteSparse"] +git-tree-sha1 = "e5f08b5689b1aad068e01751889f2f615c7db36d" +uuid = "30b0a656-2188-435a-8636-2ec0e6a096e2" +version = "0.1.29" + [[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" @@ -37,12 +65,24 @@ git-tree-sha1 = "43b1a4a8f797c1cddadf60499a8a077d4af2cd2d" uuid = "d1d4a3ce-64b1-5f1a-9ba4-7e7e69966f35" version = "0.1.7" +[[deps.BitTwiddlingConvenienceFunctions]] +deps = ["Static"] +git-tree-sha1 = "0c5f81f47bbbcf4aea7b2959135713459170798b" +uuid = "62783981-4cbd-42fc-bca8-16325de8dc4b" +version = "0.1.5" + [[deps.Bzip2_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" version = "1.0.8+0" +[[deps.CPUSummary]] +deps = ["CpuId", "IfElse", "PrecompileTools", "Static"] +git-tree-sha1 = "89e0654ed8c7aebad6d5ad235d6242c2d737a928" +uuid = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9" +version = "0.2.3" + [[deps.Cairo_jll]] deps = ["Artifacts", "Bzip2_jll", "CompilerSupportLibraries_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"] git-tree-sha1 = "4b859a208b2397a7a623a03449e4636bdb17bcf2" @@ -61,6 +101,12 @@ git-tree-sha1 = "38f7a08f19d8810338d4f5085211c7dfa5d5bdd8" uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" version = "0.1.4" +[[deps.CloseOpenIntervals]] +deps = ["Static", "StaticArrayInterface"] +git-tree-sha1 = "70232f82ffaab9dc52585e0dd043b5e0c6b714f1" +uuid = "fb6a15b2-703c-40df-9091-08a04967cfa9" +version = "0.1.12" + [[deps.CodecZlib]] deps = ["TranscodingStreams", "Zlib_jll"] git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da" @@ -105,7 +151,7 @@ version = "4.5.0" [[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" -version = "1.0.2+0" +version = "1.0.5+0" [[deps.ConstructionBase]] deps = ["LinearAlgebra"] @@ -118,6 +164,12 @@ git-tree-sha1 = "d05d9e7b7aedff4e5b51a029dced05cfb6125781" uuid = "d38c429a-6771-53c6-b99e-75d170b6e991" version = "0.6.2" +[[deps.CpuId]] +deps = ["Markdown"] +git-tree-sha1 = "fcbb72b032692610bfbdb15018ac16a36cf2e406" +uuid = "adafc99b-e345-5852-983c-f28acb93d879" +version = "0.3.1" + [[deps.Crayons]] git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" @@ -312,6 +364,17 @@ git-tree-sha1 = "129acf094d168394e80ee1dc4bc06ec835e510a3" uuid = "2e76f6c2-a576-52d4-95c1-20adfe4de566" version = "2.8.1+1" +[[deps.HostCPUFeatures]] +deps = ["BitTwiddlingConvenienceFunctions", "IfElse", "Libdl", "Static"] +git-tree-sha1 = "eb8fed28f4994600e29beef49744639d985a04b2" +uuid = "3e5b6fbb-0976-4d2c-9146-d79de83f2fb0" +version = "0.1.16" + +[[deps.IfElse]] +git-tree-sha1 = "debdd00ffef04665ccbb3e150747a77560e8fad1" +uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173" +version = "0.1.1" + [[deps.IniFile]] git-tree-sha1 = "f550e6e32074c939295eb5ea6de31849ac2c9625" uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" @@ -414,6 +477,12 @@ git-tree-sha1 = "ab9aa169d2160129beb241cb2750ca499b4e90e9" uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316" version = "0.15.17" +[[deps.LayoutPointers]] +deps = ["ArrayInterface", "LinearAlgebra", "ManualMemory", "SIMDTypes", "Static", "StaticArrayInterface"] +git-tree-sha1 = "88b8f66b604da079a627b6fb2860d3704a6729a1" +uuid = "10f19ff3-798f-405d-979b-55457f8fc047" +version = "0.1.14" + [[deps.LazyArtifacts]] deps = ["Artifacts", "Pkg"] uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" @@ -513,12 +582,32 @@ git-tree-sha1 = "cedb76b37bc5a6c702ade66be44f831fa23c681e" uuid = "e6f89c97-d47a-5376-807f-9c37f3926c36" version = "1.0.0" +[[deps.LoopVectorization]] +deps = ["ArrayInterface", "ArrayInterfaceCore", "CPUSummary", "CloseOpenIntervals", "DocStringExtensions", "HostCPUFeatures", "IfElse", "LayoutPointers", "LinearAlgebra", "OffsetArrays", "PolyesterWeave", "PrecompileTools", "SIMDTypes", "SLEEFPirates", "Static", "StaticArrayInterface", "ThreadingUtilities", "UnPack", "VectorizationBase"] +git-tree-sha1 = "c88a4afe1703d731b1c4fdf4e3c7e77e3b176ea2" +uuid = "bdcacae8-1622-11e9-2a5c-532679323890" +version = "0.12.165" + + [deps.LoopVectorization.extensions] + ForwardDiffExt = ["ChainRulesCore", "ForwardDiff"] + SpecialFunctionsExt = "SpecialFunctions" + + [deps.LoopVectorization.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" + SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" + [[deps.MacroTools]] deps = ["Markdown", "Random"] git-tree-sha1 = "42324d08725e200c23d4dfb549e0d5d89dede2d2" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" version = "0.5.10" +[[deps.ManualMemory]] +git-tree-sha1 = "bcaef4fc7a0cfe2cba636d84cda54b5e4e4ca3cd" +uuid = "d125e4d3-2237-4719-b19c-fa641b8a4667" +version = "0.1.8" + [[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" @@ -656,7 +745,7 @@ version = "0.40.1+0" [[deps.Pkg]] deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -version = "1.9.0" +version = "1.9.2" [[deps.PlotThemes]] deps = ["PlotUtils", "Statistics"] @@ -676,12 +765,24 @@ git-tree-sha1 = "02ecc6a3427e7edfff1cebcf66c1f93dd77760ec" uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" version = "1.38.1" +[[deps.PolyesterWeave]] +deps = ["BitTwiddlingConvenienceFunctions", "CPUSummary", "IfElse", "Static", "ThreadingUtilities"] +git-tree-sha1 = "240d7170f5ffdb285f9427b92333c3463bf65bf6" +uuid = "1d0040c9-8b98-4ee7-8388-3f51789ca0ad" +version = "0.2.1" + [[deps.Polynomials]] deps = ["Intervals", "LinearAlgebra", "MutableArithmetics", "RecipesBase"] git-tree-sha1 = "a1f7f4e41404bed760213ca01d7f384319f717a5" uuid = "f27b6e38-b328-58d1-80ce-0feddd5e7a45" version = "2.0.25" +[[deps.PrecompileTools]] +deps = ["Preferences"] +git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f" +uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +version = "1.2.0" + [[deps.Preferences]] deps = ["TOML"] git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d" @@ -767,6 +868,17 @@ version = "2.0.8" uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" version = "0.7.0" +[[deps.SIMDTypes]] +git-tree-sha1 = "330289636fb8107c5f32088d2741e9fd7a061a5c" +uuid = "94e857df-77ce-4151-89e5-788b33177be4" +version = "0.1.0" + +[[deps.SLEEFPirates]] +deps = ["IfElse", "Static", "VectorizationBase"] +git-tree-sha1 = "4b8586aece42bee682399c4c4aee95446aa5cd19" +uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa" +version = "0.6.39" + [[deps.Scratch]] deps = ["Dates"] git-tree-sha1 = "f94f779c94e58bf9ea243e77a37e16d9de9126bd" @@ -821,6 +933,23 @@ git-tree-sha1 = "5d65101b2ed17a8862c4c05639c3ddc7f3d791e1" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" version = "1.8.7" +[[deps.Static]] +deps = ["IfElse"] +git-tree-sha1 = "f295e0a1da4ca425659c57441bcb59abb035a4bc" +uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" +version = "0.8.8" + +[[deps.StaticArrayInterface]] +deps = ["ArrayInterface", "Compat", "IfElse", "LinearAlgebra", "Requires", "SnoopPrecompile", "SparseArrays", "Static", "SuiteSparse"] +git-tree-sha1 = "33040351d2403b84afce74dae2e22d3f5b18edcb" +uuid = "0d7ed370-da01-4f52-bd93-41d350b8b718" +version = "1.4.0" +weakdeps = ["OffsetArrays", "StaticArrays"] + + [deps.StaticArrayInterface.extensions] + StaticArrayInterfaceOffsetArraysExt = "OffsetArrays" + StaticArrayInterfaceStaticArraysExt = "StaticArrays" + [[deps.StaticArrays]] deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"] git-tree-sha1 = "6954a456979f23d05085727adb17c4551c19ecd1" @@ -860,6 +989,10 @@ git-tree-sha1 = "b03a3b745aa49b566f128977a7dd1be8711c5e71" uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" version = "0.6.14" +[[deps.SuiteSparse]] +deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"] +uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" + [[deps.SuiteSparse_jll]] deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" @@ -897,6 +1030,12 @@ version = "0.1.1" deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +[[deps.ThreadingUtilities]] +deps = ["ManualMemory"] +git-tree-sha1 = "eda08f7e9818eb53661b3deb74e3159460dfbc27" +uuid = "8290d209-cae3-49c0-8002-c8c24d57dab5" +version = "0.5.2" + [[deps.TimeZones]] deps = ["Dates", "Downloads", "InlineStrings", "LazyArtifacts", "Mocking", "Printf", "RecipesBase", "Scratch", "Unicode"] git-tree-sha1 = "a92ec4466fc6e3dd704e2668b5e7f24add36d242" @@ -953,6 +1092,12 @@ git-tree-sha1 = "c2d0db3ef09f1942d08ea455a9e252594be5f3b6" uuid = "4004b06d-e244-455f-a6ce-a5f9919cc534" version = "1.0.1" +[[deps.VectorizationBase]] +deps = ["ArrayInterface", "CPUSummary", "HostCPUFeatures", "IfElse", "LayoutPointers", "Libdl", "LinearAlgebra", "SIMDTypes", "Static", "StaticArrayInterface"] +git-tree-sha1 = "b182207d4af54ac64cbc71797765068fdeff475d" +uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" +version = "0.21.64" + [[deps.Wayland_jll]] deps = ["Artifacts", "Expat_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Pkg", "XML2_jll"] git-tree-sha1 = "ed8d92d9774b077c53e1da50fd81a36af3744c1c" diff --git a/Project.toml b/Project.toml index 558a4eb6022bd7b3bc1f7229f7bc96b1c0d24b60..5a83654027abe294907e73d571e79e3e540ece1b 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,7 @@ Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" Jacobi = "83f21c0b-4282-5fbc-9e3f-f6da3d2e584c" LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" diff --git a/src/dg1d.jl b/src/dg1d.jl index a95cba580c04dbd4e59866f1725e6bf65922a4fe..b93d6495983eaf696f6aee75924d10fb7ebfc3aa 100644 --- a/src/dg1d.jl +++ b/src/dg1d.jl @@ -8,6 +8,7 @@ using InteractiveUtils using Jacobi @reexport using HDF5 using LinearAlgebra +using LoopVectorization import MacroTools using OrderedCollections using Plots diff --git a/src/dg_rhs.jl b/src/dg_rhs.jl index 06fe3aef1d147bac7eb9b0f6e53d53d2542c7807..c18cc6c3f7a345bbb7bfaed67c9009704d1d7118 100644 --- a/src/dg_rhs.jl +++ b/src/dg_rhs.jl @@ -8,12 +8,14 @@ function compute_rhs_weak_form!(rhs, f, nf, mesh::Mesh1d) @unpack K = mesh shape = layout(mesh) mat_rhs = vreshape(rhs, shape) - mat_f = vreshape(f, shape) + mat_f = vreshape(f, shape) mul!(mat_rhs, MDM, mat_f) nf_lhs = view(nf, 1:K) nf_rhs = view(nf, K+1:2*K) - @. mat_rhs -= (nf_rhs' * Ml_rhs + #= minus contained in normal vector =# nf_lhs' * Ml_lhs) - mat_rhs .*= invjac + @turbo @. begin # factor x10 + mat_rhs -= (nf_rhs' * Ml_rhs + #= minus contained in normal vector =# nf_lhs' * Ml_lhs) + mat_rhs *= invjac + end return end function compute_rhs_weak_form!(rhs, f, s, nf, mesh::Mesh1d) @@ -28,42 +30,48 @@ function compute_rhs_weak_form!(rhs, fx, fy, nf, mesh::Mesh2d) @unpack invM, MDM, Ml_lhs, Ml_rhs, Npts = element Ml_down, Ml_up = Ml_lhs, Ml_rhs Nx, Ny = Npts, Npts - @inbounds for k = 1:n_cells(mesh.tree) + Ncs = n_cells(mesh.tree) + @inbounds for kk in 1:Ncs # bulk - idxs_bulk = cellindices(mesh, k) - v_rhs = view(rhs, idxs_bulk) - v_fx = view(fx, idxs_bulk) - v_fy = view(fy, idxs_bulk) - v_dxdX = view(dxdX, idxs_bulk) - v_dydY = view(dydY, idxs_bulk) + idxs_bulk = flatcellindices(mesh, kk) + v_rhs = rview(rhs, idxs_bulk, (Nx,Ny)) + v_fx = rview(fx, idxs_bulk, (Nx,Ny)) + v_fy = rview(fy, idxs_bulk, (Nx,Ny)) + v_dxdX = rview(dxdX, idxs_bulk, (Nx,Ny)) + v_dydY = rview(dydY, idxs_bulk, (Nx,Ny)) # faces - idxs_lhs, idxs_rhs, idxs_down, idxs_up = faceindices(mesh, k) - + idxs_lhs, idxs_rhs, idxs_down, idxs_up = faceindices(mesh, kk) v_nf_lhs = view(nf, idxs_lhs) v_nf_rhs = view(nf, idxs_rhs) v_nf_down = view(nf, idxs_down) v_nf_up = view(nf, idxs_up) - # TODO Benchmark and see whether LoopVectorization can used here. - @inbounds for j=1:Ny, i=1:Nx - rhsij = 0 - dxdXij = v_dxdX[i,j] - dydYij = v_dydY[i,j] - for k = 1:Nx - rhsij += MDM[i,k] * v_fx[k,j] + # atm we can utilize that cartesian grids have constant jacobians and + # pull those out of the loop + dxdXij = v_dxdX[1,1] + dydYij = v_dydY[1,1] + + # notes on @turbo: + # - can't use k as index for both inner loops, otherwise gives wrong result + # - can't insert x_rhsij *= dxdXij after the k loop, + # cf. https://github.com/JuliaSIMD/LoopVectorization.jl/issues/506 + @turbo for j in 1:Ny + for i in 1:Nx + x_rhsij = 0.0 + for k in 1:Nx + x_rhsij += MDM[i,k] * v_fx[k,j] + end + y_rhsij = 0.0 + for l in 1:Ny + y_rhsij += MDM[j,l] * v_fy[i,l] + end + bdry_rhsij = (v_nf_rhs[j] * Ml_rhs[i] + v_nf_lhs[j] * Ml_lhs[i]) * dxdXij + + (v_nf_up[i] * Ml_up[j] + v_nf_down[i] * Ml_down[j]) * dydYij + v_rhs[i,j] = x_rhsij * dxdXij + y_rhsij * dydYij - bdry_rhsij end - v_rhs[i,j] = rhsij * dxdXij - rhsij = 0 - for k = 1:Ny - rhsij += MDM[j,k] * v_fy[i,k] - end - v_rhs[i,j] += rhsij * dydYij - v_rhs[i,j] -= (v_nf_rhs[j] * Ml_rhs[i] + v_nf_lhs[j] * Ml_lhs[i]) * dxdXij + - (v_nf_up[i] * Ml_up[j] + v_nf_down[i] * Ml_down[j]) * dydYij end - end return end @@ -73,29 +81,34 @@ function compute_rhs_weak_form!(rhs, fx, fy::Real, nf, mesh::Mesh2d) @unpack invM, MDM, Ml_lhs, Ml_rhs, Npts = element Ml_down, Ml_up = Ml_lhs, Ml_rhs Nx, Ny = Npts, Npts - @inbounds for k = 1:n_cells(mesh.tree) + Ncs = n_cells(mesh.tree) + @inbounds for kk in 1:Ncs # bulk - idxs_bulk = cellindices(mesh, k) - v_rhs = view(rhs, idxs_bulk) - v_fx = view(fx, idxs_bulk) - v_dxdX = view(dxdX, idxs_bulk) + idxs_bulk = flatcellindices(mesh, kk) + v_rhs = rview(rhs, idxs_bulk, (Nx,Ny)) + v_fx = rview(fx, idxs_bulk, (Nx,Ny)) + v_dxdX = rview(dxdX, idxs_bulk, (Nx,Ny)) # faces - idxs_lhs, idxs_rhs, _, _ = faceindices(mesh, k) + idxs_lhs, idxs_rhs, _, _ = faceindices(mesh, kk) v_nf_lhs = view(nf, idxs_lhs) v_nf_rhs = view(nf, idxs_rhs) - # TODO Benchmark and see whether LoopVectorization can used here. - @inbounds for j=1:Ny, i=1:Nx - rhsij = 0 - dxdXij = v_dxdX[i,j] - for k = 1:Nx - rhsij += MDM[i,k] * v_fx[k,j] + # atm we can utilize that cartesian grids have constant jacobians and + # pull those out of the loop + dxdXij = v_dxdX[1,1] + + @turbo for j in 1:Ny + for i in 1:Nx + x_rhsij = 0.0 + for k in 1:Nx + x_rhsij += MDM[i,k] * v_fx[k,j] + end + bdry_rhsij = (v_nf_rhs[j] * Ml_rhs[i] + v_nf_lhs[j] * Ml_lhs[i]) * dxdXij + v_rhs[i,j] = x_rhsij * dxdXij - bdry_rhsij end - v_rhs[i,j] = rhsij * dxdXij - v_rhs[i,j] -= (v_nf_rhs[j] * Ml_rhs[i] + v_nf_lhs[j] * Ml_lhs[i]) * dxdXij end end @@ -107,29 +120,34 @@ function compute_rhs_weak_form!(rhs, fx::Real, fy, nf, mesh::Mesh2d) @unpack invM, MDM, Ml_lhs, Ml_rhs, Npts = element Ml_down, Ml_up = Ml_lhs, Ml_rhs Nx, Ny = Npts, Npts - @inbounds for k = 1:n_cells(mesh.tree) + Ncs = n_cells(mesh.tree) + @inbounds for kk in 1:Ncs # bulk - idxs_bulk = cellindices(mesh, k) - v_rhs = view(rhs, idxs_bulk) - v_fy = view(fy, idxs_bulk) - v_dydY = view(dydY, idxs_bulk) + idxs_bulk = flatcellindices(mesh, kk) + v_rhs = rview(rhs, idxs_bulk, (Nx,Ny)) + v_fy = rview(fy, idxs_bulk, (Nx,Ny)) + v_dydY = rview(dydY, idxs_bulk, (Nx,Ny)) # faces - _, _, idxs_down, idxs_up = faceindices(mesh, k) + _, _, idxs_down, idxs_up = faceindices(mesh, kk) v_nf_down = view(nf, idxs_down) v_nf_up = view(nf, idxs_up) - # TODO Benchmark and see whether LoopVectorization can used here. - @inbounds for j=1:Ny, i=1:Nx - rhsij = 0 - dydYij = v_dydY[i,j] - for k = 1:Ny - rhsij += MDM[j,k] * v_fy[i,k] + # atm we can utilize that cartesian grids have constant jacobians and + # pull those out of the loop + dydYij = v_dydY[1,1] + + @turbo for j in 1:Ny + for i in 1:Nx + y_rhsij = 0.0 + for k in 1:Ny + y_rhsij += MDM[j,k] * v_fy[i,k] + end + bdry_rhsij = (v_nf_up[i] * Ml_up[j] + v_nf_down[i] * Ml_down[j]) * dydYij + v_rhs[i,j] = y_rhsij * dydYij - bdry_rhsij end - v_rhs[i,j] += rhsij * dydYij - v_rhs[i,j] -= (v_nf_up[i] * Ml_up[j] + v_nf_down[i] * Ml_down[j]) * dydYij end end diff --git a/src/mesh.jl b/src/mesh.jl index c768d460e20259713a2765b63916088e4aa5f850..7d42bc970819d4fb7163712ed00c6d67382c8904 100644 --- a/src/mesh.jl +++ b/src/mesh.jl @@ -509,6 +509,17 @@ end eachcell(mesh::AbstractMesh) = CellIterator(mesh) eachcell(mesh::AbstractMesh, data) = CellDataIterator(mesh, data) +@inline function flatcellindices(mesh::Mesh1d, idx) + Nx = mesh.element.Npts + offset = mesh.offsets[idx] + return offset+1:offset+Nx +end +@inline function flatcellindices(mesh::Mesh2d, idx) + Nx = mesh.element.Npts + Ny = Nx + offset = mesh.offsets[idx] + return offset+1:offset+Nx*Ny +end @inline function cellindices(mesh, idx) Nx = mesh.element.Npts diff --git a/src/utils.jl b/src/utils.jl index 84fecc96898abaab9215ae5b8b250290c67047f1..9a1719fd760ec2993cc82d8ceaa92274ad26f3ce 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -564,6 +564,7 @@ end # reshape allocates, see https://github.com/JuliaLang/julia/issues/36313 # however, Base allows to construct a custom reshape that does not allocate @inline vreshape(x, sz::Tuple{N,Int64}) where N = reshape(view(x, :), sz) +@inline rview(x, rng, sz) = reshape(view(x, rng), sz) TODO() = error("Not implemented yet!")