diff --git a/bolt/.github/workflows/CI.yml b/bolt/.github/workflows/CI.yml new file mode 100644 index 0000000000000..39b5bafef5e87 --- /dev/null +++ b/bolt/.github/workflows/CI.yml @@ -0,0 +1,29 @@ +name: CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build-bolt: + + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ ubuntu-latest ] + abt: [ yes, no ] + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - name: cmake + run: | + mkdir build + cd build + cmake ../ -DLIBOMP_USE_ARGOBOTS=${{ matrix.abt }} -DOPENMP_ENABLE_WERROR=TRUE + - name: make + run: | + cd build + make -j 2 diff --git a/bolt/.gitignore b/bolt/.gitignore new file mode 100644 index 0000000000000..d4bec15d20574 --- /dev/null +++ b/bolt/.gitignore @@ -0,0 +1,42 @@ +#==============================================================================# +# This file specifies intentionally untracked files that git should ignore. +# See: http://www.kernel.org/pub/software/scm/git/docs/gitignore.html +# +# This file is intentionally different from the output of `git svn show-ignore`, +# as most of those are useless. +#==============================================================================# + +#==============================================================================# +# File extensions to be ignored anywhere in the tree. +#==============================================================================# +# Temp files created by most text editors. +*~ +# Merge files created by git. +*.orig +# Byte compiled python modules. +*.pyc +# vim swap files +.*.sw? +.sw? +#OS X specific files. +.DS_store + +#==============================================================================# +# Explicit files to ignore (only matches one). +#==============================================================================# +# Various tag programs +tags +/TAGS +/GPATH +/GRTAGS +/GSYMS +/GTAGS +.gitusers + +#==============================================================================# +# Directories to ignore (do not add trailing '/'s, they skip symlinks). +#==============================================================================# +runtime/exports + +# Nested build directory +/build diff --git a/bolt/.gitmodules b/bolt/.gitmodules new file mode 100644 index 0000000000000..77f211bd4610f --- /dev/null +++ b/bolt/.gitmodules @@ -0,0 +1,3 @@ +[submodule "external/argobots"] + path = external/argobots + url = https://github.com/pmodels/argobots diff --git a/bolt/CHANGES.txt b/bolt/CHANGES.txt new file mode 100644 index 0000000000000..a2e29d837d108 --- /dev/null +++ b/bolt/CHANGES.txt @@ -0,0 +1,50 @@ +=============================================================================== + Changes in 1.0 +=============================================================================== + +- Upgraded to LLVM OpenMP 10.0 +- Upgraded Argobots to 1.0 +- Fixed support for untied tasks +- Added tests for OpenMP task and thread scheduling +- Support several platforms including OSX and POWER9. + +=============================================================================== + Changes in 1.0rc3 +=============================================================================== + +- Upgraded Argobots to 1.0rc2 to solve the TLS-related issue +- Fixed support for scheduler sleep + +=============================================================================== + Changes in 1.0rc2 +=============================================================================== + +- Upgraded to LLVM OpenMP 9.0 +- Improved the performance of nested parallel regions +- Support the thread affinity + +=============================================================================== + Changes in 1.0rc1 +=============================================================================== + +- Upgraded to LLVM OpenMP 7.0 +- Support task depend and taskloop +- Support OpenMP 4.5 except untied task and cancellation +- Argobots updated to the latest version and integrated as a git submodule + +=============================================================================== + Changes in 1.0b1 +=============================================================================== + +- Fixed missing some global state initialization +- Fixed bugs related to newer Perl versions +- Updated the embedded Argobots version + +=============================================================================== + Changes in 1.0a1 +=============================================================================== + +# The first release of BOLT, which uses Argobots as a threading layer. + +# Support OpenMP 3.1 + diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt new file mode 100644 index 0000000000000..0e32d0988a52d --- /dev/null +++ b/bolt/CMakeLists.txt @@ -0,0 +1,120 @@ +cmake_minimum_required(VERSION 3.13.4) + +# Add cmake directory to search for custom cmake functions. +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH}) + +# llvm/runtimes/ will set OPENMP_STANDALONE_BUILD. +if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") + set(OPENMP_STANDALONE_BUILD TRUE) + project(bolt C CXX) + + # CMAKE_BUILD_TYPE was not set, default to Release. + if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) + endif() + + # Group common settings. + set(OPENMP_ENABLE_WERROR FALSE CACHE BOOL + "Enable -Werror flags to turn warnings into errors for supporting compilers.") + set(OPENMP_LIBDIR_SUFFIX "" CACHE STRING + "Suffix of lib installation directory, e.g. 64 => lib64") + # Do not use OPENMP_LIBDIR_SUFFIX directly, use OPENMP_INSTALL_LIBDIR. + set(OPENMP_INSTALL_LIBDIR "lib${OPENMP_LIBDIR_SUFFIX}") + + # Group test settings. + set(OPENMP_TEST_C_COMPILER ${CMAKE_C_COMPILER} CACHE STRING + "C compiler to use for testing OpenMP runtime libraries.") + set(OPENMP_TEST_CXX_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING + "C++ compiler to use for testing OpenMP runtime libraries.") + set(OPENMP_LLVM_TOOLS_DIR "" CACHE PATH "Path to LLVM tools for testing.") +else() + set(OPENMP_ENABLE_WERROR ${LLVM_ENABLE_WERROR}) + set(LIBOMP_USE_BOLT_DEFAULT FALSE CACHE BOOL "Use BOLT as a default LLVM OpenMP?") + if (${LIBOMP_USE_BOLT_DEFAULT}) + # If building in tree, we honor the same install suffix LLVM uses. + set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}") + else() + # If building in tree, we put BOLT libraries in a special directory + set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}/bolt") + endif() + # Place libraries in "lib/bolt" + set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/bolt) + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/bolt) + + if (NOT MSVC) + set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) + set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++) + else() + set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe) + set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe) + endif() +endif() + +# Check and set up common compiler flags. +include(config-ix) +include(HandleOpenMPOptions) + +# Set up testing infrastructure. +include(OpenMPTesting) + +set(OPENMP_TEST_FLAGS "" CACHE STRING + "Extra compiler flags to send to the test compiler.") +set(OPENMP_TEST_OPENMP_FLAGS ${OPENMP_TEST_COMPILER_OPENMP_FLAGS} CACHE STRING + "OpenMP compiler flag to use for testing OpenMP runtime libraries.") + +# Build external libraries. +add_subdirectory(external) + +# Build host runtime library. +add_subdirectory(runtime) + + +set(ENABLE_LIBOMPTARGET ON) +# Currently libomptarget cannot be compiled on Windows or MacOS X. +# Since the device plugins are only supported on Linux anyway, +# there is no point in trying to compile libomptarget on other OSes. +if (APPLE OR WIN32 OR NOT OPENMP_HAVE_STD_CPP14_FLAG) + set(ENABLE_LIBOMPTARGET OFF) +endif() + +# Attempt to locate LLVM source, required by libomptarget +if (NOT LIBOMPTARGET_LLVM_MAIN_INCLUDE_DIR) + if (LLVM_MAIN_INCLUDE_DIR) + set(LIBOMPTARGET_LLVM_MAIN_INCLUDE_DIR ${LLVM_MAIN_INCLUDE_DIR}) + elseif (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../llvm/include) + set(LIBOMPTARGET_LLVM_MAIN_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../llvm/include) + endif() +endif() + +if (NOT LIBOMPTARGET_LLVM_MAIN_INCLUDE_DIR) + message(STATUS "Missing definition for LIBOMPTARGET_LLVM_MAIN_INCLUDE_DIR, disabling libomptarget") + set(ENABLE_LIBOMPTARGET OFF) +endif() + +option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading." + ${ENABLE_LIBOMPTARGET}) +if (OPENMP_ENABLE_LIBOMPTARGET) + # Check that the library can actually be built. + if (APPLE OR WIN32) + message(FATAL_ERROR "libomptarget cannot be built on Windows and MacOS X!") + elseif (NOT OPENMP_HAVE_STD_CPP14_FLAG) + message(FATAL_ERROR "Host compiler must support C++14 to build libomptarget!") + endif() + + add_subdirectory(libomptarget) +endif() + +set(ENABLE_OMPT_TOOLS ON) +# Currently tools are not tested well on Windows or MacOS X. +if (APPLE OR WIN32) + set(ENABLE_OMPT_TOOLS OFF) +endif() + +option(OPENMP_ENABLE_OMPT_TOOLS "Enable building ompt based tools for OpenMP." + ${ENABLE_OMPT_TOOLS}) +if (OPENMP_ENABLE_OMPT_TOOLS) + add_subdirectory(tools) +endif() + +# Now that we have seen all testsuites, create the check-openmp target. +construct_check_openmp_target() diff --git a/bolt/CREDITS.txt b/bolt/CREDITS.txt new file mode 100644 index 0000000000000..ede45b10fea25 --- /dev/null +++ b/bolt/CREDITS.txt @@ -0,0 +1,65 @@ +This file is a partial list of people who have contributed to the LLVM/openmp +project. If you have contributed a patch or made some other contribution to +LLVM/openmp, please submit a patch to this file to add yourself, and it will be +done! + +The list is sorted by surname and formatted to allow easy grepping and +beautification by scripts. The fields are: name (N), email (E), web-address +(W), PGP key ID and fingerprint (P), description (D), and snail-mail address +(S). + +N: Adam Azarchs +W: 10xgenomics.com +D: Bug fix for lock code + +N: Carlo Bertolli +W: http://ibm.com +D: IBM contributor to PowerPC support in CMake files and elsewhere. + +N: Diego Caballero +E: diego.l.caballero@gmail.com +D: Fork performance improvements + +N: Sunita Chandrasekaran +D: Contributor to testsuite from OpenUH + +N: Barbara Chapman +D: Contributor to testsuite from OpenUH + +N: University of Houston +W: http://web.cs.uh.edu/~openuh/download/ +D: OpenUH test suite + +N: Intel Corporation OpenMP runtime team +W: http://openmprtl.org +D: Created the runtime. + +N: John Mellor-Crummey and other members of the OpenMP Tools Working Group +E: johnmc@rice.edu +D: OpenMP Tools Interface (OMPT) + +N: Matthias Muller +D: Contributor to testsuite from OpenUH + +N: Tal Nevo +E: tal@scalemp.com +D: ScaleMP contributor to improve runtime performance there. +W: http://scalemp.com + +N: Pavel Neytchev +D: Contributor to testsuite from OpenUH + +N: Steven Noonan +E: steven@uplinklabs.net +D: Patches for the ARM architecture and removal of several inconsistencies. + +N: Joachim Protze +E: protze@itc.rwth-aachen.de +D: OpenMP Tools Interface, Archer tool + +N: Alp Toker +E: alp@nuanti.com +D: Making build work for FreeBSD. + +N: Cheng Wang +D: Contributor to testsuite from OpenUH diff --git a/bolt/LICENSE.txt b/bolt/LICENSE.txt new file mode 100644 index 0000000000000..2153a1bab8f52 --- /dev/null +++ b/bolt/LICENSE.txt @@ -0,0 +1,397 @@ +============================================================================== +BOLT is a derivative of the Intel OpenMP runtime. The original pieces of the +code from the Intel OpenMP runtime are copyrighted to Intel, and the pieces +modified for BOLT are copyrighted to UChicago Argonne, LLC. +============================================================================== + Copyright (c) 2016, UChicago Argonne, LLC + All Rights Reserved + BOLT: OpenMP over Lightweight Threads, SF-16-140 + OPEN SOURCE LICENSE +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. Software changes, + modifications, or derivative works, should be noted with comments and the + author and organization's name. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +3. Neither the names of UChicago Argonne, LLC or the Department of Energy nor + the names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. +4. The software and the end-user documentation included with the + redistribution, if any, must include the following acknowledgment: + "This product includes software produced by UChicago Argonne, LLC under + Contract No. DE-AC02-06CH11357 with the Department of Energy." +****************************************************************************** + DISCLAIMER + THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND. +NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF +ENERGY, NOR UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY +WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY +FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, +APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT +INFRINGE PRIVATELY OWNED RIGHTS. +****************************************************************************** + +============================================================================== +The LLVM Project is under the Apache License v2.0 with LLVM Exceptions: +============================================================================== + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +---- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + +============================================================================== +Software from third parties included in the LLVM Project: +============================================================================== +The LLVM Project contains third party software which is under different license +terms. All such code will be identified clearly using at least one of two +mechanisms: +1) It will be in a separate directory tree with its own `LICENSE.txt` or + `LICENSE` file at the top containing the specific license and restrictions + which apply to that software, or +2) It will contain specific license and restriction terms at the top of every + file. + +============================================================================== +Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy): +============================================================================== + +The software contained in this directory tree is dual licensed under both the +University of Illinois "BSD-Like" license and the MIT license. As a user of +this code you may choose to use it under either license. As a contributor, +you agree to allow your code to be used under both. The full text of the +relevant licenses is included below. + +In addition, a license agreement from the copyright/patent holders of the +software contained in this directory tree is included below. + +============================================================================== + +University of Illinois/NCSA +Open Source License + +Copyright (c) 1997-2019 Intel Corporation + +All rights reserved. + +Developed by: + OpenMP Runtime Team + Intel Corporation + http://www.openmprtl.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of Intel Corporation OpenMP Runtime Team nor the + names of its contributors may be used to endorse or promote products + derived from this Software without specific prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. + +============================================================================== + +Copyright (c) 1997-2019 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +============================================================================== + +Intel Corporation + +Software Grant License Agreement ("Agreement") + +Except for the license granted herein to you, Intel Corporation ("Intel") reserves +all right, title, and interest in and to the Software (defined below). + +Definition + +"Software" means the code and documentation as well as any original work of +authorship, including any modifications or additions to an existing work, that +is intentionally submitted by Intel to llvm.org (http://llvm.org) ("LLVM") for +inclusion in, or documentation of, any of the products owned or managed by LLVM +(the "Work"). For the purposes of this definition, "submitted" means any form of +electronic, verbal, or written communication sent to LLVM or its +representatives, including but not limited to communication on electronic +mailing lists, source code control systems, and issue tracking systems that are +managed by, or on behalf of, LLVM for the purpose of discussing and improving +the Work, but excluding communication that is conspicuously marked otherwise. + +1. Grant of Copyright License. Subject to the terms and conditions of this + Agreement, Intel hereby grants to you and to recipients of the Software + distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge, + royalty-free, irrevocable copyright license to reproduce, prepare derivative + works of, publicly display, publicly perform, sublicense, and distribute the + Software and such derivative works. + +2. Grant of Patent License. Subject to the terms and conditions of this + Agreement, Intel hereby grants you and to recipients of the Software + distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge, + royalty-free, irrevocable (except as stated in this section) patent license + to make, have made, use, offer to sell, sell, import, and otherwise transfer + the Work, where such license applies only to those patent claims licensable + by Intel that are necessarily infringed by Intel's Software alone or by + combination of the Software with the Work to which such Software was + submitted. If any entity institutes patent litigation against Intel or any + other entity (including a cross-claim or counterclaim in a lawsuit) alleging + that Intel's Software, or the Work to which Intel has contributed constitutes + direct or contributory patent infringement, then any patent licenses granted + to that entity under this Agreement for the Software or Work shall terminate + as of the date such litigation is filed. + +Unless required by applicable law or agreed to in writing, the software is +provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +either express or implied, including, without limitation, any warranties or +conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. + +============================================================================== diff --git a/bolt/README.md b/bolt/README.md new file mode 100644 index 0000000000000..6776e77109468 --- /dev/null +++ b/bolt/README.md @@ -0,0 +1,263 @@ +# BOLT: OpenMP over Lightweight Threads + +BOLT targets a high-performing OpenMP implementation, especially specialized +for fine-grain parallelism. BOLT utilizes a lightweight threading model for +its underlying threading mechanism. It currently adopts Argobots, a new +holistic, low-level threading and tasking runtime, in order to overcome +shortcomings of conventional OS-level threads. The current BOLT implementation +is based on the OpenMP runtime in LLVM, and thus it can be used with +LLVM/Clang, Intel OpenMP compiler, and GCC. More information about BOLT can be +found at http://www.bolt-omp.org. + + +1. Getting Started +2. Testing BOLT +3. BOLT-Specific Environmental Variables +4. Reporting Problems +5. Alternate Build Options + + +------------------------------------------------------------------------------- + +1. Getting Started +================== + +The following instructions take you through a sequence of steps to get the +default configuration of BOLT up and running. + +Henceforth, VERSION indicates the version number of the release tarball. + +(a) You will need the following prerequisites. + + - REQUIRED: This tar file bolt-VERSION.tar.gz + + - REQUIRED: C and C++ compilers (gcc and g++ are sufficient) + + - REQUIRED: CMake (http://www.cmake.org/download) + + - OPTIONAL: Argobots (http://www.argobots.org) + The BOLT release tarball includes the Argobots source code, and + thus you can build BOLT together with the built-in Argobots. + Of course, you can use your own Argobots build instead of the + accompanied one. In the latter case, we assume Argobots has + been installed in /home/USERNAME/argobots-install. + + Also, you need to know what shell you are using since different shell has + different command syntax. Command "echo $SHELL" prints out the current shell + used by your terminal program. + + Note: if you obtained BOLT via github, the following commands download the + built-in Argobots from the Argobots repository. + + git submodule init + git submodule update + +(b) Unpack the tar file and create a build directory: + + tar xzf bolt-VERSION.tar.gz + mkdir bolt-build + cd bolt-build + + If your tar doesn't accept the z option, use + + gunzip bolt-VERSION.tar.gz + tar xf bolt-VERSION.tar + mkdir bolt-build + cd bolt-build + +(c) Choose an installation directory, say /home/USERNAME/bolt-install, which is +assumed to be non-existent or empty. + +(d) Configure BOLT specifying the installation directory: + + If you want to use the built-in Argobots, + + for csh and tcsh: + + cmake ../bolt-VERSION -G "Unix Makefiles" \ + -DCMAKE_INSTALL_PREFIX=/home/USERNAME/bolt-install \ + -DCMAKE_C_COMPILER= \ + -DCMAKE_CXX_COMPILER= \ + -DOPENMP_TEST_C_COMPILER= \ + -DOPENMP_TEST_CXX_COMPILER= \ + -DCMAKE_BUILD_TYPE=Release \ + -DLIBOMP_USE_ARGOBOTS=on \ + |& tee c.txt + + for bash and sh: + + cmake ../bolt-VERSION -G "Unix Makefiles" \ + -DCMAKE_INSTALL_PREFIX=/home/USERNAME/bolt-install \ + -DCMAKE_C_COMPILER= \ + -DCMAKE_CXX_COMPILER= \ + -DOPENMP_TEST_C_COMPILER= \ + -DOPENMP_TEST_CXX_COMPILER= \ + -DCMAKE_BUILD_TYPE=Release \ + -DLIBOMP_USE_ARGOBOTS=on \ + 2>&1 | tee c.txt + + If you want to use your own Argobots build, + + for csh and tcsh: + + cmake ../bolt-VERSION -G "Unix Makefiles" \ + -DCMAKE_INSTALL_PREFIX=/home/USERNAME/bolt-install \ + -DCMAKE_C_COMPILER= \ + -DCMAKE_CXX_COMPILER= \ + -DOPENMP_TEST_C_COMPILER= \ + -DOPENMP_TEST_CXX_COMPILER= \ + -DCMAKE_BUILD_TYPE=Release \ + -DLIBOMP_USE_ARGOBOTS=on \ + -DLIBOMP_ARGOBOTS_INSTALL_DIR=/home/USERNAME/argobots-install \ + |& tee c.txt + + for bash and sh: + + cmake ../bolt-VERSION -G "Unix Makefiles" \ + -DCMAKE_INSTALL_PREFIX=/home/USERNAME/bolt-install \ + -DCMAKE_C_COMPILER= \ + -DCMAKE_CXX_COMPILER= \ + -DOPENMP_TEST_C_COMPILER= \ + -DOPENMP_TEST_CXX_COMPILER= \ + -DCMAKE_BUILD_TYPE=Release \ + -DLIBOMP_USE_ARGOBOTS=on \ + -DLIBOMP_ARGOBOTS_INSTALL_DIR=/home/USERNAME/argobots-install \ + 2>&1 | tee c.txt + + Bourne-like shells, sh and bash, accept "2>&1 |". Csh-like shell, csh and + tcsh, accept "|&". If a failure occurs, the cmake command will display the + error. Most errors are straight-forward to follow. + +(e) Build BOLT: + + for csh and tcsh: + + make |& tee m.txt + + for bash and sh: + + make 2>&1 | tee m.txt + + This step should succeed if there were no problems with the preceding step. + Check file m.txt. If there were problems, do a "make clean" and then run + make again with V=1 and VERBOSE=1. + + make V=1 VERBOSE=1 |& tee m.txt (for csh and tcsh) + + OR + + make V=1 VERBOSE=1 2>&1 | tee m.txt (for bash and sh) + + Then go to step 3 below, for reporting the issue to the BOLT developers and + other users. + +(f) Install BOLT: + + for csh and tcsh: + + make install |& tee mi.txt + + for bash and sh: + + make install 2>&1 | tee mi.txt + + This step collects all required header and library files in the directory + specified by the prefix argument to cmake. + +------------------------------------------------------------------------------- + +2. Testing BOLT +=============== + +To test BOLT, you can run the test suite. Compilers for testing must be +specified when you run cmake. + +For example, if llvm-lit is installed: + + cd bolt-build + NUM_PARALLEL_TESTS=16 + llvm-lit runtime/test -v -j $NUM_PARALLEL_TESTS --timeout 600 + +If you run into any problems on running the test suite, please follow step 3 +below for reporting them to the BOLT developers and other users. + +------------------------------------------------------------------------------- + +3. BOLT-Specific Environmental Variables +=============== + +BOLT reveals several environmental variables specific to BOLT. + + KMP_ABT_NUM_ESS=: Set the number of execution streams which are + running on OS-level threads (e.g., Pthreads). + KMP_ABT_SCHED_SLEEP=<1|0>: If it is set to 1, sleep a scheduler when the + associate pools are empty. + KMP_ABT_VERBOSE=<1|0>: If it is set to 1, print all the BOLT-specific + parameters on runtime initialization. + KMP_ABT_FORK_CUTOFF=: Set the cut-off threshold used for a + divide-and-conquer thread creation. + KMP_ABT_FORK_NUM_WAYS=: Set the number of ways for a + divide-and-conquer thread creation. + KMP_ABT_SCHED_MIN_SLEEP_NSEC=: Set the minimum scheduler sleep time + (nanoseconds). + KMP_ABT_SCHED_MAX_SLEEP_NSEC=: Set the maximum scheduler sleep time + (nanoseconds). + KMP_ABT_SCHED_EVENT_FREQ=: Set the event-checking frequency of + schedulers. + KMP_ABT_WORK_STEAL_FREQ=: Set the random work stealing frequency of + schedulers. + +------------------------------------------------------------------------------- + +4. Reporting Problems +===================== + +If you have problems with the installation or usage of BOLT, please follow +these steps: + +(a) First visit the Frequently Asked Questions (FAQ) page at +https://github.com/pmodels/bolt/wiki/FAQ +to see if the problem you are facing has a simple solution. + +(b) If you cannot find an answer on the FAQ page, look through previous +email threads on the discuss@bolt-omp.org mailing list archive +(https://lists.bolt-omp.org/mailman/listinfo/discuss). It is likely +someone else had a similar problem, which has already been resolved +before. + +(c) If neither of the above steps work, please send an email to +discuss@bolt-omp.org. You need to subscribe to this list +(https://lists.bolt-omp.org/mailman/listinfo/discuss) before sending +an email. + +Your email should contain the following files. ONCE AGAIN, PLEASE +COMPRESS BEFORE SENDING, AS THE FILES CAN BE LARGE. Note that, +depending on which step the build failed, some of the files might not +exist. + + bolt-build/c.txt (generated in step 1(d) above) + bolt-build/m.txt (generated in step 1(e) above) + bolt-build/mi.txt (generated in step 1(f) above) + + DID WE MENTION? DO NOT FORGET TO COMPRESS THESE FILES! + +Finally, please include the actual error you are seeing when running +the application. If possible, please try to reproduce the error with +a smaller application or benchmark and send that along in your bug +report. + +(d) If you have found a bug in BOLT, we request that you report it +at our github issues page (https://github.com/pmodels/bolt/issues). +Even if you believe you have found a bug, we recommend you sending an +email to discuss@bolt-omp.org first. + +------------------------------------------------------------------------------- + +5. Alternate Build Options +============================== + +BOLT is based on the OpenMP subproject of LLVM for runtime, and thus it uses +the same build options provided in LLVM. + +Please visit http://openmp.llvm.org/ for more build options. + diff --git a/bolt/README.rst b/bolt/README.rst new file mode 100644 index 0000000000000..e46ed1a1a294b --- /dev/null +++ b/bolt/README.rst @@ -0,0 +1,340 @@ +======================================== +How to Build the LLVM* OpenMP* Libraries +======================================== +This repository requires `CMake `_ v2.8.0 or later. LLVM +and Clang need a more recent version which also applies for in-tree builds. For +more information than available in this document please see +`LLVM's CMake documentation `_ and the +`official documentation `_. + +.. contents:: + :local: + +How to Call CMake Initially, then Repeatedly +============================================ +- When calling CMake for the first time, all needed compiler options must be + specified on the command line. After this initial call to CMake, the compiler + definitions must not be included for further calls to CMake. Other options + can be specified on the command line multiple times including all definitions + in the build options section below. +- Example of configuring, building, reconfiguring, rebuilding: + + .. code-block:: console + + $ mkdir build + $ cd build + $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ .. # Initial configuration + $ make + ... + $ make clean + $ cmake -DCMAKE_BUILD_TYPE=Debug .. # Second configuration + $ make + ... + $ rm -rf * + $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .. # Third configuration + $ make + +- Notice in the example how the compiler definitions are only specified for an + empty build directory, but other build options are used at any time. +- The file ``CMakeCache.txt`` which is created after the first call to CMake is + a configuration file which holds all values for the build options. These + values can be changed using a text editor to modify ``CMakeCache.txt`` as + opposed to using definitions on the command line. +- To have CMake create a particular type of build generator file simply include + the ``-G `` option: + + .. code-block:: console + + $ cmake -G "Unix Makefiles" ... + + You can see a list of generators CMake supports by executing the cmake command + with no arguments. + +Instructions to Build +===================== +.. code-block:: console + + $ cd openmp_top_level/ [ this directory with libomptarget/, runtime/, etc. ] + $ mkdir build + $ cd build + + [ Unix* Libraries ] + $ cmake -DCMAKE_C_COMPILER= -DCMAKE_CXX_COMPILER= .. + + [ Windows* Libraries ] + $ cmake -G -DCMAKE_C_COMPILER= -DCMAKE_CXX_COMPILER= -DCMAKE_ASM_MASM_COMPILER=[ml | ml64] -DCMAKE_BUILD_TYPE=Release .. + + $ make + $ make install + +CMake Options +============= +Builds with CMake can be customized by means of options as already seen above. +One possibility is to pass them via the command line: + +.. code-block:: console + + $ cmake -DOPTION= path/to/source + +.. note:: The first value listed is the respective default for that option. + +Generic Options +--------------- +For full documentation consult the CMake manual or execute +``cmake --help-variable VARIABLE_NAME`` to get information about a specific +variable. + +**CMAKE_BUILD_TYPE** = ``Release|Debug|RelWithDebInfo`` + Build type can be ``Release``, ``Debug``, or ``RelWithDebInfo`` which chooses + the optimization level and presence of debugging symbols. + +**CMAKE_C_COMPILER** = + Specify the C compiler. + +**CMAKE_CXX_COMPILER** = + Specify the C++ compiler. + +**CMAKE_Fortran_COMPILER** = + Specify the Fortran compiler. This option is only needed when + **LIBOMP_FORTRAN_MODULES** is ``ON`` (see below). So typically, a Fortran + compiler is not needed during the build. + +**CMAKE_ASM_MASM_COMPILER** = ``ml|ml64`` + This option is only relevant for Windows*. + +Options for all Libraries +------------------------- + +**OPENMP_ENABLE_WERROR** = ``OFF|ON`` + Treat warnings as errors and fail, if a compiler warning is triggered. + +**OPENMP_LIBDIR_SUFFIX** = ``""`` + Extra suffix to append to the directory where libraries are to be installed. + +**OPENMP_TEST_C_COMPILER** = ``${CMAKE_C_COMPILER}`` + Compiler to use for testing. Defaults to the compiler that was also used for + building. + +**OPENMP_TEST_CXX_COMPILER** = ``${CMAKE_CXX_COMPILER}`` + Compiler to use for testing. Defaults to the compiler that was also used for + building. + +**OPENMP_LLVM_TOOLS_DIR** = ``/path/to/built/llvm/tools`` + Additional path to search for LLVM tools needed by tests. + +**OPENMP_LLVM_LIT_EXECUTABLE** = ``/path/to/llvm-lit`` + Specify full path to ``llvm-lit`` executable for running tests. The default + is to search the ``PATH`` and the directory in **OPENMP_LLVM_TOOLS_DIR**. + +**OPENMP_FILECHECK_EXECUTABLE** = ``/path/to/FileCheck`` + Specify full path to ``FileCheck`` executable for running tests. The default + is to search the ``PATH`` and the directory in **OPENMP_LLVM_TOOLS_DIR**. + +**OPENMP_NOT_EXECUTABLE** = ``/path/to/not`` + Specify full path to ``not`` executable for running tests. The default + is to search the ``PATH`` and the directory in **OPENMP_LLVM_TOOLS_DIR**. + +Options for ``libomp`` +---------------------- + +**LIBOMP_ARCH** = ``aarch64|arm|i386|mic|mips|mips64|ppc64|ppc64le|x86_64|riscv64`` + The default value for this option is chosen based on probing the compiler for + architecture macros (e.g., is ``__x86_64__`` predefined by compiler?). + +**LIBOMP_MIC_ARCH** = ``knc|knf`` + Intel(R) Many Integrated Core Architecture (Intel(R) MIC Architecture) to + build for. This value is ignored if **LIBOMP_ARCH** does not equal ``mic``. + +**LIBOMP_LIB_TYPE** = ``normal|profile|stubs`` + Library type can be ``normal``, ``profile``, or ``stubs``. + +**LIBOMP_USE_VERSION_SYMBOLS** = ``ON|OFF`` + Use versioned symbols for building the library. This option only makes sense + for ELF based libraries where version symbols are supported (Linux*, some BSD* + variants). It is ``OFF`` by default for Windows* and macOS*, but ``ON`` for + other Unix based operating systems. + +**LIBOMP_ENABLE_SHARED** = ``ON|OFF`` + Build a shared library. If this option is ``OFF``, static OpenMP libraries + will be built instead of dynamic ones. + + .. note:: + + Static libraries are not supported on Windows*. + +**LIBOMP_FORTRAN_MODULES** = ``OFF|ON`` + Create the Fortran modules (requires Fortran compiler). + +macOS* Fat Libraries +"""""""""""""""""""" +On macOS* machines, it is possible to build universal (or fat) libraries which +include both i386 and x86_64 architecture objects in a single archive. + +.. code-block:: console + + $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_OSX_ARCHITECTURES='i386;x86_64' .. + $ make + +There is also an option **LIBOMP_OSX_ARCHITECTURES** which can be set in case +this is an LLVM source tree build. It will only apply for the ``libomp`` library +avoids having the entire LLVM/Clang build produce universal binaries. + +Optional Features +""""""""""""""""" + +**LIBOMP_USE_ADAPTIVE_LOCKS** = ``ON|OFF`` + Include adaptive locks, based on Intel(R) Transactional Synchronization + Extensions (Intel(R) TSX). This feature is x86 specific and turned ``ON`` + by default for IA-32 architecture and Intel(R) 64 architecture. + +**LIBOMP_USE_INTERNODE_ALIGNMENT** = ``OFF|ON`` + Align certain data structures on 4096-byte. This option is useful on + multi-node systems where a small ``CACHE_LINE`` setting leads to false sharing. + +**LIBBOLT_OMPT_SUPPORT** = ``ON|OFF`` + Include support for the OpenMP Tools Interface (OMPT). + This option is supported and ``ON`` by default for x86, x86_64, AArch64, + PPC64 and RISCV64 on Linux* and macOS*. + This option is ``OFF`` if this feature is not supported for the platform. + +**LIBOMP_OMPT_OPTIONAL** = ``ON|OFF`` + Include support for optional OMPT functionality. This option is ignored if + **LIBBOLT_OMPT_SUPPORT** is ``OFF``. + +**LIBOMP_STATS** = ``OFF|ON`` + Include stats-gathering code. + +**LIBOMP_USE_DEBUGGER** = ``OFF|ON`` + Include the friendly debugger interface. + +**LIBOMP_USE_HWLOC** = ``OFF|ON`` + Use `OpenMPI's hwloc library `_ for + topology detection and affinity. + +**LIBOMP_HWLOC_INSTALL_DIR** = ``/path/to/hwloc/install/dir`` + Specify install location of hwloc. The configuration system will look for + ``hwloc.h`` in ``${LIBOMP_HWLOC_INSTALL_DIR}/include`` and the library in + ``${LIBOMP_HWLOC_INSTALL_DIR}/lib``. The default is ``/usr/local``. + This option is only used if **LIBOMP_USE_HWLOC** is ``ON``. + +Additional Compiler Flags +""""""""""""""""""""""""" + +These flags are **appended**, they do not overwrite any of the preset flags. + +**LIBOMP_CPPFLAGS** = + Additional C preprocessor flags. + +**LIBOMP_CXXFLAGS** = + Additional C++ compiler flags. + +**LIBOMP_ASMFLAGS** = + Additional assembler flags. + +**LIBOMP_LDFLAGS** = + Additional linker flags. + +**LIBOMP_LIBFLAGS** = + Additional libraries to link. + +**LIBOMP_FFLAGS** = + Additional Fortran compiler flags. + +Options for ``libomptarget`` +---------------------------- + +**LIBOMPTARGET_OPENMP_HEADER_FOLDER** = ``""`` + Path of the folder that contains ``omp.h``. This is required for testing + out-of-tree builds. + +**LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER** = ``""`` + Path of the folder that contains ``libomp.so``. This is required for testing + out-of-tree builds. + +Options for ``NVPTX device RTL`` +-------------------------------- + +**LIBOMPTARGET_NVPTX_ENABLE_BCLIB** = ``ON|OFF`` + Enable CUDA LLVM bitcode offloading device RTL. This is used for link time + optimization of the OMP runtime and application code. This option is enabled + by default if the build system determines that `CMAKE_C_COMPILER` is able to + compile and link the library. + +**LIBOMPTARGET_NVPTX_CUDA_COMPILER** = ``""`` + Location of a CUDA compiler capable of emitting LLVM bitcode. Currently only + the Clang compiler is supported. This is only used when building the CUDA LLVM + bitcode offloading device RTL. If unspecified and the CMake C compiler is + Clang, then Clang is used. + +**LIBOMPTARGET_NVPTX_BC_LINKER** = ``""`` + Location of a linker capable of linking LLVM bitcode objects. This is only + used when building the CUDA LLVM bitcode offloading device RTL. If unspecified + and the CMake C compiler is Clang and there exists a llvm-link binary in the + directory containing Clang, then this llvm-link binary is used. + +**LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER** = ``""`` + Host compiler to use with NVCC. This compiler is not going to be used to + produce any binary. Instead, this is used to overcome the input compiler + checks done by NVCC. E.g. if using a default host compiler that is not + compatible with NVCC, this option can be use to pass to NVCC a valid compiler + to avoid the error. + + **LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES** = ``35`` + List of CUDA compute capabilities that should be supported by the NVPTX + device RTL. E.g. for compute capabilities 6.0 and 7.0, the option "60,70" + should be used. Compute capability 3.5 is the minimum required. + + **LIBOMPTARGET_NVPTX_DEBUG** = ``OFF|ON`` + Enable printing of debug messages from the NVPTX device RTL. + +Example Usages of CMake +======================= + +Typical Invocations +------------------- + +.. code-block:: console + + $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ .. + $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .. + $ cmake -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc .. + +Advanced Builds with Various Options +------------------------------------ + +- Build the i386 Linux* library using GCC* + + .. code-block:: console + + $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLIBOMP_ARCH=i386 .. + +- Build the x86_64 debug Mac library using Clang* + + .. code-block:: console + + $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DLIBOMP_ARCH=x86_64 -DCMAKE_BUILD_TYPE=Debug .. + +- Build the library (architecture determined by probing compiler) using the + Intel(R) C Compiler and the Intel(R) C++ Compiler. Also, create Fortran + modules with the Intel(R) Fortran Compiler. + + .. code-block:: console + + $ cmake -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DCMAKE_Fortran_COMPILER=ifort -DLIBOMP_FORTRAN_MODULES=on .. + +- Have CMake find the C/C++ compiler and specify additional flags for the + preprocessor and C++ compiler. + + .. code-blocks:: console + + $ cmake -DLIBOMP_CPPFLAGS='-DNEW_FEATURE=1 -DOLD_FEATURE=0' -DLIBOMP_CXXFLAGS='--one-specific-flag --two-specific-flag' .. + +- Build the stubs library + + .. code-blocks:: console + + $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLIBOMP_LIB_TYPE=stubs .. + +**Footnotes** + +.. [*] Other names and brands may be claimed as the property of others. diff --git a/bolt/cmake/DetectTestCompiler/CMakeLists.txt b/bolt/cmake/DetectTestCompiler/CMakeLists.txt new file mode 100644 index 0000000000000..7fa32a90972ba --- /dev/null +++ b/bolt/cmake/DetectTestCompiler/CMakeLists.txt @@ -0,0 +1,48 @@ +cmake_minimum_required(VERSION 3.13.4) +project(DetectTestCompiler C CXX) + +include(CheckCCompilerFlag) +include(CheckCXXCompilerFlag) + +function(write_compiler_information lang) + set(information "${CMAKE_${lang}_COMPILER}") + set(information "${information}\\;${CMAKE_${lang}_COMPILER_ID}") + set(information "${information}\\;${CMAKE_${lang}_COMPILER_VERSION}") + set(information "${information}\\;${${lang}_FLAGS}") + set(information "${information}\\;${${lang}_HAS_TSAN_FLAG}") + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${lang}CompilerInformation.txt ${information}) +endfunction(write_compiler_information) + +find_package(OpenMP) +if (NOT OpenMP_Found) + set(OpenMP_C_FLAGS "-fopenmp") + set(OpenMP_CXX_FLAGS "-fopenmp") +endif() + +set(CMAKE_THREAD_PREFER_PTHREAD TRUE) +set(THREADS_PREFER_PTHREAD_FLAG TRUE) +find_package(Threads REQUIRED) + +set(C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_THREAD_LIBS_INIT}") +set(CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_THREAD_LIBS_INIT}") + +# TODO: Implement blockaddress in GlobalISel and remove this flag! +if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + check_c_compiler_flag("-fno-experimental-isel" C_HAS_EXPERIMENTAL_ISEL_FLAG) + check_cxx_compiler_flag("-fno-experimental-isel" CXX_HAS_EXPERIMENTAL_ISEL_FLAG) + macro(add_experimental_isel_flag lang) + if (${lang}_HAS_EXPERIMENTAL_ISEL_FLAG) + set(${lang}_FLAGS "-fno-experimental-isel ${${lang}_FLAGS}") + endif() + endmacro(add_experimental_isel_flag) + + add_experimental_isel_flag(C) + add_experimental_isel_flag(CXX) +endif() + +SET(CMAKE_REQUIRED_FLAGS "-fsanitize=thread") +check_c_compiler_flag("" C_HAS_TSAN_FLAG) +check_cxx_compiler_flag("" CXX_HAS_TSAN_FLAG) + +write_compiler_information(C) +write_compiler_information(CXX) diff --git a/bolt/cmake/HandleOpenMPOptions.cmake b/bolt/cmake/HandleOpenMPOptions.cmake new file mode 100644 index 0000000000000..15382bcf12de6 --- /dev/null +++ b/bolt/cmake/HandleOpenMPOptions.cmake @@ -0,0 +1,35 @@ +if (OPENMP_STANDALONE_BUILD) + # From HandleLLVMOptions.cmake + function(append_if condition value) + if (${condition}) + foreach(variable ${ARGN}) + set(${variable} "${${variable}} ${value}" PARENT_SCOPE) + endforeach(variable) + endif() + endfunction() +endif() + +# MSVC and clang-cl in compatibility mode map -Wall to -Weverything. +# TODO: LLVM adds /W4 instead, check if that works for the OpenMP runtimes. +if (NOT MSVC) + append_if(OPENMP_HAVE_WALL_FLAG "-Wall" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) +endif() +if (OPENMP_ENABLE_WERROR) + append_if(OPENMP_HAVE_WERROR_FLAG "-Werror" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) +endif() + +# Additional warnings that are not enabled by -Wall. +append_if(OPENMP_HAVE_WCAST_QUAL_FLAG "-Wcast-qual" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) +append_if(OPENMP_HAVE_WFORMAT_PEDANTIC_FLAG "-Wformat-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) +append_if(OPENMP_HAVE_WIMPLICIT_FALLTHROUGH_FLAG "-Wimplicit-fallthrough" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) +append_if(OPENMP_HAVE_WSIGN_COMPARE_FLAG "-Wsign-compare" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + +# Warnings that we want to disable because they are too verbose or fragile. +append_if(OPENMP_HAVE_WNO_EXTRA_FLAG "-Wno-extra" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) +append_if(OPENMP_HAVE_WNO_PEDANTIC_FLAG "-Wno-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) +append_if(OPENMP_HAVE_WNO_MAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + +append_if(OPENMP_HAVE_STD_GNUPP14_FLAG "-std=gnu++14" CMAKE_CXX_FLAGS) +if (NOT OPENMP_HAVE_STD_GNUPP14_FLAG) + append_if(OPENMP_HAVE_STD_CPP14_FLAG "-std=c++14" CMAKE_CXX_FLAGS) +endif() diff --git a/bolt/cmake/OpenMPTesting.cmake b/bolt/cmake/OpenMPTesting.cmake new file mode 100644 index 0000000000000..b559b543e4c11 --- /dev/null +++ b/bolt/cmake/OpenMPTesting.cmake @@ -0,0 +1,213 @@ +# Keep track if we have all dependencies. +set(ENABLE_CHECK_BOLT_TARGETS TRUE) + +# Function to find required dependencies for testing. +function(find_standalone_test_dependencies) + include(FindPythonInterp) + + if (NOT PYTHONINTERP_FOUND) + message(STATUS "Could not find Python.") + message(WARNING "The check targets will not be available!") + set(ENABLE_CHECK_BOLT_TARGETS FALSE PARENT_SCOPE) + return() + endif() + + # Find executables. + find_program(OPENMP_LLVM_LIT_EXECUTABLE + NAMES llvm-lit lit.py lit + PATHS ${OPENMP_LLVM_TOOLS_DIR}) + if (NOT OPENMP_LLVM_LIT_EXECUTABLE) + message(STATUS "Cannot find llvm-lit.") + message(STATUS "Please put llvm-lit in your PATH, set OPENMP_LLVM_LIT_EXECUTABLE to its full path, or point OPENMP_LLVM_TOOLS_DIR to its directory.") + message(WARNING "The check targets will not be available!") + set(ENABLE_CHECK_BOLT_TARGETS FALSE PARENT_SCOPE) + return() + endif() + + find_program(OPENMP_FILECHECK_EXECUTABLE + NAMES FileCheck + PATHS ${OPENMP_LLVM_TOOLS_DIR}) + if (NOT OPENMP_FILECHECK_EXECUTABLE) + message(STATUS "Cannot find FileCheck.") + message(STATUS "Please put FileCheck in your PATH, set OPENMP_FILECHECK_EXECUTABLE to its full path, or point OPENMP_LLVM_TOOLS_DIR to its directory.") + message(WARNING "The check targets will not be available!") + set(ENABLE_CHECK_BOLT_TARGETS FALSE PARENT_SCOPE) + return() + endif() + + find_program(OPENMP_NOT_EXECUTABLE + NAMES not + PATHS ${OPENMP_LLVM_TOOLS_DIR}) + if (NOT OPENMP_NOT_EXECUTABLE) + message(STATUS "Cannot find 'not'.") + message(STATUS "Please put 'not' in your PATH, set OPENMP_NOT_EXECUTABLE to its full path, or point OPENMP_LLVM_TOOLS_DIR to its directory.") + message(WARNING "The check targets will not be available!") + set(ENABLE_CHECK_BOLT_TARGETS FALSE PARENT_SCOPE) + return() + endif() +endfunction() + +if (${OPENMP_STANDALONE_BUILD}) + find_standalone_test_dependencies() + + # Set lit arguments. + set(DEFAULT_LIT_ARGS "-sv --show-unsupported --show-xfail") + if (MSVC OR XCODE) + set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --no-progress-bar") + endif() + set(OPENMP_LIT_ARGS "${DEFAULT_LIT_ARGS}" CACHE STRING "Options for lit.") + separate_arguments(OPENMP_LIT_ARGS) +else() + set(OPENMP_FILECHECK_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/FileCheck) + set(OPENMP_NOT_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/not) +endif() + +# Macro to extract information about compiler from file. (no own scope) +macro(extract_test_compiler_information lang file) + file(READ ${file} information) + list(GET information 0 path) + list(GET information 1 id) + list(GET information 2 version) + list(GET information 3 openmp_flags) + list(GET information 4 has_tsan_flags) + + set(OPENMP_TEST_${lang}_COMPILER_PATH ${path}) + set(OPENMP_TEST_${lang}_COMPILER_ID ${id}) + set(OPENMP_TEST_${lang}_COMPILER_VERSION ${version}) + set(OPENMP_TEST_${lang}_COMPILER_OPENMP_FLAGS ${openmp_flags}) + set(OPENMP_TEST_${lang}_COMPILER_HAS_TSAN_FLAGS ${has_tsan_flags}) +endmacro() + +# Function to set variables with information about the test compiler. +function(set_test_compiler_information dir) + extract_test_compiler_information(C ${dir}/CCompilerInformation.txt) + extract_test_compiler_information(CXX ${dir}/CXXCompilerInformation.txt) + if (NOT("${OPENMP_TEST_C_COMPILER_ID}" STREQUAL "${OPENMP_TEST_CXX_COMPILER_ID}" AND + "${OPENMP_TEST_C_COMPILER_VERSION}" STREQUAL "${OPENMP_TEST_CXX_COMPILER_VERSION}")) + message(STATUS "Test compilers for C and C++ don't match.") + message(WARNING "The check targets will not be available!") + set(ENABLE_CHECK_BOLT_TARGETS FALSE PARENT_SCOPE) + else() + set(OPENMP_TEST_COMPILER_ID "${OPENMP_TEST_C_COMPILER_ID}" PARENT_SCOPE) + set(OPENMP_TEST_COMPILER_VERSION "${OPENMP_TEST_C_COMPILER_VERSION}" PARENT_SCOPE) + set(OPENMP_TEST_COMPILER_OPENMP_FLAGS "${OPENMP_TEST_C_COMPILER_OPENMP_FLAGS}" PARENT_SCOPE) + set(OPENMP_TEST_COMPILER_HAS_TSAN_FLAGS "${OPENMP_TEST_C_COMPILER_HAS_TSAN_FLAGS}" PARENT_SCOPE) + + # Determine major version. + string(REGEX MATCH "[0-9]+" major "${OPENMP_TEST_C_COMPILER_VERSION}") + string(REGEX MATCH "[0-9]+\\.[0-9]+" majorminor "${OPENMP_TEST_C_COMPILER_VERSION}") + set(OPENMP_TEST_COMPILER_VERSION_MAJOR "${major}" PARENT_SCOPE) + set(OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR "${majorminor}" PARENT_SCOPE) + endif() +endfunction() + +if (${OPENMP_STANDALONE_BUILD}) + # Detect compiler that should be used for testing. + # We cannot use ExternalProject_Add() because its configuration runs when this + # project is built which is too late for detecting the compiler... + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DetectTestCompiler) + execute_process( + COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} ${CMAKE_CURRENT_LIST_DIR}/DetectTestCompiler + -DCMAKE_C_COMPILER=${OPENMP_TEST_C_COMPILER} + -DCMAKE_CXX_COMPILER=${OPENMP_TEST_CXX_COMPILER} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DetectTestCompiler + OUTPUT_VARIABLE DETECT_COMPILER_OUT + ERROR_VARIABLE DETECT_COMPILER_ERR + RESULT_VARIABLE DETECT_COMPILER_RESULT) + if (DETECT_COMPILER_RESULT) + message(STATUS "Could not detect test compilers.") + message(WARNING "The check targets will not be available!") + set(ENABLE_CHECK_BOLT_TARGETS FALSE) + else() + set_test_compiler_information(${CMAKE_CURRENT_BINARY_DIR}/DetectTestCompiler) + endif() +else() + # Set the information that we know. + set(OPENMP_TEST_COMPILER_ID "Clang") + # Cannot use CLANG_VERSION because we are not guaranteed that this is already set. + set(OPENMP_TEST_COMPILER_VERSION "${LLVM_VERSION}") + set(OPENMP_TEST_COMPILER_VERSION_MAJOR "${LLVM_VERSION_MAJOR}") + set(OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}") + # Unfortunately the top-level cmake/config-ix.cmake file mangles CMake's + # CMAKE_THREAD_LIBS_INIT variable from the FindThreads package, so work + # around that, until it is fixed there. + if("${CMAKE_THREAD_LIBS_INIT}" STREQUAL "-lpthread") + set(OPENMP_TEST_COMPILER_THREAD_FLAGS "-pthread") + else() + set(OPENMP_TEST_COMPILER_THREAD_FLAGS "${CMAKE_THREAD_LIBS_INIT}") + endif() + if(TARGET tsan) + set(OPENMP_TEST_COMPILER_HAS_TSAN_FLAGS 1) + else() + set(OPENMP_TEST_COMPILER_HAS_TSAN_FLAGS 0) + endif() + # TODO: Implement blockaddress in GlobalISel and remove this flag! + set(OPENMP_TEST_COMPILER_OPENMP_FLAGS "-fopenmp ${OPENMP_TEST_COMPILER_THREAD_FLAGS} -fno-experimental-isel") +endif() + +# Function to set compiler features for use in lit. +function(set_test_compiler_features) + if ("${OPENMP_TEST_COMPILER_ID}" STREQUAL "GNU") + set(comp "gcc") + elseif ("${OPENMP_TEST_COMPILER_ID}" STREQUAL "Intel") + set(comp "icc") + else() + # Just use the lowercase of the compiler ID as fallback. + string(TOLOWER "${OPENMP_TEST_COMPILER_ID}" comp) + endif() + set(OPENMP_TEST_COMPILER_FEATURES "['${comp}', '${comp}-${OPENMP_TEST_COMPILER_VERSION_MAJOR}', '${comp}-${OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR}', '${comp}-${OPENMP_TEST_COMPILER_VERSION}']" PARENT_SCOPE) +endfunction() +set_test_compiler_features() + +# Function to add a testsuite for an OpenMP runtime library. +function(add_openmp_testsuite target comment) + if (NOT ENABLE_CHECK_BOLT_TARGETS) + add_custom_target(${target} + COMMAND ${CMAKE_COMMAND} -E echo "${target} does nothing, dependencies not found.") + message(STATUS "${target} does nothing.") + return() + endif() + + cmake_parse_arguments(ARG "EXCLUDE_FROM_CHECK_ALL" "" "DEPENDS;ARGS" ${ARGN}) + # EXCLUDE_FROM_CHECK_ALL excludes the test ${target} out of check-openmp. + if (NOT ARG_EXCLUDE_FROM_CHECK_ALL) + # Register the testsuites and depends for the check-openmp rule. + set_property(GLOBAL APPEND PROPERTY OPENMP_LIT_TESTSUITES ${ARG_UNPARSED_ARGUMENTS}) + set_property(GLOBAL APPEND PROPERTY OPENMP_LIT_DEPENDS ${ARG_DEPENDS}) + endif() + + if (${OPENMP_STANDALONE_BUILD}) + set(LIT_ARGS ${OPENMP_LIT_ARGS} ${ARG_ARGS}) + add_custom_target(${target} + COMMAND ${PYTHON_EXECUTABLE} ${OPENMP_LLVM_LIT_EXECUTABLE} ${LIT_ARGS} ${ARG_UNPARSED_ARGUMENTS} + COMMENT ${comment} + DEPENDS ${ARG_DEPENDS} + USES_TERMINAL + ) + else() + if (ARG_EXCLUDE_FROM_CHECK_ALL) + add_lit_testsuite(${target} + ${comment} + ${ARG_UNPARSED_ARGUMENTS} + EXCLUDE_FROM_CHECK_ALL + DEPENDS clang clang-resource-headers FileCheck ${ARG_DEPENDS} + ARGS ${ARG_ARGS} + ) + else() + add_lit_testsuite(${target} + ${comment} + ${ARG_UNPARSED_ARGUMENTS} + DEPENDS clang clang-resource-headers FileCheck ${ARG_DEPENDS} + ARGS ${ARG_ARGS} + ) + endif() + endif() +endfunction() + +function(construct_check_openmp_target) + get_property(OPENMP_LIT_TESTSUITES GLOBAL PROPERTY OPENMP_LIT_TESTSUITES) + get_property(OPENMP_LIT_DEPENDS GLOBAL PROPERTY OPENMP_LIT_DEPENDS) + + # We already added the testsuites themselves, no need to do that again. + add_openmp_testsuite(check-bolt-openmp "Running BOLT tests" ${OPENMP_LIT_TESTSUITES} EXCLUDE_FROM_CHECK_ALL DEPENDS ${OPENMP_LIT_DEPENDS}) +endfunction() diff --git a/bolt/cmake/config-ix.cmake b/bolt/cmake/config-ix.cmake new file mode 100644 index 0000000000000..d9ea3bbb05749 --- /dev/null +++ b/bolt/cmake/config-ix.cmake @@ -0,0 +1,18 @@ +include(CheckCXXCompilerFlag) + +check_cxx_compiler_flag(-Wall OPENMP_HAVE_WALL_FLAG) +check_cxx_compiler_flag(-Werror OPENMP_HAVE_WERROR_FLAG) + +# Additional warnings that are not enabled by -Wall. +check_cxx_compiler_flag(-Wcast-qual OPENMP_HAVE_WCAST_QUAL_FLAG) +check_cxx_compiler_flag(-Wformat-pedantic OPENMP_HAVE_WFORMAT_PEDANTIC_FLAG) +check_cxx_compiler_flag(-Wimplicit-fallthrough OPENMP_HAVE_WIMPLICIT_FALLTHROUGH_FLAG) +check_cxx_compiler_flag(-Wsign-compare OPENMP_HAVE_WSIGN_COMPARE_FLAG) + +# Warnings that we want to disable because they are too verbose or fragile. +check_cxx_compiler_flag(-Wno-extra OPENMP_HAVE_WNO_EXTRA_FLAG) +check_cxx_compiler_flag(-Wno-pedantic OPENMP_HAVE_WNO_PEDANTIC_FLAG) +check_cxx_compiler_flag(-Wno-maybe-uninitialized OPENMP_HAVE_WNO_MAYBE_UNINITIALIZED_FLAG) + +check_cxx_compiler_flag(-std=gnu++14 OPENMP_HAVE_STD_GNUPP14_FLAG) +check_cxx_compiler_flag(-std=c++14 OPENMP_HAVE_STD_CPP14_FLAG) diff --git a/bolt/docs/ReleaseNotes.rst b/bolt/docs/ReleaseNotes.rst new file mode 100644 index 0000000000000..b7f2ec42277e3 --- /dev/null +++ b/bolt/docs/ReleaseNotes.rst @@ -0,0 +1,45 @@ +=========================== +openmp 11.0.0 Release Notes +=========================== + +.. contents:: + :local: + +.. warning:: + These are in-progress notes for the upcoming LLVM 11.0.0 release. + Release notes for previous releases can be found on + `the Download Page `_. + +Introduction +============ + +This document contains the release notes for the OpenMP runtime, release 11.0.0. +Here we describe the status of openmp, including major improvements +from the previous release. All openmp releases may be downloaded +from the `LLVM releases web site `_. + +Non-comprehensive list of changes in this release +================================================= + +5.0 features +------------ + +* ... + +5.1 features +------------ + +* ... + +OMPT Improvements +----------------- + +* Added OMPT callbacks for doacross loops, detached tasks +* Added handling for mutexinoutset dependencies + +OMPT-based Tools +---------------- + +* Added ompt-multiplex.h as a header-only OMPT-tool to support nesting of OMPT + tools. (see openmp/tools/multiplex) + diff --git a/bolt/examples/argobots/.gitignore b/bolt/examples/argobots/.gitignore new file mode 100644 index 0000000000000..d21c8ccd629e4 --- /dev/null +++ b/bolt/examples/argobots/.gitignore @@ -0,0 +1,25 @@ +parallel_for_omp +parallel_for_abt_task +parallel_for_abt_thread +nested_parallel_for_omp +nested_parallel_for_abt_task +nested_parallel_for_abt_thread +nested_parallel_for_block_abt_task +nested_parallel_for_block_abt_thread +nested_parallel_for_block_omp +nested_parallel_for_irregular_omp +nested_parallel_for_irregular_abt_task +nested_parallel_for_irregular_abt_thread +task_single_producer_omp +task_single_producer_abt_task +task_single_producer_abt_thread +task_multiple_producer_omp +task_multiple_producer_abt_task +task_multiple_producer_abt_thread +task_nested_omp +task_nested_abt_task +task_nested_lvl2_omp +task_nested_lvl2_abt_task +taskwait_omp +taskyield_omp + diff --git a/bolt/examples/argobots/Makefile b/bolt/examples/argobots/Makefile new file mode 100644 index 0000000000000..b8faccd103877 --- /dev/null +++ b/bolt/examples/argobots/Makefile @@ -0,0 +1,46 @@ +# -*- Mode: Makefile; -*- +# +# See LICENSE.txt in top-level directory. +# + +CC=gcc +CFLAGS=-g -Wall -O2 `pkg-config --cflags argobots` +LDFLAGS=-lm `pkg-config --libs argobots` + +PROGS = \ + parallel_for_abt_task \ + parallel_for_abt_thread \ + nested_parallel_for_abt_task \ + nested_parallel_for_abt_thread \ + nested_parallel_for_block_abt_thread \ + nested_parallel_for_block_abt_task \ + nested_parallel_for_irregular_abt_task \ + nested_parallel_for_irregular_abt_thread \ + task_single_producer_abt_task \ + task_single_producer_abt_thread \ + task_multiple_producer_abt_task \ + task_multiple_producer_abt_thread \ + task_nested_abt_task \ + task_nested_lvl2_abt_task + +all: $(PROGS) + +clean: + rm -rf *.o *.dSYM $(PROGS) + +testing: + ./parallel_for_abt_task + ./parallel_for_abt_thread + ./nested_parallel_for_abt_task + ./nested_parallel_for_abt_thread + ./nested_parallel_for_block_abt_thread + ./nested_parallel_for_block_abt_task + ./nested_parallel_for_irregular_abt_task + ./nested_parallel_for_irregular_abt_thread + ./task_single_producer_abt_task + ./task_single_producer_abt_thread + ./task_multiple_producer_abt_task + ./task_multiple_producer_abt_thread + ./task_nested_abt_task + ./task_nested_lvl2_abt_task + diff --git a/bolt/examples/argobots/Makefile.omp b/bolt/examples/argobots/Makefile.omp new file mode 100644 index 0000000000000..02db14d53901e --- /dev/null +++ b/bolt/examples/argobots/Makefile.omp @@ -0,0 +1,38 @@ +# -*- Mode: Makefile; -*- +# +# See LICENSE.txt in top-level directory. +# + +CC=gcc +CFLAGS=-g -Wall -O2 -fopenmp +LDFLAGS=-lm -fopenmp + +PROGS = \ + parallel_for_omp \ + nested_parallel_for_omp \ + nested_parallel_for_block_omp \ + nested_parallel_for_irregular_omp \ + task_single_producer_omp \ + task_multiple_producer_omp \ + task_nested_omp \ + task_nested_lvl2_omp \ + taskwait_omp \ + taskyield_omp + +all: $(PROGS) + +clean: + rm -rf *.o *.dSYM $(PROGS) + +testing: + ./parallel_for_omp + ./nested_parallel_for_omp + ./nested_parallel_for_block_omp + ./nested_parallel_for_irregular_omp + ./task_single_producer_omp + ./task_multiple_producer_omp + ./task_nested_omp + ./task_nested_lvl2_omp + ./taskwait_omp + ./taskyield_omp + diff --git a/bolt/examples/argobots/README b/bolt/examples/argobots/README new file mode 100644 index 0000000000000..3e899c85dd1f6 --- /dev/null +++ b/bolt/examples/argobots/README @@ -0,0 +1,87 @@ + OpenMP and Argobots + +This directory includes OpenMP code examples and their corresponding sample +implementations with Argobots APIs. These are just examples in order to show +how Argobots APIs can be used to mimic OpenMP behaviors. Please note that they +are not intended to provide optimized implementations. + +NOTE: +Examples in this directory are originated from those in examples of Argobots. + +1. Build and Testing +2. Examples + + +------------------------------------------------------------------------------- + +1. Build and Testing +==================== + +Once configure is done, there should be two Makefiles in this directory - +Makefile for Argobots examples and Makefile.omp for OpenMP examples. + +(a) To build and test Argobots examples: + + build: make + test : make testing + + +(b) To build and test OpenMP examples: + + build: make -f Makefile.omp + test : make -f Makefile.omp testing + +------------------------------------------------------------------------------- + +2. Examples +=========== + +parallel_for_{omp,abt_task,abt_thread}: + -- Implementation sample using a pragma omp parallel for directive + -- An Argobots code implementation is included in order to study the + performance + +nested_parallel_for_{omp,abt_task,abt_thread}: + -- Implementation sample using nested parallel constructions + -- An Argobots code implementation is included in order to study the + performance + +nested_parallel_for_irregular_{omp,abt_task,abt_thread}: + -- Implementation sample using nested parallel constructions when each iteration + has different workload + -- An Argobots code implementation is included in order to study the + performance + +task_single_producer_{omp,abt_task,abt_thread}: + -- Implementation sample where just a thread creates all the work (tasks) and + the other threads execute the work generated. + -- An Argobots code implementation is also included to study the performance. + -- main motivation for the Intel OpenMP Runtime modification + +task_multiple_producer_{omp,abt_task,abt_thread}: + -- Implementation sample where all thread creates all the work into its own queue + or pool and then execute the work generated. + -- An Argobots code implementation is also included to study the performance. + -- Mainly used in order to understand the cutoff mechanism implemented by + GCC and Intel OpemMP Libraries + +task_nested_{omp,abt_task}: + -- Implementation sample used mainly to know the behavior when a nested task (lvl 1) + is created. + -- An Argobots code implementation is included in order to study the performance + +task_nested_lvl2_{omp,abt_task}: + -- Implementation sample used mainly to know the behavior when a nested task (lvl 2) + is created. + -- An Argobots code implementation is included in order to study the performance + +taskwait_omp: + -- Implementation sample used to know the behavior when a taskwait directive + is used or not + -- Mainly used to get the generated code for each compiler + +taskyield_omp: + -- Implementation sample used to know the behavior when a taskyield directive + is used or not + -- Mainly used to get the generated code for each compiler + diff --git a/bolt/examples/argobots/nested_parallel_for_abt_task.c b/bolt/examples/argobots/nested_parallel_for_abt_task.c new file mode 100644 index 0000000000000..ac2fc61ea8452 --- /dev/null +++ b/bolt/examples/argobots/nested_parallel_for_abt_task.c @@ -0,0 +1,224 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* This code mimics the parallel for OpenMP directive in nested loops. + * It creates as many streams as user requires and threads are created and + * assigned by static blocs to each stream for the outer loop. + * For the inner loop, as many task as the user requires are created. + */ + +#include +#include +#include +#include +#include +#include + +#define NUM_ELEMS 5017600 /* 2GB */ +#define NUM_XSTREAMS 4 +#define NUM_REPS 1 + +ABT_pool *g_pools; + +typedef struct { + float *ptr; + float value; + int rank; + int it; + int start; + int end; +} vector_scal_args_t; + +typedef struct { + float *ptr; + float value; + int nxstreams; + int it; + int start; + int end; +} vector_scal_task_args_t; + +void vector_scal(void *arguments) +{ + int i, rank; + vector_scal_args_t *arg; + arg = (vector_scal_args_t *)arguments; + ABT_xstream_self_rank(&rank); + rank = arg->rank; + int mystart = arg->start; + int myend = arg->end; + int it = arg->it; + int base = rank * it; + float value = arg->value; + float *ptr = arg->ptr; + for (i = mystart; i < myend; i++) { + ptr[base + i] *= value; + } +} + +void vector_scal_launch(void *arguments) +{ + int i, it, j, num_ults, rank, mystart, myend, p; + ABT_task *tasks; + ABT_xstream xstream; + ABT_xstream_self(&xstream); + vector_scal_task_args_t *arg; + arg = (vector_scal_task_args_t *) arguments; + vector_scal_args_t *args; + it = arg->it; + num_ults = arg->nxstreams; + mystart = arg->start; + myend = arg->end; + int current = 0; + args = (vector_scal_args_t *)malloc(sizeof(vector_scal_args_t) + * num_ults); + tasks = (ABT_task *)malloc(sizeof(ABT_task) * num_ults); + /* ES creation */ + int bloc = it / (num_ults); + int rest = it % (num_ults); + int start = 0; + int end = 0; + ABT_xstream_self_rank(&rank); + for (i = mystart; i < myend; i++) { + for (j = 0; j < num_ults; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + args[j].value = arg->value; + args[j].ptr = arg->ptr; + args[j].it = it; + args[j].rank = rank; + + ABT_task_create(g_pools[rank], vector_scal, + (void *)&args[j], &tasks[j]); + } + current++; + for (p = 0; p < num_ults; p++) { + ABT_task_free(&tasks[p]); + } + } + ABT_thread_yield(); +} + + +int main(int argc, char *argv[]) +{ + int i, j; + int ntasks; + int num_xstreams; + char *str, *endptr; + ABT_xstream *xstreams; + vector_scal_task_args_t *args; + struct timeval t_start, t_end; + struct timeval t_start2; + double time, time_join; + float *a; + int it; + int inner_xstreams; + + num_xstreams = argc > 1 ? atoi(argv[1]) : NUM_XSTREAMS; + if (argc > 2) { + str = argv[2]; + } + + ntasks = argc > 2 ? strtoll(str, &endptr, 10) : NUM_ELEMS; + it = ceil(sqrt(ntasks)); + ntasks = it * it; + inner_xstreams = argc > 3 ? atoi(argv[3]) : NUM_XSTREAMS; + + g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_xstreams); + + a = malloc(sizeof(float) * ntasks); + for (i = 0; i < ntasks; i++) { + a[i] = i * 1.0f; + } + + xstreams = (ABT_xstream *) malloc(sizeof(ABT_xstream) * num_xstreams); + args = (vector_scal_task_args_t *) malloc(sizeof(vector_scal_task_args_t) + * num_xstreams); + + /* initialization */ + ABT_init(argc, argv); + + for (i = 0; i < num_xstreams; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &g_pools[i]); + } + + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &g_pools[0]); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + } + + gettimeofday(&t_start, NULL); + + /* Each task is created on the xstream which is going to execute it */ + + int bloc = it / (num_xstreams); + int rest = it % (num_xstreams); + int start = 0; + int end = 0; + + for (j = 0; j < num_xstreams; j++) { + + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + args[j].value = 0.9f; + args[j].ptr = a; + args[j].it = it; + args[j].nxstreams = inner_xstreams; + + ABT_thread_create_on_xstream(xstreams[j], vector_scal_launch, + (void *)&args[j], ABT_THREAD_ATTR_NULL, + NULL); + } + + ABT_thread_yield(); + + gettimeofday(&t_start2, NULL); + for (i = 1; i < num_xstreams; i++) { + size_t size; + do { + ABT_pool_get_size(g_pools[i], &size); + } while (size != 0); + } + + gettimeofday(&t_end, NULL); + time = (t_end.tv_sec * 1000000 + t_end.tv_usec) - + (t_start.tv_sec * 1000000 + t_start.tv_usec); + time_join = (t_end.tv_sec * 1000000 + t_end.tv_usec) - + (t_start2.tv_sec * 1000000 + t_start2.tv_usec); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + ABT_xstream_free(&xstreams[i]); + } + + printf("%d %d %d %f %f\n", + num_xstreams, inner_xstreams, ntasks, time / 1000000.0, + time_join / 1000000.0); + + ABT_finalize(); + free(xstreams); + for (i = 0; i < ntasks; i++) { + if (a[i] != i * 0.9f) { + printf("%f\n", a[i]); + return EXIT_FAILURE; + } + } + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/nested_parallel_for_abt_thread.c b/bolt/examples/argobots/nested_parallel_for_abt_thread.c new file mode 100644 index 0000000000000..f6994ec163a9e --- /dev/null +++ b/bolt/examples/argobots/nested_parallel_for_abt_thread.c @@ -0,0 +1,228 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* This code mimics the parallel for OpenMP directive in nested loops. + * It creates as many streams as user requires and threads are created and + * assigned by static blocs to each stream for the outer loop. + * For the inner loop, as many threads as the user requires are created. + */ + + +#include +#include +#include +#include +#include +#include + +#define NUM_ELEMS 5017600 /* 2GB */ +#define NUM_XSTREAMS 4 +#define NUM_REPS 1 + +ABT_pool *g_pools; + +typedef struct { + float *ptr; + float value; + int rank; + int it; + int start; + int end; +} vector_scal_args_t; + +typedef struct { + float *ptr; + float value; + int nxstreams; + int it; + int start; + int end; +} vector_scal_task_args_t; + +void vector_scal(void *arguments) +{ + int i, rank; + vector_scal_args_t *arg; + arg = (vector_scal_args_t *)arguments; + ABT_xstream_self_rank(&rank); + rank = arg->rank; + int mystart = arg->start; + int myend = arg->end; + int it = arg->it; + int base = rank * it; + float value = arg->value; + float *ptr = arg->ptr; + for (i = mystart; i < myend; i++) { + ptr[base + i] *= value; + } +} + +void vector_scal_launch(void *arguments) +{ + int i, it, j, num_ults, rank, mystart, myend, p; + ABT_thread *threads; + ABT_xstream xstream; + ABT_xstream_self(&xstream); + vector_scal_task_args_t *arg; + arg = (vector_scal_task_args_t *) arguments; + vector_scal_args_t *args; + it = arg->it; + num_ults = arg->nxstreams; + mystart = arg->start; + myend = arg->end; + int current = 0; + + args = (vector_scal_args_t *)malloc(sizeof(vector_scal_args_t) + * num_ults); + + threads = (ABT_thread *)malloc(sizeof(ABT_thread) * num_ults); + + int bloc = it / (num_ults); + int rest = it % (num_ults); + int start = 0; + int end = 0; + ABT_xstream_self_rank(&rank); + for (i = mystart; i < myend; i++) { + for (j = 0; j < num_ults; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + args[j].value = arg->value; + args[j].ptr = arg->ptr; + args[j].it = it; + args[j].rank = rank; + + ABT_thread_create(g_pools[rank], vector_scal, + (void *)&args[j], ABT_THREAD_ATTR_NULL, + &threads[j]); + } + current++; + for (p = 0; p < num_ults; p++) { + ABT_thread_free(&threads[p]); + } + } + ABT_thread_yield(); +} + + +int main(int argc, char *argv[]) +{ + int i, j; + int ntasks; + int num_xstreams; + char *str, *endptr; + ABT_xstream *xstreams; + vector_scal_task_args_t *args; + struct timeval t_start, t_end; + struct timeval t_start2; + float *a; + int it; + int inner_xstreams; + double time, time_join; + num_xstreams = argc > 1 ? atoi(argv[1]) : NUM_XSTREAMS; + if (argc > 2) { + str = argv[2]; + } + + ntasks = argc > 2 ? strtoll(str, &endptr, 10) : NUM_ELEMS; + it = ceil(sqrt(ntasks)); + ntasks = it * it; + inner_xstreams = argc > 3 ? atoi(argv[3]) : NUM_XSTREAMS; + + g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_xstreams); + + a = malloc(sizeof(float) * ntasks); + for (i = 0; i < ntasks; i++) { + a[i] = i * 1.0f; + } + + xstreams = (ABT_xstream *) malloc(sizeof(ABT_xstream) * num_xstreams); + args = (vector_scal_task_args_t *) malloc(sizeof(vector_scal_task_args_t) + * num_xstreams); + + /* initialization */ + ABT_init(argc, argv); + + for (i = 0; i < num_xstreams; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &g_pools[i]); + } + + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &g_pools[0]); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + + gettimeofday(&t_start, NULL); + + /* Each task is created on the xstream which is going to execute it */ + + int bloc = it / (num_xstreams); + int rest = it % (num_xstreams); + int start = 0; + int end = 0; + + for (j = 0; j < num_xstreams; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + args[j].value = 0.9f; + args[j].ptr = a; + args[j].it = it; + args[j].nxstreams = inner_xstreams; + ABT_thread_create_on_xstream(xstreams[j], vector_scal_launch, + (void *)&args[j], ABT_THREAD_ATTR_NULL, + NULL); + } + + ABT_thread_yield(); + + gettimeofday(&t_start2, NULL); + for (i = 1; i < num_xstreams; i++) { + size_t size; + do { + ABT_pool_get_size(g_pools[i], &size); + } while (size != 0); + } + + gettimeofday(&t_end, NULL); + time = (t_end.tv_sec * 1000000 + t_end.tv_usec) - + (t_start.tv_sec * 1000000 + t_start.tv_usec); + time_join = (t_end.tv_sec * 1000000 + t_end.tv_usec) - + (t_start2.tv_sec * 1000000 + t_start2.tv_usec); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + ABT_xstream_free(&xstreams[i]); + } + + printf("%d %d %d %f %f\n", + num_xstreams, inner_xstreams, ntasks, time / 1000000.0, + time_join / 1000000.0); + + ABT_finalize(); + + free(xstreams); + + for (i = 0; i < ntasks; i++) { + if (a[i] != i * 0.9f) { + printf("%f\n", a[i]); + return EXIT_FAILURE; + } + } + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/nested_parallel_for_block_abt_task.c b/bolt/examples/argobots/nested_parallel_for_block_abt_task.c new file mode 100644 index 0000000000000..de762bb5d8adf --- /dev/null +++ b/bolt/examples/argobots/nested_parallel_for_block_abt_task.c @@ -0,0 +1,239 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* This code mimics the parallel for OpenMP directive in nested loops. + * It creates as many streams as user requires and threads are created and + * assigned by static blocs to each stream for the outer loop. + * For the inner loop, as many threads as the user requires are created. + */ + + +#include +#include +#include +#include +#include +#include +#include + +#define NUM_XSTREAMS 36 +#define NUM 1000 +#define NUM_REPS 1 + +int in[NUM][NUM]; +int out[NUM][NUM]; + +/* Vector initialization */ +void init(void) +{ + int i, j; + for (i = 0; i < NUM; i++) { + for (j = 0; j < NUM; j++) { + in[i][j] = 1; + out[i][j] = 0; + } + } +} + +int comp(int v) +{ + int i; + double ret = 0.0; + for (i = 0; i < 100; i++) { + ret += sqrt(cos((double)v) * sin((double)v)); + } + return (int)ret; +} + +void check(void) +{ + int i, j; + for (i = 0; i < NUM; i++) { + for (j = 0; j < NUM; j++) { + int expected = comp(in[i][j]); + if (out[i][j] != expected) { + printf("out[%d][%d]=%d expected=%d\n", i, j, out[i][j], expected); + return; + } + } + } + printf("Verification: SUCCESS\n"); +} + + +static ABT_pool *g_pools; + +typedef struct { + int start; + int end; + int x; +} vector_scal_args_t; + +typedef struct { + int nxstreams; + int it; + int start; + int end; +} vector_scal_task_args_t; + +void vector_scal(void *arguments) +{ + int j; + vector_scal_args_t *arg; + arg = (vector_scal_args_t *)arguments; + + int mystart = arg->start; + int myend = arg->end; + int x = arg->x; + + for (j = mystart; j < myend; j++) { + out[x][j] = comp(in[x][j]); + } +} + +void vector_scal_launch(void *arguments) +{ + int i, it, j, num_ults, rank, mystart, myend, p; + ABT_task *tasks; + ABT_xstream xstream; + ABT_xstream_self(&xstream); + vector_scal_task_args_t *arg; + arg = (vector_scal_task_args_t *) arguments; + vector_scal_args_t *args; + it = arg->it; + num_ults = arg->nxstreams; + mystart = arg->start; + myend = arg->end; + + args = (vector_scal_args_t *)malloc(sizeof(vector_scal_args_t) + * num_ults); + + tasks = (ABT_task *)malloc(sizeof(ABT_task) * num_ults); + + int bloc = it / (num_ults); + int rest = it % (num_ults); + ABT_xstream_self_rank(&rank); + for (i = mystart; i < myend; i++) { + int start = 0; + int end = 0; + for (j = 0; j < num_ults; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + args[j].x = i; + + if (j > 0) { + ABT_task_create(g_pools[rank], vector_scal, (void *)&args[j], &tasks[j]); + } + } + vector_scal((void *)&args[0]); + for (p = 1; p < num_ults; p++) { + ABT_task_free(&tasks[p]); + } + } + + free(tasks); + free(args); +} + + +int main(int argc, char *argv[]) +{ + int i, j, r; + int num_xstreams; + char *str, *endptr; + ABT_xstream *xstreams; + ABT_thread *threads; + vector_scal_task_args_t *args; + int inner_xstreams; + double *time, avg_time = 0.0; + + num_xstreams = (argc > 1) ? atoi(argv[1]) : NUM_XSTREAMS; + inner_xstreams = (argc > 2) ? atoi(argv[2]) : NUM_XSTREAMS; + int rep = (argc > 3) ? atoi(argv[3]) : NUM_REPS; + time = (double *)malloc(sizeof(double) * rep); + + init(); + + g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_xstreams); + xstreams = (ABT_xstream *)malloc(sizeof(ABT_xstream) * num_xstreams); + threads = (ABT_thread *)malloc(sizeof(ABT_thread) * num_xstreams); + args = (vector_scal_task_args_t *)malloc(sizeof(vector_scal_task_args_t) + * num_xstreams); + + /* initialization */ + ABT_init(argc, argv); + + for (i = 0; i < num_xstreams; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &g_pools[i]); + } + + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &g_pools[0]); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + + /* Each task is created on the xstream which is going to execute it */ + + for (r = 0; r < rep; r++) { + time[r] = ABT_get_wtime(); + + int bloc = NUM / (num_xstreams); + int rest = NUM % (num_xstreams); + int start = 0; + int end = 0; + + for (j = 0; j < num_xstreams; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + args[j].it = NUM; + args[j].nxstreams = inner_xstreams; + if (j > 0) { + ABT_thread_create(g_pools[j], vector_scal_launch, + (void *)&args[j], ABT_THREAD_ATTR_NULL, + &threads[j]); + } + } + vector_scal_launch((void *)&args[0]); + + for (j = 1; j < num_xstreams; j++) { + ABT_thread_free(&threads[j]); + } + + time[r] = ABT_get_wtime() - time[r]; + avg_time += time[r]; + } + avg_time /= rep; + printf("%d %d %f\n", num_xstreams, inner_xstreams, avg_time); + check(); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + ABT_xstream_free(&xstreams[i]); + } + + ABT_finalize(); + + free(g_pools); + free(xstreams); + free(threads); + free(args); + free(time); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/nested_parallel_for_block_abt_thread.c b/bolt/examples/argobots/nested_parallel_for_block_abt_thread.c new file mode 100644 index 0000000000000..9b3fe45f44f59 --- /dev/null +++ b/bolt/examples/argobots/nested_parallel_for_block_abt_thread.c @@ -0,0 +1,241 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* This code mimics the parallel for OpenMP directive in nested loops. + * It creates as many streams as user requires and threads are created and + * assigned by static blocs to each stream for the outer loop. + * For the inner loop, as many threads as the user requires are created. + */ + + +#include +#include +#include +#include +#include +#include +#include + +#define NUM_XSTREAMS 36 +#define NUM 1000 +#define NUM_REPS 1 + +int in[NUM][NUM]; +int out[NUM][NUM]; + +/* Vector initialization */ +void init(void) +{ + int i, j; + for (i = 0; i < NUM; i++) { + for (j = 0; j < NUM; j++) { + in[i][j] = 1; + out[i][j] = 0; + } + } +} + +int comp(int v) +{ + int i; + double ret = 0.0; + for (i = 0; i < 100; i++) { + ret += sqrt(cos((double)v) * sin((double)v)); + } + return (int)ret; +} + +void check(void) +{ + int i, j; + for (i = 0; i < NUM; i++) { + for (j = 0; j < NUM; j++) { + int expected = comp(in[i][j]); + if (out[i][j] != expected) { + printf("out[%d][%d]=%d expected=%d\n", i, j, out[i][j], expected); + return; + } + } + } + printf("Verification: SUCCESS\n"); +} + + +static ABT_pool *g_pools; + +typedef struct { + int start; + int end; + int x; +} vector_scal_args_t; + +typedef struct { + int nxstreams; + int it; + int start; + int end; +} vector_scal_task_args_t; + +void vector_scal(void *arguments) +{ + int j; + vector_scal_args_t *arg; + arg = (vector_scal_args_t *)arguments; + + int mystart = arg->start; + int myend = arg->end; + int x = arg->x; + + for (j = mystart; j < myend; j++) { + out[x][j] = comp(in[x][j]); + } +} + +void vector_scal_launch(void *arguments) +{ + int i, it, j, num_ults, rank, mystart, myend, p; + ABT_thread *threads; + ABT_xstream xstream; + ABT_xstream_self(&xstream); + vector_scal_task_args_t *arg; + arg = (vector_scal_task_args_t *) arguments; + vector_scal_args_t *args; + it = arg->it; + num_ults = arg->nxstreams; + mystart = arg->start; + myend = arg->end; + + args = (vector_scal_args_t *)malloc(sizeof(vector_scal_args_t) + * num_ults); + + threads = (ABT_thread *)malloc(sizeof(ABT_thread) * num_ults); + + int bloc = it / (num_ults); + int rest = it % (num_ults); + ABT_xstream_self_rank(&rank); + for (i = mystart; i < myend; i++) { + int start = 0; + int end = 0; + for (j = 0; j < num_ults; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + args[j].x = i; + + if (j > 0) { + ABT_thread_create(g_pools[rank], vector_scal, + (void *)&args[j], ABT_THREAD_ATTR_NULL, + &threads[j]); + } + } + vector_scal((void *)&args[0]); + for (p = 1; p < num_ults; p++) { + ABT_thread_free(&threads[p]); + } + } + + free(threads); + free(args); +} + + +int main(int argc, char *argv[]) +{ + int i, j, r; + int num_xstreams; + char *str, *endptr; + ABT_xstream *xstreams; + ABT_thread *threads; + vector_scal_task_args_t *args; + int inner_xstreams; + double *time, avg_time = 0.0; + + num_xstreams = (argc > 1) ? atoi(argv[1]) : NUM_XSTREAMS; + inner_xstreams = (argc > 2) ? atoi(argv[2]) : NUM_XSTREAMS; + int rep = (argc > 3) ? atoi(argv[3]) : NUM_REPS; + time = (double *)malloc(sizeof(double) * rep); + + init(); + + g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_xstreams); + xstreams = (ABT_xstream *)malloc(sizeof(ABT_xstream) * num_xstreams); + threads = (ABT_thread *)malloc(sizeof(ABT_thread) * num_xstreams); + args = (vector_scal_task_args_t *)malloc(sizeof(vector_scal_task_args_t) + * num_xstreams); + + /* initialization */ + ABT_init(argc, argv); + + for (i = 0; i < num_xstreams; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &g_pools[i]); + } + + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &g_pools[0]); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + + /* Each task is created on the xstream which is going to execute it */ + + for (r = 0; r < rep; r++) { + time[r] = ABT_get_wtime(); + + int bloc = NUM / (num_xstreams); + int rest = NUM % (num_xstreams); + int start = 0; + int end = 0; + + for (j = 0; j < num_xstreams; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + args[j].it = NUM; + args[j].nxstreams = inner_xstreams; + if (j > 0) { + ABT_thread_create(g_pools[j], vector_scal_launch, + (void *)&args[j], ABT_THREAD_ATTR_NULL, + &threads[j]); + } + } + vector_scal_launch((void *)&args[0]); + + for (j = 1; j < num_xstreams; j++) { + ABT_thread_free(&threads[j]); + } + + time[r] = ABT_get_wtime() - time[r]; + avg_time += time[r]; + } + avg_time /= rep; + printf("%d %d %f\n", num_xstreams, inner_xstreams, avg_time); + check(); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + ABT_xstream_free(&xstreams[i]); + } + + ABT_finalize(); + + free(g_pools); + free(xstreams); + free(threads); + free(args); + free(time); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/nested_parallel_for_block_omp.c b/bolt/examples/argobots/nested_parallel_for_block_omp.c new file mode 100644 index 0000000000000..df84555f855ea --- /dev/null +++ b/bolt/examples/argobots/nested_parallel_for_block_omp.c @@ -0,0 +1,105 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* Nested Pragma omp parallel for directives evaluation + * Output: avg time + */ + +#include +#include +#include +#include +#include +#include + +#define NUM 1000 +#define NUM_REPS 10 + +int in[NUM][NUM]; +int out[NUM][NUM]; + +/* Vector initialization */ +void init(void) +{ + int i, j; + for (i = 0; i < NUM; i++) { + for (j = 0; j < NUM; j++) { + in[i][j] = 1; + out[i][j] = 0; + } + } +} + +int comp(int v) +{ + int i; + double ret = 0.0; + for (i = 0; i < 100; i++) { + ret += sqrt(cos((double)v) * sin((double)v)); + } + return (int)ret; +} + +void petsc_voodoo(int x) +{ + int j; + + #pragma omp parallel for + for (j = 0; j < NUM; j++) { + out[x][j] = comp(in[x][j]); + } +} + +void check(void) +{ + int i, j; + for (i = 0; i < NUM; i++) { + for (j = 0; j < NUM; j++) { + int expected = comp(in[i][j]); + if (out[i][j] != expected) { + printf("out[%d][%d]=%d expected=%d\n", i, j, out[i][j], expected); + return; + } + } + } + printf("Verification: SUCCESS\n"); +} + +int main(int argc, char *argv[]) +{ + int i, j, r, nthreads; + double *time, avg_time = 0.0; + + #pragma omp parallel + { + #pragma omp master + { + nthreads = omp_get_num_threads(); + } + } + int in_th = (argc > 1) ? atoi(argv[1]) : nthreads; + int rep = (argc > 2) ? atoi(argv[2]) : NUM_REPS; + time = (double *)malloc(sizeof(double) * rep); + init(); + for (r = 0; r < rep; r++) { + time[r] = omp_get_wtime(); + + #pragma omp parallel for + for (i = 0; i < NUM; i++) { + omp_set_num_threads(in_th); + petsc_voodoo(i); + } + time[r] = omp_get_wtime() - time[r]; + avg_time += time[r]; + } + avg_time /= rep; + printf("%d %d %f\n", nthreads, in_th, avg_time); + check(); + + free(time); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/nested_parallel_for_irregular_abt_task.c b/bolt/examples/argobots/nested_parallel_for_irregular_abt_task.c new file mode 100644 index 0000000000000..162b4e37c96f9 --- /dev/null +++ b/bolt/examples/argobots/nested_parallel_for_irregular_abt_task.c @@ -0,0 +1,245 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* This code mimics the parallel for OpenMP directive in nested loops. + * It creates as many streams as user requires and threads are created and + * assigned by static blocs to each stream for the outer loop. + * For the inner loop, as many task as the user requires are created. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define NUM_ELEMS 5017600 /* 2GB */ +#define NUM_XSTREAMS 4 +#define NUM_REPS 1 + +ABT_pool *g_pools; + +typedef struct { + int start; + int end; +} vector_scal_args_t; + +typedef struct { + int nxstreams; + int it; + int start; + int end; +} vector_scal_task_args_t; + +void exe_random(void *arguments) +{ + int i, k; + vector_scal_args_t *arg; + arg = (vector_scal_args_t *) arguments; + int mystart = arg->start; + int myend = arg->end; + for (i = mystart; i < myend; i++) { + int random = rand() % 10000; + int kk = 0; + for (k = 0; k < random; k++) + kk++; + assert(kk == random); + } +} + +void random_launch(void *arguments) +{ + int i, it, j, num_ults, rank, mystart, myend, p; +#ifdef PROFTIME + struct timeval t_start, t_end; + struct timeval t_start2, t_end2; + double time, time2; +#endif + ABT_task *tasks; + vector_scal_task_args_t *arg; + arg = (vector_scal_task_args_t *) arguments; + vector_scal_args_t *args; + it = arg->it; + num_ults = arg->nxstreams; + mystart = arg->start; + myend = arg->end; + int current = 0; + + args = (vector_scal_args_t *) malloc(sizeof(vector_scal_args_t) + * num_ults); + tasks = (ABT_task *)malloc(sizeof(ABT_task) * num_ults); + + int bloc = it / (num_ults); + int rest = it % (num_ults); + int start = 0; + int end = 0; + ABT_xstream_self_rank(&rank); +#ifdef PROFTIME + gettimeofday(&t_start, NULL); +#endif + for (i = mystart; i < myend; i++) { + for (j = 0; j < num_ults; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + +#ifdef PROFTIME + gettimeofday(&t_start2, NULL); +#endif + ABT_task_create(g_pools[rank], exe_random, + (void *)&args[j], &tasks[j]); +#ifdef PROFTIME + gettimeofday(&t_end2, NULL); + time2 = (t_end2.tv_sec * 1000000 + t_end2.tv_usec) - + (t_start2.tv_sec * 1000000 + t_start2.tv_usec); + printf("Inner_task_creation_time %f\n", (time2 / 1000000.0)); +#endif + } + current++; +#ifdef PROFTIME + gettimeofday(&t_start2, NULL); +#endif + for (p = 0; p < num_ults; p++) { + ABT_task_free(&tasks[p]); + } +#ifdef PROFTIME + gettimeofday(&t_end2, NULL); + time2 = (t_end2.tv_sec * 1000000 + t_end2.tv_usec) - + (t_start2.tv_sec * 1000000 + t_start2.tv_usec); + printf("Inner_join_time %f\n", (time2 / 1000000.0)); +#endif + } +#ifdef PROFTIME + gettimeofday(&t_end, NULL); + time = (t_end.tv_sec * 1000000 + t_end.tv_usec) - + (t_start.tv_sec * 1000000 + t_start.tv_usec); + printf("ult_time %f\n", (time2 / 1000000.0)); +#endif +} + + +int main(int argc, char *argv[]) +{ + int i, j; + int ntasks; + int num_xstreams; + char *str, *endptr; + ABT_xstream *xstreams; + vector_scal_task_args_t *args; + + struct timeval t_start, t_end; +#ifdef PROFTIME + struct timeval t_start2, t_end2; + double time2; +#endif + int it; + int inner_xstreams; + srand(1983); + num_xstreams = argc > 1 ? atoi(argv[1]) : NUM_XSTREAMS; + if (argc > 2) { + str = argv[2]; + } + ntasks = argc > 2 ? strtoll(str, &endptr, 10) : NUM_ELEMS; + it = ceil(sqrt(ntasks)); + ntasks = it * it; + inner_xstreams = argc > 3 ? atoi(argv[3]) : NUM_XSTREAMS; + g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_xstreams); + + xstreams = (ABT_xstream *) malloc(sizeof(ABT_xstream) * num_xstreams); + args = (vector_scal_task_args_t *) malloc(sizeof(vector_scal_task_args_t) + * num_xstreams); + + /* initialization */ + ABT_init(argc, argv); + for (i = 0; i < num_xstreams; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &g_pools[i]); + } + + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &g_pools[0]); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + + gettimeofday(&t_start, NULL); + + /* Each task is created on the xstream which is going to execute it */ + + int bloc = it / (num_xstreams); + int rest = it % (num_xstreams); + int start = 0; + int end = 0; + for (j = 0; j < num_xstreams; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + args[j].it = it; + args[j].nxstreams = inner_xstreams; +#ifdef PROFTIME + gettimeofday(&t_start2, NULL); +#endif + ABT_thread_create_on_xstream(xstreams[j], random_launch, + (void *)&args[j], ABT_THREAD_ATTR_NULL, + NULL); +#ifdef PROFTIME + gettimeofday(&t_end2, NULL); + time2 = (t_end2.tv_sec * 1000000 + t_end2.tv_usec) - + (t_start2.tv_sec * 1000000 + t_start2.tv_usec); + printf("ULT creation time %f\n", time2 / 1000000.0); +#endif + } + + ABT_thread_yield(); + +#ifdef PROFTIME + gettimeofday(&t_start2, NULL); +#endif + for (i = 0; i < num_xstreams; i++) { + size_t size; + while (1) { + ABT_pool_get_size(g_pools[i], &size); + if (size == 0) break; + ABT_thread_yield(); + } + } +#ifdef PROFTIME + gettimeofday(&t_end2, NULL); + time2 = (t_end2.tv_sec * 1000000 + t_end2.tv_usec) - + (t_start2.tv_sec * 1000000 + t_start2.tv_usec); + printf("Join time %f\n", time2 / 1000000.0); + +#endif + + gettimeofday(&t_end, NULL); + double time = (t_end.tv_sec * 1000000 + t_end.tv_usec) - + (t_start.tv_sec * 1000000 + t_start.tv_usec); + + + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + ABT_xstream_free(&xstreams[i]); + } + printf("%d %d %d %f\n", + num_xstreams, inner_xstreams, ntasks, time / 1000000.0); + + ABT_finalize(); + free(xstreams); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/nested_parallel_for_irregular_abt_thread.c b/bolt/examples/argobots/nested_parallel_for_irregular_abt_thread.c new file mode 100644 index 0000000000000..ae7f75ce9c92c --- /dev/null +++ b/bolt/examples/argobots/nested_parallel_for_irregular_abt_thread.c @@ -0,0 +1,245 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* This code mimics the parallel for OpenMP directive in nested loops. + * It creates as many streams as user requires and threads are created and + * assigned by static blocs to each stream for the outer loop. + * For the inner loop, as many threads as the user requires are created. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define NUM_ELEMS 5017600 /* 2GB */ +#define NUM_XSTREAMS 4 +#define NUM_REPS 1 + +ABT_pool *g_pools; + +typedef struct { + int start; + int end; +} vector_scal_args_t; + +typedef struct { + int nxstreams; + int it; + int start; + int end; +} vector_scal_task_args_t; + +void exe_random(void *arguments) +{ + int i, k; + vector_scal_args_t *arg; + arg = (vector_scal_args_t *) arguments; + int mystart = arg->start; + int myend = arg->end; + for (i = mystart; i < myend; i++) { + int random = rand() % 10000; + int kk = 0; + for (k = 0; k < random; k++) + kk++; + assert(kk == random); + } +} + +void random_launch(void *arguments) +{ + int i, it, j, num_ults, rank, mystart, myend, p; +#ifdef PROFTIME + struct timeval t_start, t_end; + struct timeval t_start2, t_end2; + double time, time2; +#endif + ABT_thread *threads; + vector_scal_task_args_t *arg; + arg = (vector_scal_task_args_t *) arguments; + vector_scal_args_t *args; + it = arg->it; + num_ults = arg->nxstreams; + mystart = arg->start; + myend = arg->end; + int current = 0; + args = (vector_scal_args_t *) malloc(sizeof(vector_scal_args_t) + * num_ults); + threads = (ABT_thread *)malloc(sizeof(ABT_thread) * num_ults); + + int bloc = it / (num_ults); + int rest = it % (num_ults); + int start = 0; + int end = 0; + ABT_xstream_self_rank(&rank); +#ifdef PROFTIME + gettimeofday(&t_start, NULL); +#endif + for (i = mystart; i < myend; i++) { + for (j = 0; j < num_ults; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + +#ifdef PROFTIME + gettimeofday(&t_start2, NULL); +#endif + ABT_thread_create(g_pools[rank], exe_random, + (void *)&args[j], ABT_THREAD_ATTR_NULL, + &threads[j]); +#ifdef PROFTIME + gettimeofday(&t_end2, NULL); + time2 = (t_end2.tv_sec * 1000000 + t_end2.tv_usec) - + (t_start2.tv_sec * 1000000 + t_start2.tv_usec); + printf("Inner_ults_creation_time %f\n", (time2 / 1000000.0)); +#endif + } + current++; +#ifdef PROFTIME + gettimeofday(&t_start2, NULL); +#endif + for (p = 0; p < num_ults; p++) { + ABT_thread_free(&threads[p]); + } +#ifdef PROFTIME + gettimeofday(&t_end2, NULL); + time2 = (t_end2.tv_sec * 1000000 + t_end2.tv_usec) - + (t_start2.tv_sec * 1000000 + t_start2.tv_usec); + printf("Inner_join_time %f\n", (time2 / 1000000.0)); +#endif + } +#ifdef PROFTIME + gettimeofday(&t_end, NULL); + time = (t_end.tv_sec * 1000000 + t_end.tv_usec) - + (t_start.tv_sec * 1000000 + t_start.tv_usec); + printf("ult_time %f\n", (time / 1000000.0)); +#endif +} + + +int main(int argc, char *argv[]) +{ + int i, j; + int ntasks; + int num_xstreams; + char *str, *endptr; + ABT_xstream *xstreams; + vector_scal_task_args_t *args; + + struct timeval t_start, t_end; +#ifdef PROFTIME + struct timeval t_start2, t_end2; + double time2; +#endif + int it; + int inner_xstreams; + srand(1983); + num_xstreams = argc > 1 ? atoi(argv[1]) : NUM_XSTREAMS; + if (argc > 2) { + str = argv[2]; + } + ntasks = argc > 2 ? strtoll(str, &endptr, 10) : NUM_ELEMS; + it = ceil(sqrt(ntasks)); + ntasks = it * it; + inner_xstreams = argc > 3 ? atoi(argv[3]) : NUM_XSTREAMS; + g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_xstreams); + + xstreams = (ABT_xstream *) malloc(sizeof(ABT_xstream) * num_xstreams); + args = (vector_scal_task_args_t *) malloc(sizeof(vector_scal_task_args_t) + * num_xstreams); + + /* initialization */ + ABT_init(argc, argv); + for (i = 0; i < num_xstreams; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &g_pools[i]); + } + + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &g_pools[0]); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + + gettimeofday(&t_start, NULL); + + /* Each task is created on the xstream which is going to execute it */ + + int bloc = it / (num_xstreams); + int rest = it % (num_xstreams); + int start = 0; + int end = 0; + for (j = 0; j < num_xstreams; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + args[j].it = it; + args[j].nxstreams = inner_xstreams; +#ifdef PROFTIME + gettimeofday(&t_start2, NULL); +#endif + ABT_thread_create_on_xstream(xstreams[j], random_launch, + (void *)&args[j], ABT_THREAD_ATTR_NULL, + NULL); +#ifdef PROFTIME + gettimeofday(&t_end2, NULL); + time2 = (t_end2.tv_sec * 1000000 + t_end2.tv_usec) - + (t_start2.tv_sec * 1000000 + t_start2.tv_usec); + printf("ULT creation time %f\n", time2 / 1000000.0); +#endif + } + + ABT_thread_yield(); + +#ifdef PROFTIME + gettimeofday(&t_start2, NULL); +#endif + for (i = 0; i < num_xstreams; i++) { + size_t size; + while (1) { + ABT_pool_get_size(g_pools[i], &size); + if (size == 0) break; + ABT_thread_yield(); + } + } +#ifdef PROFTIME + gettimeofday(&t_end2, NULL); + time2 = (t_end2.tv_sec * 1000000 + t_end2.tv_usec) - + (t_start2.tv_sec * 1000000 + t_start2.tv_usec); + printf("Join time %f\n", time2 / 1000000.0); + +#endif + + gettimeofday(&t_end, NULL); + double time = (t_end.tv_sec * 1000000 + t_end.tv_usec) - + (t_start.tv_sec * 1000000 + t_start.tv_usec); + + + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + ABT_xstream_free(&xstreams[i]); + } + printf("%d %d %d %f\n", + num_xstreams, inner_xstreams, ntasks, time / 1000000.0); + + ABT_finalize(); + free(xstreams); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/nested_parallel_for_irregular_omp.c b/bolt/examples/argobots/nested_parallel_for_irregular_omp.c new file mode 100644 index 0000000000000..e2fa6d39be5dc --- /dev/null +++ b/bolt/examples/argobots/nested_parallel_for_irregular_omp.c @@ -0,0 +1,66 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* Nested Pragma omp parallel for directive evaluation + * Output: avg time + */ + +#include +#include +#include +#include +#include +#include + +#define NUM_ELEMS 5017600 /* 2GB */ +#define NUM_REPS 1 + +int main(int argc, char *argv[]) +{ + int i, j, r, nthreads; + double *time, avg_time = 0.0; + + #pragma omp parallel + { + #pragma omp master + { + nthreads = omp_get_num_threads(); + } + } + int n = (argc > 1) ? atoi(argv[1]) : NUM_ELEMS; + int in_th = (argc > 2) ? atoi(argv[2]) : nthreads; + int rep = (argc > 3) ? atoi(argv[3]) : 3; + int it = ceil(sqrt((double)n)); + srand(1983); + + n = it * it; + time = (double *)malloc(sizeof(double) * rep); + for (r = 0; r < rep; r++) { + time[r] = omp_get_wtime(); + #pragma omp parallel for + for (j = 0; j < it; j++) { + omp_set_num_threads(in_th); + #pragma omp parallel for + for (i = 0; i < it; i++) { + int random = rand() % 10000; + volatile int kk = 0; + int k; + for (k = 0; k < random; k++) + kk++; + assert(kk == random); + } + } + time[r] = omp_get_wtime() - time[r]; + avg_time += time[r]; + } + + avg_time /= rep; + printf("%d %d %d %f\n", nthreads, in_th, n, avg_time); + + free(time); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/nested_parallel_for_omp.c b/bolt/examples/argobots/nested_parallel_for_omp.c new file mode 100644 index 0000000000000..97fef26ff9c30 --- /dev/null +++ b/bolt/examples/argobots/nested_parallel_for_omp.c @@ -0,0 +1,83 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* Nested Pragma omp parallel for directives evaluation + * Output: avg time + */ + +#include +#include +#include +#include +#include +#define NUM_ELEMS 5017600 /* 2GB */ +#define NUM_REPS 1 + +/* Vector initialization */ +void init(float *v, int n) +{ + int i = 0; + for (i = 0; i < n; i++) { + v[i] = i + 100.0f; + } +} + +/* Called after each test to be sure that the compiler does + not avoid to execute the test */ +void check(float *v, int n) +{ + int i = 0; + for (i = 0; i < n; i++) { + if (v[i] != (i + 100.0f) * 0.9f) { + printf("v[%d]<=0.0f\n", i); + } + } +} + +int main(int argc, char *argv[]) +{ + int i, j, r, nthreads; + double *time, avg_time = 0.0; + float *v; + + #pragma omp parallel + { + #pragma omp master + { + nthreads = omp_get_num_threads(); + } + } + int n = (argc > 1) ? atoi(argv[1]) : NUM_ELEMS; + int in_th = (argc > 2) ? atoi(argv[2]) : nthreads; + int rep = (argc > 3) ? atoi(argv[3]) : NUM_REPS; + int it = ceil(sqrt((double)n)); + n = it * it; + time = (double *)malloc(sizeof(double) * rep); + v = (float *)malloc(sizeof(float) * n); + init(v, n); + for (r = 0; r < rep; r++) { + time[r] = omp_get_wtime(); + + #pragma omp parallel for + for (j = 0; j < it; j++) { + omp_set_num_threads(in_th); + #pragma omp parallel for + for (i = 0; i < it; i++) { + v[j * it + i] *= 0.9f; + } + } + time[r] = omp_get_wtime() - time[r]; + avg_time += time[r]; + } + avg_time /= rep; + check(v, n); + printf("%d %d %d %f\n", nthreads, in_th, n, avg_time); + + free(time); + free(v); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/parallel_for_abt_task.c b/bolt/examples/argobots/parallel_for_abt_task.c new file mode 100644 index 0000000000000..9e5ead4e30509 --- /dev/null +++ b/bolt/examples/argobots/parallel_for_abt_task.c @@ -0,0 +1,142 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* parallel_for_abt_task.c code mimics the parallel for OpenMP directive. It + * creates as many ESs as user requires, and tasks are created and assigned by + * static blocks to each ES. + */ + +#include +#include +#include +#include +#include +#include + +#define NUM_ELEMS 5017600 /* 2GB */ +#define NUM_XSTREAMS 4 +#define NUM_REPS 1 + +ABT_pool *g_pools; + +typedef struct { + float *ptr; + float value; + int start; + int end; +} vector_scal_args_t; + + +void vector_scal(void *arguments) +{ + int i; + vector_scal_args_t *arg; + arg = (vector_scal_args_t *)arguments; + int mystart = arg->start; + int myend = arg->end; + float value = arg->value; + float *ptr = arg->ptr; + for (i = mystart; i < myend; i++) { + ptr[i] *= value; + } +} + + +int main(int argc, char *argv[]) +{ + int i, j; + int ntasks; + int num_xstreams; + char *str, *endptr; + ABT_xstream *xstreams; + vector_scal_args_t *args; + struct timeval t_start, t_end; + float *a; + ABT_task *tasks; + num_xstreams = argc > 1 ? atoi(argv[1]) : NUM_XSTREAMS; + if (argc > 2) { + str = argv[2]; + } + ntasks = argc > 2 ? strtoll(str, &endptr, 10) : NUM_ELEMS; + g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_xstreams); + tasks = (ABT_task *)malloc(sizeof(ABT_task) * num_xstreams); + a = malloc(sizeof(float) * ntasks); + for (i = 0; i < ntasks; i++) { + a[i] = i * 1.0f; + } + + xstreams = (ABT_xstream *)malloc(sizeof(ABT_xstream) * num_xstreams); + args = (vector_scal_args_t *)malloc(sizeof(vector_scal_args_t) + * num_xstreams); + + /* initialization */ + ABT_init(argc, argv); + for (i = 0; i < num_xstreams; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &g_pools[i]); + } + + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &g_pools[0]); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + + gettimeofday(&t_start, NULL); + + /* Each task is created on the xstream which is going to execute it */ + + int bloc = ntasks / (num_xstreams); + int rest = ntasks % (num_xstreams); + int start = 0; + int end = 0; + for (j = 0; j < num_xstreams; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + args[j].value = 0.9f; + args[j].ptr = a; + ABT_task_create_on_xstream(xstreams[j], vector_scal, + (void *)&args[j], &tasks[j]); + } + + ABT_thread_yield(); + + for (i = 0; i < num_xstreams; i++) { + ABT_task_free(&tasks[i]); + } + + gettimeofday(&t_end, NULL); + double time = (t_end.tv_sec * 1000000 + t_end.tv_usec) - + (t_start.tv_sec * 1000000 + t_start.tv_usec); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + } + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_free(&xstreams[i]); + } + printf("%d %d %f\n", num_xstreams, ntasks, time / 1000000.0); + + ABT_finalize(); + free(xstreams); + for (i = 0; i < ntasks; i++) { + if (a[i] != i * 0.9f) { + printf("%f\n", a[i]); + return EXIT_FAILURE; + } + } + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/parallel_for_abt_thread.c b/bolt/examples/argobots/parallel_for_abt_thread.c new file mode 100644 index 0000000000000..5c4eaae1059c5 --- /dev/null +++ b/bolt/examples/argobots/parallel_for_abt_thread.c @@ -0,0 +1,144 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* parallel_for_abt_thread.c code mimics the parallel for OpenMP directive. + * It creates as many ESs as user requires, and tasks are created and assigned + * by static blocks to each ES. + */ + +#include +#include +#include +#include +#include +#include + +#define NUM_ELEMS 5017600 /* 2GB */ +#define NUM_XSTREAMS 4 +#define NUM_REPS 1 + +ABT_pool *g_pools; + +typedef struct { + float *ptr; + float value; + int start; + int end; +} vector_scal_args_t; + + +void vector_scal(void *arguments) +{ + int i; + vector_scal_args_t *arg; + arg = (vector_scal_args_t *)arguments; + int mystart = arg->start; + int myend = arg->end; + float value = arg->value; + float *ptr = arg->ptr; + for (i = mystart; i < myend; i++) { + ptr[i] *= value; + } +} + + +int main(int argc, char *argv[]) +{ + int i, j; + int ntasks; + int num_xstreams; + char *str, *endptr; + ABT_xstream *xstreams; + vector_scal_args_t *args; + struct timeval t_start, t_end; + float *a; + ABT_thread *threads; + num_xstreams = argc > 1 ? atoi(argv[1]) : NUM_XSTREAMS; + if (argc > 2) { + str = argv[2]; + } + ntasks = argc > 2 ? strtoll(str, &endptr, 10) : NUM_ELEMS; + g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_xstreams); + threads = (ABT_thread *)malloc(sizeof(ABT_thread) * num_xstreams); + + a = malloc(sizeof(float) * ntasks); + for (i = 0; i < ntasks; i++) { + a[i] = i * 1.0f; + } + + xstreams = (ABT_xstream *)malloc(sizeof(ABT_xstream) * num_xstreams); + args = (vector_scal_args_t *)malloc(sizeof(vector_scal_args_t) + * num_xstreams); + + /* initialization */ + ABT_init(argc, argv); + for (i = 0; i < num_xstreams; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &g_pools[i]); + } + + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &g_pools[0]); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + + gettimeofday(&t_start, NULL); + + /* Each task is created on the xstream which is going to execute it */ + + int bloc = ntasks / (num_xstreams); + int rest = ntasks % (num_xstreams); + int start = 0; + int end = 0; + for (j = 0; j < num_xstreams; j++) { + start = end; + int inc = (j < rest) ? 1 : 0; + end += bloc + inc; + args[j].start = start; + args[j].end = end; + args[j].value = 0.9f; + args[j].ptr = a; + ABT_thread_create_on_xstream(xstreams[j], vector_scal, + (void *)&args[j], ABT_THREAD_ATTR_NULL, + &threads[j]); + } + + ABT_thread_yield(); + + for (i = 0; i < num_xstreams; i++) { + ABT_thread_free(&threads[i]); + } + + gettimeofday(&t_end, NULL); + double time = (t_end.tv_sec * 1000000 + t_end.tv_usec) - + (t_start.tv_sec * 1000000 + t_start.tv_usec); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + } + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_free(&xstreams[i]); + } + printf("%d %d %f\n", num_xstreams, ntasks, time / 1000000.0); + + ABT_finalize(); + free(xstreams); + for (i = 0; i < ntasks; i++) { + if (a[i] != i * 0.9f) { + printf("%f\n", a[i]); + return EXIT_FAILURE; + } + } + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/parallel_for_omp.c b/bolt/examples/argobots/parallel_for_omp.c new file mode 100644 index 0000000000000..2b260cf2dae5c --- /dev/null +++ b/bolt/examples/argobots/parallel_for_omp.c @@ -0,0 +1,78 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* Pragma omp parallel for directive evaluation + * Output: avg time + */ + +#include +#include +#include +#include +#include +#define NUM_ELEMS 5017600 /* 2GB */ +#define EXT_LOOP_ELEM 2 /* 2GB */ +#define IN_LOOP_ELEM 2 /* 2GB */ +#define IN_LOOP_TH 1 /* 2GB */ +#define NUM_REPS 1 + +/* Vector initialization */ +void init(float *v, int n) +{ + int i = 0; + for (i = 0; i < n; i++) { + v[i] = i + 100.0f; + } +} + +/* Called after each test to be sure that the compiler does + not avoid to execute the test */ +void check(float *v, int n) +{ + int i = 0; + for (i = 0; i < n; i++) { + if (v[i] != (i + 100.0f) * 0.9f) { + printf("v[%d]<=0.0f\n", i); + } + } +} + +int main(int argc, char *argv[]) +{ + int i, r, nthreads; + double *time, avg_time = 0.0; + float *v; + + #pragma omp parallel + { + #pragma omp master + { + nthreads = omp_get_num_threads(); + } + } + int n = (argc > 1) ? atoi(argv[1]) : NUM_ELEMS; + int rep = (argc > 2) ? atoi(argv[2]) : 1; + time = (double *)malloc(sizeof(double) * rep); + v = (float *)malloc(sizeof(float) * n); + init(v, n); + for (r = 0; r < rep; r++) { + time[r] = omp_get_wtime(); + #pragma omp parallel for + for (i = 0; i < n; i++) { + v[i] *= 0.9f; + } + time[r] = omp_get_wtime() - time[r]; + avg_time += time[r]; + } + avg_time /= rep; + check(v, n); + printf("%d %d %f\n", nthreads, n, avg_time); + + free(time); + free(v); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/task_multiple_producer_abt_task.c b/bolt/examples/argobots/task_multiple_producer_abt_task.c new file mode 100644 index 0000000000000..7da53b06d93cb --- /dev/null +++ b/bolt/examples/argobots/task_multiple_producer_abt_task.c @@ -0,0 +1,163 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* This code creates one task for each argobots xstream and each task creates + * a number of tasks. This version uses as many pools as execution streams are + * created. This number of tasks is the division between number of tasks + * required and number of streams. This code mimics the all producer all + * consumers system. + */ + +#include +#include +#include +#include +#include +#include + +#define NUM_TASKS 5000000 +#define NUM_XSTREAMS 4 +#define NUM_REPS 1 + +ABT_pool *g_pools; + +typedef struct { + float *ptr; + float value; + int start; + int end; + int id; +} vector_scal_task_args_t; + +void task_function(void *args) +{ + float *a; + a = (float *)args; + *a = *a * 0.9f; +} + + +void task_creator(void *args) +{ + int i; + vector_scal_task_args_t *arg; + arg = (vector_scal_task_args_t *)args; + for (i = arg->start; i < arg->end; i++) { + ABT_task_create(g_pools[arg->id], task_function, (void *)&arg->ptr[i], + NULL); + } +} + +int main(int argc, char *argv[]) +{ + int i, j; + int ntasks; + int start, end; + int num_xstreams; + ABT_xstream *xstreams; + vector_scal_task_args_t *args; + struct timeval t_start, t_end, t_end2; + char *str, *endptr; + float *a; + + num_xstreams = argc > 1 ? atoi(argv[1]) : NUM_XSTREAMS; + if (argc > 2) { + str = argv[2]; + } + + ntasks = argc > 2 ? strtoll(str, &endptr, 10) : NUM_TASKS; + if (ntasks < num_xstreams) { + ntasks = num_xstreams; + } + + printf("# of ESs: %d\n", num_xstreams); + + xstreams = (ABT_xstream *)malloc(sizeof(ABT_xstream) * num_xstreams); + args = (vector_scal_task_args_t *)malloc(sizeof(vector_scal_task_args_t) + * num_xstreams); + g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_xstreams); + + a = malloc(sizeof(float) * ntasks); + + for (i = 0; i < ntasks; i++) { + a[i] = i + 100.0f; + } + + /* initialization */ + ABT_init(argc, argv); + + for (i = 0; i < num_xstreams; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &g_pools[i]); + } + + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &g_pools[0]); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + + /* Work here */ + start = end = 0; + int bloc = ntasks / num_xstreams; + int rest = ntasks % num_xstreams; + gettimeofday(&t_start, NULL); + for (j = 0; j < num_xstreams; j++) { + start = end; + end = start + bloc; + if (j < rest) { + end++; + } + args[j].ptr = a; + args[j].value = 0.9f; + args[j].start = start; + args[j].end = end; + args[j].id = j; + ABT_task_create_on_xstream(xstreams[j], task_creator, + (void *)&args[j], NULL); + } + gettimeofday(&t_end2, NULL); + + for (i = 0; i < num_xstreams; i++) { + size_t size; + do { + ABT_thread_yield(); + ABT_pool_get_size(g_pools[i], &size); + } while (size != 0); + } + + gettimeofday(&t_end, NULL); + + for (i = 0; i < ntasks; i++) { + if (a[i] != (i + 100.0f) * 0.9f) { + printf("error: a[%d]\n", i); + } + } + + double time = (t_end.tv_sec * 1000000 + t_end.tv_usec) + - (t_start.tv_sec * 1000000 + t_start.tv_usec); + double time2 = (t_end2.tv_sec * 1000000 + t_end2.tv_usec) + - (t_start.tv_sec * 1000000 + t_start.tv_usec); + + printf("nxstreams: %d\nntasks %d\nTime(s): %f\n", + num_xstreams, ntasks, time / 1000000.0); + /* join ESs */ + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + ABT_xstream_free(&xstreams[i]); + } + printf("Creation time=%f\n", time2 / 1000000.0); + ABT_finalize(); + + free(xstreams); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/task_multiple_producer_abt_thread.c b/bolt/examples/argobots/task_multiple_producer_abt_thread.c new file mode 100644 index 0000000000000..f3a380d436209 --- /dev/null +++ b/bolt/examples/argobots/task_multiple_producer_abt_thread.c @@ -0,0 +1,163 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* This code creates one task for each argobots xstream and each task creates + * a number of tasks. This version uses as many pools as execution streams are + * created. This number of tasks is the division between number of tasks + * required and number of streams. This code mimics the all producer all + * consumers system. + */ + +#include +#include +#include +#include +#include +#include + +#define NUM_TASKS 5000000 +#define NUM_XSTREAMS 4 +#define NUM_REPS 1 + +ABT_pool *g_pools; + +typedef struct { + float *ptr; + float value; + int start; + int end; + int id; +} vector_scal_task_args_t; + +void task_function(void *args) +{ + float *a; + a = (float *)args; + *a = *a * 0.9f; +} + +void task_creator(void *args) +{ + int i; + vector_scal_task_args_t *arg; + arg = (vector_scal_task_args_t *)args; + for (i = arg->start; i < arg->end; i++) { + ABT_thread_create(g_pools[arg->id], task_function, (void *)&arg->ptr[i], + ABT_THREAD_ATTR_NULL, NULL); + } +} + +int main(int argc, char *argv[]) +{ + int i, j; + int ntasks; + int start, end; + int num_xstreams; + ABT_xstream *xstreams; + vector_scal_task_args_t *args; + struct timeval t_start, t_end, t_end2; + char *str, *endptr; + float *a; + + num_xstreams = argc > 1 ? atoi(argv[1]) : NUM_XSTREAMS; + if (argc > 2) { + str = argv[2]; + } + + ntasks = argc > 2 ? strtoll(str, &endptr, 10) : NUM_TASKS; + if (ntasks < num_xstreams) { + ntasks = num_xstreams; + } + + printf("# of ESs: %d\n", num_xstreams); + + xstreams = (ABT_xstream *)malloc(sizeof(ABT_xstream) * num_xstreams); + args = (vector_scal_task_args_t *)malloc(sizeof(vector_scal_task_args_t) + * num_xstreams); + g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_xstreams); + + a = malloc(sizeof(float) * ntasks); + + for (i = 0; i < ntasks; i++) { + a[i] = i + 100.0f; + } + + /* initialization */ + ABT_init(argc, argv); + + for (i = 0; i < num_xstreams; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &g_pools[i]); + } + + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &g_pools[0]); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + + /* Work here */ + start = end = 0; + int bloc = ntasks / num_xstreams; + int rest = ntasks % num_xstreams; + gettimeofday(&t_start, NULL); + for (j = 0; j < num_xstreams; j++) { + start = end; + end = start + bloc; + if (j < rest) { + end++; + } + args[j].ptr = a; + args[j].value = 0.9f; + args[j].start = start; + args[j].end = end; + args[j].id = j; + ABT_thread_create_on_xstream(xstreams[j], task_creator, + (void *)&args[j], ABT_THREAD_ATTR_NULL, + NULL); + } + gettimeofday(&t_end2, NULL); + + for (i = 0; i < num_xstreams; i++) { + size_t size; + do { + ABT_thread_yield(); + ABT_pool_get_size(g_pools[i], &size); + } while (size != 0); + } + + gettimeofday(&t_end, NULL); + + for (i = 0; i < ntasks; i++) { + if (a[i] != (i + 100.0f) * 0.9f) { + printf("error: a[%d]\n", i); + } + } + + double time = (t_end.tv_sec * 1000000 + t_end.tv_usec) + - (t_start.tv_sec * 1000000 + t_start.tv_usec); + double time2 = (t_end2.tv_sec * 1000000 + t_end2.tv_usec) + - (t_start.tv_sec * 1000000 + t_start.tv_usec); + + printf("nxstreams: %d\nntasks %d\nTime(s): %f\n", + num_xstreams, ntasks, time / 1000000.0); + /* join ESs */ + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + ABT_xstream_free(&xstreams[i]); + } + printf("Creation time=%f\n", time2 / 1000000.0); + ABT_finalize(); + + free(xstreams); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/task_multiple_producer_omp.c b/bolt/examples/argobots/task_multiple_producer_omp.c new file mode 100644 index 0000000000000..ef64c4109cba4 --- /dev/null +++ b/bolt/examples/argobots/task_multiple_producer_omp.c @@ -0,0 +1,90 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +#include +#include +#include +#include +#include + +#define NUM_TASKS 5000000 +#define NUM_REPS 1 +#define USLEEP usleep(100); + +/* Pragma omp task directive evaluation + * Output: avg time + */ + +void sscal(float value, float *a) +{ + *a = *a * value; +} + + +int main(int argc, char *argv[]) +{ + int i, r, nthreads; + double *time, avg_time = 0.0; + char *str, *endptr; + float *a; + + #pragma omp parallel + { + #pragma omp master + { + nthreads = omp_get_num_threads(); + } + } + + if (argc > 1) { + str = argv[1]; + } + + int ntasks = argc > 1 ? strtoll(str, &endptr, 10) : NUM_TASKS; + if (ntasks < nthreads) { + ntasks = nthreads; + } + + int rep = (argc > 2) ? atoi(argv[2]) : NUM_REPS; + + time = malloc(sizeof(double) * (rep + 1)); + + a = malloc(sizeof(float) * ntasks); + + for (i = 0; i < ntasks; i++) { + a[i] = i + 100.0f; + } + + for (r = 0; r < rep; r++) { + time[r] = omp_get_wtime(); + #pragma omp parallel + { + time[1] = omp_get_wtime(); + #pragma omp for + for (i = 0; i < ntasks; i++) { + #pragma omp task firstprivate(i) + { + sscal(0.9f, &a[i]); + } + } + time[1] = (omp_get_wtime() - time[1]); + } + time[r] = omp_get_wtime() - time[r]; + avg_time += time[r]; + } + for (i = 0; i < ntasks; i++) { + if (a[i] != (i + 100.0f) * 0.9f) { + printf("error: a[%d]=%.2f expected %.2f\n", i, + a[i], (i + 100.0f) * 0.9f); + } + } + + avg_time /= rep; + printf("nthreads: %d\nntasks: %d\nTime(s):%f\nCreation Time: %f\n", + nthreads, ntasks, avg_time, time[1]); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/task_nested_abt_task.c b/bolt/examples/argobots/task_nested_abt_task.c new file mode 100644 index 0000000000000..694fb5a67085a --- /dev/null +++ b/bolt/examples/argobots/task_nested_abt_task.c @@ -0,0 +1,137 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* This code creates all tasks from the main ES but using as many pools as + * xstreams and they are executed by all the xstreams. This code mimics + * the 1 producers all consumers system. +*/ + +#include +#include +#include +#include +#include +#include +#include + +#define NUM_TASKS 50000 +#define NUM_XSTREAMS 4 +#define NUM_REPS 1 + +ABT_pool *g_pools; +int num_pools; +int num_xstreams; +int pool_for_task = 0; +int o = 0; + +void vector_scal(void *arguments) +{ + float *a; + a = (float *)arguments; + *a = *a * 0.9f; +} + +void na(void *arguments) +{ + o++; +} + +void prevector_scal(void *arguments) +{ + ABT_task_create(g_pools[pool_for_task % num_pools], vector_scal, + arguments, NULL); + ABT_task_create(g_pools[pool_for_task % num_pools], na, arguments, NULL); + pool_for_task++; +} + + +int main(int argc, char *argv[]) +{ + int i, j; + int ntasks; + ABT_xstream *xstreams; + ABT_task *tasks; + struct timeval start, end, end2; + char *str, *endptr; + float *a; + + num_xstreams = argc > 1 ? atoi(argv[1]) : NUM_XSTREAMS; + if (argc > 2) { + str = argv[2]; + } + ntasks = argc > 2 ? strtoll(str, &endptr, 10) : NUM_TASKS; + if (ntasks < num_xstreams) { + ntasks = num_xstreams; + } + num_pools = argc > 3 ? atoi(argv[3]) : num_xstreams; + printf("# of ESs: %d Pools: %d\n", num_xstreams, num_pools); + + a = malloc(sizeof(float) * ntasks); + + for (i = 0; i < ntasks; i++) { + a[i] = i + 100.0f; + } + + xstreams = (ABT_xstream *)malloc(sizeof(ABT_xstream) * num_xstreams); + tasks = (ABT_task *)malloc(sizeof(ABT_task) * num_xstreams); + g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_pools); + + /* initialization */ + ABT_init(argc, argv); + + /* shared pool creation */ + for (i = 0; i < num_pools; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &g_pools[i]); + } + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &g_pools[0]); + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i % num_pools], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + /* Work here */ + gettimeofday(&start, NULL); + for (j = 0; j < ntasks; j++) { + ABT_task_create_on_xstream(xstreams[j % num_xstreams], prevector_scal, + (void *)&a[j], NULL); + } + + gettimeofday(&end2, NULL); + ABT_thread_yield(); + for (i = 1; i < num_xstreams; i++) { + size_t size; + while (1) { + ABT_pool_get_size(g_pools[i], &size); + if (size == 0) break; + ABT_thread_yield(); + } + } + + gettimeofday(&end, NULL); + double time = (end.tv_sec * 1000000 + end.tv_usec) + - (start.tv_sec * 1000000 + start.tv_usec); + double time2 = (end2.tv_sec * 1000000 + end2.tv_usec) + - (start.tv_sec * 1000000 + start.tv_usec); + + printf("nxstreams: %d\nntasks %d\nTotal Time(s): %f\n Creation Time (s): %f\n", + num_xstreams, ntasks, time / 1000000.0, time2 / 1000000.0); + printf("o=%d and it should be %d\n", o, ntasks); + + /* join ESs */ + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + ABT_xstream_free(&xstreams[i]); + } + + free(tasks); + free(xstreams); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/task_nested_lvl2_abt_task.c b/bolt/examples/argobots/task_nested_lvl2_abt_task.c new file mode 100644 index 0000000000000..91651eaa1cecf --- /dev/null +++ b/bolt/examples/argobots/task_nested_lvl2_abt_task.c @@ -0,0 +1,154 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* This code creates all tasks from the main ES but using as many pools as + * xstreams and they are executed by all the xstreams. This code mimics the 1 + * producers all consumers system. +*/ + + +#include +#include +#include +#include +#include +#include + +#define NUM_TASKS 50000 +#define NUM_XSTREAMS 4 +#define NUM_REPS 1 +#define LEVELS 2 + +int num_pools; +int num_xstreams; +int o = 0; +ABT_pool *g_pools; +int lvl; + +void vector_scal(void *arguments) +{ + float *a; + a = (float *)arguments; + *a = *a * 0.9f; +} + +void na(void *arguments) +{ + o++; +} + +void prevector_scal2(void *arguments) +{ + int rank; + ABT_xstream_self_rank(&rank); + ABT_task_create(g_pools[rank], vector_scal, arguments, NULL); + ABT_task_create(g_pools[rank], na, arguments, NULL); +} + +void prevector_scal(void *arguments) +{ + int rank; + ABT_xstream_self_rank(&rank); + ABT_task_create(g_pools[rank], prevector_scal2, arguments, NULL); + ABT_task_create(g_pools[rank], prevector_scal2, arguments, NULL); +} + + +int main(int argc, char *argv[]) +{ + int i, j; + int ntasks; + ABT_xstream *xstreams; + ABT_task *tasks; + struct timeval start, end, end2; + char *str, *endptr; + float *a; + + num_xstreams = argc > 1 ? atoi(argv[1]) : NUM_XSTREAMS; + if (argc > 2) { + str = argv[2]; + } + ntasks = argc > 2 ? strtoll(str, &endptr, 10) : NUM_TASKS; + if (ntasks < num_xstreams) { + ntasks = num_xstreams; + } + num_pools = argc > 3 ? atoi(argv[3]) : num_xstreams; + lvl = (argc > 4) ? atoi(argv[4]) : LEVELS; + + a = malloc(sizeof(float) * ntasks); + + for (i = 0; i < ntasks; i++) { + a[i] = i + 100.0f; + } + + xstreams = (ABT_xstream *)malloc(sizeof(ABT_xstream) * num_xstreams); + tasks = (ABT_task *)malloc(sizeof(ABT_task) * num_xstreams); + g_pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_pools); + + /* initialization */ + ABT_init(argc, argv); + + /* shared pool creation */ + for (i = 0; i < num_pools; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &g_pools[i]); + } + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &g_pools[0]); + + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &g_pools[i % num_pools], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + /* Work here */ + gettimeofday(&start, NULL); + for (j = 0; j < ntasks; j++) { + ABT_task_create_on_xstream(xstreams[j % num_xstreams], prevector_scal, + (void *)&a[j], NULL); + } + + gettimeofday(&end2, NULL); + ABT_thread_yield(); + for (i = 1; i < num_xstreams; i++) { + size_t size; + do { + ABT_pool_get_size(g_pools[i], &size); + } while (size != 0); + + } + + gettimeofday(&end, NULL); + double time = (end.tv_sec * 1000000 + end.tv_usec) + - (start.tv_sec * 1000000 + start.tv_usec); + double time2 = (end2.tv_sec * 1000000 + end2.tv_usec) + - (start.tv_sec * 1000000 + start.tv_usec); + + printf("nxstreams: %d\nntasks %d\nTime(s): %f\n", + num_xstreams, ntasks, time / 1000000.0); + printf("o=%d ans it should be %d\n", o, ntasks); + printf("Creation time= %f\n", time2 / 1000000.0); + + /* join ESs */ + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + ABT_xstream_free(&xstreams[i]); + } + //TODO: it only works for 1 rep + for (i = 0; i < ntasks; i++) { + if (a[i] != (i + 100.0f) * 0.9f) { + printf("error: a[%d]\n", i); + } + } + ABT_finalize(); + + free(tasks); + free(xstreams); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/task_nested_lvl2_omp.c b/bolt/examples/argobots/task_nested_lvl2_omp.c new file mode 100644 index 0000000000000..8d9b1dcfe9dcc --- /dev/null +++ b/bolt/examples/argobots/task_nested_lvl2_omp.c @@ -0,0 +1,121 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +#include +#include +#include +#include +#include + +#define NUM_TASKS 50000 +#define NUM_REPS 1 +#define LEVELS 2 + +int o = 0; +void sscal(float value, float *a) +{ + *a = *a * value; +} + +void na(float value) +{ + o++; +} + +void presscal(float value, float *a, int lvl, int i) +{ + if (lvl > 1) { + lvl--; + #pragma omp task + { + presscal(value, a, lvl, i); + } + #pragma omp task + { + presscal(value, a, lvl, i); + } + } + else { + #pragma omp task + { + sscal(value, a); + } + + #pragma omp task + { + na(value); + } + } +} + +int main(int argc, char *argv[]) +{ + int i, r, nthreads; + double *time, avg_time = 0.0; + char *str, *endptr; + float *a; + double time2 = 0.0; + + #pragma omp parallel + { + #pragma omp master + { + nthreads = omp_get_num_threads(); + } + } + + if (argc > 1) { + str = argv[1]; + } + + int ntasks = argc > 1 ? strtoll(str, &endptr, 10) : NUM_TASKS; + + int lvl = (argc > 2) ? atoi(argv[2]) : LEVELS; + + int rep = (argc > 3) ? atoi(argv[3]) : NUM_REPS; + + time = malloc(sizeof(double) * rep); + a = malloc(sizeof(float) * ntasks); + + for (i = 0; i < ntasks; i++) { + a[i] = i + 100.0f; + } + + for (r = 0; r < rep; r++) { + time[r] = omp_get_wtime(); + #pragma omp parallel + { + #pragma omp single + { + time2 = omp_get_wtime(); + for (i = 0; i < ntasks; i++) { + #pragma omp task firstprivate(i) + { + presscal(0.9f, &a[i], lvl, i); + } + } + time2 = omp_get_wtime() - time2; + } + } + time[r] = omp_get_wtime() - time[r]; + avg_time += time[r]; + + } + + // TODO: Just works with one repetition + for (i = 0; i < ntasks; i++) { + if (a[i] != (i + 100.0f) * 0.9f) { + printf("error: a[%d]=%2.f expected %2.f\n", i, + a[i], (i + 100.0f) * 0.9f); + } + } + avg_time /= rep; + printf("nthreads: %d\nntasks: %d\nTime(s):%f\nCreation Time: %f\n", + nthreads, ntasks, avg_time, time2); + printf("o=%d deberia valer %d\n", o, ntasks); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/task_nested_omp.c b/bolt/examples/argobots/task_nested_omp.c new file mode 100644 index 0000000000000..0a88b11ddf7ad --- /dev/null +++ b/bolt/examples/argobots/task_nested_omp.c @@ -0,0 +1,108 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +#include +#include +#include +#include +#include + +#define NUM_TASKS 50000 +#define NUM_REPS 1 + +int o = 0; + +void sscal(float value, float *a) +{ + *a = *a * value; +} + +void na(float value) +{ + o++; +} + +void presscal(float value, float *a) +{ + #pragma omp task + { + sscal(value, a); + } + + #pragma omp task + { + na(value); + } +} + +int main(int argc, char *argv[]) +{ + int i, r, nthreads; + double *time, avg_time = 0.0; + char *str, *endptr; + float *a; + double time2 = 0.0; + + #pragma omp parallel + { + #pragma omp master + { + nthreads = omp_get_num_threads(); + } + } + + if (argc > 1) { + str = argv[1]; + } + + int ntasks = argc > 1 ? strtoll(str, &endptr, 10) : NUM_TASKS; + if (ntasks < nthreads) + ntasks = nthreads; + + int rep = (argc > 2) ? atoi(argv[2]) : NUM_REPS; + + time = malloc(sizeof(double) * rep); + a = malloc(sizeof(float) * ntasks); + + + for (i = 0; i < ntasks; i++) { + a[i] = i + 100.0f; + } + + for (r = 0; r < rep; r++) { + time[r] = omp_get_wtime(); + #pragma omp parallel + { + #pragma omp single + { + time2 = omp_get_wtime(); + for (i = 0; i < ntasks; i++) { + #pragma omp task firstprivate(i) + { + presscal(0.9f, &a[i]); + } + } + time2 = omp_get_wtime() - time2; + } + } + time[r] = omp_get_wtime() - time[r]; + avg_time += time[r]; + } + + // TODO: Just works with one repetition + for (i = 0; i < ntasks; i++) { + if (a[i] != (i + 100.0f) * 0.9f) { + printf("error: a[%d]=%2.f expected %2.f\n", i, + a[i], (i + 100.0f) * 0.9f); + } + } + avg_time /= rep; + printf("nthreads: %d\nntasks: %d\nTime(s):%f\nCreation Time: %f\n", + nthreads, ntasks, avg_time, time2); + printf("o=%d deberia valer %d\n", o, ntasks); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/task_single_producer_abt_task.c b/bolt/examples/argobots/task_single_producer_abt_task.c new file mode 100644 index 0000000000000..f326108c51368 --- /dev/null +++ b/bolt/examples/argobots/task_single_producer_abt_task.c @@ -0,0 +1,119 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* This code creates all tasks from the main ES but using as many pools as + * xstreams and they are executed by all the xstreams. It mimics one producer + * all consumers system + */ + +#include +#include +#include +#include +#include +#include + +#define NUM_TASKS 5000000 +#define NUM_XSTREAMS 4 +#define NUM_REPS 1 + +void vector_scal(void *arguments) +{ + float *a; + a = (float *)arguments; + *a = *a * 0.9f; +} + +int main(int argc, char *argv[]) +{ + int i, j; + int ntasks; + int num_xstreams; + int num_pools; + ABT_xstream *xstreams; + ABT_task *tasks; + ABT_pool *pools; + struct timeval start, end; + char *str, *endptr; + float *a; + + num_xstreams = argc > 1 ? atoi(argv[1]) : NUM_XSTREAMS; + if (argc > 2) { + str = argv[2]; + } + ntasks = argc > 2 ? strtoll(str, &endptr, 10) : NUM_TASKS; + if (ntasks < num_xstreams) { + ntasks = num_xstreams; + } + num_pools = argc > 5 ? atoi(argv[5]) : num_xstreams; + + a = malloc(sizeof(float) * ntasks); + + for (i = 0; i < ntasks; i++) { + a[i] = i + 100.0f; + } + + xstreams = (ABT_xstream *)malloc(sizeof(ABT_xstream) * num_xstreams); + tasks = (ABT_task *)malloc(sizeof(ABT_task) * num_xstreams); + pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_pools); + + /* initialization */ + ABT_init(argc, argv); + + /* shared pool creation */ + for (i = 0; i < num_pools; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &pools[i]); + } + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &pools[0]); + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &pools[i % num_pools], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + /* Work here */ + gettimeofday(&start, NULL); + for (j = 0; j < ntasks; j++) { + ABT_task_create(pools[j % num_pools], vector_scal, (void *)&a[j], NULL); + } + + gettimeofday(&end, NULL); + double time2 = (end.tv_sec * 1000000 + end.tv_usec) + - (start.tv_sec * 1000000 + start.tv_usec); + ABT_thread_yield(); + for (i = 0; i < num_pools; i++) { + size_t size; + do { + ABT_pool_get_size(pools[i], &size); + } while (size != 0); + } + + gettimeofday(&end, NULL); + double time = (end.tv_sec * 1000000 + end.tv_usec) + - (start.tv_sec * 1000000 + start.tv_usec); + printf("nxstreams: %d\nntasks %d\nTime(s): %f Creation Time(s): %f\n", + num_xstreams, ntasks, time / 1000000.0, time2 / 1000000.0); + + for (i = 0; i < ntasks; i++) { + if (a[i] != (i + 100.0f) * 0.9f) { + printf("error: a[%d]\n", i); + } + } + + /* join ESs */ + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + ABT_xstream_free(&xstreams[i]); + } + + free(tasks); + free(xstreams); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/task_single_producer_abt_thread.c b/bolt/examples/argobots/task_single_producer_abt_thread.c new file mode 100644 index 0000000000000..e2d44dc652ad5 --- /dev/null +++ b/bolt/examples/argobots/task_single_producer_abt_thread.c @@ -0,0 +1,118 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* This code creates all tasks from the main ES but using as many pools as + * xstreams and they are executed by all the xstreams. It mimics one producer + * all consumers system + */ + +#include +#include +#include +#include +#include +#include + +#define NUM_TASKS 5000000 +#define NUM_XSTREAMS 4 +#define NUM_REPS 1 + +void vector_scal(void *arguments) +{ + float *a; + a = (float *)arguments; + *a = *a * 0.9f; +} + +int main(int argc, char *argv[]) +{ + int i, j; + int ntasks; + int num_xstreams; + int num_pools; + ABT_xstream *xstreams; + ABT_task *tasks; + ABT_pool *pools; + struct timeval start, end; + char *str, *endptr; + float *a; + + num_xstreams = argc > 1 ? atoi(argv[1]) : NUM_XSTREAMS; + if (argc > 2) { + str = argv[2]; + } + ntasks = argc > 2 ? strtoll(str, &endptr, 10) : NUM_TASKS; + if (ntasks < num_xstreams) { + ntasks = num_xstreams; + } + num_pools = argc > 5 ? atoi(argv[5]) : num_xstreams; + printf("# of ESs: %d Pools: %d\n", num_xstreams, num_pools); + + a = malloc(sizeof(float) * ntasks); + + for (i = 0; i < ntasks; i++) { + a[i] = i + 100.0f; + } + + xstreams = (ABT_xstream *)malloc(sizeof(ABT_xstream) * num_xstreams); + tasks = (ABT_task *)malloc(sizeof(ABT_task) * num_xstreams); + pools = (ABT_pool *)malloc(sizeof(ABT_pool) * num_pools); + + /* initialization */ + ABT_init(argc, argv); + + /* shared pool creation */ + for (i = 0; i < num_pools; i++) { + ABT_pool_create_basic(ABT_POOL_FIFO, ABT_POOL_ACCESS_MPMC, ABT_TRUE, + &pools[i]); + } + /* ES creation */ + ABT_xstream_self(&xstreams[0]); + ABT_xstream_set_main_sched_basic(xstreams[0], ABT_SCHED_DEFAULT, + 1, &pools[0]); + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_create_basic(ABT_SCHED_DEFAULT, 1, &pools[i % num_pools], + ABT_SCHED_CONFIG_NULL, &xstreams[i]); + ABT_xstream_start(xstreams[i]); + } + /* Work here */ + gettimeofday(&start, NULL); + for (j = 0; j < ntasks; j++) { + ABT_thread_create(pools[j % num_pools], vector_scal, + (void *)&a[j], ABT_THREAD_ATTR_NULL, NULL); + } + + ABT_thread_yield(); + for (i = 0; i < num_pools; i++) { + size_t size; + do { + ABT_pool_get_size(pools[i], &size); + } while (size != 0); + } + + gettimeofday(&end, NULL); + double time = (end.tv_sec * 1000000 + end.tv_usec) + - (start.tv_sec * 1000000 + start.tv_usec); + printf("nxstreams: %d\nntasks %d\nTime(s): %f\n", + num_xstreams, ntasks, time / 1000000.0); + + for (i = 0; i < ntasks; i++) { + if (a[i] != (i + 100.0f) * 0.9f) { + printf("error: a[%d]\n", i); + } + } + + /* join ESs */ + for (i = 1; i < num_xstreams; i++) { + ABT_xstream_join(xstreams[i]); + ABT_xstream_free(&xstreams[i]); + } + + free(tasks); + free(xstreams); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/task_single_producer_omp.c b/bolt/examples/argobots/task_single_producer_omp.c new file mode 100644 index 0000000000000..7a64a75c480ad --- /dev/null +++ b/bolt/examples/argobots/task_single_producer_omp.c @@ -0,0 +1,90 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +#include +#include +#include +#include +#include + +#define NUM_TASKS 5000000 +#define NUM_REPS 1 + +void sscal(float value, float *a) +{ + *a = *a * value; +} + +int main(int argc, char *argv[]) +{ + int i, r, nthreads; + double *time, avg_time = 0.0; + char *str, *endptr; + float *a; + double time2 = 0.0; + + #pragma omp parallel + { + #pragma omp master + { + nthreads = omp_get_num_threads(); + } + } + + if (argc > 1) { + str = argv[1]; + } + + int ntasks = argc > 1 ? strtoll(str, &endptr, 10) : NUM_TASKS; + if (ntasks < nthreads) + ntasks = nthreads; + + int rep = (argc > 2) ? atoi(argv[2]) : NUM_REPS; + time = malloc(sizeof(double) * rep); + a = malloc(sizeof(float) * ntasks); + + for (i = 0; i < ntasks; i++) { + a[i] = i + 100.0f; + } + + for (r = 0; r < rep; r++) { + time[r] = omp_get_wtime(); + #pragma omp parallel + { + #pragma omp single + { + sleep(2); + printf("Thread %d\n", omp_get_thread_num()); + time2 = omp_get_wtime(); + for (i = 0; i < ntasks; i++) { + #pragma omp task firstprivate(i) + { + printf("Task %d executed by Thread %d Stolen? %s\n", + i, omp_get_thread_num(), + (i % nthreads == omp_get_thread_num()) + ? "NO" : "YES"); + sscal(0.9f, &a[i]); + } + } + time2 = omp_get_wtime() - time2; + } + } + time[r] = omp_get_wtime() - time[r]; + avg_time += time[r]; + + } + for (i = 0; i < ntasks; i++) { + if (a[i] != (i + 100.0f) * 0.9f) { + printf("error: a[%d]=%2.f expected %2.f\n", i, + a[i], (i + 100.0f) * 0.9f); + } + } + avg_time /= rep; + printf("nthreads: %d\nntasks: %d\nTime(s):%f\nCreation Time: %f\n", + nthreads, ntasks, avg_time, time2); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/taskwait_omp.c b/bolt/examples/argobots/taskwait_omp.c new file mode 100644 index 0000000000000..9863839da190e --- /dev/null +++ b/bolt/examples/argobots/taskwait_omp.c @@ -0,0 +1,117 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* + * A bunch of n tasks (1st arg) are created by a single thread. + * Each task creates two tasks more and executes a taskwait directive + */ + +#include +#include +#include +#include +#include + +#define NUM_TASKS 50000 +#define NUM_REPS 1 + +int o = 0; +int pp = 0; + +void na(float value) +{ + o++; +} + +void sscal(float value, float *a) +{ + *a = *a * value; +} + +void presscal(float value, float *a) +{ + #pragma omp task + { + sscal(value, a); + } + + #pragma omp task + { + na(value); + } + + #pragma omp taskwait +} + +int main(int argc, char *argv[]) +{ + int i, r, nthreads; + double *time, avg_time = 0.0; + char *str, *endptr; + float *a; + double time2 = 0.0; + + #pragma omp parallel + { + #pragma omp master + { + nthreads = omp_get_num_threads(); + } + } + + if (argc > 1) { + str = argv[1]; + } + + int ntasks = argc > 1 ? strtoll(str, &endptr, 10) : NUM_TASKS; + if (ntasks < nthreads) + ntasks = nthreads; + + int rep = (argc > 2) ? atoi(argv[2]) : NUM_REPS; + + time = malloc(sizeof(double) * rep); + + a = malloc(sizeof(float) * ntasks); + + for (i = 0; i < ntasks; i++) { + a[i] = i + 100.0f; + } + + for (r = 0; r < rep; r++) { + time[r] = omp_get_wtime(); + #pragma omp parallel + { + #pragma omp single + { + time2 = omp_get_wtime(); + for (i = 0; i < ntasks; i++) { + #pragma omp task firstprivate(i) + { + presscal(0.9f, &a[i]); + } + } + time2 = omp_get_wtime() - time2; + } + } + time[r] = omp_get_wtime() - time[r]; + avg_time += time[r]; + + } + for (i = 0; i < ntasks; i++) { + if (a[i] != (i + 100.0f) * 0.9f) { + printf("error: a[%d]=%2.f expected %2.f\n", i, + a[i], (i + 100.0f) * 0.9f); + } + } + avg_time /= rep; + + printf("nthreads: %d\nntasks: %d\nTime(s):%f\nCreation Time: %f\n", + nthreads, ntasks, avg_time, time2); + printf("o=%d and it should be %d\n", o, ntasks); + printf("pp=%d and it should be %d\n", pp, ntasks); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/argobots/taskyield_omp.c b/bolt/examples/argobots/taskyield_omp.c new file mode 100644 index 0000000000000..7436453a67459 --- /dev/null +++ b/bolt/examples/argobots/taskyield_omp.c @@ -0,0 +1,115 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +/* + * A bunch of n tasks (1st arg) are created by a single thread. + * Each task creates two tasks more and inside the second one + * a taskyield directive is called + */ + +#include +#include +#include +#include +#include + +#define NUM_TASKS 50000 +#define NUM_REPS 1 + +int o = 0; +int pp = 0; + +void na(float value) +{ + o++; +} + +void sscal(float value, float *a) +{ + *a = *a * value; +} + +void presscal(float value, float *a) +{ + #pragma omp task + { + sscal(value, a); + } + + #pragma omp task + { + na(value); + #pragma omp taskyield + } +} + +int main(int argc, char *argv[]) +{ + int i, r, nthreads; + double *time, avg_time = 0.0; + char *str, *endptr; + float *a; + double time2 = 0.0; + + #pragma omp parallel + { + #pragma omp master + { + nthreads = omp_get_num_threads(); + } + } + + if (argc > 1) { + str = argv[1]; + } + + int ntasks = argc > 1 ? strtoll(str, &endptr, 10) : NUM_TASKS; + if (ntasks < nthreads) + ntasks = nthreads; + + int rep = (argc > 2) ? atoi(argv[2]) : NUM_REPS; + + time = malloc(sizeof(double) * rep); + a = malloc(sizeof(float) * ntasks); + + for (i = 0; i < ntasks; i++) { + a[i] = i + 100.0f; + } + + for (r = 0; r < rep; r++) { + time[r] = omp_get_wtime(); + #pragma omp parallel + { + #pragma omp single + { + time2 = omp_get_wtime(); + for (i = 0; i < ntasks; i++) { + #pragma omp task firstprivate(i) + { + presscal(0.9f, &a[i]); + } + } + time2 = omp_get_wtime() - time2; + } + } + time[r] = omp_get_wtime() - time[r]; + avg_time += time[r]; + } + + for (i = 0; i < ntasks; i++) { + if (a[i] != (i + 100.0f) * 0.9f) { + printf("error: a[%d]=%2.f expected %2.f\n", i, + a[i], (i + 100.0f) * 0.9f); + } + } + avg_time /= rep; + printf("nthreads: %d\nntasks: %d\nTime(s):%f\nCreation Time: %f\n", + nthreads, ntasks, avg_time, time2); + printf("o=%d and it should be %d\n", o, ntasks); + printf("pp=%d and it should be %d\n", pp, ntasks); + + return EXIT_SUCCESS; +} diff --git a/bolt/examples/sample_nested.c b/bolt/examples/sample_nested.c new file mode 100644 index 0000000000000..2feb5be2eb870 --- /dev/null +++ b/bolt/examples/sample_nested.c @@ -0,0 +1,59 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ + +/* + * See LICENSE.txt in top-level directory. + */ + +#include +#include +#include + + + +int main(int argc, char * argv[]) { + + int size=(argc>1)?atoi(argv[1]):100; + int i,j,k=0; + int nthreads; + struct timeval t_start, t_end; + double time; + + double *a = (double *)malloc(sizeof(double)*size*size); + + #pragma omp parallel + { + nthreads=omp_get_num_threads(); + } + + for(i=0;i +#include +#include + +int main(int argc, char * argv[]) { + + int i,num=(argc>1)?atoi(argv[1]):100; + int nthreads; + struct timeval t_start, t_end; + double time; + double *a = (double *)malloc(sizeof(double)*num); + + #pragma omp parallel + { + nthreads=omp_get_num_threads(); + } + + + for(i=0;i +#include +#include + +int main(int argc, char * argv[]) { + + int i,num=(argc>1)?atoi(argv[1]):100; + int nthreads; + struct timeval t_start, t_end; + double time; + double *a = (double *)malloc(sizeof(double)*num); + + #pragma omp parallel + { + nthreads=omp_get_num_threads(); + } + + + for(i=0;i