Created
October 25, 2023 12:36
-
-
Save foxtran/fdc4abf8e2de127800f670b9edeeb9f2 to your computer and use it in GitHub Desktop.
Original is taken from https://github.com/google/autofdo/blob/master/docs/OptimizeClangO3WithPropeller.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
## This script does the following: | |
## 1. It checks out and builds trunk LLVM. | |
## 2. It checks out and builds the create_llvm_prof tool. | |
## 3. It builds multiple clang binaries towards building a | |
## propeller optimized clang binary. | |
## 4. It runs performance comparisons of a baseline clang | |
## binary and the Propeller optimized clang binary. | |
## To run this script please set BASE_PROPELLER_CLANG_DIR and run: | |
## sh propeller_optimize_clang.sh | |
## The propeller optimized clang binary will be in: | |
## ${BASE_PROPELLER_CLANG_DIR}/propeller_build/bin/clang | |
set -eu | |
# Set this path and run the script. | |
BASE_PROPELLER_CLANG_DIR="$(cd $(dirname $0); pwd)"/propeller_optimize_clang.dir | |
if [[ -z "${BASE_PROPELLER_CLANG_DIR}" ]]; then | |
echo "Please change this script to set variable BASE_PROPELLER_CLANG_DIR to an empty directory." | |
exit 1 | |
fi | |
mkdir -p "${BASE_PROPELLER_CLANG_DIR}" | |
PATH_TO_LLVM_SOURCES=${BASE_PROPELLER_CLANG_DIR}/sources | |
PATH_TO_TRUNK_LLVM_BUILD=${BASE_PROPELLER_CLANG_DIR}/trunk_llvm_build | |
PATH_TO_TRUNK_LLVM_INSTALL=${BASE_PROPELLER_CLANG_DIR}/trunk_llvm_install | |
# Build Trunk LLVM | |
mkdir -p ${PATH_TO_LLVM_SOURCES} && cd ${PATH_TO_LLVM_SOURCES} | |
git clone -b release/17.x --single-branch https://github.com/llvm/llvm-project.git | |
mkdir -p ${PATH_TO_TRUNK_LLVM_BUILD} && cd ${PATH_TO_TRUNK_LLVM_BUILD} | |
cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD=X86 \ | |
-DCMAKE_INSTALL_PREFIX="${PATH_TO_TRUNK_LLVM_INSTALL}" \ | |
-DLLVM_ENABLE_RTTI=On -DLLVM_INCLUDE_TESTS=Off \ | |
-DLLVM_ENABLE_PROJECTS="clang;lld" ${PATH_TO_LLVM_SOURCES}/llvm-project/llvm | |
ninja install | |
#Build create_llvm_prof | |
PATH_TO_CREATE_LLVM_PROF=${BASE_PROPELLER_CLANG_DIR}/create_llvm_prof_build | |
mkdir -p ${PATH_TO_CREATE_LLVM_PROF} && cd ${PATH_TO_CREATE_LLVM_PROF} | |
git clone --recursive https://github.com/google/autofdo.git | |
mkdir build && cd build | |
cmake -G Ninja -DCMAKE_INSTALL_PREFIX="." \ | |
-DCMAKE_C_COMPILER="${PATH_TO_TRUNK_LLVM_INSTALL}/bin/clang" \ | |
-DCMAKE_CXX_COMPILER="${PATH_TO_TRUNK_LLVM_INSTALL}/bin/clang++" \ | |
-DLLVM_PATH="${PATH_TO_TRUNK_LLVM_INSTALL}" ../autofdo/ | |
ninja | |
ls create_llvm_prof | |
# Common CMAKE Flags | |
COMMON_CMAKE_FLAGS=( | |
"-DLLVM_OPTIMIZED_TABLEGEN=On" | |
"-DCMAKE_BUILD_TYPE=Release" | |
"-DLLVM_TARGETS_TO_BUILD=X86" | |
"-DLLVM_ENABLE_PROJECTS=clang" | |
"-DCMAKE_C_COMPILER=${PATH_TO_TRUNK_LLVM_BUILD}/bin/clang" | |
"-DCMAKE_CXX_COMPILER=${PATH_TO_TRUNK_LLVM_BUILD}/bin/clang++" ) | |
# Additional Baseline CMAKE flags | |
BASELINE_CC_LD_CMAKE_FLAGS=( | |
"-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld" | |
"-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld" | |
"-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld" ) | |
# Build Baseline Clang Binary | |
PATH_TO_BASELINE_CLANG_BUILD=${BASE_PROPELLER_CLANG_DIR}/baseline_clang_build | |
mkdir -p ${PATH_TO_BASELINE_CLANG_BUILD} && cd ${PATH_TO_BASELINE_CLANG_BUILD} | |
cmake -G Ninja "${COMMON_CMAKE_FLAGS[@]}" "${BASELINE_CC_LD_CMAKE_FLAGS[@]}" ${PATH_TO_LLVM_SOURCES}/llvm-project/llvm | |
ninja clang | |
# Labels CMAKE Flags | |
LABELS_CC_LD_CMAKE_FLAGS=( | |
"-DCMAKE_C_FLAGS=-funique-internal-linkage-names -fbasic-block-sections=labels" | |
"-DCMAKE_CXX_FLAGS=-funique-internal-linkage-names -fbasic-block-sections=labels" | |
"-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld" | |
"-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld" | |
"-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld" ) | |
# Build Labels Clang binary | |
PATH_TO_LABELS_CLANG_BUILD=${BASE_PROPELLER_CLANG_DIR}/labels_clang_build | |
mkdir -p ${PATH_TO_LABELS_CLANG_BUILD} && cd ${PATH_TO_LABELS_CLANG_BUILD} | |
cmake -G Ninja "${COMMON_CMAKE_FLAGS[@]}" "${LABELS_CC_LD_CMAKE_FLAGS[@]}" ${PATH_TO_LLVM_SOURCES}/llvm-project/llvm | |
ninja clang | |
# Set up Benchmarking and BUILD | |
BENCHMARKING_CLANG_BUILD=${BASE_PROPELLER_CLANG_DIR}/benchmarking_clang_build | |
mkdir -p ${BENCHMARKING_CLANG_BUILD} && cd ${BENCHMARKING_CLANG_BUILD} | |
mkdir -p symlink_to_clang_binary && cd symlink_to_clang_binary | |
CLANG_VERSION=$(sed -Ene 's!^CLANG_EXECUTABLE_VERSION:STRING=(.*)$!\1!p' ${PATH_TO_TRUNK_LLVM_BUILD}/CMakeCache.txt) #' | |
ln -sf ${PATH_TO_LABELS_CLANG_BUILD}/bin/clang-${CLANG_VERSION} clang | |
ln -sf ${PATH_TO_LABELS_CLANG_BUILD}/bin/clang-${CLANG_VERSION} clang++ | |
# Setup cmake for Benchmarking BUILD | |
cd ${BENCHMARKING_CLANG_BUILD} | |
cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_ENABLE_PROJECTS=clang \ | |
-DCMAKE_C_COMPILER=${BENCHMARKING_CLANG_BUILD}/symlink_to_clang_binary/clang \ | |
-DCMAKE_CXX_COMPILER=${BENCHMARKING_CLANG_BUILD}/symlink_to_clang_binary/clang++ \ | |
${PATH_TO_LLVM_SOURCES}/llvm-project/llvm | |
# Profile labels binary, just 10 compilations should do. | |
ninja -t commands | head -100 >& ./perf_commands.sh | |
chmod +x ./perf_commands.sh | |
perf record -e cycles:u -j any,u -- ./perf_commands.sh | |
ls perf.data | |
# Convert profiles using create_llvm_prof | |
cd ${BENCHMARKING_CLANG_BUILD} | |
${PATH_TO_CREATE_LLVM_PROF}/build/create_llvm_prof --format=propeller \ | |
--binary=${PATH_TO_LABELS_CLANG_BUILD}/bin/clang-${CLANG_VERSION} \ | |
--profiled_binary_name=${PATH_TO_LABELS_CLANG_BUILD}/bin/clang-${CLANG_VERSION} \ | |
--profile=perf.data --out=cluster.txt --propeller_symorder=symorder.txt 2>/dev/null 1>/dev/null | |
ls cluster.txt symorder.txt | |
# Set Propeller's CMAKE Flags | |
PROPELLER_CC_LD_CMAKE_FLAGS=( | |
"-DCMAKE_C_FLAGS=-funique-internal-linkage-names -fbasic-block-sections=list=${BENCHMARKING_CLANG_BUILD}/cluster.txt" | |
"-DCMAKE_CXX_FLAGS=-funique-internal-linkage-names -fbasic-block-sections=list=${BENCHMARKING_CLANG_BUILD}/cluster.txt" | |
"-DCMAKE_EXE_LINKER_FLAGS=-Wl,--symbol-ordering-file=${BENCHMARKING_CLANG_BUILD}/symorder.txt -Wl,--no-warn-symbol-ordering -fuse-ld=lld" | |
"-DCMAKE_SHARED_LINKER_FLAGS=-Wl,--symbol-ordering-file=${BENCHMARKING_CLANG_BUILD}/symorder.txt -Wl,--no-warn-symbol-ordering -fuse-ld=lld" | |
"-DCMAKE_MODULE_LINKER_FLAGS=-Wl,--symbol-ordering-file=${BENCHMARKING_CLANG_BUILD}/symorder.txt -Wl,--no-warn-symbol-ordering -fuse-ld=lld" ) | |
# Build Propeller Optimized Clang | |
PATH_TO_PROPELLER_CLANG_BUILD=${BASE_PROPELLER_CLANG_DIR}/propeller_build | |
mkdir -p ${PATH_TO_PROPELLER_CLANG_BUILD} && cd ${PATH_TO_PROPELLER_CLANG_BUILD} | |
cmake -G Ninja "${COMMON_CMAKE_FLAGS[@]}" "${PROPELLER_CC_LD_CMAKE_FLAGS[@]}" ${PATH_TO_LLVM_SOURCES}/llvm-project/llvm | |
ninja clang | |
# Run comparison of baseline verus propeller optimized clang | |
cd ${BENCHMARKING_CLANG_BUILD}/symlink_to_clang_binary | |
ln -sf ${PATH_TO_BASELINE_CLANG_BUILD}/bin/clang-${CLANG_VERSION} clang | |
ln -sf ${PATH_TO_BASELINE_CLANG_BUILD}/bin/clang-${CLANG_VERSION} clang++ | |
cd .. | |
ninja clean | |
perf stat -r5 -e instructions,cycles,L1-icache-misses,iTLB-misses -- bash -c "ninja clang && ninja clean" | |
cd ${BENCHMARKING_CLANG_BUILD}/symlink_to_clang_binary | |
ln -sf ${PATH_TO_PROPELLER_CLANG_BUILD}/bin/clang-${CLANG_VERSION} clang | |
ln -sf ${PATH_TO_PROPELLER_CLANG_BUILD}/bin/clang-${CLANG_VERSION} clang++ | |
cd .. | |
ninja clean | |
perf stat -r5 -e instructions,cycles,L1-icache-misses,iTLB-misses -- bash -c "ninja clang && ninja clean" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BASELINE (samples=5): | |
Performance counter stats for 'bash -c ninja clang && ninja clean' (5 runs): | |
29911672560885 instructions:u # 0.73 insn per cycle ( +- 0.00% ) | |
40762914939742 cycles:u ( +- 0.01% ) | |
2198963872412 L1-icache-misses:u ( +- 0.01% ) | |
16606325255 iTLB-misses:u ( +- 0.05% ) | |
119.413 +- 0.212 seconds time elapsed ( +- 0.18% ) | |
PROPELLER (samples=5): | |
Performance counter stats for 'bash -c ninja clang && ninja clean' (5 runs): | |
30835273549813 instructions:u # 0.63 insn per cycle ( +- 0.00% ) | |
49008268336239 cycles:u ( +- 0.01% ) | |
3025079343587 L1-icache-misses:u ( +- 0.02% ) | |
16944457932 iTLB-misses:u ( +- 0.03% ) | |
139.041 +- 0.250 seconds time elapsed ( +- 0.18% ) | |
Tested in RAM-disk on $ lscpu | |
Architecture: x86_64 | |
CPU op-mode(s): 32-bit, 64-bit | |
Byte Order: Little Endian | |
CPU(s): 152 | |
On-line CPU(s) list: 0-151 | |
Thread(s) per core: 2 | |
Core(s) per socket: 38 | |
Socket(s): 2 | |
NUMA node(s): 2 | |
Vendor ID: GenuineIntel | |
CPU family: 6 | |
Model: 106 | |
Model name: Intel(R) Xeon(R) Platinum 8368 CPU @ 2.40GHz | |
Stepping: 6 | |
CPU MHz: 3400.000 | |
CPU max MHz: 3400.0000 | |
CPU min MHz: 800.0000 | |
BogoMIPS: 4800.00 | |
Virtualization: VT-x | |
L1d cache: 48K | |
L1i cache: 32K | |
L2 cache: 1280K | |
L3 cache: 58368K | |
NUMA node0 CPU(s): 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,130,132,134,136,138,140,142,144,146,148,150 | |
NUMA node1 CPU(s): 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63,65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,129,131,133,135,137,139,141,143,145,147,149,151 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment