| name | cpp-performance |
| description | Performance optimization, profiling, benchmarking, cache optimization, and system-level tuning techniques for C++. |
C++ Performance & Optimization Skill
Learn to profile, analyze, and optimize C++ code for maximum performance.
Profiling Tools
Perf (Linux)
# Record performance data
perf record ./program
# Generate report
perf report
# Flame graph
perf script | stackcollapse-perf.pl | flamegraph.pl > out.svg
Valgrind - Memory and CPU Profiling
# Cachegrind - CPU cache simulation
valgrind --tool=cachegrind ./program
cg_annotate cachegrind.out.pid
# Callgrind - Call graph
valgrind --tool=callgrind ./program
kcachegrind callgrind.out.pid
Google Benchmark Library
#include <benchmark/benchmark.h>
// Simple benchmark
static void BM_StringCreation(benchmark::State& state) {
for (auto _ : state) {
std::string empty_string;
}
}
BENCHMARK(BM_StringCreation);
// With arguments
static void BM_VectorPush(benchmark::State& state) {
for (auto _ : state) {
std::vector<int> v;
for (int i = 0; i < state.range(0); ++i) {
v.push_back(i);
}
}
}
BENCHMARK(BM_VectorPush)->Range(8, 8 << 10);
BENCHMARK_MAIN();
Benchmarking Best Practices
#include <chrono>
// Simple timing
auto start = std::chrono::high_resolution_clock::now();
// ... code to measure ...
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
end - start
);
std::cout << "Time: " << duration.count() << " µs" << std::endl;
// Multiple runs for averaging
double total_time = 0.0;
const int runs = 1000;
for (int i = 0; i < runs; ++i) {
auto start = std::chrono::high_resolution_clock::now();
// ... code ...
auto end = std::chrono::high_resolution_clock::now();
total_time += std::chrono::duration<double>(end - start).count();
}
double average = total_time / runs;
Cache Optimization
CPU Cache Hierarchy
L1 Cache: ~4KB, ~4 cycles
L2 Cache: ~256KB, ~11 cycles
L3 Cache: ~8MB, ~40 cycles
RAM: ~100+ cycles
Cache-Friendly Code
// BAD: Column-major access (bad cache locality)
int arr[1000][1000];
for (int j = 0; j < 1000; ++j) {
for (int i = 0; i < 1000; ++i) {
sum += arr[i][j]; // Jumps across cache lines
}
}
// GOOD: Row-major access (good cache locality)
for (int i = 0; i < 1000; ++i) {
for (int j = 0; j < 1000; ++j) {
sum += arr[i][j]; // Sequential memory access
}
}
Data Structure Alignment
// Inefficient layout (32 bytes due to padding)
struct BadLayout {
char a; // 1 byte + 7 padding
double b; // 8 bytes
char c; // 1 byte + 7 padding
};
// Efficient layout (16 bytes)
struct GoodLayout {
double b; // 8 bytes
char a; // 1 byte
char c; // 1 byte + 6 padding
};
std::cout << sizeof(BadLayout) << std::endl; // 24 bytes
std::cout << sizeof(GoodLayout) << std::endl; // 16 bytes
Algorithmic Optimization
Algorithm Selection
// O(n²) - Too slow for large n
std::vector<int> slowSearch(const std::vector<int>& v, int target) {
std::vector<int> result;
for (int i = 0; i < v.size(); ++i) {
for (int j = 0; j < v.size(); ++j) {
if (v[i] + v[j] == target) {
result.push_back(i);
}
}
}
return result;
}
// O(n) - Fast
std::vector<int> fastSearch(const std::vector<int>& v, int target) {
std::unordered_set<int> seen;
std::vector<int> result;
for (int i = 0; i < v.size(); ++i) {
int complement = target - v[i];
if (seen.count(complement)) {
result.push_back(i);
}
seen.insert(v[i]);
}
return result;
}
Dynamic Programming
// Fibonacci: Naive O(2^n)
int fib_naive(int n) {
if (n <= 1) return n;
return fib_naive(n-1) + fib_naive(n-2);
}
// DP: O(n) with memoization
std::unordered_map<int, int> memo;
int fib_dp(int n) {
if (n <= 1) return n;
if (memo.count(n)) return memo[n];
return memo[n] = fib_dp(n-1) + fib_dp(n-2);
}
Compiler Optimizations
Optimization Levels
# -O0: No optimization (default, debugging)
g++ -O0 -g program.cpp
# -O1: Basic optimization
g++ -O1 program.cpp
# -O2: More optimization (usually good balance)
g++ -O2 program.cpp
# -O3: Aggressive optimization (might be risky)
g++ -O3 program.cpp
# -Os: Optimize for size
g++ -Os program.cpp
# PGO (Profile-Guided Optimization)
g++ -fprofile-generate program.cpp -o program
./program # Run with representative input
g++ -fprofile-use program.cpp -o program
Inline Hints
// Explicit inline (modern C++ prefers compiler decisions)
inline void frequently_called() {
// ...
}
// Force inline (compiler dependent)
#ifdef _MSC_VER
#define FORCEINLINE __forceinline
#else
#define FORCEINLINE __attribute__((always_inline))
#endif
FORCEINLINE void critical_function() {
// ...
}
Memory Optimization
Custom Allocators
class PoolAllocator {
private:
std::vector<char> pool;
size_t offset = 0;
public:
PoolAllocator(size_t size) : pool(size) {}
void* allocate(size_t size) {
if (offset + size > pool.size()) {
throw std::bad_alloc();
}
void* ptr = pool.data() + offset;
offset += size;
return ptr;
}
void deallocate(void* ptr, size_t size) {
// Implement if needed
}
};
Arena Allocation
class Arena {
private:
std::vector<std::unique_ptr<char[]>> blocks;
size_t current_offset = 0;
static const size_t BLOCK_SIZE = 65536;
public:
void* allocate(size_t size) {
if (blocks.empty() || current_offset + size > BLOCK_SIZE) {
blocks.push_back(std::make_unique<char[]>(BLOCK_SIZE));
current_offset = 0;
}
void* ptr = blocks.back().get() + current_offset;
current_offset += size;
return ptr;
}
void reset() {
blocks.clear();
current_offset = 0;
}
};
Parallel Algorithms (C++17)
Execution Policies
#include <execution>
std::vector<int> v = {5, 2, 8, 1, 9};
// Parallel sort
std::sort(std::execution::par, v.begin(), v.end());
// Parallel for_each
std::for_each(std::execution::par_unseq, v.begin(), v.end(),
[](int& x) { x *= 2; });
// Parallel find
auto it = std::find(std::execution::par, v.begin(), v.end(), 5);
OpenMP Parallelization
#include <omp.h>
// Parallel for loop
#pragma omp parallel for
for (int i = 0; i < 1000; ++i) {
data[i] *= 2;
}
// Parallel region with threads
#pragma omp parallel num_threads(4)
{
int id = omp_get_thread_num();
// Each thread runs this code
}
// Parallel reduction
int sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < 1000; ++i) {
sum += data[i];
}
Performance Optimization Checklist
- Measure first - Profile before optimizing
- Focus on bottlenecks - 80/20 rule
- Use appropriate data structures - Choose by access pattern
- Minimize memory allocations - Reuse containers
- Improve cache locality - Access memory sequentially
- Parallelize - Use multi-threading when beneficial
- Compiler optimizations - Use -O2 or -O3
- SIMD - Use vectorization instructions
- Reduce branching - Branch misprediction is expensive
- Inline critical functions - Let compiler decide
When to Use This Skill
- Identifying performance bottlenecks
- Optimizing algorithm selection
- Improving cache utilization
- Parallelizing code
- Reducing memory footprint
- Fine-tuning compiler settings
- Benchmarking code changes