name	cpp-performance
description	Performance optimization, profiling, benchmarking, cache optimization, and system-level tuning techniques for C++.

C++ Performance & Optimization Skill

Learn to profile, analyze, and optimize C++ code for maximum performance.

Profiling Tools

Perf (Linux)

# Record performance data
perf record ./program

# Generate report
perf report

# Flame graph
perf script | stackcollapse-perf.pl | flamegraph.pl > out.svg

Valgrind - Memory and CPU Profiling

# Cachegrind - CPU cache simulation
valgrind --tool=cachegrind ./program
cg_annotate cachegrind.out.pid

# Callgrind - Call graph
valgrind --tool=callgrind ./program
kcachegrind callgrind.out.pid

Google Benchmark Library

#include <benchmark/benchmark.h>

// Simple benchmark
static void BM_StringCreation(benchmark::State& state) {
    for (auto _ : state) {
        std::string empty_string;
    }
}

BENCHMARK(BM_StringCreation);

// With arguments
static void BM_VectorPush(benchmark::State& state) {
    for (auto _ : state) {
        std::vector<int> v;
        for (int i = 0; i < state.range(0); ++i) {
            v.push_back(i);
        }
    }
}

BENCHMARK(BM_VectorPush)->Range(8, 8 << 10);

BENCHMARK_MAIN();

Benchmarking Best Practices

#include <chrono>

// Simple timing
auto start = std::chrono::high_resolution_clock::now();

// ... code to measure ...

auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
    end - start
);
std::cout << "Time: " << duration.count() << " µs" << std::endl;

// Multiple runs for averaging
double total_time = 0.0;
const int runs = 1000;
for (int i = 0; i < runs; ++i) {
    auto start = std::chrono::high_resolution_clock::now();
    // ... code ...
    auto end = std::chrono::high_resolution_clock::now();
    total_time += std::chrono::duration<double>(end - start).count();
}
double average = total_time / runs;

Cache Optimization

CPU Cache Hierarchy

L1 Cache: ~4KB, ~4 cycles
L2 Cache: ~256KB, ~11 cycles
L3 Cache: ~8MB, ~40 cycles
RAM: ~100+ cycles

Cache-Friendly Code

// BAD: Column-major access (bad cache locality)
int arr[1000][1000];
for (int j = 0; j < 1000; ++j) {
    for (int i = 0; i < 1000; ++i) {
        sum += arr[i][j];  // Jumps across cache lines
    }
}

// GOOD: Row-major access (good cache locality)
for (int i = 0; i < 1000; ++i) {
    for (int j = 0; j < 1000; ++j) {
        sum += arr[i][j];  // Sequential memory access
    }
}

Data Structure Alignment

// Inefficient layout (32 bytes due to padding)
struct BadLayout {
    char a;      // 1 byte + 7 padding
    double b;    // 8 bytes
    char c;      // 1 byte + 7 padding
};

// Efficient layout (16 bytes)
struct GoodLayout {
    double b;    // 8 bytes
    char a;      // 1 byte
    char c;      // 1 byte + 6 padding
};

std::cout << sizeof(BadLayout) << std::endl;  // 24 bytes
std::cout << sizeof(GoodLayout) << std::endl; // 16 bytes

Algorithmic Optimization

Algorithm Selection

// O(n²) - Too slow for large n
std::vector<int> slowSearch(const std::vector<int>& v, int target) {
    std::vector<int> result;
    for (int i = 0; i < v.size(); ++i) {
        for (int j = 0; j < v.size(); ++j) {
            if (v[i] + v[j] == target) {
                result.push_back(i);
            }
        }
    }
    return result;
}

// O(n) - Fast
std::vector<int> fastSearch(const std::vector<int>& v, int target) {
    std::unordered_set<int> seen;
    std::vector<int> result;
    for (int i = 0; i < v.size(); ++i) {
        int complement = target - v[i];
        if (seen.count(complement)) {
            result.push_back(i);
        }
        seen.insert(v[i]);
    }
    return result;
}

Dynamic Programming

// Fibonacci: Naive O(2^n)
int fib_naive(int n) {
    if (n <= 1) return n;
    return fib_naive(n-1) + fib_naive(n-2);
}

// DP: O(n) with memoization
std::unordered_map<int, int> memo;

int fib_dp(int n) {
    if (n <= 1) return n;
    if (memo.count(n)) return memo[n];
    return memo[n] = fib_dp(n-1) + fib_dp(n-2);
}

Compiler Optimizations

Optimization Levels

# -O0: No optimization (default, debugging)
g++ -O0 -g program.cpp

# -O1: Basic optimization
g++ -O1 program.cpp

# -O2: More optimization (usually good balance)
g++ -O2 program.cpp

# -O3: Aggressive optimization (might be risky)
g++ -O3 program.cpp

# -Os: Optimize for size
g++ -Os program.cpp

# PGO (Profile-Guided Optimization)
g++ -fprofile-generate program.cpp -o program
./program  # Run with representative input
g++ -fprofile-use program.cpp -o program

Inline Hints

// Explicit inline (modern C++ prefers compiler decisions)
inline void frequently_called() {
    // ...
}

// Force inline (compiler dependent)
#ifdef _MSC_VER
    #define FORCEINLINE __forceinline
#else
    #define FORCEINLINE __attribute__((always_inline))
#endif

FORCEINLINE void critical_function() {
    // ...
}

Memory Optimization

Custom Allocators

class PoolAllocator {
private:
    std::vector<char> pool;
    size_t offset = 0;

public:
    PoolAllocator(size_t size) : pool(size) {}

    void* allocate(size_t size) {
        if (offset + size > pool.size()) {
            throw std::bad_alloc();
        }
        void* ptr = pool.data() + offset;
        offset += size;
        return ptr;
    }

    void deallocate(void* ptr, size_t size) {
        // Implement if needed
    }
};

Arena Allocation

class Arena {
private:
    std::vector<std::unique_ptr<char[]>> blocks;
    size_t current_offset = 0;
    static const size_t BLOCK_SIZE = 65536;

public:
    void* allocate(size_t size) {
        if (blocks.empty() || current_offset + size > BLOCK_SIZE) {
            blocks.push_back(std::make_unique<char[]>(BLOCK_SIZE));
            current_offset = 0;
        }
        void* ptr = blocks.back().get() + current_offset;
        current_offset += size;
        return ptr;
    }

    void reset() {
        blocks.clear();
        current_offset = 0;
    }
};

Parallel Algorithms (C++17)

Execution Policies

#include <execution>

std::vector<int> v = {5, 2, 8, 1, 9};

// Parallel sort
std::sort(std::execution::par, v.begin(), v.end());

// Parallel for_each
std::for_each(std::execution::par_unseq, v.begin(), v.end(),
              [](int& x) { x *= 2; });

// Parallel find
auto it = std::find(std::execution::par, v.begin(), v.end(), 5);

OpenMP Parallelization

#include <omp.h>

// Parallel for loop
#pragma omp parallel for
for (int i = 0; i < 1000; ++i) {
    data[i] *= 2;
}

// Parallel region with threads
#pragma omp parallel num_threads(4)
{
    int id = omp_get_thread_num();
    // Each thread runs this code
}

// Parallel reduction
int sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < 1000; ++i) {
    sum += data[i];
}

Performance Optimization Checklist

Measure first - Profile before optimizing
Focus on bottlenecks - 80/20 rule
Use appropriate data structures - Choose by access pattern
Minimize memory allocations - Reuse containers
Improve cache locality - Access memory sequentially
Parallelize - Use multi-threading when beneficial
Compiler optimizations - Use -O2 or -O3
SIMD - Use vectorization instructions
Reduce branching - Branch misprediction is expensive
Inline critical functions - Let compiler decide

When to Use This Skill

Identifying performance bottlenecks
Optimizing algorithm selection
Improving cache utilization
Parallelizing code
Reducing memory footprint
Fine-tuning compiler settings
Benchmarking code changes

cpp-performance

Install Skill

SKILL.md

C++ Performance & Optimization Skill

Profiling Tools

Perf (Linux)

Valgrind - Memory and CPU Profiling

Google Benchmark Library

Benchmarking Best Practices

Cache Optimization

CPU Cache Hierarchy

Cache-Friendly Code

Data Structure Alignment

Algorithmic Optimization

Algorithm Selection

Dynamic Programming

Compiler Optimizations

Optimization Levels

Inline Hints

Memory Optimization

Custom Allocators

Arena Allocation

Parallel Algorithms (C++17)

Execution Policies

OpenMP Parallelization

Performance Optimization Checklist

When to Use This Skill