Home > Tech > Content

Optimizing CPU Cache Usage in C++ Applications

Tech May 8 3

Cache Utilization Fundamentals

Locality Principles

Temporal Locality: Frequently accessed data tends to be reused shortly after access. This means that variables used repeatedly within loops can benefit from being cached in CPU registers.
Spatial Locality: Adjacent memory locations are often loaded together in to cache lines. Sequential memory access leverages this principle effectively.

#include <iostream>
#include <vector>

int main() {
    const size_t size = 10000;
    std::vector<int> data(size);

    // Initialize vector elements
    for (size_t i = 0; i < size; ++i) {
        data[i] = static_cast<int>(i);
    }

    // Compute sum using temporal locality
    long long total = 0;
    for (size_t i = 0; i < size; ++i) {
        total += data[i];
    }

    std::cout << "Total: " << total << std::endl;
    return 0;
}

This example demonstrates sequential memory access patterns that enhance spatial locality. The vector's contiguous memory layout ensures that adjacent elements are likely to be cached together during iteration.

Matrix Operations and Cache Efficiency

#include <iostream>
#include <vector>

const size_t DIM = 1000;

void multiplyMatrices(const std::vector<std::vector<int>>& A,
                      const std::vector<std::vector<int>>& B,
                      std::vector<std::vector<int>>& C) {
    for (size_t i = 0; i < DIM; ++i) {
        for (size_t j = 0; j < DIM; ++j) {
            long long sum = 0;
            for (size_t k = 0; k < DIM; ++k) {
                sum += static_cast<long long>(A[i][k]) * B[k][j];
            }
            C[i][j] = static_cast<int>(sum);
        }
    }
}

int main() {
    std::vector<std::vector<int>> matrixA(DIM, std::vector<int>(DIM, 1));
    std::vector<std::vector<int>> matrixB(DIM, std::vector<int>(DIM, 2));
    std::vector<std::vector<int>> result(DIM, std::vector<int>(DIM, 0));

    multiplyMatrices(matrixA, matrixB, result);
    return 0;
}

The nested loop structure maintains good spatial locality when accessing matrix elements. The innermost loop accesses elements from consecutive memory locations in both matrices.

Memory Layout Optimization

#include <iostream>
#include <vector>

struct Employee {
    int id;
    char name[20];
    int age;
};

int main() {
    const size_t count = 1000;
    std::vector<Employee> employees(count);

    // Initialize employee records
    for (size_t i = 0; i < count; ++i) {
        employees[i].id = static_cast<int>(i + 1);
        snprintf(employees[i].name, sizeof(employees[i].name), "Emp%zu", i + 1);
        employees[i].age = 25 + static_cast<int>(i % 10);
    }

    // Calculate average age
    long long sum = 0;
    for (size_t i = 0; i < count; ++i) {
        sum += employees[i].age;
    }
    double avg = static_cast<double>(sum) / count;

    std::cout << "Average age: " << avg << std::endl;
    return 0;
}

This implementation places related data members close together in memory, improving cache efficiency through better spatial locality.

Cache-Friendly Algorithm Design

#include <iostream>
#include <vector>

int computeFibonacci(int n) {
    if (n <= 1) return n;
    
    std::vector<int> dp(n + 1);
    dp[0] = 0;
    dp[1] = 1;
    
    for (int i = 2; i <= n; ++i) {
        dp[i] = dp[i - 1] + dp[i - 2];
    }
    
    return dp[n];
}

int main() {
    int result = computeFibonacci(30);
    std::cout << "Fibonacci(30): " << result << std::endl;
    return 0;
}

Dynamic programming approach ensures sequantial memory access, maximizing cache hit rates by storing intermediate results contiguously.

Block-Based Processing for Cache Optimization

#include <iostream>
#include <vector>
#include <chrono>

const size_t SIZE = 1000;
const size_t BLOCK_SIZE = 32;

void blockMatrixMultiply(const std::vector<std::vector<int>>& A,
                         const std::vector<std::vector<int>>& B,
                         std::vector<std::vector<int>>& C) {
    for (size_t i = 0; i < SIZE; i += BLOCK_SIZE) {
        for (size_t j = 0; j < SIZE; j += BLOCK_SIZE) {
            for (size_t k = 0; k < SIZE; k += BLOCK_SIZE) {
                for (size_t ii = i; ii < std::min(i + BLOCK_SIZE, SIZE); ++ii) {
                    for (size_t jj = j; jj < std::min(j + BLOCK_SIZE, SIZE); ++jj) {
                        long long sum = 0;
                        for (size_t kk = k; kk < std::min(k + BLOCK_SIZE, SIZE); ++kk) {
                            sum += static_cast<long long>(A[ii][kk]) * B[kk][jj];
                        }
                        C[ii][jj] += static_cast<int>(sum);
                    }
                }
            }
        }
    }
}

int main() {
    std::vector<std::vector<int>> A(SIZE, std::vector<int>(SIZE, 1));
    std::vector<std::vector<int>> B(SIZE, std::vector<int>(SIZE, 2));
    std::vector<std::vector<int>> C(SIZE, std::vector<int>(SIZE, 0));

    auto start = std::chrono::high_resolution_clock::now();
    blockMatrixMultiply(A, B, C);
    auto end = std::chrono::high_resolution_clock::now();

    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
    std::cout << "Execution time: " << duration.count() << " ms" << std::endl;
    return 0;
}

Block-based computation reduces cache misses by processing smaller chunks that fit within cache levels.

Sequential Access Patterns

#include <iostream>
#include <vector>

const size_t ROWS = 1000;
const size_t COLS = 1000;

void processMatrix(std::vector<std::vector<int>>& matrix) {
    long long sum = 0;
    for (size_t row = 0; row < ROWS; ++row) {
        for (size_t col = 0; col < COLS; ++col) {
            sum += matrix[row][col];
        }
    }
    std::cout << "Sum: " << sum << std::endl;
}

int main() {
    std::vector<std::vector<int>> matrix(ROWS, std::vector<int>(COLS, 0));
    
    // Initialize matrix values
    for (size_t row = 0; row < ROWS; ++row) {
        for (size_t col = 0; col < COLS; ++col) {
            matrix[row][col] = static_cast<int>(row * COLS + col);
        }
    }

    processMatrix(matrix);
    return 0;
}

Row-major order traversal maximizes cache performance by ensuring sequential memory access patterns.

Efficient Data Structures

#include <iostream>
#include <vector>

class Stack {
private:
    std::vector<int> storage;
public:
    void push(int value) {
        storage.push_back(value);
    }
    
    int pop() {
        if (storage.empty()) {
            throw std::runtime_error("Stack underflow");");
        }
        int top = storage.back();
        storage.pop_back();
        return top;
    }
    
    bool empty() const {
        return storage.empty();
    }
};

int main() {
    Stack stack;
    stack.push(10);
    stack.push(20);
    stack.push(30);
    
    std::cout << stack.pop() << std::endl; // Outputs 30
    std::cout << stack.pop() << std::endl; // Outputs 20
    std::cout << stack.pop() << std::endl; // Outputs 10
    
    return 0;
}

Using vectors for stack implemantation provides optimal cache performance due to their contiguous memory layout.

Tags: C++Performance

Back to List

Prev: File Processing and Student Exam Result Analysis in C

Next: Mastering Java Control Flow Without Expensive Courses

Fading Coder

Optimizing CPU Cache Usage in C++ Applications

Cache Utilization Fundamentals

Locality Principles

Matrix Operations and Cache Efficiency

Memory Layout Optimization

Cache-Friendly Algorithm Design

Block-Based Processing for Cache Optimization

Sequential Access Patterns

Efficient Data Structures

Related Articles

Understanding Strong and Weak References in Java

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Leave a Comment

Copyright © fadingcoder.top

Fading Coder

Optimizing CPU Cache Usage in C++ Applications

Cache Utilization Fundamentals

Locality Principles

Matrix Operations and Cache Efficiency

Memory Layout Optimization

Cache-Friendly Algorithm Design

Block-Based Processing for Cache Optimization

Sequential Access Patterns

Efficient Data Structures

Related Articles

Understanding Strong and Weak References in Java

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Leave a CommentCancel Reply

Copyright © fadingcoder.top

Leave a Comment