Fading Coder

One Final Commit for the Last Sprint

Home > Tech > Content

Optimizing CPU Cache Usage in C++ Applications

Tech May 8 3

Cache Utilization Fundamentals

Locality Principles

  • Temporal Locality: Frequently accessed data tends to be reused shortly after access. This means that variables used repeatedly within loops can benefit from being cached in CPU registers.
  • Spatial Locality: Adjacent memory locations are often loaded together in to cache lines. Sequential memory access leverages this principle effectively.
#include <iostream>
#include <vector>

int main() {
    const size_t size = 10000;
    std::vector<int> data(size);

    // Initialize vector elements
    for (size_t i = 0; i < size; ++i) {
        data[i] = static_cast<int>(i);
    }

    // Compute sum using temporal locality
    long long total = 0;
    for (size_t i = 0; i < size; ++i) {
        total += data[i];
    }

    std::cout << "Total: " << total << std::endl;
    return 0;
}

This example demonstrates sequential memory access patterns that enhance spatial locality. The vector's contiguous memory layout ensures that adjacent elements are likely to be cached together during iteration.

Matrix Operations and Cache Efficiency

#include <iostream>
#include <vector>

const size_t DIM = 1000;

void multiplyMatrices(const std::vector<std::vector<int>>& A,
                      const std::vector<std::vector<int>>& B,
                      std::vector<std::vector<int>>& C) {
    for (size_t i = 0; i < DIM; ++i) {
        for (size_t j = 0; j < DIM; ++j) {
            long long sum = 0;
            for (size_t k = 0; k < DIM; ++k) {
                sum += static_cast<long long>(A[i][k]) * B[k][j];
            }
            C[i][j] = static_cast<int>(sum);
        }
    }
}

int main() {
    std::vector<std::vector<int>> matrixA(DIM, std::vector<int>(DIM, 1));
    std::vector<std::vector<int>> matrixB(DIM, std::vector<int>(DIM, 2));
    std::vector<std::vector<int>> result(DIM, std::vector<int>(DIM, 0));

    multiplyMatrices(matrixA, matrixB, result);
    return 0;
}

The nested loop structure maintains good spatial locality when accessing matrix elements. The innermost loop accesses elements from consecutive memory locations in both matrices.

Memory Layout Optimization

#include <iostream>
#include <vector>

struct Employee {
    int id;
    char name[20];
    int age;
};

int main() {
    const size_t count = 1000;
    std::vector<Employee> employees(count);

    // Initialize employee records
    for (size_t i = 0; i < count; ++i) {
        employees[i].id = static_cast<int>(i + 1);
        snprintf(employees[i].name, sizeof(employees[i].name), "Emp%zu", i + 1);
        employees[i].age = 25 + static_cast<int>(i % 10);
    }

    // Calculate average age
    long long sum = 0;
    for (size_t i = 0; i < count; ++i) {
        sum += employees[i].age;
    }
    double avg = static_cast<double>(sum) / count;

    std::cout << "Average age: " << avg << std::endl;
    return 0;
}

This implementation places related data members close together in memory, improving cache efficiency through better spatial locality.

Cache-Friendly Algorithm Design

#include <iostream>
#include <vector>

int computeFibonacci(int n) {
    if (n <= 1) return n;
    
    std::vector<int> dp(n + 1);
    dp[0] = 0;
    dp[1] = 1;
    
    for (int i = 2; i <= n; ++i) {
        dp[i] = dp[i - 1] + dp[i - 2];
    }
    
    return dp[n];
}

int main() {
    int result = computeFibonacci(30);
    std::cout << "Fibonacci(30): " << result << std::endl;
    return 0;
}

Dynamic programming approach ensures sequantial memory access, maximizing cache hit rates by storing intermediate results contiguously.

Block-Based Processing for Cache Optimization

#include <iostream>
#include <vector>
#include <chrono>

const size_t SIZE = 1000;
const size_t BLOCK_SIZE = 32;

void blockMatrixMultiply(const std::vector<std::vector<int>>& A,
                         const std::vector<std::vector<int>>& B,
                         std::vector<std::vector<int>>& C) {
    for (size_t i = 0; i < SIZE; i += BLOCK_SIZE) {
        for (size_t j = 0; j < SIZE; j += BLOCK_SIZE) {
            for (size_t k = 0; k < SIZE; k += BLOCK_SIZE) {
                for (size_t ii = i; ii < std::min(i + BLOCK_SIZE, SIZE); ++ii) {
                    for (size_t jj = j; jj < std::min(j + BLOCK_SIZE, SIZE); ++jj) {
                        long long sum = 0;
                        for (size_t kk = k; kk < std::min(k + BLOCK_SIZE, SIZE); ++kk) {
                            sum += static_cast<long long>(A[ii][kk]) * B[kk][jj];
                        }
                        C[ii][jj] += static_cast<int>(sum);
                    }
                }
            }
        }
    }
}

int main() {
    std::vector<std::vector<int>> A(SIZE, std::vector<int>(SIZE, 1));
    std::vector<std::vector<int>> B(SIZE, std::vector<int>(SIZE, 2));
    std::vector<std::vector<int>> C(SIZE, std::vector<int>(SIZE, 0));

    auto start = std::chrono::high_resolution_clock::now();
    blockMatrixMultiply(A, B, C);
    auto end = std::chrono::high_resolution_clock::now();

    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
    std::cout << "Execution time: " << duration.count() << " ms" << std::endl;
    return 0;
}

Block-based computation reduces cache misses by processing smaller chunks that fit within cache levels.

Sequential Access Patterns

#include <iostream>
#include <vector>

const size_t ROWS = 1000;
const size_t COLS = 1000;

void processMatrix(std::vector<std::vector<int>>& matrix) {
    long long sum = 0;
    for (size_t row = 0; row < ROWS; ++row) {
        for (size_t col = 0; col < COLS; ++col) {
            sum += matrix[row][col];
        }
    }
    std::cout << "Sum: " << sum << std::endl;
}

int main() {
    std::vector<std::vector<int>> matrix(ROWS, std::vector<int>(COLS, 0));
    
    // Initialize matrix values
    for (size_t row = 0; row < ROWS; ++row) {
        for (size_t col = 0; col < COLS; ++col) {
            matrix[row][col] = static_cast<int>(row * COLS + col);
        }
    }

    processMatrix(matrix);
    return 0;
}

Row-major order traversal maximizes cache performance by ensuring sequential memory access patterns.

Efficient Data Structures

#include <iostream>
#include <vector>

class Stack {
private:
    std::vector<int> storage;
public:
    void push(int value) {
        storage.push_back(value);
    }
    
    int pop() {
        if (storage.empty()) {
            throw std::runtime_error("Stack underflow");");
        }
        int top = storage.back();
        storage.pop_back();
        return top;
    }
    
    bool empty() const {
        return storage.empty();
    }
};

int main() {
    Stack stack;
    stack.push(10);
    stack.push(20);
    stack.push(30);
    
    std::cout << stack.pop() << std::endl; // Outputs 30
    std::cout << stack.pop() << std::endl; // Outputs 20
    std::cout << stack.pop() << std::endl; // Outputs 10
    
    return 0;
}

Using vectors for stack implemantation provides optimal cache performance due to their contiguous memory layout.

Related Articles

Understanding Strong and Weak References in Java

Strong References Strong reference are the most prevalent type of object referencing in Java. When an object has a strong reference pointing to it, the garbage collector will not reclaim its memory. F...

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Introduction Server-Side Template Injection (SSTI) is a vulnerability in web applications where user input is improper handled within the template engine and executed on the server. This exploit can r...

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Django’s Admin panel is highly user-friendly, and pairing it with TinyMCE, an effective rich text editor, simplifies content management significantly. Combining the two is particular useful for bloggi...

Leave a Comment

Anonymous

◎Feel free to join the discussion and share your thoughts.