Optimizing CPU Cache Usage in C++ Applications
Cache Utilization Fundamentals
Locality Principles
- Temporal Locality: Frequently accessed data tends to be reused shortly after access. This means that variables used repeatedly within loops can benefit from being cached in CPU registers.
- Spatial Locality: Adjacent memory locations are often loaded together in to cache lines. Sequential memory access leverages this principle effectively.
#include <iostream>
#include <vector>
int main() {
const size_t size = 10000;
std::vector<int> data(size);
// Initialize vector elements
for (size_t i = 0; i < size; ++i) {
data[i] = static_cast<int>(i);
}
// Compute sum using temporal locality
long long total = 0;
for (size_t i = 0; i < size; ++i) {
total += data[i];
}
std::cout << "Total: " << total << std::endl;
return 0;
}
This example demonstrates sequential memory access patterns that enhance spatial locality. The vector's contiguous memory layout ensures that adjacent elements are likely to be cached together during iteration.
Matrix Operations and Cache Efficiency
#include <iostream>
#include <vector>
const size_t DIM = 1000;
void multiplyMatrices(const std::vector<std::vector<int>>& A,
const std::vector<std::vector<int>>& B,
std::vector<std::vector<int>>& C) {
for (size_t i = 0; i < DIM; ++i) {
for (size_t j = 0; j < DIM; ++j) {
long long sum = 0;
for (size_t k = 0; k < DIM; ++k) {
sum += static_cast<long long>(A[i][k]) * B[k][j];
}
C[i][j] = static_cast<int>(sum);
}
}
}
int main() {
std::vector<std::vector<int>> matrixA(DIM, std::vector<int>(DIM, 1));
std::vector<std::vector<int>> matrixB(DIM, std::vector<int>(DIM, 2));
std::vector<std::vector<int>> result(DIM, std::vector<int>(DIM, 0));
multiplyMatrices(matrixA, matrixB, result);
return 0;
}
The nested loop structure maintains good spatial locality when accessing matrix elements. The innermost loop accesses elements from consecutive memory locations in both matrices.
Memory Layout Optimization
#include <iostream>
#include <vector>
struct Employee {
int id;
char name[20];
int age;
};
int main() {
const size_t count = 1000;
std::vector<Employee> employees(count);
// Initialize employee records
for (size_t i = 0; i < count; ++i) {
employees[i].id = static_cast<int>(i + 1);
snprintf(employees[i].name, sizeof(employees[i].name), "Emp%zu", i + 1);
employees[i].age = 25 + static_cast<int>(i % 10);
}
// Calculate average age
long long sum = 0;
for (size_t i = 0; i < count; ++i) {
sum += employees[i].age;
}
double avg = static_cast<double>(sum) / count;
std::cout << "Average age: " << avg << std::endl;
return 0;
}
This implementation places related data members close together in memory, improving cache efficiency through better spatial locality.
Cache-Friendly Algorithm Design
#include <iostream>
#include <vector>
int computeFibonacci(int n) {
if (n <= 1) return n;
std::vector<int> dp(n + 1);
dp[0] = 0;
dp[1] = 1;
for (int i = 2; i <= n; ++i) {
dp[i] = dp[i - 1] + dp[i - 2];
}
return dp[n];
}
int main() {
int result = computeFibonacci(30);
std::cout << "Fibonacci(30): " << result << std::endl;
return 0;
}
Dynamic programming approach ensures sequantial memory access, maximizing cache hit rates by storing intermediate results contiguously.
Block-Based Processing for Cache Optimization
#include <iostream>
#include <vector>
#include <chrono>
const size_t SIZE = 1000;
const size_t BLOCK_SIZE = 32;
void blockMatrixMultiply(const std::vector<std::vector<int>>& A,
const std::vector<std::vector<int>>& B,
std::vector<std::vector<int>>& C) {
for (size_t i = 0; i < SIZE; i += BLOCK_SIZE) {
for (size_t j = 0; j < SIZE; j += BLOCK_SIZE) {
for (size_t k = 0; k < SIZE; k += BLOCK_SIZE) {
for (size_t ii = i; ii < std::min(i + BLOCK_SIZE, SIZE); ++ii) {
for (size_t jj = j; jj < std::min(j + BLOCK_SIZE, SIZE); ++jj) {
long long sum = 0;
for (size_t kk = k; kk < std::min(k + BLOCK_SIZE, SIZE); ++kk) {
sum += static_cast<long long>(A[ii][kk]) * B[kk][jj];
}
C[ii][jj] += static_cast<int>(sum);
}
}
}
}
}
}
int main() {
std::vector<std::vector<int>> A(SIZE, std::vector<int>(SIZE, 1));
std::vector<std::vector<int>> B(SIZE, std::vector<int>(SIZE, 2));
std::vector<std::vector<int>> C(SIZE, std::vector<int>(SIZE, 0));
auto start = std::chrono::high_resolution_clock::now();
blockMatrixMultiply(A, B, C);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Execution time: " << duration.count() << " ms" << std::endl;
return 0;
}
Block-based computation reduces cache misses by processing smaller chunks that fit within cache levels.
Sequential Access Patterns
#include <iostream>
#include <vector>
const size_t ROWS = 1000;
const size_t COLS = 1000;
void processMatrix(std::vector<std::vector<int>>& matrix) {
long long sum = 0;
for (size_t row = 0; row < ROWS; ++row) {
for (size_t col = 0; col < COLS; ++col) {
sum += matrix[row][col];
}
}
std::cout << "Sum: " << sum << std::endl;
}
int main() {
std::vector<std::vector<int>> matrix(ROWS, std::vector<int>(COLS, 0));
// Initialize matrix values
for (size_t row = 0; row < ROWS; ++row) {
for (size_t col = 0; col < COLS; ++col) {
matrix[row][col] = static_cast<int>(row * COLS + col);
}
}
processMatrix(matrix);
return 0;
}
Row-major order traversal maximizes cache performance by ensuring sequential memory access patterns.
Efficient Data Structures
#include <iostream>
#include <vector>
class Stack {
private:
std::vector<int> storage;
public:
void push(int value) {
storage.push_back(value);
}
int pop() {
if (storage.empty()) {
throw std::runtime_error("Stack underflow");");
}
int top = storage.back();
storage.pop_back();
return top;
}
bool empty() const {
return storage.empty();
}
};
int main() {
Stack stack;
stack.push(10);
stack.push(20);
stack.push(30);
std::cout << stack.pop() << std::endl; // Outputs 30
std::cout << stack.pop() << std::endl; // Outputs 20
std::cout << stack.pop() << std::endl; // Outputs 10
return 0;
}
Using vectors for stack implemantation provides optimal cache performance due to their contiguous memory layout.