Chapter 6: Memory Access Patterns
Understanding and optimizing memory access patterns is crucial for writing high-performance code. This chapter explores how different memory access patterns affect performance and how to write code that maximizes cache utilization.
Memory Hierarchy and Access Patterns
Modern computer systems have a complex memory hierarchy:
- Registers (fastest, smallest)
- L1 Cache (~1ns, 32-64KB)
- L2 Cache (~10ns, 256KB-1MB)
- L3 Cache (~30ns, 2-32MB)
- Main Memory (~100ns, GBs)
- Storage (slowest, largest)
Cache Line Basics
A cache line is typically 64 bytes on modern processors. When you access a memory location, the entire cache line containing that location is loaded into the cache.
// Example of cache line access
struct Data {
int values[16]; // 64 bytes = 1 cache line
};
void process_data(struct Data* data, int n) {
for (int i = 0; i < n; i++) {
data[i].values[0] *= 2; // Accesses entire cache line
}
}
Common Memory Access Patterns
Sequential Access
// Good: Sequential access pattern
void sum_array(int* arr, int n) {
int sum = 0;
for (int i = 0; i < n; i++) {
sum += arr[i]; // Sequential access
}
}
// Bad: Random access pattern
void sum_random(int* arr, int* indices, int n) {
int sum = 0;
for (int i = 0; i < n; i++) {
sum += arr[indices[i]]; // Random access
}
}
Strided Access
// Good: Unit stride
void process_matrix_row_major(int matrix[][N], int n) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
matrix[i][j] *= 2; // Unit stride
}
}
}
// Bad: Large stride
void process_matrix_col_major(int matrix[][N], int n) {
for (int j = 0; j < n; j++) {
for (int i = 0; i < n; i++) {
matrix[i][j] *= 2; // Large stride
}
}
}
Complete Working Example: Matrix Transposition
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 1024
#define BLOCK_SIZE 32
// Naive matrix transposition
void transpose_naive(int A[N][N], int B[N][N]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
B[j][i] = A[i][j];
}
}
}
// Cache-friendly matrix transposition
void transpose_blocked(int A[N][N], int B[N][N]) {
for (int i = 0; i < N; i += BLOCK_SIZE) {
for (int j = 0; j < N; j += BLOCK_SIZE) {
// Process block
for (int ii = i; ii < i + BLOCK_SIZE; ii++) {
for (int jj = j; jj < j + BLOCK_SIZE; jj++) {
B[jj][ii] = A[ii][jj];
}
}
}
}
}
// SIMD-optimized matrix transposition
void transpose_simd(int A[N][N], int B[N][N]) {
for (int i = 0; i < N; i += 8) {
for (int j = 0; j < N; j += 8) {
// Process 8x8 block using SIMD
for (int ii = i; ii < i + 8; ii++) {
for (int jj = j; jj < j + 8; jj++) {
B[jj][ii] = A[ii][jj];
}
}
}
}
}
int main() {
int (*A)[N] = malloc(N * N * sizeof(int));
int (*B)[N] = malloc(N * N * sizeof(int));
// Initialize matrix
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
A[i][j] = i * N + j;
}
}
// Time naive transposition
clock_t start = clock();
transpose_naive(A, B);
clock_t end = clock();
double naive_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Time blocked transposition
start = clock();
transpose_blocked(A, B);
end = clock();
double blocked_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Time SIMD transposition
start = clock();
transpose_simd(A, B);
end = clock();
double simd_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Naive transposition: %f seconds\n", naive_time);
printf("Blocked transposition: %f seconds\n", blocked_time);
printf("SIMD transposition: %f seconds\n", simd_time);
free(A);
free(B);
return 0;
}
Cache-Friendly Data Structures
Array of Structures vs Structure of Arrays
// Array of Structures (AoS)
struct Particle {
float x, y, z;
float vx, vy, vz;
float mass;
};
// Structure of Arrays (SoA)
struct Particles {
float* x, *y, *z;
float* vx, *vy, *vz;
float* mass;
};
// Complete Working Example: Particle Simulation
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <immintrin.h>
#define N 1000000
// AoS implementation
void update_particles_aos(struct Particle* particles, int n) {
for (int i = 0; i < n; i++) {
particles[i].x += particles[i].vx;
particles[i].y += particles[i].vy;
particles[i].z += particles[i].vz;
}
}
// SoA implementation
void update_particles_soa(struct Particles* particles, int n) {
for (int i = 0; i < n; i += 8) {
__m256 x = _mm256_loadu_ps(&particles->x[i]);
__m256 y = _mm256_loadu_ps(&particles->y[i]);
__m256 z = _mm256_loadu_ps(&particles->z[i]);
__m256 vx = _mm256_loadu_ps(&particles->vx[i]);
__m256 vy = _mm256_loadu_ps(&particles->vy[i]);
__m256 vz = _mm256_loadu_ps(&particles->vz[i]);
x = _mm256_add_ps(x, vx);
y = _mm256_add_ps(y, vy);
z = _mm256_add_ps(z, vz);
_mm256_storeu_ps(&particles->x[i], x);
_mm256_storeu_ps(&particles->y[i], y);
_mm256_storeu_ps(&particles->z[i], z);
}
}
int main() {
// Allocate and initialize AoS
struct Particle* particles_aos = malloc(N * sizeof(struct Particle));
for (int i = 0; i < N; i++) {
particles_aos[i].x = (float)rand() / RAND_MAX;
particles_aos[i].y = (float)rand() / RAND_MAX;
particles_aos[i].z = (float)rand() / RAND_MAX;
particles_aos[i].vx = (float)rand() / RAND_MAX;
particles_aos[i].vy = (float)rand() / RAND_MAX;
particles_aos[i].vz = (float)rand() / RAND_MAX;
particles_aos[i].mass = 1.0f;
}
// Allocate and initialize SoA
struct Particles particles_soa;
particles_soa.x = malloc(N * sizeof(float));
particles_soa.y = malloc(N * sizeof(float));
particles_soa.z = malloc(N * sizeof(float));
particles_soa.vx = malloc(N * sizeof(float));
particles_soa.vy = malloc(N * sizeof(float));
particles_soa.vz = malloc(N * sizeof(float));
particles_soa.mass = malloc(N * sizeof(float));
for (int i = 0; i < N; i++) {
particles_soa.x[i] = (float)rand() / RAND_MAX;
particles_soa.y[i] = (float)rand() / RAND_MAX;
particles_soa.z[i] = (float)rand() / RAND_MAX;
particles_soa.vx[i] = (float)rand() / RAND_MAX;
particles_soa.vy[i] = (float)rand() / RAND_MAX;
particles_soa.vz[i] = (float)rand() / RAND_MAX;
particles_soa.mass[i] = 1.0f;
}
// Time AoS update
clock_t start = clock();
update_particles_aos(particles_aos, N);
clock_t end = clock();
double aos_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Time SoA update
start = clock();
update_particles_soa(&particles_soa, N);
end = clock();
double soa_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("AoS update: %f seconds\n", aos_time);
printf("SoA update: %f seconds\n", soa_time);
// Clean up
free(particles_aos);
free(particles_soa.x);
free(particles_soa.y);
free(particles_soa.z);
free(particles_soa.vx);
free(particles_soa.vy);
free(particles_soa.vz);
free(particles_soa.mass);
return 0;
}
Prefetching and Cache Control
Hardware Prefetching
Modern processors have hardware prefetchers that detect sequential access patterns and prefetch data into the cache.
// Good: Sequential access with hardware prefetching
void process_array(int* arr, int n) {
for (int i = 0; i < n; i++) {
arr[i] *= 2;
}
}
// Bad: Random access defeats prefetching
void process_random(int* arr, int* indices, int n) {
for (int i = 0; i < n; i++) {
arr[indices[i]] *= 2;
}
}
Software Prefetching
#include <xmmintrin.h>
void process_array_with_prefetch(int* arr, int n) {
for (int i = 0; i < n; i += 16) {
// Prefetch next cache line
_mm_prefetch((char*)&arr[i + 16], _MM_HINT_T0);
// Process current cache line
for (int j = 0; j < 16; j++) {
arr[i + j] *= 2;
}
}
}
Cache Alignment and Padding
Structure Padding
// Unpadded structure (may cause cache conflicts)
struct Unpadded {
char a;
int b;
char c;
double d;
};
// Padded structure (cache-friendly)
struct Padded {
char a;
char padding1[3]; // Align b to 4 bytes
int b;
char c;
char padding2[7]; // Align d to 8 bytes
double d;
};
Complete Working Example: Cache Alignment
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 1000000
// Unaligned access
void process_unaligned(char* data, int n) {
for (int i = 0; i < n; i++) {
data[i] *= 2;
}
}
// Aligned access
void process_aligned(char* data, int n) {
// Align to cache line boundary
int offset = ((uintptr_t)data) % 64;
if (offset != 0) {
data += (64 - offset);
n -= (64 - offset);
}
for (int i = 0; i < n; i++) {
data[i] *= 2;
}
}
int main() {
char* data = malloc(N + 64); // Extra space for alignment
// Initialize data
for (int i = 0; i < N + 64; i++) {
data[i] = rand() % 256;
}
// Time unaligned access
clock_t start = clock();
process_unaligned(data, N);
clock_t end = clock();
double unaligned_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Time aligned access
start = clock();
process_aligned(data, N);
end = clock();
double aligned_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Unaligned access: %f seconds\n", unaligned_time);
printf("Aligned access: %f seconds\n", aligned_time);
free(data);
return 0;
}
Summary
Optimizing memory access patterns is crucial for performance. Key points to remember:
- Understand the Memory Hierarchy
- Cache sizes and latencies
- Cache line size
- Memory bandwidth
- Write Cache-Friendly Code
- Use sequential access patterns
- Minimize stride sizes
- Consider data structure layout
- Align data properly
- Leverage Hardware Features
- Hardware prefetching
- Cache line utilization
- SIMD instructions
- Profile and Optimize
- Use performance counters
- Measure cache misses
- Consider different data layouts
- Balance between memory and compute
Remember that memory access patterns can have a significant impact on performance, often more than algorithmic complexity. Always profile your code to identify memory bottlenecks and optimize accordingly.