Chapter 5: Unlocking Vectorization
Vectorization is one of the most powerful optimization techniques available in modern compilers. By processing multiple data elements simultaneously using SIMD (Single Instruction, Multiple Data) instructions, we can achieve significant performance improvements. This chapter explores how compilers vectorize code and how we can write code to maximize vectorization opportunities.
Understanding Vectorization
Vectorization transforms scalar operations into vector operations, allowing multiple data elements to be processed in parallel. Modern processors support various SIMD instruction sets:
- SSE (Streaming SIMD Extensions)
- AVX (Advanced Vector Extensions)
- AVX2
- AVX-512
Basic Vectorization Example
// Original scalar code
void add_arrays(float* a, float* b, float* c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// Compiler-generated vectorized code (AVX2)
add_arrays:
mov r8d, edi
and r8d, -8
je .L4
xor eax, eax
.L3:
vmovups ymm0, YMMWORD PTR [rsi+rax*4]
vaddps ymm0, ymm0, YMMWORD PTR [rdx+rax*4]
vmovups YMMWORD PTR [rcx+rax*4], ymm0
add rax, 8
cmp r8d, eax
ja .L3
The compiler uses AVX2 instructions to process 8 floats simultaneously.
Complete Working Example: Image Processing
Let’s examine a real-world vectorization example:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <immintrin.h>
#define WIDTH 1920
#define HEIGHT 1080
#define CHANNELS 3
// Structure to hold image data
typedef struct {
unsigned char* data;
int width;
int height;
int channels;
} Image;
// Create a new image
Image* create_image(int width, int height, int channels) {
Image* img = malloc(sizeof(Image));
img->width = width;
img->height = height;
img->channels = channels;
img->data = malloc(width * height * channels * sizeof(unsigned char));
return img;
}
// Free image memory
void free_image(Image* img) {
free(img->data);
free(img);
}
// Convert RGB to grayscale using vectorization
void rgb_to_grayscale_vectorized(Image* src, Image* dst) {
const __m256i mask = _mm256_set1_epi32(0x0000FF00);
const __m256 scale = _mm256_set1_ps(0.299f);
const __m256 scale2 = _mm256_set1_ps(0.587f);
const __m256 scale3 = _mm256_set1_ps(0.114f);
for (int y = 0; y < src->height; y++) {
for (int x = 0; x < src->width; x += 8) {
// Load 8 RGB pixels
__m256i rgb = _mm256_loadu_si256((__m256i*)&src->data[y * src->width * 3 + x * 3]);
// Extract R, G, B components
__m256 r = _mm256_cvtepi32_ps(_mm256_and_si256(rgb, _mm256_set1_epi32(0xFF)));
__m256 g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(rgb, 8), _mm256_set1_epi32(0xFF)));
__m256 b = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(rgb, 16), _mm256_set1_epi32(0xFF)));
// Calculate grayscale
__m256 gray = _mm256_fmadd_ps(r, scale,
_mm256_fmadd_ps(g, scale2,
_mm256_mul_ps(b, scale3)));
// Convert back to integer and store
__m256i result = _mm256_cvtps_epi32(gray);
_mm256_storeu_si256((__m256i*)&dst->data[y * dst->width + x], result);
}
}
}
// Convert RGB to grayscale (scalar version for comparison)
void rgb_to_grayscale_scalar(Image* src, Image* dst) {
for (int y = 0; y < src->height; y++) {
for (int x = 0; x < src->width; x++) {
unsigned char r = src->data[y * src->width * 3 + x * 3];
unsigned char g = src->data[y * src->width * 3 + x * 3 + 1];
unsigned char b = src->data[y * src->width * 3 + x * 3 + 2];
dst->data[y * dst->width + x] = (unsigned char)(0.299f * r + 0.587f * g + 0.114f * b);
}
}
}
int main() {
// Create source and destination images
Image* src = create_image(WIDTH, HEIGHT, CHANNELS);
Image* dst = create_image(WIDTH, HEIGHT, 1);
// Initialize source image with random data
for (int i = 0; i < WIDTH * HEIGHT * CHANNELS; i++) {
src->data[i] = rand() % 256;
}
// Time vectorized version
clock_t start = clock();
rgb_to_grayscale_vectorized(src, dst);
clock_t end = clock();
double vectorized_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Time scalar version
start = clock();
rgb_to_grayscale_scalar(src, dst);
end = clock();
double scalar_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Vectorized time: %f seconds\n", vectorized_time);
printf("Scalar time: %f seconds\n", scalar_time);
printf("Speedup: %fx\n", scalar_time / vectorized_time);
// Clean up
free_image(src);
free_image(dst);
return 0;
}
This example demonstrates:
- Manual vectorization using AVX2 intrinsics
- Efficient memory access patterns
- SIMD arithmetic operations
- Performance comparison between vectorized and scalar code
Data Types and Vectorization
Different data types have different vectorization characteristics:
Integer Vectorization
// Original code
void add_arrays_int(int* a, int* b, int* c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// Compiler-generated vectorized code (AVX2)
add_arrays_int:
mov r8d, edi
and r8d, -8
je .L4
xor eax, eax
.L3:
vmovdqu ymm0, YMMWORD PTR [rsi+rax*4]
vpaddd ymm0, ymm0, YMMWORD PTR [rdx+rax*4]
vmovdqu YMMWORD PTR [rcx+rax*4], ymm0
add rax, 8
cmp r8d, eax
ja .L3
Floating-Point Vectorization
// Original code
void multiply_arrays_double(double* a, double* b, double* c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] * b[i];
}
}
// Compiler-generated vectorized code (AVX2)
multiply_arrays_double:
mov r8d, edi
and r8d, -4
je .L4
xor eax, eax
.L3:
vmovapd ymm0, YMMWORD PTR [rsi+rax*8]
vmulpd ymm0, ymm0, YMMWORD PTR [rdx+rax*8]
vmovapd YMMWORD PTR [rcx+rax*8], ymm0
add rax, 4
cmp r8d, eax
ja .L3
Standard Algorithms and Vectorization
Many standard algorithms can be vectorized effectively:
Vectorized Sorting
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <immintrin.h>
// Vectorized quicksort partition
int partition_vectorized(int* arr, int low, int high) {
int pivot = arr[high];
__m256i pivot_vec = _mm256_set1_epi32(pivot);
int i = low - 1;
for (int j = low; j < high; j += 8) {
__m256i data = _mm256_loadu_si256((__m256i*)&arr[j]);
__m256i mask = _mm256_cmpgt_epi32(pivot_vec, data);
int mask_int = _mm256_movemask_epi8(mask);
while (mask_int) {
int bit = __builtin_ctz(mask_int);
int idx = j + (bit / 4);
if (idx < high) {
i++;
int temp = arr[i];
arr[i] = arr[idx];
arr[idx] = temp;
}
mask_int &= mask_int - 1;
}
}
int temp = arr[i + 1];
arr[i + 1] = arr[high];
arr[high] = temp;
return i + 1;
}
// Vectorized quicksort
void quicksort_vectorized(int* arr, int low, int high) {
if (low < high) {
int pi = partition_vectorized(arr, low, high);
quicksort_vectorized(arr, low, pi - 1);
quicksort_vectorized(arr, pi + 1, high);
}
}
int main() {
const int n = 1000000;
int* arr = malloc(n * sizeof(int));
// Initialize array with random values
srand(time(NULL));
for (int i = 0; i < n; i++) {
arr[i] = rand();
}
clock_t start = clock();
quicksort_vectorized(arr, 0, n - 1);
clock_t end = clock();
double time_taken = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Vectorized quicksort took %f seconds\n", time_taken);
free(arr);
return 0;
}
Vectorized String Operations
#include <stdio.h>
#include <string.h>
#include <immintrin.h>
// Vectorized string comparison
int strcmp_vectorized(const char* s1, const char* s2) {
__m256i zero = _mm256_setzero_si256();
while (1) {
__m256i str1 = _mm256_loadu_si256((__m256i*)s1);
__m256i str2 = _mm256_loadu_si256((__m256i*)s2);
__m256i cmp = _mm256_cmpeq_epi8(str1, str2);
int mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
// Find first mismatch
int idx = __builtin_ctz(~mask);
return s1[idx] - s2[idx];
}
// Check for null terminator
__m256i null_check = _mm256_cmpeq_epi8(str1, zero);
if (_mm256_movemask_epi8(null_check)) {
return 0;
}
s1 += 32;
s2 += 32;
}
}
int main() {
const char* str1 = "This is a test string for vectorized comparison";
const char* str2 = "This is a test string for vectorized comparison";
int result = strcmp_vectorized(str1, str2);
printf("Comparison result: %d\n", result);
return 0;
}
Challenges of Floating-Point Vectorization
Floating-point vectorization presents unique challenges:
- Precision Issues
- Different rounding modes
- Order of operations matters
- FMA instructions affect precision
- Special Values
- Handling of NaN
- Handling of infinity
- Handling of denormals
- Performance Considerations
- Subnormal number handling
- Rounding mode changes
- Exception handling
Complete Working Example: Matrix Multiplication with Floating-Point
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <immintrin.h>
#define N 1024
#define BLOCK_SIZE 32
void matrix_multiply_float(float A[N][N], float B[N][N], float C[N][N]) {
for (int i = 0; i < N; i += BLOCK_SIZE) {
for (int j = 0; j < N; j += BLOCK_SIZE) {
for (int k = 0; k < N; k += BLOCK_SIZE) {
// Process block
for (int ii = i; ii < i + BLOCK_SIZE; ii++) {
for (int jj = j; jj < j + BLOCK_SIZE; jj += 8) {
__m256 c = _mm256_loadu_ps(&C[ii][jj]);
for (int kk = k; kk < k + BLOCK_SIZE; kk++) {
__m256 a = _mm256_broadcast_ss(&A[ii][kk]);
__m256 b = _mm256_loadu_ps(&B[kk][jj]);
c = _mm256_fmadd_ps(a, b, c);
}
_mm256_storeu_ps(&C[ii][jj], c);
}
}
}
}
}
}
int main() {
float (*A)[N] = malloc(N * N * sizeof(float));
float (*B)[N] = malloc(N * N * sizeof(float));
float (*C)[N] = malloc(N * N * sizeof(float));
// Initialize matrices
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
A[i][j] = (float)rand() / RAND_MAX;
B[i][j] = (float)rand() / RAND_MAX;
C[i][j] = 0.0f;
}
}
clock_t start = clock();
matrix_multiply_float(A, B, C);
clock_t end = clock();
double time_taken = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Matrix multiplication took %f seconds\n", time_taken);
free(A);
free(B);
free(C);
return 0;
}
Summary
Vectorization is a powerful optimization technique that can significantly improve performance. To maximize vectorization opportunities:
- Write Vectorization-Friendly Code
- Use contiguous memory access
- Avoid data dependencies
- Keep loops simple
- Use appropriate data types
- Understand SIMD Capabilities
- Know your target architecture
- Use appropriate instruction sets
- Consider data alignment
- Handle edge cases properly
- Use Compiler Directives
#pragma omp simd
__attribute__((vector_size(N)))
__restrict
keyword- Alignment hints
- Profile and Optimize
- Measure vectorization effectiveness
- Identify bottlenecks
- Consider manual vectorization
- Balance between vectorization and other optimizations
Remember that while vectorization can provide significant performance improvements, it’s not always the best solution. Consider the trade-offs between:
- Development time
- Code maintainability
- Portability
- Performance gains