rusty_common/vectorized.rs
1//! Vectorization-friendly utilities for HFT calculations
2//!
3//! This module provides high-performance implementations of common HFT calculations
4//! specifically designed to maximize compiler auto-vectorization and CPU efficiency.
5//!
6//! ## HFT Performance Rationale
7//!
8//! ### Auto-Vectorization Benefits
9//! Modern CPUs can execute multiple operations simultaneously through SIMD instructions:
10//! - **SSE/AVX instructions**: Process 2-8 floating-point operations per cycle
11//! - **Pipeline utilization**: Keeps CPU execution units busy
12//! - **Memory bandwidth**: Vectorized loads/stores maximize memory throughput
13//! - **Branch reduction**: Eliminates costly conditional jumps in inner loops
14//!
15//! ### Critical Path Calculations
16//! HFT systems require these calculations in sub-microsecond timeframes:
17//! - **VWAP calculations**: Volume-weighted average price across order book levels
18//! - **Volatility estimates**: Realized volatility from tick data
19//! - **Order flow analysis**: Buy/sell pressure and market imbalance
20//! - **Moving averages**: Exponential and simple moving averages for signals
21//!
22//! ## Memory Alignment Strategy
23//!
24//! ### Cache Line Optimization
25//! ```text
26//! 32-byte alignment (AVX): ┌──── 4 x f64 ────┐
27//! 64-byte alignment (Cache): ┌─────── 8 x f64 ───────┐
28//! ```
29//!
30//! ### AlignedBuffer Types
31//! - **AlignedBuffer32**: Optimized for AVX/AVX2 SIMD operations
32//! - **AlignedBuffer64**: Cache-line aligned to prevent false sharing
33//! - **Generic implementation**: Works with any `Copy + Default` type
34//!
35//! ## Compiler Vectorization Patterns
36//!
37//! ### Loop Structure Optimization
38//! Designed to trigger auto-vectorization:
39//! ```rust
40//! // Vectorization-friendly pattern
41//! for i in 0..values.len() {
42//! result[i] = values[i].mul_add(weights[i], accumulator);
43//! }
44//! ```
45//!
46//! ### Dependency Chain Breaking
47//! Uses multiple accumulators to improve instruction-level parallelism:
48//! ```rust
49//! let mut sum1 = 0.0;
50//! let mut sum2 = 0.0;
51//! let mut sum3 = 0.0;
52//! let mut sum4 = 0.0;
53//!
54//! // Process in chunks to break dependencies
55//! for chunk in values.chunks_exact(4) {
56//! sum1 = chunk[0].mul_add(weights[0], sum1);
57//! sum2 = chunk[1].mul_add(weights[1], sum2);
58//! sum3 = chunk[2].mul_add(weights[2], sum3);
59//! sum4 = chunk[3].mul_add(weights[3], sum4);
60//! }
61//! ```
62//!
63//! ## Financial Calculation Implementations
64//!
65//! ### Volume-Weighted Average Price (VWAP)
66//! - **Multiple accumulators**: Breaks dependency chains for better ILP
67//! - **FMA operations**: Uses `mul_add` for better precision and performance
68//! - **Chunk processing**: 4-element chunks optimize for SIMD width
69//! - **Remainder handling**: Processes leftover elements efficiently
70//!
71//! ### Exponential Moving Average (EMA)
72//! - **Vectorization-friendly**: Linear dependency structure
73//! - **Precision optimization**: Uses `mul_add` for numerical stability
74//! - **Cache efficiency**: Sequential memory access pattern
75//!
76//! ### Order Flow Imbalance (OFI)
77//! - **Parallel processing**: Independent calculations per element
78//! - **Sign operations**: Vectorizable using `signum()` function
79//! - **Memory layout**: Structure-of-arrays for optimal access
80//!
81//! ### Realized Volatility
82//! - **Squared returns**: Vectorizable multiplication operations
83//! - **Multiple accumulators**: Reduces dependency chain length
84//! - **Chunk processing**: Maximizes SIMD utilization
85//!
86//! ## Performance Characteristics
87//!
88//! ### Latency Improvements
89//! - **2-4x speedup**: From auto-vectorization on modern CPUs
90//! - **Cache efficiency**: Aligned access patterns reduce cache misses
91//! - **Memory bandwidth**: Vectorized loads maximize throughput
92//! - **Branch elimination**: Reduces pipeline stalls
93//!
94//! ### Throughput Optimization
95//! - **High concurrency**: Functions are thread-safe and reentrant
96//! - **Memory efficiency**: Minimal allocation through in-place operations
97//! - **CPU utilization**: Keeps multiple execution units busy
98//!
99//! ## Integration Patterns
100//!
101//! ### Real-Time Trading
102//! ```rust
103//! // Market making: Calculate fair value
104//! let fair_value = calculate_weighted_mid_price(
105//! &bid_prices, &bid_sizes,
106//! &ask_prices, &ask_sizes
107//! );
108//!
109//! // Risk management: Estimate volatility
110//! let vol = calculate_realized_volatility(&recent_returns);
111//! ```
112//!
113//! ### Strategy Development
114//! ```rust
115//! // Alpha generation: Order flow analysis
116//! let ofi = calculate_ofi(
117//! &bid_volumes, &ask_volumes,
118//! &bid_changes, &ask_changes
119//! );
120//!
121//! // Signal processing: Moving averages
122//! let ema = calculate_ema_vectorized(&prices, alpha);
123//! ```
124//!
125//! ## Compiler Optimization Requirements
126//!
127//! ### Build Flags
128//! For optimal performance, use:
129//! ```toml
130//! [profile.release]
131//! opt-level = 3
132//! lto = true
133//! codegen-units = 1
134//! target-cpu = "native" # Enables AVX2/FMA if available
135//! ```
136//!
137//! ### Target Features
138//! - **AVX2**: 256-bit SIMD operations
139//! - **FMA**: Fused multiply-add instructions
140//! - **BMI2**: Bit manipulation for efficient indexing
141
142/// 32-byte aligned buffer optimized for AVX/AVX2 SIMD operations
143///
144/// Provides guaranteed memory alignment for optimal SIMD performance in HFT calculations.
145/// The 32-byte alignment matches AVX register width for efficient vectorized operations.
146///
147/// ## Performance Benefits
148/// - **SIMD efficiency**: Aligned loads/stores avoid performance penalties
149/// - **Cache optimization**: Reduces cache line splits and false sharing
150/// - **Compiler hints**: Enables better auto-vectorization
151/// - **Zero-cost abstraction**: Alignment overhead eliminated at compile time
152///
153/// ## Usage Patterns
154/// ```rust
155/// // Price calculations with SIMD optimization
156/// let mut prices = AlignedBuffer32::<f64, 256>::new();
157/// let prices_slice = prices.as_mut_slice();
158/// // ... fill with market data ...
159/// let vwap = calculate_vwap_vectorized(prices_slice, volumes);
160/// ```
161#[repr(align(32))]
162pub struct AlignedBuffer32<T, const N: usize>
163where
164 T: Copy + Default,
165{
166 data: [T; N],
167}
168
169/// 64-byte aligned buffer for cache line optimization and false sharing prevention
170///
171/// Aligns data structures to CPU cache line boundaries (typically 64 bytes) to maximize
172/// cache efficiency and prevent false sharing in multi-threaded HFT applications.
173///
174/// ## Cache Line Benefits
175/// - **Cache efficiency**: Entire buffer fits within cache line boundaries
176/// - **False sharing prevention**: Eliminates cache ping-ponging between cores
177/// - **Memory bandwidth**: Maximizes utilization of cache line transfers
178/// - **Prefetching optimization**: Predictable access patterns improve prefetcher efficiency
179///
180/// ## HFT Use Cases
181/// - **Per-thread buffers**: Eliminate false sharing between trading threads
182/// - **Order book snapshots**: Cache-efficient storage of market data
183/// - **Statistics collections**: High-frequency counter updates
184/// - **Message buffers**: Network I/O optimization
185#[repr(align(64))]
186pub struct AlignedBuffer64<T, const N: usize>
187where
188 T: Copy + Default,
189{
190 data: [T; N],
191}
192
193/// Legacy type alias for backward compatibility with existing f64 buffers
194pub type LegacyAlignedBuffer<const N: usize> = AlignedBuffer32<f64, N>;
195
196impl<T, const N: usize> Default for AlignedBuffer32<T, N>
197where
198 T: Copy + Default,
199{
200 fn default() -> Self {
201 Self::new()
202 }
203}
204
205impl<T, const N: usize> Default for AlignedBuffer64<T, N>
206where
207 T: Copy + Default,
208{
209 fn default() -> Self {
210 Self::new()
211 }
212}
213
214impl<T, const N: usize> AlignedBuffer32<T, N>
215where
216 T: Copy + Default,
217{
218 /// Create a new 32-byte aligned buffer with all elements initialized to default
219 #[inline]
220 pub fn new() -> Self {
221 Self {
222 data: [T::default(); N],
223 }
224 }
225
226 /// Create a new 32-byte aligned buffer with all elements initialized to the given value
227 #[inline]
228 pub const fn with_value(value: T) -> Self {
229 Self { data: [value; N] }
230 }
231
232 /// Get an immutable slice view of the buffer data
233 #[inline(always)]
234 pub const fn as_slice(&self) -> &[T] {
235 &self.data
236 }
237
238 /// Get a mutable slice view of the buffer data
239 #[inline(always)]
240 pub const fn as_mut_slice(&mut self) -> &mut [T] {
241 &mut self.data
242 }
243
244 /// Get the number of elements in the buffer
245 #[inline(always)]
246 pub const fn len(&self) -> usize {
247 N
248 }
249
250 /// Check if the buffer is empty
251 #[inline(always)]
252 pub const fn is_empty(&self) -> bool {
253 N == 0
254 }
255
256 /// Get a reference to the raw data array
257 #[inline(always)]
258 pub const fn data(&self) -> &[T; N] {
259 &self.data
260 }
261
262 /// Get a mutable reference to the raw data array
263 #[inline(always)]
264 pub const fn data_mut(&mut self) -> &mut [T; N] {
265 &mut self.data
266 }
267
268 /// Get the alignment of this buffer in bytes
269 #[inline(always)]
270 pub const fn alignment() -> usize {
271 32
272 }
273}
274
275impl<T, const N: usize> AlignedBuffer64<T, N>
276where
277 T: Copy + Default,
278{
279 /// Create a new 64-byte aligned buffer with all elements initialized to default
280 #[inline]
281 pub fn new() -> Self {
282 Self {
283 data: [T::default(); N],
284 }
285 }
286
287 /// Create a new 64-byte aligned buffer with all elements initialized to the given value
288 #[inline]
289 pub const fn with_value(value: T) -> Self {
290 Self { data: [value; N] }
291 }
292
293 /// Get an immutable slice view of the buffer data
294 #[inline(always)]
295 pub const fn as_slice(&self) -> &[T] {
296 &self.data
297 }
298
299 /// Get a mutable slice view of the buffer data
300 #[inline(always)]
301 pub const fn as_mut_slice(&mut self) -> &mut [T] {
302 &mut self.data
303 }
304
305 /// Get the number of elements in the buffer
306 #[inline(always)]
307 pub const fn len(&self) -> usize {
308 N
309 }
310
311 /// Check if the buffer is empty
312 #[inline(always)]
313 pub const fn is_empty(&self) -> bool {
314 N == 0
315 }
316
317 /// Get a reference to the raw data array
318 #[inline(always)]
319 pub const fn data(&self) -> &[T; N] {
320 &self.data
321 }
322
323 /// Get a mutable reference to the raw data array
324 #[inline(always)]
325 pub const fn data_mut(&mut self) -> &mut [T; N] {
326 &mut self.data
327 }
328
329 /// Get the alignment of this buffer in bytes
330 #[inline(always)]
331 pub const fn alignment() -> usize {
332 64
333 }
334}
335
336/// Calculate VWAP in a vectorization-friendly way
337#[inline(always)]
338pub fn calculate_vwap(prices: &[f64], volumes: &[f64]) -> f64 {
339 debug_assert_eq!(prices.len(), volumes.len());
340
341 // Use multiple accumulators to break dependency chains
342 let mut pv_sum1 = 0.0;
343 let mut pv_sum2 = 0.0;
344 let mut pv_sum3 = 0.0;
345 let mut pv_sum4 = 0.0;
346
347 let mut v_sum1 = 0.0;
348 let mut v_sum2 = 0.0;
349 let mut v_sum3 = 0.0;
350 let mut v_sum4 = 0.0;
351
352 // Process in chunks of 4 for vectorization
353 let chunks = prices.chunks_exact(4);
354 let vol_chunks = volumes.chunks_exact(4);
355
356 for (p_chunk, v_chunk) in chunks.zip(vol_chunks) {
357 // Compiler will vectorize these operations
358 pv_sum1 = p_chunk[0].mul_add(v_chunk[0], pv_sum1);
359 pv_sum2 = p_chunk[1].mul_add(v_chunk[1], pv_sum2);
360 pv_sum3 = p_chunk[2].mul_add(v_chunk[2], pv_sum3);
361 pv_sum4 = p_chunk[3].mul_add(v_chunk[3], pv_sum4);
362
363 v_sum1 += v_chunk[0];
364 v_sum2 += v_chunk[1];
365 v_sum3 += v_chunk[2];
366 v_sum4 += v_chunk[3];
367 }
368
369 // Sum the accumulators
370 let total_pv = pv_sum1 + pv_sum2 + pv_sum3 + pv_sum4;
371 let total_v = v_sum1 + v_sum2 + v_sum3 + v_sum4;
372
373 // Handle remainder
374 let p_remainder = prices.chunks_exact(4).remainder();
375 let v_remainder = volumes.chunks_exact(4).remainder();
376
377 let remainder_pv: f64 = p_remainder
378 .iter()
379 .zip(v_remainder.iter())
380 .map(|(p, v)| p * v)
381 .sum();
382
383 let remainder_v: f64 = v_remainder.iter().sum();
384
385 (total_pv + remainder_pv) / (total_v + remainder_v)
386}
387
388/// Calculate exponential moving average with vectorization
389#[inline(always)]
390pub fn calculate_ema_vectorized(data: &[f64], alpha: f64) -> Vec<f64> {
391 if data.is_empty() {
392 return Vec::new();
393 }
394
395 let mut result = Vec::with_capacity(data.len());
396 result.push(data[0]);
397
398 let one_minus_alpha = 1.0 - alpha;
399
400 // This loop pattern allows for better vectorization
401 for i in 1..data.len() {
402 let ema = alpha.mul_add(data[i], one_minus_alpha * result[i - 1]);
403 result.push(ema);
404 }
405
406 result
407}
408
409/// Calculate order flow imbalance (OFI) with vectorization
410#[inline(always)]
411pub fn calculate_ofi(
412 bid_volumes: &[f64],
413 ask_volumes: &[f64],
414 bid_price_changes: &[f64],
415 ask_price_changes: &[f64],
416) -> Vec<f64> {
417 debug_assert_eq!(bid_volumes.len(), ask_volumes.len());
418 debug_assert_eq!(bid_volumes.len(), bid_price_changes.len());
419 debug_assert_eq!(bid_volumes.len(), ask_price_changes.len());
420
421 let mut ofi = vec![0.0; bid_volumes.len()];
422
423 // Vectorizable main loop
424 for i in 0..bid_volumes.len() {
425 let bid_component = bid_volumes[i] * bid_price_changes[i].signum();
426 let ask_component = ask_volumes[i] * ask_price_changes[i].signum();
427 ofi[i] = bid_component - ask_component;
428 }
429
430 ofi
431}
432
433/// Calculate realized volatility with vectorization
434#[inline(always)]
435pub fn calculate_realized_volatility(returns: &[f64]) -> f64 {
436 if returns.is_empty() {
437 return 0.0;
438 }
439
440 // Use multiple accumulators for better pipelining
441 let mut sum1 = 0.0;
442 let mut sum2 = 0.0;
443 let mut sum3 = 0.0;
444 let mut sum4 = 0.0;
445
446 let chunks = returns.chunks_exact(4);
447 let remainder = chunks.remainder();
448
449 for chunk in chunks {
450 // Squared returns - vectorizable
451 sum1 = chunk[0].mul_add(chunk[0], sum1);
452 sum2 = chunk[1].mul_add(chunk[1], sum2);
453 sum3 = chunk[2].mul_add(chunk[2], sum3);
454 sum4 = chunk[3].mul_add(chunk[3], sum4);
455 }
456
457 // Handle remainder
458 let remainder_sum: f64 = remainder.iter().map(|r| r * r).sum();
459
460 let total_sum = sum1 + sum2 + sum3 + sum4 + remainder_sum;
461 (total_sum / returns.len() as f64).sqrt()
462}
463
464/// Calculate weighted mid price with vectorization
465#[inline(always)]
466pub fn calculate_weighted_mid_price(
467 bid_prices: &[f64],
468 bid_sizes: &[f64],
469 ask_prices: &[f64],
470 ask_sizes: &[f64],
471) -> f64 {
472 debug_assert_eq!(bid_prices.len(), bid_sizes.len());
473 debug_assert_eq!(ask_prices.len(), ask_sizes.len());
474
475 // Calculate total bid volume
476 let total_bid_volume: f64 = bid_sizes.iter().sum();
477 let total_ask_volume: f64 = ask_sizes.iter().sum();
478
479 if total_bid_volume == 0.0 || total_ask_volume == 0.0 {
480 return 0.0;
481 }
482
483 // Weighted bid price
484 let weighted_bid = calculate_weighted_average(bid_prices, bid_sizes);
485
486 // Weighted ask price
487 let weighted_ask = calculate_weighted_average(ask_prices, ask_sizes);
488
489 // Volume-weighted mid price
490 let bid_weight = total_bid_volume / (total_bid_volume + total_ask_volume);
491 let ask_weight = total_ask_volume / (total_bid_volume + total_ask_volume);
492
493 weighted_bid * bid_weight + weighted_ask * ask_weight
494}
495
496/// Helper function for weighted average calculation
497#[inline(always)]
498fn calculate_weighted_average(values: &[f64], weights: &[f64]) -> f64 {
499 debug_assert_eq!(values.len(), weights.len());
500
501 let mut weighted_sum = 0.0;
502 let mut weight_sum = 0.0;
503
504 // Vectorizable loop
505 for i in 0..values.len() {
506 weighted_sum = values[i].mul_add(weights[i], weighted_sum);
507 weight_sum += weights[i];
508 }
509
510 weighted_sum / weight_sum
511}
512
513/// Calculate order book imbalance with vectorization
514#[inline(always)]
515pub fn calculate_order_book_imbalance(bid_volumes: &[f64], ask_volumes: &[f64]) -> f64 {
516 let total_bid: f64 = bid_volumes.iter().sum();
517 let total_ask: f64 = ask_volumes.iter().sum();
518
519 if total_bid + total_ask == 0.0 {
520 return 0.0;
521 }
522
523 (total_bid - total_ask) / (total_bid + total_ask)
524}
525
526/// Fast moving sum calculation
527#[inline(always)]
528pub fn moving_sum(data: &[f64], window: usize) -> Vec<f64> {
529 if data.len() < window || window == 0 {
530 return vec![];
531 }
532
533 let mut result = Vec::with_capacity(data.len() - window + 1);
534
535 // Initial sum - vectorizable
536 let mut sum: f64 = data[..window].iter().sum();
537 result.push(sum);
538
539 // Rolling window - efficient single pass
540 for i in window..data.len() {
541 sum += data[i] - data[i - window];
542 result.push(sum);
543 }
544
545 result
546}
547
548/// Vectorized returns calculation
549#[inline(always)]
550pub fn calculate_returns(prices: &[f64]) -> Vec<f64> {
551 if prices.len() < 2 {
552 return vec![];
553 }
554
555 let mut returns = Vec::with_capacity(prices.len() - 1);
556
557 // Vectorizable loop for returns
558 for i in 1..prices.len() {
559 let return_val = (prices[i] - prices[i - 1]) / prices[i - 1];
560 returns.push(return_val);
561 }
562
563 returns
564}
565
566/// Vectorized log returns calculation
567#[inline(always)]
568pub fn calculate_log_returns(prices: &[f64]) -> Vec<f64> {
569 if prices.len() < 2 {
570 return vec![];
571 }
572
573 let mut returns = Vec::with_capacity(prices.len() - 1);
574
575 // Vectorizable loop for log returns
576 for i in 1..prices.len() {
577 let log_return = (prices[i] / prices[i - 1]).ln();
578 returns.push(log_return);
579 }
580
581 returns
582}
583
584#[cfg(test)]
585mod tests {
586 use super::*;
587
588 #[test]
589 fn test_calculate_vwap() {
590 let prices = vec![100.0, 101.0, 102.0, 103.0, 104.0];
591 let volumes = vec![1000.0, 1500.0, 2000.0, 1200.0, 1800.0];
592
593 let vwap = calculate_vwap(&prices, &volumes);
594
595 // Manual calculation for verification
596 let total_pv: f64 = prices.iter().zip(volumes.iter()).map(|(p, v)| p * v).sum();
597 let total_v: f64 = volumes.iter().sum();
598 let expected = total_pv / total_v;
599
600 eprintln!("VWAP calculated: {vwap}");
601 eprintln!("VWAP expected: {expected}");
602 eprintln!("Difference: {}", (vwap - expected).abs());
603
604 // The manual calculation gives us ~102.1733
605 assert!((vwap - 102.173333).abs() < 0.01);
606 }
607
608 #[test]
609 fn test_calculate_ofi() {
610 let bid_vols = vec![100.0, 150.0, 200.0];
611 let ask_vols = vec![120.0, 130.0, 180.0];
612 let bid_changes = vec![0.1, -0.1, 0.2];
613 let ask_changes = vec![-0.1, 0.1, -0.2];
614
615 let ofi = calculate_ofi(&bid_vols, &ask_vols, &bid_changes, &ask_changes);
616 assert_eq!(ofi.len(), 3);
617 }
618
619 #[test]
620 fn test_realized_volatility() {
621 let returns = vec![0.01, -0.02, 0.015, -0.01, 0.02];
622 let vol = calculate_realized_volatility(&returns);
623 assert!(vol > 0.0);
624 }
625}