rusty_common/
vectorized.rs

1//! Vectorization-friendly utilities for HFT calculations
2//!
3//! This module provides high-performance implementations of common HFT calculations
4//! specifically designed to maximize compiler auto-vectorization and CPU efficiency.
5//!
6//! ## HFT Performance Rationale
7//!
8//! ### Auto-Vectorization Benefits
9//! Modern CPUs can execute multiple operations simultaneously through SIMD instructions:
10//! - **SSE/AVX instructions**: Process 2-8 floating-point operations per cycle
11//! - **Pipeline utilization**: Keeps CPU execution units busy
12//! - **Memory bandwidth**: Vectorized loads/stores maximize memory throughput
13//! - **Branch reduction**: Eliminates costly conditional jumps in inner loops
14//!
15//! ### Critical Path Calculations
16//! HFT systems require these calculations in sub-microsecond timeframes:
17//! - **VWAP calculations**: Volume-weighted average price across order book levels
18//! - **Volatility estimates**: Realized volatility from tick data
19//! - **Order flow analysis**: Buy/sell pressure and market imbalance
20//! - **Moving averages**: Exponential and simple moving averages for signals
21//!
22//! ## Memory Alignment Strategy
23//!
24//! ### Cache Line Optimization
25//! ```text
26//! 32-byte alignment (AVX):  ┌──── 4 x f64 ────┐
27//! 64-byte alignment (Cache): ┌─────── 8 x f64 ───────┐
28//! ```
29//!
30//! ### AlignedBuffer Types
31//! - **AlignedBuffer32**: Optimized for AVX/AVX2 SIMD operations
32//! - **AlignedBuffer64**: Cache-line aligned to prevent false sharing
33//! - **Generic implementation**: Works with any `Copy + Default` type
34//!
35//! ## Compiler Vectorization Patterns
36//!
37//! ### Loop Structure Optimization
38//! Designed to trigger auto-vectorization:
39//! ```rust
40//! // Vectorization-friendly pattern
41//! for i in 0..values.len() {
42//!     result[i] = values[i].mul_add(weights[i], accumulator);
43//! }
44//! ```
45//!
46//! ### Dependency Chain Breaking
47//! Uses multiple accumulators to improve instruction-level parallelism:
48//! ```rust
49//! let mut sum1 = 0.0;
50//! let mut sum2 = 0.0;
51//! let mut sum3 = 0.0;
52//! let mut sum4 = 0.0;
53//!
54//! // Process in chunks to break dependencies
55//! for chunk in values.chunks_exact(4) {
56//!     sum1 = chunk[0].mul_add(weights[0], sum1);
57//!     sum2 = chunk[1].mul_add(weights[1], sum2);
58//!     sum3 = chunk[2].mul_add(weights[2], sum3);
59//!     sum4 = chunk[3].mul_add(weights[3], sum4);
60//! }
61//! ```
62//!
63//! ## Financial Calculation Implementations
64//!
65//! ### Volume-Weighted Average Price (VWAP)
66//! - **Multiple accumulators**: Breaks dependency chains for better ILP
67//! - **FMA operations**: Uses `mul_add` for better precision and performance
68//! - **Chunk processing**: 4-element chunks optimize for SIMD width
69//! - **Remainder handling**: Processes leftover elements efficiently
70//!
71//! ### Exponential Moving Average (EMA)
72//! - **Vectorization-friendly**: Linear dependency structure
73//! - **Precision optimization**: Uses `mul_add` for numerical stability
74//! - **Cache efficiency**: Sequential memory access pattern
75//!
76//! ### Order Flow Imbalance (OFI)
77//! - **Parallel processing**: Independent calculations per element
78//! - **Sign operations**: Vectorizable using `signum()` function
79//! - **Memory layout**: Structure-of-arrays for optimal access
80//!
81//! ### Realized Volatility
82//! - **Squared returns**: Vectorizable multiplication operations
83//! - **Multiple accumulators**: Reduces dependency chain length
84//! - **Chunk processing**: Maximizes SIMD utilization
85//!
86//! ## Performance Characteristics
87//!
88//! ### Latency Improvements
89//! - **2-4x speedup**: From auto-vectorization on modern CPUs
90//! - **Cache efficiency**: Aligned access patterns reduce cache misses
91//! - **Memory bandwidth**: Vectorized loads maximize throughput
92//! - **Branch elimination**: Reduces pipeline stalls
93//!
94//! ### Throughput Optimization
95//! - **High concurrency**: Functions are thread-safe and reentrant
96//! - **Memory efficiency**: Minimal allocation through in-place operations
97//! - **CPU utilization**: Keeps multiple execution units busy
98//!
99//! ## Integration Patterns
100//!
101//! ### Real-Time Trading
102//! ```rust
103//! // Market making: Calculate fair value
104//! let fair_value = calculate_weighted_mid_price(
105//!     &bid_prices, &bid_sizes,
106//!     &ask_prices, &ask_sizes
107//! );
108//!
109//! // Risk management: Estimate volatility
110//! let vol = calculate_realized_volatility(&recent_returns);
111//! ```
112//!
113//! ### Strategy Development
114//! ```rust
115//! // Alpha generation: Order flow analysis
116//! let ofi = calculate_ofi(
117//!     &bid_volumes, &ask_volumes,
118//!     &bid_changes, &ask_changes
119//! );
120//!
121//! // Signal processing: Moving averages
122//! let ema = calculate_ema_vectorized(&prices, alpha);
123//! ```
124//!
125//! ## Compiler Optimization Requirements
126//!
127//! ### Build Flags
128//! For optimal performance, use:
129//! ```toml
130//! [profile.release]
131//! opt-level = 3
132//! lto = true
133//! codegen-units = 1
134//! target-cpu = "native"  # Enables AVX2/FMA if available
135//! ```
136//!
137//! ### Target Features
138//! - **AVX2**: 256-bit SIMD operations
139//! - **FMA**: Fused multiply-add instructions
140//! - **BMI2**: Bit manipulation for efficient indexing
141
142/// 32-byte aligned buffer optimized for AVX/AVX2 SIMD operations
143///
144/// Provides guaranteed memory alignment for optimal SIMD performance in HFT calculations.
145/// The 32-byte alignment matches AVX register width for efficient vectorized operations.
146///
147/// ## Performance Benefits
148/// - **SIMD efficiency**: Aligned loads/stores avoid performance penalties
149/// - **Cache optimization**: Reduces cache line splits and false sharing
150/// - **Compiler hints**: Enables better auto-vectorization
151/// - **Zero-cost abstraction**: Alignment overhead eliminated at compile time
152///
153/// ## Usage Patterns
154/// ```rust
155/// // Price calculations with SIMD optimization
156/// let mut prices = AlignedBuffer32::<f64, 256>::new();
157/// let prices_slice = prices.as_mut_slice();
158/// // ... fill with market data ...
159/// let vwap = calculate_vwap_vectorized(prices_slice, volumes);
160/// ```
161#[repr(align(32))]
162pub struct AlignedBuffer32<T, const N: usize>
163where
164    T: Copy + Default,
165{
166    data: [T; N],
167}
168
169/// 64-byte aligned buffer for cache line optimization and false sharing prevention
170///
171/// Aligns data structures to CPU cache line boundaries (typically 64 bytes) to maximize
172/// cache efficiency and prevent false sharing in multi-threaded HFT applications.
173///
174/// ## Cache Line Benefits
175/// - **Cache efficiency**: Entire buffer fits within cache line boundaries
176/// - **False sharing prevention**: Eliminates cache ping-ponging between cores
177/// - **Memory bandwidth**: Maximizes utilization of cache line transfers
178/// - **Prefetching optimization**: Predictable access patterns improve prefetcher efficiency
179///
180/// ## HFT Use Cases
181/// - **Per-thread buffers**: Eliminate false sharing between trading threads
182/// - **Order book snapshots**: Cache-efficient storage of market data
183/// - **Statistics collections**: High-frequency counter updates
184/// - **Message buffers**: Network I/O optimization
185#[repr(align(64))]
186pub struct AlignedBuffer64<T, const N: usize>
187where
188    T: Copy + Default,
189{
190    data: [T; N],
191}
192
193/// Legacy type alias for backward compatibility with existing f64 buffers
194pub type LegacyAlignedBuffer<const N: usize> = AlignedBuffer32<f64, N>;
195
196impl<T, const N: usize> Default for AlignedBuffer32<T, N>
197where
198    T: Copy + Default,
199{
200    fn default() -> Self {
201        Self::new()
202    }
203}
204
205impl<T, const N: usize> Default for AlignedBuffer64<T, N>
206where
207    T: Copy + Default,
208{
209    fn default() -> Self {
210        Self::new()
211    }
212}
213
214impl<T, const N: usize> AlignedBuffer32<T, N>
215where
216    T: Copy + Default,
217{
218    /// Create a new 32-byte aligned buffer with all elements initialized to default
219    #[inline]
220    pub fn new() -> Self {
221        Self {
222            data: [T::default(); N],
223        }
224    }
225
226    /// Create a new 32-byte aligned buffer with all elements initialized to the given value
227    #[inline]
228    pub const fn with_value(value: T) -> Self {
229        Self { data: [value; N] }
230    }
231
232    /// Get an immutable slice view of the buffer data
233    #[inline(always)]
234    pub const fn as_slice(&self) -> &[T] {
235        &self.data
236    }
237
238    /// Get a mutable slice view of the buffer data
239    #[inline(always)]
240    pub const fn as_mut_slice(&mut self) -> &mut [T] {
241        &mut self.data
242    }
243
244    /// Get the number of elements in the buffer
245    #[inline(always)]
246    pub const fn len(&self) -> usize {
247        N
248    }
249
250    /// Check if the buffer is empty
251    #[inline(always)]
252    pub const fn is_empty(&self) -> bool {
253        N == 0
254    }
255
256    /// Get a reference to the raw data array
257    #[inline(always)]
258    pub const fn data(&self) -> &[T; N] {
259        &self.data
260    }
261
262    /// Get a mutable reference to the raw data array
263    #[inline(always)]
264    pub const fn data_mut(&mut self) -> &mut [T; N] {
265        &mut self.data
266    }
267
268    /// Get the alignment of this buffer in bytes
269    #[inline(always)]
270    pub const fn alignment() -> usize {
271        32
272    }
273}
274
275impl<T, const N: usize> AlignedBuffer64<T, N>
276where
277    T: Copy + Default,
278{
279    /// Create a new 64-byte aligned buffer with all elements initialized to default
280    #[inline]
281    pub fn new() -> Self {
282        Self {
283            data: [T::default(); N],
284        }
285    }
286
287    /// Create a new 64-byte aligned buffer with all elements initialized to the given value
288    #[inline]
289    pub const fn with_value(value: T) -> Self {
290        Self { data: [value; N] }
291    }
292
293    /// Get an immutable slice view of the buffer data
294    #[inline(always)]
295    pub const fn as_slice(&self) -> &[T] {
296        &self.data
297    }
298
299    /// Get a mutable slice view of the buffer data
300    #[inline(always)]
301    pub const fn as_mut_slice(&mut self) -> &mut [T] {
302        &mut self.data
303    }
304
305    /// Get the number of elements in the buffer
306    #[inline(always)]
307    pub const fn len(&self) -> usize {
308        N
309    }
310
311    /// Check if the buffer is empty
312    #[inline(always)]
313    pub const fn is_empty(&self) -> bool {
314        N == 0
315    }
316
317    /// Get a reference to the raw data array
318    #[inline(always)]
319    pub const fn data(&self) -> &[T; N] {
320        &self.data
321    }
322
323    /// Get a mutable reference to the raw data array
324    #[inline(always)]
325    pub const fn data_mut(&mut self) -> &mut [T; N] {
326        &mut self.data
327    }
328
329    /// Get the alignment of this buffer in bytes
330    #[inline(always)]
331    pub const fn alignment() -> usize {
332        64
333    }
334}
335
336/// Calculate VWAP in a vectorization-friendly way
337#[inline(always)]
338pub fn calculate_vwap(prices: &[f64], volumes: &[f64]) -> f64 {
339    debug_assert_eq!(prices.len(), volumes.len());
340
341    // Use multiple accumulators to break dependency chains
342    let mut pv_sum1 = 0.0;
343    let mut pv_sum2 = 0.0;
344    let mut pv_sum3 = 0.0;
345    let mut pv_sum4 = 0.0;
346
347    let mut v_sum1 = 0.0;
348    let mut v_sum2 = 0.0;
349    let mut v_sum3 = 0.0;
350    let mut v_sum4 = 0.0;
351
352    // Process in chunks of 4 for vectorization
353    let chunks = prices.chunks_exact(4);
354    let vol_chunks = volumes.chunks_exact(4);
355
356    for (p_chunk, v_chunk) in chunks.zip(vol_chunks) {
357        // Compiler will vectorize these operations
358        pv_sum1 = p_chunk[0].mul_add(v_chunk[0], pv_sum1);
359        pv_sum2 = p_chunk[1].mul_add(v_chunk[1], pv_sum2);
360        pv_sum3 = p_chunk[2].mul_add(v_chunk[2], pv_sum3);
361        pv_sum4 = p_chunk[3].mul_add(v_chunk[3], pv_sum4);
362
363        v_sum1 += v_chunk[0];
364        v_sum2 += v_chunk[1];
365        v_sum3 += v_chunk[2];
366        v_sum4 += v_chunk[3];
367    }
368
369    // Sum the accumulators
370    let total_pv = pv_sum1 + pv_sum2 + pv_sum3 + pv_sum4;
371    let total_v = v_sum1 + v_sum2 + v_sum3 + v_sum4;
372
373    // Handle remainder
374    let p_remainder = prices.chunks_exact(4).remainder();
375    let v_remainder = volumes.chunks_exact(4).remainder();
376
377    let remainder_pv: f64 = p_remainder
378        .iter()
379        .zip(v_remainder.iter())
380        .map(|(p, v)| p * v)
381        .sum();
382
383    let remainder_v: f64 = v_remainder.iter().sum();
384
385    (total_pv + remainder_pv) / (total_v + remainder_v)
386}
387
388/// Calculate exponential moving average with vectorization
389#[inline(always)]
390pub fn calculate_ema_vectorized(data: &[f64], alpha: f64) -> Vec<f64> {
391    if data.is_empty() {
392        return Vec::new();
393    }
394
395    let mut result = Vec::with_capacity(data.len());
396    result.push(data[0]);
397
398    let one_minus_alpha = 1.0 - alpha;
399
400    // This loop pattern allows for better vectorization
401    for i in 1..data.len() {
402        let ema = alpha.mul_add(data[i], one_minus_alpha * result[i - 1]);
403        result.push(ema);
404    }
405
406    result
407}
408
409/// Calculate order flow imbalance (OFI) with vectorization
410#[inline(always)]
411pub fn calculate_ofi(
412    bid_volumes: &[f64],
413    ask_volumes: &[f64],
414    bid_price_changes: &[f64],
415    ask_price_changes: &[f64],
416) -> Vec<f64> {
417    debug_assert_eq!(bid_volumes.len(), ask_volumes.len());
418    debug_assert_eq!(bid_volumes.len(), bid_price_changes.len());
419    debug_assert_eq!(bid_volumes.len(), ask_price_changes.len());
420
421    let mut ofi = vec![0.0; bid_volumes.len()];
422
423    // Vectorizable main loop
424    for i in 0..bid_volumes.len() {
425        let bid_component = bid_volumes[i] * bid_price_changes[i].signum();
426        let ask_component = ask_volumes[i] * ask_price_changes[i].signum();
427        ofi[i] = bid_component - ask_component;
428    }
429
430    ofi
431}
432
433/// Calculate realized volatility with vectorization
434#[inline(always)]
435pub fn calculate_realized_volatility(returns: &[f64]) -> f64 {
436    if returns.is_empty() {
437        return 0.0;
438    }
439
440    // Use multiple accumulators for better pipelining
441    let mut sum1 = 0.0;
442    let mut sum2 = 0.0;
443    let mut sum3 = 0.0;
444    let mut sum4 = 0.0;
445
446    let chunks = returns.chunks_exact(4);
447    let remainder = chunks.remainder();
448
449    for chunk in chunks {
450        // Squared returns - vectorizable
451        sum1 = chunk[0].mul_add(chunk[0], sum1);
452        sum2 = chunk[1].mul_add(chunk[1], sum2);
453        sum3 = chunk[2].mul_add(chunk[2], sum3);
454        sum4 = chunk[3].mul_add(chunk[3], sum4);
455    }
456
457    // Handle remainder
458    let remainder_sum: f64 = remainder.iter().map(|r| r * r).sum();
459
460    let total_sum = sum1 + sum2 + sum3 + sum4 + remainder_sum;
461    (total_sum / returns.len() as f64).sqrt()
462}
463
464/// Calculate weighted mid price with vectorization
465#[inline(always)]
466pub fn calculate_weighted_mid_price(
467    bid_prices: &[f64],
468    bid_sizes: &[f64],
469    ask_prices: &[f64],
470    ask_sizes: &[f64],
471) -> f64 {
472    debug_assert_eq!(bid_prices.len(), bid_sizes.len());
473    debug_assert_eq!(ask_prices.len(), ask_sizes.len());
474
475    // Calculate total bid volume
476    let total_bid_volume: f64 = bid_sizes.iter().sum();
477    let total_ask_volume: f64 = ask_sizes.iter().sum();
478
479    if total_bid_volume == 0.0 || total_ask_volume == 0.0 {
480        return 0.0;
481    }
482
483    // Weighted bid price
484    let weighted_bid = calculate_weighted_average(bid_prices, bid_sizes);
485
486    // Weighted ask price
487    let weighted_ask = calculate_weighted_average(ask_prices, ask_sizes);
488
489    // Volume-weighted mid price
490    let bid_weight = total_bid_volume / (total_bid_volume + total_ask_volume);
491    let ask_weight = total_ask_volume / (total_bid_volume + total_ask_volume);
492
493    weighted_bid * bid_weight + weighted_ask * ask_weight
494}
495
496/// Helper function for weighted average calculation
497#[inline(always)]
498fn calculate_weighted_average(values: &[f64], weights: &[f64]) -> f64 {
499    debug_assert_eq!(values.len(), weights.len());
500
501    let mut weighted_sum = 0.0;
502    let mut weight_sum = 0.0;
503
504    // Vectorizable loop
505    for i in 0..values.len() {
506        weighted_sum = values[i].mul_add(weights[i], weighted_sum);
507        weight_sum += weights[i];
508    }
509
510    weighted_sum / weight_sum
511}
512
513/// Calculate order book imbalance with vectorization
514#[inline(always)]
515pub fn calculate_order_book_imbalance(bid_volumes: &[f64], ask_volumes: &[f64]) -> f64 {
516    let total_bid: f64 = bid_volumes.iter().sum();
517    let total_ask: f64 = ask_volumes.iter().sum();
518
519    if total_bid + total_ask == 0.0 {
520        return 0.0;
521    }
522
523    (total_bid - total_ask) / (total_bid + total_ask)
524}
525
526/// Fast moving sum calculation
527#[inline(always)]
528pub fn moving_sum(data: &[f64], window: usize) -> Vec<f64> {
529    if data.len() < window || window == 0 {
530        return vec![];
531    }
532
533    let mut result = Vec::with_capacity(data.len() - window + 1);
534
535    // Initial sum - vectorizable
536    let mut sum: f64 = data[..window].iter().sum();
537    result.push(sum);
538
539    // Rolling window - efficient single pass
540    for i in window..data.len() {
541        sum += data[i] - data[i - window];
542        result.push(sum);
543    }
544
545    result
546}
547
548/// Vectorized returns calculation
549#[inline(always)]
550pub fn calculate_returns(prices: &[f64]) -> Vec<f64> {
551    if prices.len() < 2 {
552        return vec![];
553    }
554
555    let mut returns = Vec::with_capacity(prices.len() - 1);
556
557    // Vectorizable loop for returns
558    for i in 1..prices.len() {
559        let return_val = (prices[i] - prices[i - 1]) / prices[i - 1];
560        returns.push(return_val);
561    }
562
563    returns
564}
565
566/// Vectorized log returns calculation
567#[inline(always)]
568pub fn calculate_log_returns(prices: &[f64]) -> Vec<f64> {
569    if prices.len() < 2 {
570        return vec![];
571    }
572
573    let mut returns = Vec::with_capacity(prices.len() - 1);
574
575    // Vectorizable loop for log returns
576    for i in 1..prices.len() {
577        let log_return = (prices[i] / prices[i - 1]).ln();
578        returns.push(log_return);
579    }
580
581    returns
582}
583
584#[cfg(test)]
585mod tests {
586    use super::*;
587
588    #[test]
589    fn test_calculate_vwap() {
590        let prices = vec![100.0, 101.0, 102.0, 103.0, 104.0];
591        let volumes = vec![1000.0, 1500.0, 2000.0, 1200.0, 1800.0];
592
593        let vwap = calculate_vwap(&prices, &volumes);
594
595        // Manual calculation for verification
596        let total_pv: f64 = prices.iter().zip(volumes.iter()).map(|(p, v)| p * v).sum();
597        let total_v: f64 = volumes.iter().sum();
598        let expected = total_pv / total_v;
599
600        eprintln!("VWAP calculated: {vwap}");
601        eprintln!("VWAP expected: {expected}");
602        eprintln!("Difference: {}", (vwap - expected).abs());
603
604        // The manual calculation gives us ~102.1733
605        assert!((vwap - 102.173333).abs() < 0.01);
606    }
607
608    #[test]
609    fn test_calculate_ofi() {
610        let bid_vols = vec![100.0, 150.0, 200.0];
611        let ask_vols = vec![120.0, 130.0, 180.0];
612        let bid_changes = vec![0.1, -0.1, 0.2];
613        let ask_changes = vec![-0.1, 0.1, -0.2];
614
615        let ofi = calculate_ofi(&bid_vols, &ask_vols, &bid_changes, &ask_changes);
616        assert_eq!(ofi.len(), 3);
617    }
618
619    #[test]
620    fn test_realized_volatility() {
621        let returns = vec![0.01, -0.02, 0.015, -0.01, 0.02];
622        let vol = calculate_realized_volatility(&returns);
623        assert!(vol > 0.0);
624    }
625}
rusty_common/vectorized.rs

rusty_common/
vectorized.rs