rusty_backtest/
simd_enhanced.rs

1//! Enhanced SIMD Operations for High-Frequency Trading
2//!
3//! This module provides advanced SIMD optimizations specifically designed for HFT workloads:
4//! - Cache-aligned memory layouts
5//! - Optimized memory access patterns
6//! - Branch-free algorithms where possible
7//! - Prefetching for large datasets
8//! - Structure-of-Arrays (SoA) optimizations
9
10#[cfg(test)]
11use std::hint::black_box;
12use wide::f64x4;
13
14/// Portable prefetch function that compiles to optimal instructions per platform
15#[inline(always)]
16fn prefetch_data(ptr: *const f64) {
17    #[cfg(target_arch = "x86_64")]
18    {
19        unsafe {
20            std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_T0);
21        }
22    }
23    #[cfg(target_arch = "aarch64")]
24    {
25        unsafe {
26            std::arch::aarch64::_prefetch(
27                ptr as *const i8,
28                std::arch::aarch64::_PREFETCH_READ,
29                std::arch::aarch64::_PREFETCH_LOCALITY3,
30            );
31        }
32    }
33    // On other architectures, prefetch is a no-op
34    // The compiler may still optimize memory access patterns
35    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
36    {
37        let _ = ptr; // Suppress unused variable warning
38    }
39}
40
41/// Re-export the unified 64-byte aligned buffer type
42///
43/// This type alias maintains backward compatibility while using the
44/// consolidated aligned buffer implementation from rusty-common.
45pub use rusty_common::AlignedBuffer64 as SimdAlignedBuffer;
46
47/// Enhanced SIMD operations optimized for financial data processing
48pub struct EnhancedSimdOps;
49
50impl EnhancedSimdOps {
51    /// Optimized SIMD sum with explicit prefetching for large arrays
52    #[inline]
53    pub fn sum_f64_optimized(values: &[f64]) -> f64 {
54        if values.is_empty() {
55            return 0.0;
56        }
57
58        if values.len() == 1 {
59            return values[0];
60        }
61
62        if values.len() < 4 {
63            return values.iter().sum();
64        }
65
66        // For large arrays, use prefetching
67        if values.len() > 1024 {
68            Self::sum_f64_with_prefetch(values)
69        } else {
70            Self::sum_f64_fast(values)
71        }
72    }
73
74    /// Fast SIMD sum for medium-sized arrays without prefetching overhead
75    #[inline]
76    fn sum_f64_fast(values: &[f64]) -> f64 {
77        let mut i = 0;
78        let len = values.len();
79
80        // Use two accumulators to reduce dependency chains
81        let mut sum1 = f64x4::ZERO;
82        let mut sum2 = f64x4::ZERO;
83
84        // Process 8 elements at a time (2 SIMD lanes)
85        while i + 8 <= len {
86            let v1 = f64x4::from([values[i], values[i + 1], values[i + 2], values[i + 3]]);
87            let v2 = f64x4::from([values[i + 4], values[i + 5], values[i + 6], values[i + 7]]);
88
89            sum1 += v1;
90            sum2 += v2;
91
92            i += 8;
93        }
94
95        // Process remaining 4-element chunks
96        while i + 4 <= len {
97            let v = f64x4::from([values[i], values[i + 1], values[i + 2], values[i + 3]]);
98            sum1 += v;
99            i += 4;
100        }
101
102        // Combine the two accumulators
103        let combined = sum1 + sum2;
104        let combined_array = combined.as_array_ref();
105        let mut total =
106            combined_array[0] + combined_array[1] + combined_array[2] + combined_array[3];
107
108        // Handle remaining elements
109        while i < len {
110            total += values[i];
111            i += 1;
112        }
113
114        total
115    }
116
117    /// SIMD sum with explicit prefetching for large arrays
118    #[inline]
119    fn sum_f64_with_prefetch(values: &[f64]) -> f64 {
120        let mut i = 0;
121        let len = values.len();
122
123        // Use four accumulators to maximize instruction-level parallelism
124        let mut sum1 = f64x4::ZERO;
125        let mut sum2 = f64x4::ZERO;
126        let mut sum3 = f64x4::ZERO;
127        let mut sum4 = f64x4::ZERO;
128
129        // Process 16 elements at a time with prefetching
130        while i + 16 <= len {
131            // Prefetch next cache line (64 bytes = 8 f64 values ahead)
132            if i + 24 < len {
133                unsafe {
134                    prefetch_data(values.as_ptr().add(i + 16));
135                }
136            }
137
138            let v1 = f64x4::from([values[i], values[i + 1], values[i + 2], values[i + 3]]);
139            let v2 = f64x4::from([values[i + 4], values[i + 5], values[i + 6], values[i + 7]]);
140            let v3 = f64x4::from([values[i + 8], values[i + 9], values[i + 10], values[i + 11]]);
141            let v4 = f64x4::from([
142                values[i + 12],
143                values[i + 13],
144                values[i + 14],
145                values[i + 15],
146            ]);
147
148            sum1 += v1;
149            sum2 += v2;
150            sum3 += v3;
151            sum4 += v4;
152
153            i += 16;
154        }
155
156        // Combine all accumulators
157        let combined = (sum1 + sum2) + (sum3 + sum4);
158        let combined_array = combined.as_array_ref();
159        let mut total =
160            combined_array[0] + combined_array[1] + combined_array[2] + combined_array[3];
161
162        // Handle remaining elements with scalar code
163        while i < len {
164            total += values[i];
165            i += 1;
166        }
167
168        total
169    }
170
171    /// Welford's algorithm for variance calculation (scalar implementation)
172    /// This is the actual Welford's algorithm with single-pass, numerically stable computation
173    #[inline]
174    pub fn variance_welford_scalar(values: &[f64]) -> f64 {
175        if values.len() < 2 {
176            return 0.0;
177        }
178
179        // Initialize for Welford's algorithm
180        let mut count = 0f64;
181        let mut mean = 0.0;
182        let mut m2 = 0.0;
183
184        // Process elements using Welford's update formula
185        for &value in values {
186            count += 1.0;
187            let delta = value - mean;
188            mean += delta / count;
189            let delta2 = value - mean;
190            m2 += delta * delta2;
191        }
192
193        m2 / count
194    }
195
196    /// Two-pass variance calculation using SIMD with aligned memory access
197    /// Uses simd_aligned for guaranteed memory alignment and optimal performance
198    #[inline]
199    pub fn variance_two_pass_simd(values: &[f64]) -> f64 {
200        if values.len() < 2 {
201            return 0.0;
202        }
203
204        let len = values.len();
205        let len_f64 = len as f64;
206
207        // Use simple scalar calculation for small arrays where SIMD overhead isn't worth it
208        if len <= 4 {
209            return Self::variance_welford_scalar(values);
210        }
211
212        // Calculate mean using SIMD on aligned buffer but only for complete chunks
213        let complete_chunks = len / 4;
214        let mut sum = f64x4::splat(0.0);
215
216        // Process complete 4-element chunks using SIMD
217        for i in 0..complete_chunks {
218            let start_idx = i * 4;
219            let chunk = f64x4::from([
220                values[start_idx],
221                values[start_idx + 1],
222                values[start_idx + 2],
223                values[start_idx + 3],
224            ]);
225            sum += chunk;
226        }
227
228        // Sum the SIMD lanes
229        let sum_array = sum.as_array_ref();
230        let mut total_sum = sum_array[0] + sum_array[1] + sum_array[2] + sum_array[3];
231
232        // Add remaining elements using scalar arithmetic
233        for i in (complete_chunks * 4)..len {
234            total_sum += values[i];
235        }
236
237        let mean = total_sum / len_f64;
238
239        // Second pass: Calculate variance using SIMD for complete chunks
240        let mean_vec = f64x4::splat(mean);
241        let mut variance_sum = f64x4::splat(0.0);
242
243        // Process complete 4-element chunks using SIMD
244        for i in 0..complete_chunks {
245            let start_idx = i * 4;
246            let chunk = f64x4::from([
247                values[start_idx],
248                values[start_idx + 1],
249                values[start_idx + 2],
250                values[start_idx + 3],
251            ]);
252            let diff = chunk - mean_vec;
253            variance_sum += diff * diff;
254        }
255
256        // Sum the SIMD lanes
257        let variance_array = variance_sum.as_array_ref();
258        let mut total_variance =
259            variance_array[0] + variance_array[1] + variance_array[2] + variance_array[3];
260
261        // Add remaining elements using scalar arithmetic
262        for i in (complete_chunks * 4)..len {
263            let diff = values[i] - mean;
264            total_variance += diff * diff;
265        }
266
267        total_variance / len_f64
268    }
269
270    /// Calculate running averages for price series using SIMD
271    /// Optimized for typical HFT window sizes (50-200 periods)
272    #[inline]
273    pub fn running_average_simd(values: &[f64], window_size: usize) -> Vec<f64> {
274        if values.len() < window_size {
275            return vec![];
276        }
277
278        let mut results = Vec::with_capacity(values.len() - window_size + 1);
279
280        // Calculate first window sum
281        let mut window_sum = Self::sum_f64_fast(&values[0..window_size]);
282        results.push(window_sum / window_size as f64);
283
284        // Use sliding window technique for subsequent calculations
285        for i in window_size..values.len() {
286            window_sum = window_sum - values[i - window_size] + values[i];
287            results.push(window_sum / window_size as f64);
288        }
289
290        results
291    }
292
293    /// Optimized correlation calculation using SIMD
294    /// Processes two price series to calculate correlation coefficient
295    #[inline]
296    pub fn correlation_simd(x: &[f64], y: &[f64]) -> f64 {
297        if x.len() != y.len() || x.len() < 2 {
298            return 0.0;
299        }
300
301        let n = x.len() as f64;
302        let sum_x = Self::sum_f64_optimized(x);
303        let sum_y = Self::sum_f64_optimized(y);
304        let mean_x = sum_x / n;
305        let mean_y = sum_y / n;
306
307        let mut sum_xy = 0.0;
308        let mut sum_x2 = 0.0;
309        let mut sum_y2 = 0.0;
310
311        let mut i = 0;
312        let len = x.len();
313
314        // Process 4 elements at a time
315        while i + 4 <= len {
316            let x_vec = f64x4::from([x[i], x[i + 1], x[i + 2], x[i + 3]]);
317            let y_vec = f64x4::from([y[i], y[i + 1], y[i + 2], y[i + 3]]);
318            let mean_x_vec = f64x4::splat(mean_x);
319            let mean_y_vec = f64x4::splat(mean_y);
320
321            let dx = x_vec - mean_x_vec;
322            let dy = y_vec - mean_y_vec;
323
324            let xy = dx * dy;
325            let x2 = dx * dx;
326            let y2 = dy * dy;
327
328            let xy_array = xy.as_array_ref();
329            let x2_array = x2.as_array_ref();
330            let y2_array = y2.as_array_ref();
331
332            sum_xy += xy_array[0] + xy_array[1] + xy_array[2] + xy_array[3];
333            sum_x2 += x2_array[0] + x2_array[1] + x2_array[2] + x2_array[3];
334            sum_y2 += y2_array[0] + y2_array[1] + y2_array[2] + y2_array[3];
335
336            i += 4;
337        }
338
339        // Handle remaining elements
340        while i < len {
341            let dx = x[i] - mean_x;
342            let dy = y[i] - mean_y;
343            sum_xy += dx * dy;
344            sum_x2 += dx * dx;
345            sum_y2 += dy * dy;
346            i += 1;
347        }
348
349        let denominator = (sum_x2 * sum_y2).sqrt();
350        if denominator > f64::EPSILON {
351            sum_xy / denominator
352        } else {
353            0.0
354        }
355    }
356
357    /// Optimized price returns calculation with SIMD
358    /// Calculates log returns: ln(price\[i\] / price\[i-1\])
359    #[inline]
360    pub fn log_returns_simd(prices: &[f64]) -> Vec<f64> {
361        if prices.len() < 2 {
362            return vec![];
363        }
364
365        let mut returns = Vec::with_capacity(prices.len() - 1);
366
367        let mut i = 1;
368        let len = prices.len();
369
370        // Process 4 returns at a time
371        while i + 4 <= len {
372            let curr_prices = f64x4::from([prices[i], prices[i + 1], prices[i + 2], prices[i + 3]]);
373            let prev_prices = f64x4::from([prices[i - 1], prices[i], prices[i + 1], prices[i + 2]]);
374
375            // Calculate ratios and take natural log
376            let ratios = curr_prices / prev_prices;
377            let ratios_array = ratios.as_array_ref();
378
379            returns.push(ratios_array[0].ln());
380            returns.push(ratios_array[1].ln());
381            returns.push(ratios_array[2].ln());
382            returns.push(ratios_array[3].ln());
383
384            i += 4;
385        }
386
387        // Handle remaining elements
388        while i < len {
389            let log_return = (prices[i] / prices[i - 1]).ln();
390            returns.push(log_return);
391            i += 1;
392        }
393
394        returns
395    }
396
397    /// Structure-of-Arrays optimized order book level processing
398    /// Processes bid/ask levels more efficiently by separating prices and quantities
399    #[inline]
400    pub fn process_order_book_levels_soa(
401        prices: &[f64],
402        quantities: &[f64],
403        operation: impl Fn(f64, f64) -> f64,
404    ) -> Vec<f64> {
405        if prices.len() != quantities.len() {
406            return vec![];
407        }
408
409        let mut results = Vec::with_capacity(prices.len());
410        let mut i = 0;
411        let len = prices.len();
412
413        // Process 4 levels at a time
414        while i + 4 <= len {
415            let price_vec = f64x4::from([prices[i], prices[i + 1], prices[i + 2], prices[i + 3]]);
416            let qty_vec = f64x4::from([
417                quantities[i],
418                quantities[i + 1],
419                quantities[i + 2],
420                quantities[i + 3],
421            ]);
422
423            let price_array = price_vec.as_array_ref();
424            let qty_array = qty_vec.as_array_ref();
425
426            for j in 0..4 {
427                #[cfg(test)]
428                let result = black_box(operation(price_array[j], qty_array[j]));
429                #[cfg(not(test))]
430                let result = operation(price_array[j], qty_array[j]);
431                results.push(result);
432            }
433
434            i += 4;
435        }
436
437        // Handle remaining elements
438        while i < len {
439            results.push(operation(prices[i], quantities[i]));
440            i += 1;
441        }
442
443        results
444    }
445}
446
447#[cfg(test)]
448mod tests {
449    use super::*;
450
451    #[test]
452    fn test_enhanced_sum_small_array() {
453        let values = [1.0, 2.0, 3.0];
454        let sum = EnhancedSimdOps::sum_f64_optimized(&values);
455        assert_eq!(sum, 6.0);
456    }
457
458    #[test]
459    fn test_enhanced_sum_medium_array() {
460        let values: Vec<f64> = (1..=100).map(|i| i as f64).collect();
461        let sum = EnhancedSimdOps::sum_f64_optimized(&values);
462        assert_eq!(sum, 5050.0);
463    }
464
465    #[test]
466    fn test_enhanced_sum_large_array() {
467        let values: Vec<f64> = (1..=10000).map(|i| i as f64).collect();
468        let sum = EnhancedSimdOps::sum_f64_optimized(&values);
469        assert_eq!(sum, 50005000.0);
470    }
471
472    #[test]
473    fn test_variance_welford() {
474        let values = [2.0, 4.0, 6.0, 8.0];
475        let variance = EnhancedSimdOps::variance_welford_scalar(&values);
476        assert!((variance - 5.0).abs() < 1e-10);
477    }
478
479    #[test]
480    fn test_running_average() {
481        let values = [1.0, 2.0, 3.0, 4.0, 5.0];
482        let averages = EnhancedSimdOps::running_average_simd(&values, 3);
483        assert_eq!(averages.len(), 3);
484        assert_eq!(averages[0], 2.0); // (1+2+3)/3
485        assert_eq!(averages[1], 3.0); // (2+3+4)/3
486        assert_eq!(averages[2], 4.0); // (3+4+5)/3
487    }
488
489    #[test]
490    fn test_correlation() {
491        let x = [1.0, 2.0, 3.0, 4.0, 5.0];
492        let y = [2.0, 4.0, 6.0, 8.0, 10.0];
493        let corr = EnhancedSimdOps::correlation_simd(&x, &y);
494        assert!((corr - 1.0).abs() < 1e-10); // Perfect positive correlation
495    }
496
497    #[test]
498    fn test_log_returns() {
499        let prices = [100.0, 101.0, 99.0, 102.0];
500        let returns = EnhancedSimdOps::log_returns_simd(&prices);
501        assert_eq!(returns.len(), 3);
502        assert!((returns[0] - (101.0f64 / 100.0f64).ln()).abs() < 1e-10);
503        assert!((returns[1] - (99.0f64 / 101.0f64).ln()).abs() < 1e-10);
504        assert!((returns[2] - (102.0f64 / 99.0f64).ln()).abs() < 1e-10);
505    }
506
507    #[test]
508    fn test_soa_processing() {
509        let prices = [100.0, 101.0, 102.0, 103.0];
510        let quantities = [10.0, 20.0, 30.0, 40.0];
511
512        // Calculate notional values (price * quantity)
513        let notionals =
514            EnhancedSimdOps::process_order_book_levels_soa(&prices, &quantities, |price, qty| {
515                price * qty
516            });
517
518        assert_eq!(notionals.len(), 4);
519        assert_eq!(notionals[0], 1000.0);
520        assert_eq!(notionals[1], 2020.0);
521        assert_eq!(notionals[2], 3060.0);
522        assert_eq!(notionals[3], 4120.0);
523    }
524
525    #[test]
526    fn test_simd_aligned_buffer() {
527        let mut buffer = SimdAlignedBuffer::<f64, 8>::new();
528        let slice = buffer.as_mut_slice();
529        slice[0] = 1.0;
530        slice[7] = 8.0;
531
532        assert_eq!(buffer.len(), 8);
533        assert_eq!(buffer.as_slice()[0], 1.0);
534        assert_eq!(buffer.as_slice()[7], 8.0);
535    }
536
537    #[test]
538    fn test_variance_two_pass_simd_correctness() {
539        // Test against known variance calculation using standard two-pass algorithm
540        let values = [2.0, 4.0, 6.0, 8.0];
541        let result = EnhancedSimdOps::variance_two_pass_simd(&values);
542
543        // Calculate expected variance manually:
544        // Mean = (2+4+6+8)/4 = 5.0
545        // Variance = ((2-5)² + (4-5)² + (6-5)² + (8-5)²) / 4 = (9+1+1+9)/4 = 5.0
546        let expected = 5.0;
547        assert!(
548            (result - expected).abs() < 1e-10,
549            "Expected {expected}, got {result}"
550        );
551    }
552
553    #[test]
554    fn test_variance_two_pass_simd_aligned_vs_padded() {
555        // Test with array length exactly divisible by 4 (aligned)
556        let aligned_values = [1.0, 2.0, 3.0, 4.0]; // Length 4 - no padding
557        let aligned_result = EnhancedSimdOps::variance_two_pass_simd(&aligned_values);
558
559        // Test with array length requiring padding
560        let padded_values = [1.0, 2.0, 3.0, 4.0, 5.0]; // Length 5 - needs padding
561        let padded_result = EnhancedSimdOps::variance_two_pass_simd(&padded_values);
562
563        // Calculate expected variances manually
564        // Aligned: mean=2.5, variance=((1-2.5)²+(2-2.5)²+(3-2.5)²+(4-2.5)²)/4 = 1.25
565        assert!((aligned_result - 1.25).abs() < 1e-10);
566
567        // Padded: mean=3.0, variance=((1-3)²+(2-3)²+(3-3)²+(4-3)²+(5-3)²)/5 = 2.0
568        assert!((padded_result - 2.0).abs() < 1e-10);
569    }
570
571    #[test]
572    fn test_variance_two_pass_simd_edge_cases() {
573        // Empty array
574        let empty: &[f64] = &[];
575        assert_eq!(EnhancedSimdOps::variance_two_pass_simd(empty), 0.0);
576
577        // Single element
578        let single = [42.0];
579        assert_eq!(EnhancedSimdOps::variance_two_pass_simd(&single), 0.0);
580
581        // Two identical elements
582        let identical = [5.0, 5.0];
583        assert_eq!(EnhancedSimdOps::variance_two_pass_simd(&identical), 0.0);
584
585        // Two different elements
586        let two_diff = [1.0, 3.0];
587        // Mean = 2.0, variance = ((1-2)² + (3-2)²) / 2 = 1.0
588        assert!((EnhancedSimdOps::variance_two_pass_simd(&two_diff) - 1.0).abs() < 1e-10);
589    }
590
591    #[test]
592    fn test_variance_two_pass_simd_large_arrays() {
593        // Test with larger arrays to verify SIMD chunking works correctly
594        let large_values: Vec<f64> = (1..=100).map(|i| i as f64).collect();
595        let result = EnhancedSimdOps::variance_two_pass_simd(&large_values);
596
597        // For sequence 1..100: mean = 50.5, variance = 833.25
598        let expected = 833.25;
599        assert!(
600            (result - expected).abs() < 1e-8,
601            "Expected {expected}, got {result}"
602        );
603
604        // Test odd-length array that requires padding
605        let odd_values: Vec<f64> = (1..=99).map(|i| i as f64).collect();
606        let odd_result = EnhancedSimdOps::variance_two_pass_simd(&odd_values);
607
608        // For sequence 1..99: mean = 50.0, variance = 816.6667
609        let expected_odd = 816.6666666666666;
610        assert!(
611            (odd_result - expected_odd).abs() < 1e-8,
612            "Expected {expected_odd}, got {odd_result}"
613        );
614    }
615
616    #[test]
617    fn test_variance_two_pass_simd_precision() {
618        // Test with high precision values to ensure SIMD doesn't introduce significant errors
619        let precise_values = [1.000000001, 1.000000002, 1.000000003, 1.000000004];
620        let result = EnhancedSimdOps::variance_two_pass_simd(&precise_values);
621
622        // Small variance should be calculated precisely
623        assert!(result > 0.0, "Should detect small variance");
624        assert!(result < 1e-15, "Small variance should be very small");
625
626        // Test with financial data ranges (typical price values)
627        let price_data = [100.25, 100.50, 99.75, 101.00, 100.125];
628        let price_variance = EnhancedSimdOps::variance_two_pass_simd(&price_data);
629        assert!(
630            price_variance > 0.0,
631            "Should calculate variance for realistic price data"
632        );
633    }
634
635    #[test]
636    fn test_variance_two_pass_simd_vs_welford() {
637        // Compare SIMD two-pass with Welford algorithm for consistency
638        let test_data = [2.5, 4.7, 1.3, 8.9, 6.1, 3.4, 7.8, 5.2];
639
640        let simd_result = EnhancedSimdOps::variance_two_pass_simd(&test_data);
641        let welford_result = EnhancedSimdOps::variance_welford_scalar(&test_data);
642
643        // Both algorithms should give very similar results (within floating point precision)
644        assert!(
645            (simd_result - welford_result).abs() < 1e-12,
646            "SIMD two-pass ({simd_result}) and Welford ({welford_result}) results should be nearly identical"
647        );
648    }
649}