@@ -911,6 +911,23 @@ unsafe fn memcpy_nontemporal_sse2(dst: *mut u8, src: *const u8, len: usize) {
911911 }
912912}
913913
914+ #[ cfg( all( feature = "simd" , target_arch = "x86_64" ) ) ]
915+ #[ target_feature( enable = "avx2" ) ]
916+ unsafe fn memcpy_nontemporal_avx2 ( dst : * mut u8 , src : * const u8 , len : usize ) {
917+ unsafe {
918+ let mut i = 0 ;
919+ while i + 32 <= len {
920+ let v = _mm256_loadu_si256 ( src. add ( i) as * const __m256i ) ;
921+ _mm256_stream_si256 ( dst. add ( i) as * mut __m256i , v) ;
922+ i += 32 ;
923+ }
924+ _mm_sfence ( ) ;
925+ if i < len {
926+ std:: ptr:: copy_nonoverlapping ( src. add ( i) , dst. add ( i) , len - i) ;
927+ }
928+ }
929+ }
930+
914931#[ cfg( all( feature = "simd" , target_arch = "x86_64" ) ) ]
915932#[ inline( always) ]
916933/// # Safety
@@ -921,14 +938,46 @@ unsafe fn memcpy_nontemporal_sse2(dst: *mut u8, src: *const u8, len: usize) {
921938/// - the regions do not overlap
922939pub unsafe fn memcpy_nontemporal ( dst : * mut u8 , src : * const u8 , len : usize ) {
923940 unsafe {
924- if is_simd_available ( ) && len >= 64 {
941+ if is_simd_available ( ) && len >= 64 && ( dst as usize ) . is_multiple_of ( 16 ) {
925942 memcpy_nontemporal_sse2 ( dst, src, len) ;
926943 return ;
927944 }
928945 std:: ptr:: copy_nonoverlapping ( src, dst, len) ;
929946 }
930947}
931948
949+ pub fn safe_memcpy_nontemporal ( dst : & mut [ u8 ] , src : & [ u8 ] ) -> Result < ( ) , & ' static str > {
950+ if dst. len ( ) != src. len ( ) {
951+ return Err ( "length mismatch" ) ;
952+ }
953+ let len = dst. len ( ) ;
954+ let src_ptr = src. as_ptr ( ) as usize ;
955+ let dst_ptr = dst. as_mut_ptr ( ) as usize ;
956+ let src_end = src_ptr. checked_add ( len) . ok_or ( "overflow" ) ?;
957+ let dst_end = dst_ptr. checked_add ( len) . ok_or ( "overflow" ) ?;
958+ if src_ptr < dst_end && dst_ptr < src_end {
959+ return Err ( "overlap" ) ;
960+ }
961+
962+ #[ cfg( all( feature = "simd" , target_arch = "x86_64" ) ) ]
963+ {
964+ let info = simd_info ( ) ;
965+ if info. avx2 && is_simd_available ( ) && len >= 64 && ( dst_ptr) . is_multiple_of ( 32 ) {
966+ unsafe { memcpy_nontemporal_avx2 ( dst. as_mut_ptr ( ) , src. as_ptr ( ) , len) } ;
967+ return Ok ( ( ) ) ;
968+ }
969+ if is_simd_available ( ) && len >= 64 && ( dst_ptr) . is_multiple_of ( 16 ) {
970+ unsafe { memcpy_nontemporal_sse2 ( dst. as_mut_ptr ( ) , src. as_ptr ( ) , len) } ;
971+ return Ok ( ( ) ) ;
972+ }
973+ }
974+
975+ unsafe {
976+ std:: ptr:: copy_nonoverlapping ( src. as_ptr ( ) , dst. as_mut_ptr ( ) , len) ;
977+ }
978+ Ok ( ( ) )
979+ }
980+
932981#[ cfg( not( all( feature = "simd" , target_arch = "x86_64" ) ) ) ]
933982#[ inline( always) ]
934983/// # Safety
@@ -1965,6 +2014,41 @@ mod tests {
19652014 }
19662015 }
19672016
2017+ #[ test]
2018+ fn test_memcpy_nontemporal_unaligned_large ( ) {
2019+ use std:: alloc:: { Layout , alloc, dealloc} ;
2020+ let layout = Layout :: from_size_align ( 136 , 8 ) . unwrap ( ) ;
2021+ let base_ptr = unsafe { alloc ( layout) } ;
2022+ let dst_ptr = unsafe { base_ptr. add ( 8 ) } ;
2023+ let src = [ 99 ; 128 ] ;
2024+ unsafe {
2025+ memcpy_nontemporal ( dst_ptr, src. as_ptr ( ) , 128 ) ;
2026+ for i in 0 ..128 {
2027+ assert_eq ! ( * dst_ptr. add( i) , 99 ) ;
2028+ }
2029+ dealloc ( base_ptr, layout) ;
2030+ }
2031+ }
2032+
2033+ #[ test]
2034+ fn test_safe_memcpy_nontemporal_slice ( ) {
2035+ let src = [ 7u8 ; 128 ] ;
2036+ let mut dst = [ 0u8 ; 128 ] ;
2037+ assert ! ( safe_memcpy_nontemporal( & mut dst, & src) . is_ok( ) ) ;
2038+ assert_eq ! ( dst, src) ;
2039+ }
2040+
2041+ #[ test]
2042+ fn test_safe_memcpy_nontemporal_overlap ( ) {
2043+ let mut buf = vec ! [ 0u8 ; 256 ] ;
2044+ for ( i, item) in buf. iter_mut ( ) . enumerate ( ) . take ( 256 ) {
2045+ * item = i as u8 ;
2046+ }
2047+ let src = unsafe { std:: slice:: from_raw_parts ( buf. as_ptr ( ) , 128 ) } ;
2048+ let dst = unsafe { std:: slice:: from_raw_parts_mut ( buf. as_mut_ptr ( ) . add ( 64 ) , 128 ) } ;
2049+ assert ! ( safe_memcpy_nontemporal( dst, src) . is_err( ) ) ;
2050+ }
2051+
19682052 #[ test]
19692053 fn test_simd_level_fallback ( ) {
19702054 let info = SimdInfo {
0 commit comments