Skip to content

Commit 1697eb4

Browse files
committed
enhance: memcpy_nontemporal with alignment check
1 parent 5f06825 commit 1697eb4

1 file changed

Lines changed: 85 additions & 1 deletion

File tree

src/simd.rs

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -911,6 +911,23 @@ unsafe fn memcpy_nontemporal_sse2(dst: *mut u8, src: *const u8, len: usize) {
911911
}
912912
}
913913

914+
#[cfg(all(feature = "simd", target_arch = "x86_64"))]
915+
#[target_feature(enable = "avx2")]
916+
unsafe fn memcpy_nontemporal_avx2(dst: *mut u8, src: *const u8, len: usize) {
917+
unsafe {
918+
let mut i = 0;
919+
while i + 32 <= len {
920+
let v = _mm256_loadu_si256(src.add(i) as *const __m256i);
921+
_mm256_stream_si256(dst.add(i) as *mut __m256i, v);
922+
i += 32;
923+
}
924+
_mm_sfence();
925+
if i < len {
926+
std::ptr::copy_nonoverlapping(src.add(i), dst.add(i), len - i);
927+
}
928+
}
929+
}
930+
914931
#[cfg(all(feature = "simd", target_arch = "x86_64"))]
915932
#[inline(always)]
916933
/// # Safety
@@ -921,14 +938,46 @@ unsafe fn memcpy_nontemporal_sse2(dst: *mut u8, src: *const u8, len: usize) {
921938
/// - the regions do not overlap
922939
pub unsafe fn memcpy_nontemporal(dst: *mut u8, src: *const u8, len: usize) {
923940
unsafe {
924-
if is_simd_available() && len >= 64 {
941+
if is_simd_available() && len >= 64 && (dst as usize).is_multiple_of(16) {
925942
memcpy_nontemporal_sse2(dst, src, len);
926943
return;
927944
}
928945
std::ptr::copy_nonoverlapping(src, dst, len);
929946
}
930947
}
931948

949+
pub fn safe_memcpy_nontemporal(dst: &mut [u8], src: &[u8]) -> Result<(), &'static str> {
950+
if dst.len() != src.len() {
951+
return Err("length mismatch");
952+
}
953+
let len = dst.len();
954+
let src_ptr = src.as_ptr() as usize;
955+
let dst_ptr = dst.as_mut_ptr() as usize;
956+
let src_end = src_ptr.checked_add(len).ok_or("overflow")?;
957+
let dst_end = dst_ptr.checked_add(len).ok_or("overflow")?;
958+
if src_ptr < dst_end && dst_ptr < src_end {
959+
return Err("overlap");
960+
}
961+
962+
#[cfg(all(feature = "simd", target_arch = "x86_64"))]
963+
{
964+
let info = simd_info();
965+
if info.avx2 && is_simd_available() && len >= 64 && (dst_ptr).is_multiple_of(32) {
966+
unsafe { memcpy_nontemporal_avx2(dst.as_mut_ptr(), src.as_ptr(), len) };
967+
return Ok(());
968+
}
969+
if is_simd_available() && len >= 64 && (dst_ptr).is_multiple_of(16) {
970+
unsafe { memcpy_nontemporal_sse2(dst.as_mut_ptr(), src.as_ptr(), len) };
971+
return Ok(());
972+
}
973+
}
974+
975+
unsafe {
976+
std::ptr::copy_nonoverlapping(src.as_ptr(), dst.as_mut_ptr(), len);
977+
}
978+
Ok(())
979+
}
980+
932981
#[cfg(not(all(feature = "simd", target_arch = "x86_64")))]
933982
#[inline(always)]
934983
/// # Safety
@@ -1965,6 +2014,41 @@ mod tests {
19652014
}
19662015
}
19672016

2017+
#[test]
2018+
fn test_memcpy_nontemporal_unaligned_large() {
2019+
use std::alloc::{Layout, alloc, dealloc};
2020+
let layout = Layout::from_size_align(136, 8).unwrap();
2021+
let base_ptr = unsafe { alloc(layout) };
2022+
let dst_ptr = unsafe { base_ptr.add(8) };
2023+
let src = [99; 128];
2024+
unsafe {
2025+
memcpy_nontemporal(dst_ptr, src.as_ptr(), 128);
2026+
for i in 0..128 {
2027+
assert_eq!(*dst_ptr.add(i), 99);
2028+
}
2029+
dealloc(base_ptr, layout);
2030+
}
2031+
}
2032+
2033+
#[test]
2034+
fn test_safe_memcpy_nontemporal_slice() {
2035+
let src = [7u8; 128];
2036+
let mut dst = [0u8; 128];
2037+
assert!(safe_memcpy_nontemporal(&mut dst, &src).is_ok());
2038+
assert_eq!(dst, src);
2039+
}
2040+
2041+
#[test]
2042+
fn test_safe_memcpy_nontemporal_overlap() {
2043+
let mut buf = vec![0u8; 256];
2044+
for (i, item) in buf.iter_mut().enumerate().take(256) {
2045+
*item = i as u8;
2046+
}
2047+
let src = unsafe { std::slice::from_raw_parts(buf.as_ptr(), 128) };
2048+
let dst = unsafe { std::slice::from_raw_parts_mut(buf.as_mut_ptr().add(64), 128) };
2049+
assert!(safe_memcpy_nontemporal(dst, src).is_err());
2050+
}
2051+
19682052
#[test]
19692053
fn test_simd_level_fallback() {
19702054
let info = SimdInfo {

0 commit comments

Comments
 (0)