Convert OpenCL host code to SVM

This commit is contained in:
Andrzej Janik 2021-08-04 19:34:56 +02:00
parent 638786b0ec
commit becda31524
3 changed files with 77 additions and 189 deletions

View file

@ -4,6 +4,7 @@ use cuda::{CUdevice_attribute, CUuuid_st};
use ocl_core::{ClDeviceIdPtr, ContextProperties, DeviceType};
use std::{
cmp,
collections::HashSet,
ffi::c_void,
mem,
os::raw::{c_char, c_int, c_uint},
@ -24,175 +25,14 @@ pub struct Device {
pub ocl_base: ocl_core::DeviceId,
pub default_queue: ocl_core::CommandQueue,
pub ocl_context: ocl_core::Context,
pub(crate) ocl_ext: OpenCLExtensions,
pub primary_context: context::Context,
pub allocations: HashSet<*mut c_void>,
properties: Option<Box<l0::sys::ze_device_properties_t>>,
image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>,
memory_properties: Option<Vec<l0::sys::ze_device_memory_properties_t>>,
compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>,
}
type cl_mem_properties_intel = ocl_core::ffi::cl_bitfield;
pub(crate) struct OpenCLExtensions {
pub clDeviceMemAllocINTEL: unsafe extern "system" fn(
ocl_core::ffi::cl_context,
ocl_core::ffi::cl_device_id,
*const cl_mem_properties_intel,
usize,
ocl_core::ffi::cl_uint,
*mut ocl_core::ffi::cl_int,
) -> *mut c_void,
pub clEnqueueMemcpyINTEL: unsafe extern "system" fn(
ocl_core::ffi::cl_command_queue,
ocl_core::ffi::cl_bool,
*mut c_void,
*const c_void,
usize,
ocl_core::ffi::cl_uint,
*const ocl_core::ffi::cl_event,
*mut ocl_core::ffi::cl_event,
) -> ocl_core::ffi::cl_int,
pub clMemBlockingFreeINTEL:
unsafe extern "system" fn(ocl_core::ffi::cl_context, *mut c_void) -> ocl_core::ffi::cl_int,
pub clEnqueueMemFillINTEL: unsafe extern "system" fn(
ocl_core::ffi::cl_command_queue,
*mut c_void,
*const c_void,
usize,
usize,
ocl_core::ffi::cl_uint,
*const ocl_core::ffi::cl_event,
*mut ocl_core::ffi::cl_event,
) -> ocl_core::ffi::cl_int,
}
impl OpenCLExtensions {
fn new(plat: &ocl_core::PlatformId) -> Result<Self, CUresult> {
let clDeviceMemAllocINTEL = unsafe {
ocl_core::get_extension_function_address_for_platform(
plat,
"clDeviceMemAllocINTEL",
None,
)?
};
let clEnqueueMemcpyINTEL = unsafe {
ocl_core::get_extension_function_address_for_platform(
plat,
"clEnqueueMemcpyINTEL",
None,
)?
};
let clMemBlockingFreeINTEL = unsafe {
ocl_core::get_extension_function_address_for_platform(
plat,
"clMemBlockingFreeINTEL",
None,
)?
};
let clEnqueueMemFillINTEL = unsafe {
ocl_core::get_extension_function_address_for_platform(
plat,
"clEnqueueMemFillINTEL",
None,
)?
};
Ok(Self {
clDeviceMemAllocINTEL: unsafe { mem::transmute(clDeviceMemAllocINTEL) },
clEnqueueMemcpyINTEL: unsafe { mem::transmute(clEnqueueMemcpyINTEL) },
clMemBlockingFreeINTEL: unsafe { mem::transmute(clMemBlockingFreeINTEL) },
clEnqueueMemFillINTEL: unsafe { mem::transmute(clEnqueueMemFillINTEL) },
})
}
pub unsafe fn device_mem_alloc(
&self,
ctx: &ocl_core::Context,
device: &ocl_core::DeviceId,
size: usize,
alignment: ocl_core::ffi::cl_uint,
) -> Result<*mut c_void, CUresult> {
let mut error = 0;
let result = (self.clDeviceMemAllocINTEL)(
ctx.as_ptr(),
device.as_ptr(),
ptr::null(),
size,
alignment,
&mut error,
);
if error == 0 {
Ok(result)
} else {
Err(CUresult::CUDA_ERROR_UNKNOWN)
}
}
pub unsafe fn enqueue_memcpy(
&self,
queue: &ocl_core::CommandQueue,
blocking: bool,
dst: *mut c_void,
src: *const c_void,
size: usize,
) -> Result<(), CUresult> {
let error = (self.clEnqueueMemcpyINTEL)(
queue.as_ptr(),
if blocking { 1 } else { 0 },
dst,
src,
size,
0,
ptr::null(),
ptr::null_mut(),
);
if error == 0 {
Ok(())
} else {
Err(CUresult::CUDA_ERROR_UNKNOWN)
}
}
pub unsafe fn mem_blocking_free(
&self,
ctx: &ocl_core::Context,
mem_ptr: *mut c_void,
) -> Result<(), CUresult> {
let error = (self.clMemBlockingFreeINTEL)(ctx.as_ptr(), mem_ptr);
if error == 0 {
Ok(())
} else {
Err(CUresult::CUDA_ERROR_UNKNOWN)
}
}
pub unsafe fn enqueue_memfill(
&self,
queue: &ocl_core::CommandQueue,
dst: *mut c_void,
pattern: *const c_void,
patternSize: usize,
size: usize,
) -> Result<ocl_core::Event, CUresult> {
let mut signal: ocl_core::ffi::cl_event = ptr::null_mut();
let error = (self.clEnqueueMemFillINTEL)(
queue.as_ptr(),
dst,
pattern,
patternSize,
size,
0,
ptr::null(),
&mut signal,
);
if error == 0 {
Ok(ocl_core::Event::from_raw(signal))
} else {
Err(CUresult::CUDA_ERROR_UNKNOWN)
}
}
}
unsafe impl Send for Device {}
impl Device {
@ -202,7 +42,6 @@ impl Device {
ocl_dev: ocl_core::DeviceId,
idx: usize,
) -> Result<Self, CUresult> {
let ocl_ext = OpenCLExtensions::new(&platform)?;
let mut props = ocl_core::ContextProperties::new();
props.set_platform(platform);
let ctx = ocl_core::create_context(Some(&props), &[ocl_dev], None, None)?;
@ -210,13 +49,13 @@ impl Device {
let primary_context =
context::Context::new(context::ContextData::new(0, true, ptr::null_mut())?);
Ok(Self {
ocl_ext,
index: Index(idx as c_int),
base: l0_dev,
ocl_base: ocl_dev,
default_queue: queue,
ocl_context: ctx,
primary_context,
allocations: HashSet::new(),
properties: None,
image_properties: None,
memory_properties: None,

View file

@ -3,7 +3,7 @@ use ocl_core::DeviceId;
use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
use crate::cuda::CUfunction_attribute;
use ::std::os::raw::{c_uint, c_void};
use std::{hint, ptr};
use std::{hint, mem, ptr};
const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
@ -101,7 +101,9 @@ pub fn launch_kernel(
{
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
GlobalState::lock_enqueue(hstream, |queue| {
GlobalState::lock_stream(hstream, |stream_data| {
let dev = unsafe { &mut *(*stream_data.context).device };
let queue = stream_data.cmd_list.as_ref().unwrap();
let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
if kernel_params != ptr::null_mut() {
for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
@ -162,6 +164,16 @@ pub fn launch_kernel(
)?
};
}
let buffers = dev.allocations.iter().copied().collect::<Vec<_>>();
let err = unsafe {
ocl_core::ffi::clSetKernelExecInfo(
func.base.as_ptr(),
ocl_core::ffi::CL_KERNEL_EXEC_INFO_SVM_PTRS,
buffers.len() * mem::size_of::<*mut c_void>(),
buffers.as_ptr() as *const _,
)
};
assert_eq!(err, 0);
let global_dims = [
(block_dim_x * grid_dim_x) as usize,
(block_dim_y * grid_dim_y) as usize,
@ -184,7 +196,7 @@ pub fn launch_kernel(
)?
};
Ok::<_, CUresult>(())
})
})?
}
fn round_up_to_multiple(x: usize, multiple: usize) -> usize {

View file

@ -5,27 +5,39 @@ use super::{
use std::{
ffi::c_void,
mem::{self, size_of},
ptr,
};
pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
let ptr = GlobalState::lock_stream(CU_STREAM_LEGACY, |stream_data| {
let dev = unsafe { &*(*stream_data.context).device };
let dev = unsafe { &mut *(*stream_data.context).device };
let queue = stream_data.cmd_list.as_ref().unwrap();
let ptr = unsafe {
dev.ocl_ext
.device_mem_alloc(&dev.ocl_context, &dev.ocl_base, bytesize, 0)?
ocl_core::ffi::clSVMAlloc(
dev.ocl_context.as_ptr(),
ocl_core::ffi::CL_MEM_READ_WRITE,
bytesize,
0,
)
};
// CUDA does the same thing and e.g. GeekBench relies on this behavior
let event = unsafe {
dev.ocl_ext.enqueue_memfill(
queue,
let mut event = ptr::null_mut();
let err = unsafe {
ocl_core::ffi::clEnqueueSVMMemFill(
queue.as_ptr(),
ptr,
&0u8 as *const u8 as *const c_void,
1,
bytesize,
)?
0,
ptr::null(),
&mut event,
)
};
ocl_core::wait_for_event(&event)?;
assert_eq!(err, 0);
let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
assert_eq!(err, 0);
dev.allocations.insert(ptr);
Ok::<_, CUresult>(ptr)
})??;
unsafe { *dptr = ptr };
@ -36,10 +48,22 @@ pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result<
GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream_data| {
let dev = unsafe { &*(*stream_data.context).device };
let queue = stream_data.cmd_list.as_ref().unwrap();
unsafe {
dev.ocl_ext
.enqueue_memcpy(queue, true, dst, src, bytesize)?
let mut event = ptr::null_mut();
let err = unsafe {
ocl_core::ffi::clEnqueueSVMMemcpy(
queue.as_ptr(),
1,
dst,
src,
bytesize,
0,
ptr::null(),
&mut event,
)
};
assert_eq!(err, 0);
let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
assert_eq!(err, 0);
Ok(())
})?
}
@ -47,7 +71,8 @@ pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result<
pub fn free_v2(ptr: *mut c_void) -> Result<(), CUresult> {
GlobalState::lock_current_context(|ctx| {
let dev = unsafe { &mut *ctx.device };
unsafe { dev.ocl_ext.mem_blocking_free(&dev.ocl_context, ptr)? };
unsafe { ocl_core::ffi::clSVMFree(dev.ocl_context.as_ptr(), ptr) };
dev.allocations.remove(&ptr);
Ok(())
})?
}
@ -57,16 +82,22 @@ pub(crate) fn set_d32_v2(dst: *mut c_void, mut ui: u32, n: usize) -> Result<(),
let dev = unsafe { &*(*stream_data.context).device };
let queue = stream_data.cmd_list.as_ref().unwrap();
let pattern_size = mem::size_of_val(&ui);
let event = unsafe {
dev.ocl_ext.enqueue_memfill(
queue,
let mut event = ptr::null_mut();
let err = unsafe {
ocl_core::ffi::clEnqueueSVMMemFill(
queue.as_ptr(),
dst,
&ui as *const _ as *const _,
pattern_size,
pattern_size * n,
)?
0,
ptr::null(),
&mut event,
)
};
ocl_core::wait_for_event(&event)?;
assert_eq!(err, 0);
let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
assert_eq!(err, 0);
Ok(())
})?
}
@ -76,16 +107,22 @@ pub(crate) fn set_d8_v2(dst: *mut c_void, mut uc: u8, n: usize) -> Result<(), CU
let dev = unsafe { &*(*stream_data.context).device };
let queue = stream_data.cmd_list.as_ref().unwrap();
let pattern_size = mem::size_of_val(&uc);
let event = unsafe {
dev.ocl_ext.enqueue_memfill(
queue,
let mut event = ptr::null_mut();
let err = unsafe {
ocl_core::ffi::clEnqueueSVMMemFill(
queue.as_ptr(),
dst,
&uc as *const _ as *const _,
pattern_size,
pattern_size * n,
)?
0,
ptr::null(),
&mut event,
)
};
ocl_core::wait_for_event(&event)?;
assert_eq!(err, 0);
let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
assert_eq!(err, 0);
Ok(())
})?
}