Support more property queries

This commit is contained in:
Andrzej Janik 2020-11-14 15:48:05 +01:00
parent a6765baa3a
commit eac5fbd806
7 changed files with 103 additions and 16 deletions

View file

@ -833,6 +833,12 @@ impl<'a> Kernel<'a> {
check!(sys::zeKernelSetGroupSize(self.0, x, y, z));
Ok(())
}
pub fn get_properties(&self) -> Result<Box<sys::ze_kernel_properties_t>> {
let mut props = Box::new(unsafe { mem::zeroed::<sys::ze_kernel_properties_t>() });
check!(sys::zeKernelGetProperties(self.0, props.as_mut() as *mut _));
Ok(props)
}
}
impl<'a> Drop for Kernel<'a> {

View file

@ -2365,7 +2365,7 @@ pub extern "C" fn cuCtxGetFlags(flags: *mut ::std::os::raw::c_uint) -> CUresult
#[cfg_attr(not(test), no_mangle)]
pub extern "C" fn cuCtxSynchronize() -> CUresult {
r#impl::unimplemented()
r#impl::context::synchronize()
}
#[cfg_attr(not(test), no_mangle)]
@ -3569,7 +3569,7 @@ pub extern "C" fn cuFuncGetAttribute(
attrib: CUfunction_attribute,
hfunc: CUfunction,
) -> CUresult {
r#impl::unimplemented()
r#impl::function::get_attribute(pi, attrib, hfunc.decuda()).encuda()
}
#[cfg_attr(not(test), no_mangle)]

View file

@ -249,6 +249,11 @@ pub fn detach(pctx: *mut Context) -> Result<(), CUresult> {
})?
}
pub(crate) fn synchronize() -> CUresult {
// TODO: change the implementation once we do async stream operations
CUresult::CUDA_SUCCESS
}
#[cfg(test)]
mod test {
use super::super::test::CudaDriverFns;

View file

@ -96,6 +96,14 @@ impl Device {
pub fn late_init(&mut self) {
self.primary_context.as_option_mut().unwrap().device = self as *mut _;
}
fn get_max_simd(&mut self) -> l0::Result<u32> {
let props = self.get_compute_properties()?;
Ok(*props.subGroupSizes[0..props.numSubGroupSizes as usize]
.iter()
.max()
.unwrap())
}
}
pub fn init(driver: &l0::Driver) -> Result<Vec<Device>, CUresult> {
@ -210,14 +218,32 @@ pub fn get_attribute(
Ok::<_, l0::sys::ze_result_t>(props.maxHardwareContexts as i32)
})??
}
// Streaming Multiprocessor corresponds roughly to a sub-slice (thread group can't cross either)
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_properties()?;
Ok::<_, l0::sys::ze_result_t>((props.numSlices * props.numSubslicesPerSlice) as i32)
})??
}
// I honestly don't know how to answer this query
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR => {
GlobalState::lock_device(dev_idx, |dev| {
let max_simd = dev.get_max_simd()?;
let props = dev.get_properties()?;
Ok::<_, l0::sys::ze_result_t>(
(props.numSlices * props.numSubslicesPerSlice * props.numEUsPerSubslice) as i32,
(props.numEUsPerSubslice * props.numThreadsPerEU * max_simd) as i32,
)
})??
}
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(cmp::min(
i32::max_value() as u32,
props.maxTotalGroupSize,
) as i32)
})??
}
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_image_properties()?;
@ -230,7 +256,7 @@ pub fn get_attribute(
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(cmp::max(
Ok::<_, l0::sys::ze_result_t>(cmp::min(
i32::max_value() as u32,
props.maxGroupCountX,
) as i32)
@ -239,7 +265,7 @@ pub fn get_attribute(
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(cmp::max(
Ok::<_, l0::sys::ze_result_t>(cmp::min(
i32::max_value() as u32,
props.maxGroupCountY,
) as i32)
@ -248,7 +274,7 @@ pub fn get_attribute(
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(cmp::max(
Ok::<_, l0::sys::ze_result_t>(cmp::min(
i32::max_value() as u32,
props.maxGroupCountZ,
) as i32)
@ -258,7 +284,7 @@ pub fn get_attribute(
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(
cmp::max(i32::max_value() as u32, props.maxGroupSizeX) as i32,
cmp::min(i32::max_value() as u32, props.maxGroupSizeX) as i32,
)
})??
}
@ -266,7 +292,7 @@ pub fn get_attribute(
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(
cmp::max(i32::max_value() as u32, props.maxGroupSizeY) as i32,
cmp::min(i32::max_value() as u32, props.maxGroupSizeY) as i32,
)
})??
}
@ -274,19 +300,19 @@ pub fn get_attribute(
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(
cmp::max(i32::max_value() as u32, props.maxGroupSizeZ) as i32,
cmp::min(i32::max_value() as u32, props.maxGroupSizeZ) as i32,
)
})??
}
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(cmp::max(
i32::max_value() as u32,
props.maxTotalGroupSize,
) as i32)
Ok::<_, l0::sys::ze_result_t>(props.maxSharedLocalMemory as i32)
})??
}
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => {
GlobalState::lock_device(dev_idx, |dev| Ok::<_, CUresult>(dev.get_max_simd()? as i32))??
}
_ => {
// TODO: support more attributes for CUDA runtime
/*

View file

@ -1,7 +1,9 @@
use ::std::os::raw::{c_uint, c_void};
use std::ptr;
use std::{hint, ptr};
use super::{CUresult, GlobalState, HasLivenessCookie, LiveCheck, stream::Stream};
use crate::cuda::CUfunction_attribute;
use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
pub type Function = LiveCheck<FunctionData>;
@ -23,6 +25,19 @@ pub struct FunctionData {
pub base: l0::Kernel<'static>,
pub arg_size: Vec<usize>,
pub use_shared_mem: bool,
pub properties: Option<Box<l0::sys::ze_kernel_properties_t>>,
}
impl FunctionData {
fn get_properties(&mut self) -> Result<&l0::sys::ze_kernel_properties_t, l0::sys::ze_result_t> {
if let None = self.properties {
self.properties = Some(self.base.get_properties()?)
}
match self.properties {
Some(ref props) => Ok(props.as_ref()),
None => unsafe { hint::unreachable_unchecked() },
}
}
}
pub fn launch_kernel(
@ -74,3 +89,24 @@ pub fn launch_kernel(
Ok(())
})?
}
pub(crate) fn get_attribute(
pi: *mut i32,
attrib: CUfunction_attribute,
func: *mut Function,
) -> Result<(), CUresult> {
if pi == ptr::null_mut() || func == ptr::null_mut() {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
match attrib {
CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
let max_threads = GlobalState::lock_function(func, |func| {
let props = func.get_properties()?;
Ok::<_, CUresult>(props.maxSubgroupSize * props.maxNumSubgroups)
})??;
unsafe { *pi = max_threads as i32 };
Ok(())
}
_ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
}
}

View file

@ -268,6 +268,19 @@ impl GlobalState {
})?
}
}
fn lock_function<T>(
func: *mut function::Function,
f: impl FnOnce(&mut function::FunctionData) -> T,
) -> Result<T, CUresult> {
if func == ptr::null_mut() {
return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
}
Self::lock(|_| {
let func = unsafe { &mut *func }.as_result_mut()?;
Ok(f(func))
})?
}
}
// TODO: implement

View file

@ -131,6 +131,7 @@ pub fn get_function(
base: kernel,
arg_size: kernel_info.arguments_sizes.clone(),
use_shared_mem: kernel_info.uses_shared_mem,
properties: None,
})))
}
};