mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-04-28 13:37:57 +03:00
Support more property queries
This commit is contained in:
parent
a6765baa3a
commit
eac5fbd806
7 changed files with 103 additions and 16 deletions
|
@ -833,6 +833,12 @@ impl<'a> Kernel<'a> {
|
|||
check!(sys::zeKernelSetGroupSize(self.0, x, y, z));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_properties(&self) -> Result<Box<sys::ze_kernel_properties_t>> {
|
||||
let mut props = Box::new(unsafe { mem::zeroed::<sys::ze_kernel_properties_t>() });
|
||||
check!(sys::zeKernelGetProperties(self.0, props.as_mut() as *mut _));
|
||||
Ok(props)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Drop for Kernel<'a> {
|
||||
|
|
|
@ -2365,7 +2365,7 @@ pub extern "C" fn cuCtxGetFlags(flags: *mut ::std::os::raw::c_uint) -> CUresult
|
|||
|
||||
#[cfg_attr(not(test), no_mangle)]
|
||||
pub extern "C" fn cuCtxSynchronize() -> CUresult {
|
||||
r#impl::unimplemented()
|
||||
r#impl::context::synchronize()
|
||||
}
|
||||
|
||||
#[cfg_attr(not(test), no_mangle)]
|
||||
|
@ -3569,7 +3569,7 @@ pub extern "C" fn cuFuncGetAttribute(
|
|||
attrib: CUfunction_attribute,
|
||||
hfunc: CUfunction,
|
||||
) -> CUresult {
|
||||
r#impl::unimplemented()
|
||||
r#impl::function::get_attribute(pi, attrib, hfunc.decuda()).encuda()
|
||||
}
|
||||
|
||||
#[cfg_attr(not(test), no_mangle)]
|
||||
|
|
|
@ -249,6 +249,11 @@ pub fn detach(pctx: *mut Context) -> Result<(), CUresult> {
|
|||
})?
|
||||
}
|
||||
|
||||
pub(crate) fn synchronize() -> CUresult {
|
||||
// TODO: change the implementation once we do async stream operations
|
||||
CUresult::CUDA_SUCCESS
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::super::test::CudaDriverFns;
|
||||
|
|
|
@ -96,6 +96,14 @@ impl Device {
|
|||
pub fn late_init(&mut self) {
|
||||
self.primary_context.as_option_mut().unwrap().device = self as *mut _;
|
||||
}
|
||||
|
||||
fn get_max_simd(&mut self) -> l0::Result<u32> {
|
||||
let props = self.get_compute_properties()?;
|
||||
Ok(*props.subGroupSizes[0..props.numSubGroupSizes as usize]
|
||||
.iter()
|
||||
.max()
|
||||
.unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init(driver: &l0::Driver) -> Result<Vec<Device>, CUresult> {
|
||||
|
@ -210,14 +218,32 @@ pub fn get_attribute(
|
|||
Ok::<_, l0::sys::ze_result_t>(props.maxHardwareContexts as i32)
|
||||
})??
|
||||
}
|
||||
// Streaming Multiprocessor corresponds roughly to a sub-slice (thread group can't cross either)
|
||||
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT => {
|
||||
GlobalState::lock_device(dev_idx, |dev| {
|
||||
let props = dev.get_properties()?;
|
||||
Ok::<_, l0::sys::ze_result_t>((props.numSlices * props.numSubslicesPerSlice) as i32)
|
||||
})??
|
||||
}
|
||||
// I honestly don't know how to answer this query
|
||||
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR => {
|
||||
GlobalState::lock_device(dev_idx, |dev| {
|
||||
let max_simd = dev.get_max_simd()?;
|
||||
let props = dev.get_properties()?;
|
||||
Ok::<_, l0::sys::ze_result_t>(
|
||||
(props.numSlices * props.numSubslicesPerSlice * props.numEUsPerSubslice) as i32,
|
||||
(props.numEUsPerSubslice * props.numThreadsPerEU * max_simd) as i32,
|
||||
)
|
||||
})??
|
||||
}
|
||||
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
|
||||
GlobalState::lock_device(dev_idx, |dev| {
|
||||
let props = dev.get_compute_properties()?;
|
||||
Ok::<_, l0::sys::ze_result_t>(cmp::min(
|
||||
i32::max_value() as u32,
|
||||
props.maxTotalGroupSize,
|
||||
) as i32)
|
||||
})??
|
||||
}
|
||||
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH => {
|
||||
GlobalState::lock_device(dev_idx, |dev| {
|
||||
let props = dev.get_image_properties()?;
|
||||
|
@ -230,7 +256,7 @@ pub fn get_attribute(
|
|||
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X => {
|
||||
GlobalState::lock_device(dev_idx, |dev| {
|
||||
let props = dev.get_compute_properties()?;
|
||||
Ok::<_, l0::sys::ze_result_t>(cmp::max(
|
||||
Ok::<_, l0::sys::ze_result_t>(cmp::min(
|
||||
i32::max_value() as u32,
|
||||
props.maxGroupCountX,
|
||||
) as i32)
|
||||
|
@ -239,7 +265,7 @@ pub fn get_attribute(
|
|||
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y => {
|
||||
GlobalState::lock_device(dev_idx, |dev| {
|
||||
let props = dev.get_compute_properties()?;
|
||||
Ok::<_, l0::sys::ze_result_t>(cmp::max(
|
||||
Ok::<_, l0::sys::ze_result_t>(cmp::min(
|
||||
i32::max_value() as u32,
|
||||
props.maxGroupCountY,
|
||||
) as i32)
|
||||
|
@ -248,7 +274,7 @@ pub fn get_attribute(
|
|||
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z => {
|
||||
GlobalState::lock_device(dev_idx, |dev| {
|
||||
let props = dev.get_compute_properties()?;
|
||||
Ok::<_, l0::sys::ze_result_t>(cmp::max(
|
||||
Ok::<_, l0::sys::ze_result_t>(cmp::min(
|
||||
i32::max_value() as u32,
|
||||
props.maxGroupCountZ,
|
||||
) as i32)
|
||||
|
@ -258,7 +284,7 @@ pub fn get_attribute(
|
|||
GlobalState::lock_device(dev_idx, |dev| {
|
||||
let props = dev.get_compute_properties()?;
|
||||
Ok::<_, l0::sys::ze_result_t>(
|
||||
cmp::max(i32::max_value() as u32, props.maxGroupSizeX) as i32,
|
||||
cmp::min(i32::max_value() as u32, props.maxGroupSizeX) as i32,
|
||||
)
|
||||
})??
|
||||
}
|
||||
|
@ -266,7 +292,7 @@ pub fn get_attribute(
|
|||
GlobalState::lock_device(dev_idx, |dev| {
|
||||
let props = dev.get_compute_properties()?;
|
||||
Ok::<_, l0::sys::ze_result_t>(
|
||||
cmp::max(i32::max_value() as u32, props.maxGroupSizeY) as i32,
|
||||
cmp::min(i32::max_value() as u32, props.maxGroupSizeY) as i32,
|
||||
)
|
||||
})??
|
||||
}
|
||||
|
@ -274,19 +300,19 @@ pub fn get_attribute(
|
|||
GlobalState::lock_device(dev_idx, |dev| {
|
||||
let props = dev.get_compute_properties()?;
|
||||
Ok::<_, l0::sys::ze_result_t>(
|
||||
cmp::max(i32::max_value() as u32, props.maxGroupSizeZ) as i32,
|
||||
cmp::min(i32::max_value() as u32, props.maxGroupSizeZ) as i32,
|
||||
)
|
||||
})??
|
||||
}
|
||||
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
|
||||
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK => {
|
||||
GlobalState::lock_device(dev_idx, |dev| {
|
||||
let props = dev.get_compute_properties()?;
|
||||
Ok::<_, l0::sys::ze_result_t>(cmp::max(
|
||||
i32::max_value() as u32,
|
||||
props.maxTotalGroupSize,
|
||||
) as i32)
|
||||
Ok::<_, l0::sys::ze_result_t>(props.maxSharedLocalMemory as i32)
|
||||
})??
|
||||
}
|
||||
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => {
|
||||
GlobalState::lock_device(dev_idx, |dev| Ok::<_, CUresult>(dev.get_max_simd()? as i32))??
|
||||
}
|
||||
_ => {
|
||||
// TODO: support more attributes for CUDA runtime
|
||||
/*
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
use ::std::os::raw::{c_uint, c_void};
|
||||
use std::ptr;
|
||||
use std::{hint, ptr};
|
||||
|
||||
use super::{CUresult, GlobalState, HasLivenessCookie, LiveCheck, stream::Stream};
|
||||
use crate::cuda::CUfunction_attribute;
|
||||
|
||||
use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
|
||||
|
||||
pub type Function = LiveCheck<FunctionData>;
|
||||
|
||||
|
@ -23,6 +25,19 @@ pub struct FunctionData {
|
|||
pub base: l0::Kernel<'static>,
|
||||
pub arg_size: Vec<usize>,
|
||||
pub use_shared_mem: bool,
|
||||
pub properties: Option<Box<l0::sys::ze_kernel_properties_t>>,
|
||||
}
|
||||
|
||||
impl FunctionData {
|
||||
fn get_properties(&mut self) -> Result<&l0::sys::ze_kernel_properties_t, l0::sys::ze_result_t> {
|
||||
if let None = self.properties {
|
||||
self.properties = Some(self.base.get_properties()?)
|
||||
}
|
||||
match self.properties {
|
||||
Some(ref props) => Ok(props.as_ref()),
|
||||
None => unsafe { hint::unreachable_unchecked() },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn launch_kernel(
|
||||
|
@ -74,3 +89,24 @@ pub fn launch_kernel(
|
|||
Ok(())
|
||||
})?
|
||||
}
|
||||
|
||||
pub(crate) fn get_attribute(
|
||||
pi: *mut i32,
|
||||
attrib: CUfunction_attribute,
|
||||
func: *mut Function,
|
||||
) -> Result<(), CUresult> {
|
||||
if pi == ptr::null_mut() || func == ptr::null_mut() {
|
||||
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
|
||||
}
|
||||
match attrib {
|
||||
CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
|
||||
let max_threads = GlobalState::lock_function(func, |func| {
|
||||
let props = func.get_properties()?;
|
||||
Ok::<_, CUresult>(props.maxSubgroupSize * props.maxNumSubgroups)
|
||||
})??;
|
||||
unsafe { *pi = max_threads as i32 };
|
||||
Ok(())
|
||||
}
|
||||
_ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -268,6 +268,19 @@ impl GlobalState {
|
|||
})?
|
||||
}
|
||||
}
|
||||
|
||||
fn lock_function<T>(
|
||||
func: *mut function::Function,
|
||||
f: impl FnOnce(&mut function::FunctionData) -> T,
|
||||
) -> Result<T, CUresult> {
|
||||
if func == ptr::null_mut() {
|
||||
return Err(CUresult::CUDA_ERROR_INVALID_HANDLE);
|
||||
}
|
||||
Self::lock(|_| {
|
||||
let func = unsafe { &mut *func }.as_result_mut()?;
|
||||
Ok(f(func))
|
||||
})?
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: implement
|
||||
|
|
|
@ -131,6 +131,7 @@ pub fn get_function(
|
|||
base: kernel,
|
||||
arg_size: kernel_info.arguments_sizes.clone(),
|
||||
use_shared_mem: kernel_info.uses_shared_mem,
|
||||
properties: None,
|
||||
})))
|
||||
}
|
||||
};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue