diff --git a/Cargo.lock b/Cargo.lock index 804ef4f..5726bb3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -807,7 +807,6 @@ dependencies = [ "ptx_parser", "quick-error", "rustc-hash 2.0.0", - "smallvec", "strum", "strum_macros", "tempfile", diff --git a/ptx/Cargo.toml b/ptx/Cargo.toml index 2f645e7..2876539 100644 --- a/ptx/Cargo.toml +++ b/ptx/Cargo.toml @@ -20,7 +20,6 @@ strum_macros = "0.26" petgraph = "0.7.1" microlp = "0.2.10" int-enum = "1.1" -smallvec = "1.13" unwrap_or = "1.0.1" [dev-dependencies] diff --git a/ptx/src/pass/emit_llvm.rs b/ptx/src/pass/emit_llvm.rs index d7cbbc5..5a5dd80 100644 --- a/ptx/src/pass/emit_llvm.rs +++ b/ptx/src/pass/emit_llvm.rs @@ -452,22 +452,6 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> { } } -fn fun_name( - method: Function2, SpirvWord>, - method_emitter: &mut MethodEmitContext<'_>, -) -> Result<(), TranslateError> { - Ok(if method.is_kernel { - if method.rounding_mode_f32 != ast::RoundingMode::NearestEven - || method.rounding_mode_f16f64 != ast::RoundingMode::NearestEven - { - method_emitter.emit_set_mode(ModeRegister::Rounding { - f32: method.rounding_mode_f32, - f16f64: method.rounding_mode_f16f64, - })?; - } - }) -} - fn llvm_ftz(ftz: bool) -> &'static str { if ftz { "preserve-sign" diff --git a/ptx/src/pass/insert_ftz_control/call_with_mode.ptx b/ptx/src/pass/instruction_mode_to_global_mode/call_with_mode.ptx similarity index 100% rename from ptx/src/pass/insert_ftz_control/call_with_mode.ptx rename to ptx/src/pass/instruction_mode_to_global_mode/call_with_mode.ptx diff --git a/ptx/src/pass/insert_ftz_control/fold_denormal.ptx b/ptx/src/pass/instruction_mode_to_global_mode/fold_denormal.ptx similarity index 100% rename from ptx/src/pass/insert_ftz_control/fold_denormal.ptx rename to ptx/src/pass/instruction_mode_to_global_mode/fold_denormal.ptx diff --git a/ptx/src/pass/insert_ftz_control/mod.rs b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs similarity index 77% rename from ptx/src/pass/insert_ftz_control/mod.rs rename to ptx/src/pass/instruction_mode_to_global_mode/mod.rs index dfaafe3..c2b9672 100644 --- a/ptx/src/pass/insert_ftz_control/mod.rs +++ b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs @@ -1,6 +1,7 @@ use super::BrachCondition; use super::Directive2; use super::Function2; +use super::GlobalStringIdentResolver2; use super::ModeRegister; use super::SpirvWord; use super::Statement; @@ -16,7 +17,6 @@ use petgraph::Graph; use ptx_parser as ast; use rustc_hash::FxHashMap; use rustc_hash::FxHashSet; -use smallvec::SmallVec; use std::hash::Hash; use std::iter; use std::mem; @@ -115,108 +115,6 @@ struct ResolvedInstructionModes { rounding_f16f64: Resolved, } -/* -struct ExitInstructionModes { - denormal_f32: Resolved, - denormal_f16f64: Resolved, - rounding_f32: Resolved, - rounding_f16f64: Resolved, -} - -impl ExitInstructionModes { - fn from_node( - denormal: &TwinModeInsertions, - rounding: &TwinModeInsertions, - Node { - label: ret_block_name, - denormal_f32, - denormal_f16f64, - rounding_f32, - rounding_f16f64, - }: &Node, - ) -> Result { - let denormal_entry = &denormal.basic_blocks; - let rounding_entry = &rounding.basic_blocks; - let denormal_f32 = match denormal_f32.exit { - Some(ExtendedMode::Entry(kernel)) => Resolved::Value( - denormal_entry - .get(&kernel) - .ok_or_else(error_unreachable)? - .twin_mode - .ok_or_else(error_unreachable)? - .f32 - .to_ftz(), - ), - Some(ExtendedMode::BasicBlock(value)) => Resolved::Value(value.to_ftz()), - None => denormal_entry - .get(ret_block_name) - .ok_or_else(error_unreachable)? - .twin_mode - .map(|m| m.f32.to_ftz()), - }; - /* - let denormal_f16f64 = match denormal_f16f64.exit { - None => denormal_entry - .get(ret_block_name) - .ok_or_else(error_unreachable)? - .twin_mode - .map(|m| m.f16f64.to_ftz()), - Some(ExtendedMode::Entry(kernel)) => Some( - denormal_entry - .get(&kernel) - .ok_or_else(error_unreachable)? - .twin_mode - .unwrap() - .f16f64 - .to_ftz(), - ), - Some(ExtendedMode::BasicBlock(value)) => Some(value.to_ftz()), - }; - let rounding_f32 = match rounding_f32.exit { - None => rounding_entry - .get(ret_block_name) - .ok_or_else(error_unreachable)? - .twin_mode - .map(|m| m.f32.to_ast()), - Some(ExtendedMode::Entry(kernel)) => Some( - rounding_entry - .get(&kernel) - .ok_or_else(error_unreachable)? - .twin_mode - .unwrap() - .f32 - .to_ast(), - ), - Some(ExtendedMode::BasicBlock(value)) => Some(value.to_ast()), - }; - let rounding_f16f64 = match rounding_f16f64.exit { - None => rounding_entry - .get(ret_block_name) - .ok_or_else(error_unreachable)? - .twin_mode - .map(|m| m.f16f64.to_ast()), - Some(ExtendedMode::Entry(kernel)) => Some( - rounding_entry - .get(&kernel) - .ok_or_else(error_unreachable)? - .twin_mode - .unwrap() - .f16f64 - .to_ast(), - ), - Some(ExtendedMode::BasicBlock(value)) => Some(value.to_ast()), - }; - */ - Ok(Self { - denormal_f32, - denormal_f16f64, - rounding_f32, - rounding_f16f64, - }) - } -} - */ - impl InstructionModes { fn fold_into(self, entry: &mut Self, exit: &mut Self) { fn set_if_none(source: &mut Option, value: Option) { @@ -405,6 +303,15 @@ impl ControlFlowGraph { node.rounding_f16f64.exit = exit.rounding_f16f64.map(ExtendedMode::BasicBlock); } + // Our control flow graph expresses function calls as edges in the graph. + // While building the graph it's always possible to create the edge from + // caller basic block to a function, but it's impossible to construct an + // edge from the function return basic block to after-call basic block in + // caller (the function might have been just a declaration for now). + // That's why we collect: + // * Which basic blocks does a function return to + // * What is thew functin's return basic blocks + // and then, after visiting all functions, we add the missing edges here fn fixup_function_calls(&mut self) -> Result<(), TranslateError> { for (fn_, follow_on_labels) in self.call_returns.iter() { let connecting_bb = match self.functions_rets.get(fn_) { @@ -417,34 +324,25 @@ impl ControlFlowGraph { } } Ok(()) - /* - for (function, source) in self.functions_rets.iter() { - for target in self - .call_returns - .get(function) - .iter() - .map(|vec| vec.iter()) - .flatten() - .copied() - { - self.graph.add_edge(*source, target, ()); - } - } - */ } } struct ResolvedControlFlowGraph { - entry_points: FxHashMap, basic_blocks: FxHashMap, - // map function -> return label - call_returns: FxHashMap>, // map function -> return basic block functions_rets: FxHashMap, graph: Graph, } impl ResolvedControlFlowGraph { + // This function takes the initial control flow graph. Initial control flow + // graph only has mode values for basic blocks if any instruction in the + // given basic block requires a mode. All the other basic blocks have no + // value. This pass resolved the values for all basic blocks. If a basic + // block sets no value then and there are multiple incoming edges from + // basic block with different values then the value is set to a special + // value "Conflict". + // After this pass every basic block either has a concrete value or "Conflict" fn new( cfg: ControlFlowGraph, f32_denormal_kernels: &FxHashMap, @@ -626,9 +524,7 @@ impl ResolvedControlFlowGraph { Err(error_unreachable()) } else { Ok(Self { - entry_points: cfg.entry_points, basic_blocks: cfg.basic_blocks, - call_returns: cfg.call_returns, functions_rets: cfg.functions_rets, graph, }) @@ -706,17 +602,111 @@ impl Node { } } -trait EnumTuple { - const LENGTH: usize; - - fn get(&self, x: usize) -> u8; - fn get_mut(&mut self, x: usize) -> &mut u8; +// This instruction convert instruction-scoped modes (denormal, rounding) in PTX +// to globally-scoped modes as expected by AMD GPUs. +// As a simplified example this pass converts this instruction: +// add.ftz.rn.f32 %r1, %r2, %r3; +// to: +// set_ftz_mode true; +// set_rnd_mode rn; +// add.ftz.rn.f32 %r1, %r2, %r3; +pub(crate) fn run<'input>( + flat_resolver: &mut GlobalStringIdentResolver2<'input>, + directives: Vec, SpirvWord>>, +) -> Result, SpirvWord>>, TranslateError> { + let cfg = create_control_flow_graph(&directives)?; + let (denormal_f32, denormal_f16f64, rounding_f32, rounding_f16f64) = + compute_minimal_mode_insertions(&cfg); + let temp = compute_full_mode_insertions( + flat_resolver, + &directives, + cfg, + denormal_f32, + denormal_f16f64, + rounding_f32, + rounding_f16f64, + )?; + apply_global_mode_controls(directives, temp) } -pub(crate) fn run<'input>( - flat_resolver: &mut super::GlobalStringIdentResolver2<'input>, - directives: Vec, super::SpirvWord>>, -) -> Result, SpirvWord>>, TranslateError> { +// For every basic block this pass computes: +// - Name of mode prologue basic blocks. Mode prologue is a basic block which +// contains single instruction that sets mode to the desired value. It will +// be later inserted just before the basic block and all jumps that require +// mode change will go through this basic block +// - Entry mode: what is the mode for both f32 and f16f64 at the first instruction. +// This will be used when emiting instructions in the basic block. When we +// emit an instruction we get its modes, check if they are different and if so +// decide: do we emit new mode set statement or we fold into previous mode set. +// We don't need to compute exit mode for every basic block because this will be +// computed naturally when emitting instructions in a basic block. +// Only exception is exit mode for returning (containing instruction `ret;`) +// basic blocks for functions. +// We need this information to handle call instructions correctly. +fn compute_full_mode_insertions( + flat_resolver: &mut GlobalStringIdentResolver2, + directives: &Vec, SpirvWord>>, + cfg: ControlFlowGraph, + denormal_f32: MandatoryModeInsertions, + denormal_f16f64: MandatoryModeInsertions, + rounding_f32: MandatoryModeInsertions, + rounding_f16f64: MandatoryModeInsertions, +) -> Result { + let cfg = ResolvedControlFlowGraph::new( + cfg, + &denormal_f32.kernels, + &denormal_f16f64.kernels, + &rounding_f32.kernels, + &rounding_f16f64.kernels, + )?; + join_modes( + flat_resolver, + directives, + cfg, + denormal_f32, + denormal_f16f64, + rounding_f32, + rounding_f16f64, + ) +} + +// This function takes the control flow graph and for each global mode computes: +// * Which basic blocks have an incoming edge from at least one basic block with +// different mode. That means that we will later need to insert a mode +// "prologue": an artifical basic block which sets the mode to the desired +// value. All mode-changing edges will be redirected to than basic block +// * What is the initial value for the mode in a kernel. Note, that only +// computes the initial value if the value is observed by a basic block. +// For some kernels the initial value does not matter and in that case a later +// pass should use default value +fn compute_minimal_mode_insertions( + cfg: &ControlFlowGraph, +) -> ( + MandatoryModeInsertions, + MandatoryModeInsertions, + MandatoryModeInsertions, + MandatoryModeInsertions, +) { + let rounding_f32 = compute_single_mode_insertions(cfg, |node| node.rounding_f32); + let denormal_f32 = compute_single_mode_insertions(cfg, |node| node.denormal_f32); + let denormal_f16f64 = compute_single_mode_insertions(cfg, |node| node.denormal_f16f64); + let rounding_f16f64 = compute_single_mode_insertions(cfg, |node| node.rounding_f16f64); + let denormal_f32 = + optimize_mode_insertions::(denormal_f32); + let denormal_f16f64 = + optimize_mode_insertions::(denormal_f16f64); + let rounding_f32 = + optimize_mode_insertions::(rounding_f32); + let rounding_f16f64: MandatoryModeInsertions = + optimize_mode_insertions::(rounding_f16f64); + (denormal_f32, denormal_f16f64, rounding_f32, rounding_f16f64) +} + +// This function creates control flow graph for the whole module. This control +// flow graph expresses function calls as edges in the control flow graph +fn create_control_flow_graph( + directives: &Vec, SpirvWord>>, +) -> Result { let mut cfg = ControlFlowGraph::new(); for directive in directives.iter() { match directive { @@ -770,65 +760,11 @@ pub(crate) fn run<'input>( _ => {} } } - //println!( - // "{:?}", - // petgraph::dot::Dot::with_config(&cfg.graph, &[petgraph::dot::Config::EdgeNoLabel]) - //); cfg.fixup_function_calls()?; - //println!( - // "{:?}", - // petgraph::dot::Dot::with_config(&cfg.graph, &[petgraph::dot::Config::EdgeNoLabel]) - //); - let rounding_f32 = compute_single_mode(&cfg, |node| node.rounding_f32); - let denormal_f32 = compute_single_mode(&cfg, |node| node.denormal_f32); - let denormal_f16f64 = compute_single_mode(&cfg, |node| node.denormal_f16f64); - let rounding_f16f64 = compute_single_mode(&cfg, |node| node.rounding_f16f64); - let denormal_f32 = optimize::(denormal_f32); - let denormal_f16f64 = optimize::(denormal_f16f64); - let rounding_f32 = optimize::(rounding_f32); - let rounding_f16f64: MandatoryModeInsertions = - optimize::(rounding_f16f64); - let cfg = ResolvedControlFlowGraph::new( - cfg, - &denormal_f32.kernels, - &denormal_f16f64.kernels, - &rounding_f32.kernels, - &rounding_f16f64.kernels, - )?; - let temp = join_modes2( - flat_resolver, - &directives, - cfg, - denormal_f32, - denormal_f16f64, - rounding_f32, - rounding_f16f64, - )?; - - /* - let denormal = join_modes( - flat_resolver, - &cfg, - denormal_f32, - |node| node.denormal_f32, - denormal_f16f64, - |node| node.denormal_f16f64, - )?; - let rounding = join_modes( - flat_resolver, - &cfg, - rounding_f32, - |node| node.rounding_f32, - rounding_f16f64, - |node| node.rounding_f16f64, - )?; - let all_modes = FullModeInsertion::new(flat_resolver, denormal, rounding)?; - */ - let directives = insert_mode_control(flat_resolver, directives, temp)?; - Ok(directives) + Ok(cfg) } -fn join_modes2( +fn join_modes( flat_resolver: &mut super::GlobalStringIdentResolver2, directives: &Vec, super::SpirvWord>>, cfg: ResolvedControlFlowGraph, @@ -836,7 +772,7 @@ fn join_modes2( mandatory_denormal_f16f64: MandatoryModeInsertions, mandatory_rounding_f32: MandatoryModeInsertions, mandatory_rounding_f16f64: MandatoryModeInsertions, -) -> Result { +) -> Result { let basic_blocks = cfg .graph .node_weights() @@ -892,7 +828,7 @@ fn join_modes2( )) }) .collect::, _>>()?; - let temp = directives + let functions_exit_modes = directives .iter() .filter_map(|directive| match directive { Directive2::Method(Function2 { @@ -933,128 +869,15 @@ fn join_modes2( _ => None, }) .collect::, _>>()?; - let functions_exit_modes = cfg - .functions_rets - .into_iter() - .map(|(bb, node)| { - let weights = cfg.graph.node_weight(node).ok_or_else(error_unreachable)?; - let modes = ResolvedInstructionModes { - denormal_f32: weights.denormal_f32.exit.map(DenormalMode::to_ftz), - denormal_f16f64: weights.denormal_f16f64.exit.map(DenormalMode::to_ftz), - rounding_f32: weights.rounding_f32.exit.map(RoundingMode::to_ast), - rounding_f16f64: weights.rounding_f16f64.exit.map(RoundingMode::to_ast), - }; - Ok((bb, modes)) - }) - .collect::, _>>()?; - Ok(FullModeInsertion2 { + Ok(FullModeInsertion { basic_blocks, - functions_exit_modes: temp, + functions_exit_modes, }) } -// For every basic block this pass computes: -// - Name of mode prologue basic block. Mode prologue is a basic block which -// contains single instruction that sets mode to the desired value. It will -// be later inserted just before the basic block and all jumps that require -// mode change will go through this basic block -// - Entry mode: what is the mode for both f32 and f16f64 at the first instruction. -// This will be used when emiting instructions in the basic block. When we -// emit an instruction we get its modes, check if they are different and if so -// decide: do we emit new mode set statement or we fold into previous mode set. -// We don't need to compute exit mode because this will be computed naturally -// when emitting instructions in a basic block. We need exit mode to know if we -// jump directly to the next bb or jump to mode prologue -/* -fn join_modes<'input, T: Eq + PartialEq + Copy + Default>( - flat_resolver: &mut super::GlobalStringIdentResolver2<'input>, - cfg: &ResolvedControlFlowGraph, - f32_insertions: MandatoryModeInsertions, - mut f32_view: impl FnMut(&ResolvedNode) -> ResolvedMode, - f16f64_insertions: MandatoryModeInsertions, - mut f16f64_view: impl FnMut(&ResolvedNode) -> ResolvedMode, -) -> Result, TranslateError> { - let basic_blocks = cfg - .graph - .node_weights() - .map(|basic_block| { - let requires_prologue = f32_insertions.basic_blocks.contains(&basic_block.label) - || f16f64_insertions.basic_blocks.contains(&basic_block.label); - let prologue: Option = if requires_prologue { - Some(flat_resolver.register_unnamed(None)) - } else { - None - }; - let f32 = f32_view(basic_block); - let f16f64 = f16f64_view(basic_block); - let twin_mode = match (f32.entry, f16f64.entry) { - (Resolved::Conflict, Resolved::Conflict) => Resolved::Conflict, - (f32, f16f64) => Resolved::Value(TwinMode { - f32: f32.unwrap_of_default(), - f16f64: f16f64.unwrap_of_default(), - }), - }; - Ok(( - basic_block.label, - BasicBlockEntryState { - prologue, - twin_mode, - }, - )) - }) - .collect::, _>>()?; - Ok(TwinModeInsertions { basic_blocks }) -} - */ - -struct TwinModeInsertions { - basic_blocks: FxHashMap>, -} - -struct FullModeInsertion2 { - basic_blocks: FxHashMap, - functions_exit_modes: FxHashMap, -} - struct FullModeInsertion { basic_blocks: FxHashMap, -} - -impl FullModeInsertion { - fn new( - flat_resolver: &mut super::GlobalStringIdentResolver2, - denormal: TwinModeInsertions, - rounding: TwinModeInsertions, - ) -> Result { - if denormal.basic_blocks.len() != rounding.basic_blocks.len() { - return Err(error_unreachable()); - } - let basic_blocks = denormal - .basic_blocks - .into_iter() - .map(|(bb, denormal)| { - let rounding = rounding - .basic_blocks - .get(&bb) - .copied() - .ok_or_else(error_unreachable)?; - let dual_prologue = if denormal.prologue.is_some() && rounding.prologue.is_some() { - Some(flat_resolver.register_unnamed(None)) - } else { - None - }; - Ok(( - bb, - FullBasicBlockEntryState { - dual_prologue, - denormal, - rounding, - }, - )) - }) - .collect::, _>>()?; - Ok(Self { basic_blocks }) - } + functions_exit_modes: FxHashMap, } struct FullBasicBlockEntryState { @@ -1075,20 +898,21 @@ struct TwinMode { f16f64: T, } -fn insert_mode_control( - flat_resolver: &mut super::GlobalStringIdentResolver2, +// This function goes through every method, every basic block, every instruction +// and based on computed information inserts: +// * Instructions that change global mode +// * Insert additional "prelude" basic blocks that sets mode +// * Redirect some jumps to "prelude" basic blocks +fn apply_global_mode_controls( directives: Vec, SpirvWord>>, - global_modes: FullModeInsertion2, + global_modes: FullModeInsertion, ) -> Result, SpirvWord>>, TranslateError> { - let directives_len = directives.len(); directives .into_iter() .map(|directive| { - let mut new_directives = SmallVec::<[_; 4]>::new(); let (mut method, initial_mode) = match directive { Directive2::Variable(..) | Directive2::Method(Function2 { body: None, .. }) => { - new_directives.push(directive); - return Ok(new_directives); + return Ok(directive); } Directive2::Method( mut method @ Function2 { @@ -1114,7 +938,7 @@ fn insert_mode_control( (method, initial_mode) } }; - emit_mode_prelude(flat_resolver, &method, &global_modes, &mut new_directives)?; + check_function_prelude(&method, &global_modes)?; let old_body = method.body.take().unwrap(); let mut result = Vec::with_capacity(old_body.len()); let mut bb_state = BasicBlockControlState::new(&global_modes, initial_mode); @@ -1175,233 +999,34 @@ fn insert_mode_control( } } method.body = Some(result); - new_directives.push(Directive2::Method(method)); - Ok(new_directives) - }) - .try_fold(Vec::with_capacity(directives_len), |mut acc, d| { - acc.extend(d?); - Ok(acc) + Ok(Directive2::Method(method)) }) + .collect::, _>>() } -fn emit_mode_prelude( - flat_resolver: &mut super::GlobalStringIdentResolver2, +fn check_function_prelude( method: &Function2, SpirvWord>, - global_modes: &FullModeInsertion2, - new_directives: &mut SmallVec<[Directive2, SpirvWord>; 4]>, + global_modes: &FullModeInsertion, ) -> Result<(), TranslateError> { let fn_mode_state = global_modes .basic_blocks .get(&method.name) .ok_or_else(error_unreachable)?; - if let Some(dual_prologue) = fn_mode_state.dual_prologue { - new_directives.push(create_fn_wrapper( - flat_resolver, - method, - dual_prologue, - [ - ModeRegister::Denormal { - f32: fn_mode_state - .denormal - .twin_mode - .f32 - .unwrap_or_default() - .to_ftz(), - f16f64: fn_mode_state - .denormal - .twin_mode - .f16f64 - .unwrap_or_default() - .to_ftz(), - }, - ModeRegister::Rounding { - f32: fn_mode_state - .rounding - .twin_mode - .f32 - .unwrap_or_default() - .to_ast(), - f16f64: fn_mode_state - .rounding - .twin_mode - .f16f64 - .unwrap_or_default() - .to_ast(), - }, - ] - .into_iter(), - )); - } - if let Some(prologue) = fn_mode_state.denormal.prologue { - new_directives.push(create_fn_wrapper( - flat_resolver, - method, - prologue, - [ModeRegister::Denormal { - f32: fn_mode_state - .denormal - .twin_mode - .f32 - .unwrap_or_default() - .to_ftz(), - f16f64: fn_mode_state - .denormal - .twin_mode - .f16f64 - .unwrap_or_default() - .to_ftz(), - }] - .into_iter(), - )); - } - if let Some(prologue) = fn_mode_state.rounding.prologue { - new_directives.push(create_fn_wrapper( - flat_resolver, - method, - prologue, - [ModeRegister::Rounding { - f32: fn_mode_state - .rounding - .twin_mode - .f32 - .unwrap_or_default() - .to_ast(), - f16f64: fn_mode_state - .rounding - .twin_mode - .f16f64 - .unwrap_or_default() - .to_ast(), - }] - .into_iter(), - )); + // A function should never have a prelude. Preludes happen only if there + // is an edge in the control flow graph that requires a mode change. + // Since functions never have a mode setting instructions that means they + // only pass the mode from incoming edges to outgoing edges + if fn_mode_state.dual_prologue.is_some() + || fn_mode_state.denormal.prologue.is_some() + || fn_mode_state.rounding.prologue.is_some() + { + return Err(error_unreachable()); } Ok(()) } -fn create_fn_wrapper( - flat_resolver: &mut super::GlobalStringIdentResolver2, - method: &Function2, SpirvWord>, - name: SpirvWord, - modes: impl ExactSizeIterator, -) -> Directive2, SpirvWord> { - // * Label - // * return argument registers - // * input argument registers - // * Load input arguments - // * set modes - // * call - // * return with value - let return_arguments = rename_variables(flat_resolver, &method.return_arguments); - let input_arguments = rename_variables(flat_resolver, &method.input_arguments); - let mut body = Vec::with_capacity( - 1 + (input_arguments.len() * 2) + return_arguments.len() + modes.len() + 2, - ); - body.push(Statement::Label(flat_resolver.register_unnamed(None))); - let return_variables = append_variables(flat_resolver, &mut body, &return_arguments); - let input_variables = append_variables(flat_resolver, &mut body, &input_arguments); - for (index, input_reg) in input_variables.iter().enumerate() { - body.push(Statement::Instruction(ast::Instruction::Ld { - data: ast::LdDetails { - qualifier: ast::LdStQualifier::Weak, - state_space: input_arguments[index].state_space, - caching: ast::LdCacheOperator::Cached, - typ: input_arguments[index].v_type.clone(), - non_coherent: false, - }, - arguments: ast::LdArgs { - src: input_arguments[index].name, - dst: *input_reg, - }, - })); - } - body.extend(modes.map(|mode_set| Statement::SetMode(mode_set))); - // Out of order because we want to use return_variables before they are moved - let ret_statement = if return_arguments.is_empty() { - Statement::Instruction(ast::Instruction::Ret { - data: ast::RetData { uniform: false }, - }) - } else { - Statement::RetValue( - ast::RetData { uniform: false }, - return_variables - .iter() - .enumerate() - .map(|(index, var)| (*var, method.return_arguments[index].v_type.clone())) - .collect(), - ) - }; - body.push(Statement::Instruction(ast::Instruction::Call { - data: ast::CallDetails { - uniform: false, - return_arguments: return_arguments - .iter() - .map(|arg| (arg.v_type.clone(), arg.state_space)) - .collect(), - input_arguments: input_arguments - .iter() - .map(|arg| (arg.v_type.clone(), arg.state_space)) - .collect(), - }, - arguments: ast::CallArgs { - return_arguments: return_variables, - func: method.name, - input_arguments: input_variables, - }, - })); - body.push(ret_statement); - Directive2::Method(Function2 { - return_arguments, - name, - input_arguments, - body: Some(body), - is_kernel: false, - import_as: None, - tuning: Vec::new(), - linkage: ast::LinkingDirective::NONE, - flush_to_zero_f32: false, - flush_to_zero_f16f64: false, - rounding_mode_f32: ptx_parser::RoundingMode::NearestEven, - rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven, - }) -} - -fn rename_variables( - flat_resolver: &mut super::GlobalStringIdentResolver2, - variables: &Vec>, -) -> Vec> { - variables - .iter() - .cloned() - .map(|arg| ast::Variable { - name: flat_resolver.register_unnamed(Some((arg.v_type.clone(), arg.state_space))), - ..arg - }) - .collect() -} - -fn append_variables<'a, 'input: 'a>( - flat_resolver: &'a mut super::GlobalStringIdentResolver2<'input>, - body: &mut Vec, SpirvWord>>, - arguments: &'a Vec>, -) -> Vec { - let mut result = Vec::with_capacity(arguments.len()); - for arg in arguments { - let name = flat_resolver.register_unnamed(Some((arg.v_type.clone(), ast::StateSpace::Reg))); - body.push(Statement::Variable(ast::Variable { - align: None, - v_type: arg.v_type.clone(), - state_space: ast::StateSpace::Reg, - name, - array_init: Vec::new(), - })); - result.push(name); - } - result -} - struct BasicBlockControlState<'a> { - global_modes: &'a FullModeInsertion2, + global_modes: &'a FullModeInsertion, denormal_f32: RegisterState, denormal_f16f64: RegisterState, rounding_f32: RegisterState, @@ -1429,7 +1054,7 @@ impl RegisterState { } impl<'a> BasicBlockControlState<'a> { - fn new(global_modes: &'a FullModeInsertion2, initial_mode: &FullBasicBlockEntryState) -> Self { + fn new(global_modes: &'a FullModeInsertion, initial_mode: &FullBasicBlockEntryState) -> Self { let denormal_f32 = RegisterState::new(initial_mode.denormal.twin_mode.f32); let denormal_f16f64 = RegisterState::new(initial_mode.denormal.twin_mode.f16f64); let rounding_f32 = RegisterState::new(initial_mode.rounding.twin_mode.f32); @@ -1600,7 +1225,7 @@ impl<'a> BasicBlockControlState<'a> { } fn redirect_jump_impl( - global_modes: &FullModeInsertion2, + global_modes: &FullModeInsertion, current_mode: &ResolvedInstructionModes, jump_target: &mut SpirvWord, ) -> Result<(), TranslateError> { @@ -1918,7 +1543,7 @@ impl<'a> Drop for BasicBlockState<'a> { } } -fn compute_single_mode( +fn compute_single_mode_insertions( graph: &ControlFlowGraph, mut getter: impl FnMut(&Node) -> Mode, ) -> PartialModeInsertion { @@ -1988,7 +1613,7 @@ struct PartialModeInsertion { } // Only returns kernel mode insertions if a kernel is relevant to the optimization problem -fn optimize< +fn optimize_mode_insertions< T: Copy + Into + strum::VariantArray + std::fmt::Debug + Default, const N: usize, >( diff --git a/ptx/src/pass/insert_ftz_control/test.rs b/ptx/src/pass/instruction_mode_to_global_mode/test.rs similarity index 94% rename from ptx/src/pass/insert_ftz_control/test.rs rename to ptx/src/pass/instruction_mode_to_global_mode/test.rs index ef59495..78d1d66 100644 --- a/ptx/src/pass/insert_ftz_control/test.rs +++ b/ptx/src/pass/instruction_mode_to_global_mode/test.rs @@ -43,7 +43,7 @@ fn transitive_mixed() { graph.add_jump(empty, false2_id); let false2_ = graph.get_or_add_basic_block(false2_id); graph.set_modes(false2_, ftz(), ftz()); - let partial_result = super::compute_single_mode(&graph, |node| node.denormal_f32); + let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32); assert_eq!(partial_result.bb_must_insert_mode.len(), 0); assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1); assert_eq!( @@ -51,7 +51,7 @@ fn transitive_mixed() { (DenormalMode::FlushToZero, iter::once(entry_id).collect()) ); - let result = optimize::(partial_result); + let result = optimize_mode_insertions::(partial_result); assert_eq!(result.basic_blocks.len(), 0); assert_eq!(result.kernels.len(), 1); assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero); @@ -73,7 +73,7 @@ fn transitive_change_twice() { graph.add_jump(empty, true_id); let true_ = graph.get_or_add_basic_block(true_id); graph.set_modes(true_, preserve(), preserve()); - let partial_result = super::compute_single_mode(&graph, |node| node.denormal_f32); + let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32); assert_eq!(partial_result.bb_must_insert_mode.len(), 1); assert!(partial_result.bb_must_insert_mode.contains(&true_id)); assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1); @@ -82,7 +82,7 @@ fn transitive_change_twice() { (DenormalMode::FlushToZero, iter::once(entry_id).collect()) ); - let result = optimize::(partial_result); + let result = optimize_mode_insertions::(partial_result); assert_eq!(result.basic_blocks, iter::once(true_id).collect()); assert_eq!(result.kernels.len(), 1); assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero); @@ -100,7 +100,7 @@ fn transitive_change() { graph.add_jump(empty, true_id); let true_ = graph.get_or_add_basic_block(true_id); graph.set_modes(true_, preserve(), preserve()); - let partial_result = super::compute_single_mode(&graph, |node| node.denormal_f32); + let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32); assert_eq!(partial_result.bb_must_insert_mode.len(), 0); assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1); assert_eq!( @@ -108,7 +108,7 @@ fn transitive_change() { (DenormalMode::Preserve, iter::once(entry_id).collect()) ); - let result = optimize::(partial_result); + let result = optimize_mode_insertions::(partial_result); assert_eq!(result.basic_blocks.len(), 0); assert_eq!(result.kernels.len(), 1); assert_eq!(result.kernels[&entry_id], DenormalMode::Preserve); @@ -143,7 +143,7 @@ fn codependency() { // "{:?}", // petgraph::dot::Dot::with_config(&graph.graph, &[petgraph::dot::Config::EdgeNoLabel]) //); - let partial_result = super::compute_single_mode(&graph, |node| node.denormal_f32); + let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32); assert_eq!(partial_result.bb_must_insert_mode.len(), 0); assert_eq!(partial_result.bb_maybe_insert_mode.len(), 2); assert_eq!( @@ -155,7 +155,7 @@ fn codependency() { (DenormalMode::FlushToZero, iter::once(entry_id).collect()) ); - let result = optimize::(partial_result); + let result = optimize_mode_insertions::(partial_result); assert_eq!(result.basic_blocks.len(), 0); assert_eq!(result.kernels.len(), 1); assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero); diff --git a/ptx/src/pass/mod.rs b/ptx/src/pass/mod.rs index 1e138dd..77d7e60 100644 --- a/ptx/src/pass/mod.rs +++ b/ptx/src/pass/mod.rs @@ -17,7 +17,7 @@ mod expand_operands; mod fix_special_registers2; mod hoist_globals; mod insert_explicit_load_store; -mod insert_ftz_control; +mod instruction_mode_to_global_mode; mod insert_implicit_conversions2; mod normalize_basic_blocks; mod normalize_identifiers2; @@ -54,7 +54,7 @@ pub fn to_llvm_module<'input>(ast: ast::Module<'input>) -> Result, ptr addrspace(1) %"32", align 8 - %"49" = extractelement <2 x i32> %"30", i8 0 - %"50" = extractelement <2 x i32> %"30", i8 1 - store i32 %"49", ptr addrspace(5) %"44", align 4 - store i32 %"50", ptr addrspace(5) %"45", align 4 - %"52" = load i32, ptr addrspace(5) %"44", align 4 - %"53" = load i32, ptr addrspace(5) %"45", align 4 - %"51" = add i32 %"52", %"53" - store i32 %"51", ptr addrspace(5) %"44", align 4 - %"54" = load i64, ptr addrspace(5) %"43", align 4 - %"55" = load i32, ptr addrspace(5) %"44", align 4 - %"57" = inttoptr i64 %"54" to ptr addrspace(1) - store i32 %"55", ptr addrspace(1) %"57", align 4 + %"43" = extractelement <2 x i32> %"30", i8 0 + %"44" = extractelement <2 x i32> %"30", i8 1 + store i32 %"43", ptr addrspace(5) %"38", align 4 + store i32 %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i32, ptr addrspace(5) %"38", align 4 + %"47" = load i32, ptr addrspace(5) %"39", align 4 + %"45" = add i32 %"46", %"47" + store i32 %"45", ptr addrspace(5) %"38", align 4 + %"48" = load i64, ptr addrspace(5) %"37", align 4 + %"49" = load i32, ptr addrspace(5) %"38", align 4 + %"51" = inttoptr i64 %"48" to ptr addrspace(1) + store i32 %"49", ptr addrspace(1) %"51", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/not.ll b/ptx/src/test/ll/not.ll index c34a537..efb1f95 100644 --- a/ptx/src/test/ll/not.ll +++ b/ptx/src/test/ll/not.ll @@ -1,43 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"30" "30": ; preds = %1 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"51" = inttoptr i64 %"46" to ptr - %"45" = load i64, ptr %"51", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"52" = xor i64 %"48", -1 - store i64 %"52", ptr addrspace(5) %"42", align 4 - %"49" = load i64, ptr addrspace(5) %"40", align 4 - %"50" = load i64, ptr addrspace(5) %"42", align 4 - %"54" = inttoptr i64 %"49" to ptr - store i64 %"50", ptr %"54", align 4 + %"37" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"37", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"40" = load i64, ptr addrspace(5) %"33", align 4 + %"45" = inttoptr i64 %"40" to ptr + %"39" = load i64, ptr %"45", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"46" = xor i64 %"42", -1 + store i64 %"46", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"34", align 4 + %"44" = load i64, ptr addrspace(5) %"36", align 4 + %"48" = inttoptr i64 %"43" to ptr + store i64 %"44", ptr %"48", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/ntid.ll b/ptx/src/test/ll/ntid.ll index 6839731..87185bc 100644 --- a/ptx/src/test/ll/ntid.ll +++ b/ptx/src/test/ll/ntid.ll @@ -1,47 +1,37 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 { - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i32, align 4, addrspace(5) - %"45" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 br label %"32" "32": ; preds = %1 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"47" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = inttoptr i64 %"49" to ptr - %"48" = load i32, ptr %"56", align 4 - store i32 %"48", ptr addrspace(5) %"44", align 4 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %"51" = inttoptr i64 %"44" to ptr + %"43" = load i32, ptr %"51", align 4 + store i32 %"43", ptr addrspace(5) %"39", align 4 %"31" = call i32 @__zluda_ptx_impl_sreg_ntid(i8 0) br label %"33" "33": ; preds = %"32" - store i32 %"31", ptr addrspace(5) %"45", align 4 - %"52" = load i32, ptr addrspace(5) %"44", align 4 - %"53" = load i32, ptr addrspace(5) %"45", align 4 - %"51" = add i32 %"52", %"53" - store i32 %"51", ptr addrspace(5) %"44", align 4 - %"54" = load i64, ptr addrspace(5) %"43", align 4 - %"55" = load i32, ptr addrspace(5) %"44", align 4 - %"57" = inttoptr i64 %"54" to ptr - store i32 %"55", ptr %"57", align 4 + store i32 %"31", ptr addrspace(5) %"40", align 4 + %"47" = load i32, ptr addrspace(5) %"39", align 4 + %"48" = load i32, ptr addrspace(5) %"40", align 4 + %"46" = add i32 %"47", %"48" + store i32 %"46", ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = load i32, ptr addrspace(5) %"39", align 4 + %"52" = inttoptr i64 %"49" to ptr + store i32 %"50", ptr %"52", align 4 ret void } diff --git a/ptx/src/test/ll/or.ll b/ptx/src/test/ll/or.ll index 1402f8d..e773120 100644 --- a/ptx/src/test/ll/or.ll +++ b/ptx/src/test/ll/or.ll @@ -1,49 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"32" "32": ; preds = %1 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"48" to ptr - %"47" = load i64, ptr %"56", align 4 - store i64 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"57" = inttoptr i64 %"49" to ptr - %"31" = getelementptr inbounds i8, ptr %"57", i64 8 - %"50" = load i64, ptr %"31", align 4 - store i64 %"50", ptr addrspace(5) %"44", align 4 - %"52" = load i64, ptr addrspace(5) %"43", align 4 - %"53" = load i64, ptr addrspace(5) %"44", align 4 - %"58" = or i64 %"52", %"53" - store i64 %"58", ptr addrspace(5) %"43", align 4 - %"54" = load i64, ptr addrspace(5) %"42", align 4 - %"55" = load i64, ptr addrspace(5) %"43", align 4 - %"61" = inttoptr i64 %"54" to ptr - store i64 %"55", ptr %"61", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load i64, ptr %"50", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 8 + %"44" = load i64, ptr %"31", align 4 + store i64 %"44", ptr addrspace(5) %"38", align 4 + %"46" = load i64, ptr addrspace(5) %"37", align 4 + %"47" = load i64, ptr addrspace(5) %"38", align 4 + %"52" = or i64 %"46", %"47" + store i64 %"52", ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load i64, ptr addrspace(5) %"37", align 4 + %"55" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"55", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/popc.ll b/ptx/src/test/ll/popc.ll index 23c798a..0b379c5 100644 --- a/ptx/src/test/ll/popc.ll +++ b/ptx/src/test/ll/popc.ll @@ -1,46 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 br label %"29" "29": ; preds = %1 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"42" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"42", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(5) %"38", align 4 - %"49" = inttoptr i64 %"44" to ptr - %"43" = load i32, ptr %"49", align 4 - store i32 %"43", ptr addrspace(5) %"40", align 4 - %"46" = load i32, ptr addrspace(5) %"40", align 4 - %"50" = call i32 @llvm.ctpop.i32(i32 %"46") - store i32 %"50", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"48" = load i32, ptr addrspace(5) %"40", align 4 - %"51" = inttoptr i64 %"47" to ptr - store i32 %"48", ptr %"51", align 4 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load i32, ptr %"43", align 4 + store i32 %"37", ptr addrspace(5) %"34", align 4 + %"40" = load i32, ptr addrspace(5) %"34", align 4 + %"44" = call i32 @llvm.ctpop.i32(i32 %"40") + store i32 %"44", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load i32, ptr addrspace(5) %"34", align 4 + %"45" = inttoptr i64 %"41" to ptr + store i32 %"42", ptr %"45", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.ctpop.i32(i32) #2 +declare i32 @llvm.ctpop.i32(i32) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/pred_not.ll b/ptx/src/test/ll/pred_not.ll index 90b142a..65cc659 100644 --- a/ptx/src/test/ll/pred_not.ll +++ b/ptx/src/test/ll/pred_not.ll @@ -1,70 +1,57 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #1 { - %"49" = alloca i64, align 8, addrspace(5) - %"50" = alloca i64, align 8, addrspace(5) - %"51" = alloca i64, align 8, addrspace(5) - %"52" = alloca i64, align 8, addrspace(5) - %"53" = alloca i64, align 8, addrspace(5) - %"54" = alloca i1, align 1, addrspace(5) +define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 br label %"40" "40": ; preds = %1 - %"55" = load i64, ptr addrspace(4) %"47", align 4 - store i64 %"55", ptr addrspace(5) %"49", align 4 - %"56" = load i64, ptr addrspace(4) %"48", align 4 - store i64 %"56", ptr addrspace(5) %"50", align 4 - %"58" = load i64, ptr addrspace(5) %"49", align 4 - %"72" = inttoptr i64 %"58" to ptr - %"57" = load i64, ptr %"72", align 4 - store i64 %"57", ptr addrspace(5) %"51", align 4 - %"59" = load i64, ptr addrspace(5) %"49", align 4 - %"73" = inttoptr i64 %"59" to ptr - %"37" = getelementptr inbounds i8, ptr %"73", i64 8 - %"60" = load i64, ptr %"37", align 4 - store i64 %"60", ptr addrspace(5) %"52", align 4 - %"62" = load i64, ptr addrspace(5) %"51", align 4 - %"63" = load i64, ptr addrspace(5) %"52", align 4 - %"61" = icmp ult i64 %"62", %"63" - store i1 %"61", ptr addrspace(5) %"54", align 1 - %"65" = load i1, ptr addrspace(5) %"54", align 1 - %"64" = xor i1 %"65", true - store i1 %"64", ptr addrspace(5) %"54", align 1 - %"66" = load i1, ptr addrspace(5) %"54", align 1 - br i1 %"66", label %"16", label %"17" + %"49" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"49", ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"50", ptr addrspace(5) %"44", align 4 + %"52" = load i64, ptr addrspace(5) %"43", align 4 + %"66" = inttoptr i64 %"52" to ptr + %"51" = load i64, ptr %"66", align 4 + store i64 %"51", ptr addrspace(5) %"45", align 4 + %"53" = load i64, ptr addrspace(5) %"43", align 4 + %"67" = inttoptr i64 %"53" to ptr + %"37" = getelementptr inbounds i8, ptr %"67", i64 8 + %"54" = load i64, ptr %"37", align 4 + store i64 %"54", ptr addrspace(5) %"46", align 4 + %"56" = load i64, ptr addrspace(5) %"45", align 4 + %"57" = load i64, ptr addrspace(5) %"46", align 4 + %"55" = icmp ult i64 %"56", %"57" + store i1 %"55", ptr addrspace(5) %"48", align 1 + %"59" = load i1, ptr addrspace(5) %"48", align 1 + %"58" = xor i1 %"59", true + store i1 %"58", ptr addrspace(5) %"48", align 1 + %"60" = load i1, ptr addrspace(5) %"48", align 1 + br i1 %"60", label %"16", label %"17" "16": ; preds = %"40" - store i64 1, ptr addrspace(5) %"53", align 4 + store i64 1, ptr addrspace(5) %"47", align 4 br label %"17" "17": ; preds = %"16", %"40" - %"68" = load i1, ptr addrspace(5) %"54", align 1 - br i1 %"68", label %"19", label %"18" + %"62" = load i1, ptr addrspace(5) %"48", align 1 + br i1 %"62", label %"19", label %"18" "18": ; preds = %"17" - store i64 2, ptr addrspace(5) %"53", align 4 + store i64 2, ptr addrspace(5) %"47", align 4 br label %"19" "19": ; preds = %"18", %"17" - %"70" = load i64, ptr addrspace(5) %"50", align 4 - %"71" = load i64, ptr addrspace(5) %"53", align 4 - %"74" = inttoptr i64 %"70" to ptr - store i64 %"71", ptr %"74", align 4 + %"64" = load i64, ptr addrspace(5) %"44", align 4 + %"65" = load i64, ptr addrspace(5) %"47", align 4 + %"68" = inttoptr i64 %"64" to ptr + store i64 %"65", ptr %"68", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/prmt.ll b/ptx/src/test/ll/prmt.ll index 6f6cebb..85f144e 100644 --- a/ptx/src/test/ll/prmt.ll +++ b/ptx/src/test/ll/prmt.ll @@ -1,51 +1,38 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) - %"44" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 br label %"32" "32": ; preds = %1 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"48" to ptr - %"47" = load i32, ptr %"56", align 4 - store i32 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"57" = inttoptr i64 %"49" to ptr - %"31" = getelementptr inbounds i8, ptr %"57", i64 4 - %"50" = load i32, ptr %"31", align 4 - store i32 %"50", ptr addrspace(5) %"44", align 4 - %"52" = load i32, ptr addrspace(5) %"43", align 4 - %"53" = load i32, ptr addrspace(5) %"44", align 4 - %2 = bitcast i32 %"52" to <4 x i8> - %3 = bitcast i32 %"53" to <4 x i8> - %"58" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> - store <4 x i8> %"58", ptr addrspace(5) %"44", align 4 - %"54" = load i64, ptr addrspace(5) %"42", align 4 - %"55" = load i32, ptr addrspace(5) %"44", align 4 - %"61" = inttoptr i64 %"54" to ptr - store i32 %"55", ptr %"61", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"50", align 4 + store i32 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 4 + %"44" = load i32, ptr %"31", align 4 + store i32 %"44", ptr addrspace(5) %"38", align 4 + %"46" = load i32, ptr addrspace(5) %"37", align 4 + %"47" = load i32, ptr addrspace(5) %"38", align 4 + %2 = bitcast i32 %"46" to <4 x i8> + %3 = bitcast i32 %"47" to <4 x i8> + %"52" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> + store <4 x i8> %"52", ptr addrspace(5) %"38", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load i32, ptr addrspace(5) %"38", align 4 + %"55" = inttoptr i64 %"48" to ptr + store i32 %"49", ptr %"55", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/rcp.ll b/ptx/src/test/ll/rcp.ll index e2bce27..0995cc0 100644 --- a/ptx/src/test/ll/rcp.ll +++ b/ptx/src/test/ll/rcp.ll @@ -1,46 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 br label %"29" "29": ; preds = %1 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"42" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"42", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(5) %"38", align 4 - %"49" = inttoptr i64 %"44" to ptr - %"43" = load float, ptr %"49", align 4 - store float %"43", ptr addrspace(5) %"40", align 4 - %"46" = load float, ptr addrspace(5) %"40", align 4 - %"45" = call float @llvm.amdgcn.rcp.f32(float %"46") - store float %"45", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"48" = load float, ptr addrspace(5) %"40", align 4 - %"50" = inttoptr i64 %"47" to ptr - store float %"48", ptr %"50", align 4 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load float, ptr %"43", align 4 + store float %"37", ptr addrspace(5) %"34", align 4 + %"40" = load float, ptr addrspace(5) %"34", align 4 + %"39" = call float @llvm.amdgcn.rcp.f32(float %"40") + store float %"39", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load float, ptr addrspace(5) %"34", align 4 + %"44" = inttoptr i64 %"41" to ptr + store float %"42", ptr %"44", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.amdgcn.rcp.f32(float) #2 +declare float @llvm.amdgcn.rcp.f32(float) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/reg_local.ll b/ptx/src/test/ll/reg_local.ll index 7df82ac..a1b6bf2 100644 --- a/ptx/src/test/ll/reg_local.ll +++ b/ptx/src/test/ll/reg_local.ll @@ -1,49 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #1 { +define amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { %"10" = alloca [8 x i8], align 8, addrspace(5) - %"45" = alloca i64, align 8, addrspace(5) - %"46" = alloca i64, align 8, addrspace(5) - %"47" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"36" "36": ; preds = %1 - %"48" = load i64, ptr addrspace(4) %"43", align 4 - store i64 %"48", ptr addrspace(5) %"45", align 4 - %"49" = load i64, ptr addrspace(4) %"44", align 4 - store i64 %"49", ptr addrspace(5) %"46", align 4 - %"51" = load i64, ptr addrspace(5) %"45", align 4 - %"57" = inttoptr i64 %"51" to ptr addrspace(1) - %"56" = load i64, ptr addrspace(1) %"57", align 4 - store i64 %"56", ptr addrspace(5) %"47", align 4 - %"52" = load i64, ptr addrspace(5) %"47", align 4 - %"31" = add i64 %"52", 1 - %"58" = addrspacecast ptr addrspace(5) %"10" to ptr - store i64 %"31", ptr %"58", align 4 - %"60" = addrspacecast ptr addrspace(5) %"10" to ptr - %"33" = getelementptr inbounds i8, ptr %"60", i64 0 - %"61" = load i64, ptr %"33", align 4 - store i64 %"61", ptr addrspace(5) %"47", align 4 - %"54" = load i64, ptr addrspace(5) %"46", align 4 - %"62" = inttoptr i64 %"54" to ptr addrspace(1) - %"35" = getelementptr inbounds i8, ptr addrspace(1) %"62", i64 0 - %"55" = load i64, ptr addrspace(5) %"47", align 4 - store i64 %"55", ptr addrspace(1) %"35", align 4 + %"42" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"42", ptr addrspace(5) %"39", align 4 + %"43" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"43", ptr addrspace(5) %"40", align 4 + %"45" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"45" to ptr addrspace(1) + %"50" = load i64, ptr addrspace(1) %"51", align 4 + store i64 %"50", ptr addrspace(5) %"41", align 4 + %"46" = load i64, ptr addrspace(5) %"41", align 4 + %"31" = add i64 %"46", 1 + %"52" = addrspacecast ptr addrspace(5) %"10" to ptr + store i64 %"31", ptr %"52", align 4 + %"54" = addrspacecast ptr addrspace(5) %"10" to ptr + %"33" = getelementptr inbounds i8, ptr %"54", i64 0 + %"55" = load i64, ptr %"33", align 4 + store i64 %"55", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr addrspace(1) + %"35" = getelementptr inbounds i8, ptr addrspace(1) %"56", i64 0 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + store i64 %"49", ptr addrspace(1) %"35", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/rem.ll b/ptx/src/test/ll/rem.ll index 5bc1a67..dd33785 100644 --- a/ptx/src/test/ll/rem.ll +++ b/ptx/src/test/ll/rem.ll @@ -1,49 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) - %"44" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 br label %"32" "32": ; preds = %1 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"48" to ptr - %"47" = load i32, ptr %"56", align 4 - store i32 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"57" = inttoptr i64 %"49" to ptr - %"31" = getelementptr inbounds i8, ptr %"57", i64 4 - %"50" = load i32, ptr %"31", align 4 - store i32 %"50", ptr addrspace(5) %"44", align 4 - %"52" = load i32, ptr addrspace(5) %"43", align 4 - %"53" = load i32, ptr addrspace(5) %"44", align 4 - %"51" = srem i32 %"52", %"53" - store i32 %"51", ptr addrspace(5) %"43", align 4 - %"54" = load i64, ptr addrspace(5) %"42", align 4 - %"55" = load i32, ptr addrspace(5) %"43", align 4 - %"58" = inttoptr i64 %"54" to ptr - store i32 %"55", ptr %"58", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"50", align 4 + store i32 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 4 + %"44" = load i32, ptr %"31", align 4 + store i32 %"44", ptr addrspace(5) %"38", align 4 + %"46" = load i32, ptr addrspace(5) %"37", align 4 + %"47" = load i32, ptr addrspace(5) %"38", align 4 + %"45" = srem i32 %"46", %"47" + store i32 %"45", ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load i32, ptr addrspace(5) %"37", align 4 + %"52" = inttoptr i64 %"48" to ptr + store i32 %"49", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/rsqrt.ll b/ptx/src/test/ll/rsqrt.ll index cd79a60..04ca3e5 100644 --- a/ptx/src/test/ll/rsqrt.ll +++ b/ptx/src/test/ll/rsqrt.ll @@ -1,46 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca double, align 8, addrspace(5) +define amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca double, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"29" "29": ; preds = %1 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"42" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"42", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(5) %"38", align 4 - %"49" = inttoptr i64 %"44" to ptr - %"43" = load double, ptr %"49", align 8 - store double %"43", ptr addrspace(5) %"40", align 8 - %"46" = load double, ptr addrspace(5) %"40", align 8 - %"45" = call double @llvm.amdgcn.rsq.f64(double %"46") - store double %"45", ptr addrspace(5) %"40", align 8 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"48" = load double, ptr addrspace(5) %"40", align 8 - %"50" = inttoptr i64 %"47" to ptr - store double %"48", ptr %"50", align 8 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load double, ptr %"43", align 8 + store double %"37", ptr addrspace(5) %"34", align 8 + %"40" = load double, ptr addrspace(5) %"34", align 8 + %"39" = call double @llvm.amdgcn.rsq.f64(double %"40") + store double %"39", ptr addrspace(5) %"34", align 8 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load double, ptr addrspace(5) %"34", align 8 + %"44" = inttoptr i64 %"41" to ptr + store double %"42", ptr %"44", align 8 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare double @llvm.amdgcn.rsq.f64(double) #2 +declare double @llvm.amdgcn.rsq.f64(double) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/selp.ll b/ptx/src/test/ll/selp.ll index 3788d04..918c4df 100644 --- a/ptx/src/test/ll/selp.ll +++ b/ptx/src/test/ll/selp.ll @@ -1,49 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 { - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i16, align 2, addrspace(5) - %"45" = alloca i16, align 2, addrspace(5) +define amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i16, align 2, addrspace(5) + %"39" = alloca i16, align 2, addrspace(5) br label %1 1: ; preds = %0 br label %"33" "33": ; preds = %1 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"47" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %"57" = inttoptr i64 %"49" to ptr - %"48" = load i16, ptr %"57", align 2 - store i16 %"48", ptr addrspace(5) %"44", align 2 - %"50" = load i64, ptr addrspace(5) %"42", align 4 - %"58" = inttoptr i64 %"50" to ptr - %"31" = getelementptr inbounds i8, ptr %"58", i64 2 - %"51" = load i16, ptr %"31", align 2 - store i16 %"51", ptr addrspace(5) %"45", align 2 - %"53" = load i16, ptr addrspace(5) %"44", align 2 - %"54" = load i16, ptr addrspace(5) %"45", align 2 - %"52" = select i1 false, i16 %"53", i16 %"54" - store i16 %"52", ptr addrspace(5) %"44", align 2 - %"55" = load i64, ptr addrspace(5) %"43", align 4 - %"56" = load i16, ptr addrspace(5) %"44", align 2 - %"59" = inttoptr i64 %"55" to ptr - store i16 %"56", ptr %"59", align 2 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"42" = load i16, ptr %"51", align 2 + store i16 %"42", ptr addrspace(5) %"38", align 2 + %"44" = load i64, ptr addrspace(5) %"36", align 4 + %"52" = inttoptr i64 %"44" to ptr + %"31" = getelementptr inbounds i8, ptr %"52", i64 2 + %"45" = load i16, ptr %"31", align 2 + store i16 %"45", ptr addrspace(5) %"39", align 2 + %"47" = load i16, ptr addrspace(5) %"38", align 2 + %"48" = load i16, ptr addrspace(5) %"39", align 2 + %"46" = select i1 false, i16 %"47", i16 %"48" + store i16 %"46", ptr addrspace(5) %"38", align 2 + %"49" = load i64, ptr addrspace(5) %"37", align 4 + %"50" = load i16, ptr addrspace(5) %"38", align 2 + %"53" = inttoptr i64 %"49" to ptr + store i16 %"50", ptr %"53", align 2 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/selp_true.ll b/ptx/src/test/ll/selp_true.ll index 6a3a18f..a422f89 100644 --- a/ptx/src/test/ll/selp_true.ll +++ b/ptx/src/test/ll/selp_true.ll @@ -1,49 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 { - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i16, align 2, addrspace(5) - %"45" = alloca i16, align 2, addrspace(5) +define amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i16, align 2, addrspace(5) + %"39" = alloca i16, align 2, addrspace(5) br label %1 1: ; preds = %0 br label %"33" "33": ; preds = %1 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"47" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %"57" = inttoptr i64 %"49" to ptr - %"48" = load i16, ptr %"57", align 2 - store i16 %"48", ptr addrspace(5) %"44", align 2 - %"50" = load i64, ptr addrspace(5) %"42", align 4 - %"58" = inttoptr i64 %"50" to ptr - %"31" = getelementptr inbounds i8, ptr %"58", i64 2 - %"51" = load i16, ptr %"31", align 2 - store i16 %"51", ptr addrspace(5) %"45", align 2 - %"53" = load i16, ptr addrspace(5) %"44", align 2 - %"54" = load i16, ptr addrspace(5) %"45", align 2 - %"52" = select i1 true, i16 %"53", i16 %"54" - store i16 %"52", ptr addrspace(5) %"44", align 2 - %"55" = load i64, ptr addrspace(5) %"43", align 4 - %"56" = load i16, ptr addrspace(5) %"44", align 2 - %"59" = inttoptr i64 %"55" to ptr - store i16 %"56", ptr %"59", align 2 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"42" = load i16, ptr %"51", align 2 + store i16 %"42", ptr addrspace(5) %"38", align 2 + %"44" = load i64, ptr addrspace(5) %"36", align 4 + %"52" = inttoptr i64 %"44" to ptr + %"31" = getelementptr inbounds i8, ptr %"52", i64 2 + %"45" = load i16, ptr %"31", align 2 + store i16 %"45", ptr addrspace(5) %"39", align 2 + %"47" = load i16, ptr addrspace(5) %"38", align 2 + %"48" = load i16, ptr addrspace(5) %"39", align 2 + %"46" = select i1 true, i16 %"47", i16 %"48" + store i16 %"46", ptr addrspace(5) %"38", align 2 + %"49" = load i64, ptr addrspace(5) %"37", align 4 + %"50" = load i16, ptr addrspace(5) %"38", align 2 + %"53" = inttoptr i64 %"49" to ptr + store i16 %"50", ptr %"53", align 2 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp.ll b/ptx/src/test/ll/setp.ll index 1d7b52a..d0617b8 100644 --- a/ptx/src/test/ll/setp.ll +++ b/ptx/src/test/ll/setp.ll @@ -1,67 +1,54 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #1 { - %"49" = alloca i64, align 8, addrspace(5) - %"50" = alloca i64, align 8, addrspace(5) - %"51" = alloca i64, align 8, addrspace(5) - %"52" = alloca i64, align 8, addrspace(5) - %"53" = alloca i64, align 8, addrspace(5) - %"54" = alloca i1, align 1, addrspace(5) +define amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 br label %"40" "40": ; preds = %1 - %"55" = load i64, ptr addrspace(4) %"47", align 4 - store i64 %"55", ptr addrspace(5) %"49", align 4 - %"56" = load i64, ptr addrspace(4) %"48", align 4 - store i64 %"56", ptr addrspace(5) %"50", align 4 - %"58" = load i64, ptr addrspace(5) %"49", align 4 - %"70" = inttoptr i64 %"58" to ptr - %"57" = load i64, ptr %"70", align 4 - store i64 %"57", ptr addrspace(5) %"51", align 4 - %"59" = load i64, ptr addrspace(5) %"49", align 4 - %"71" = inttoptr i64 %"59" to ptr - %"37" = getelementptr inbounds i8, ptr %"71", i64 8 - %"60" = load i64, ptr %"37", align 4 - store i64 %"60", ptr addrspace(5) %"52", align 4 - %"62" = load i64, ptr addrspace(5) %"51", align 4 - %"63" = load i64, ptr addrspace(5) %"52", align 4 - %"61" = icmp ult i64 %"62", %"63" - store i1 %"61", ptr addrspace(5) %"54", align 1 - %"64" = load i1, ptr addrspace(5) %"54", align 1 - br i1 %"64", label %"16", label %"17" + %"49" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"49", ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"50", ptr addrspace(5) %"44", align 4 + %"52" = load i64, ptr addrspace(5) %"43", align 4 + %"64" = inttoptr i64 %"52" to ptr + %"51" = load i64, ptr %"64", align 4 + store i64 %"51", ptr addrspace(5) %"45", align 4 + %"53" = load i64, ptr addrspace(5) %"43", align 4 + %"65" = inttoptr i64 %"53" to ptr + %"37" = getelementptr inbounds i8, ptr %"65", i64 8 + %"54" = load i64, ptr %"37", align 4 + store i64 %"54", ptr addrspace(5) %"46", align 4 + %"56" = load i64, ptr addrspace(5) %"45", align 4 + %"57" = load i64, ptr addrspace(5) %"46", align 4 + %"55" = icmp ult i64 %"56", %"57" + store i1 %"55", ptr addrspace(5) %"48", align 1 + %"58" = load i1, ptr addrspace(5) %"48", align 1 + br i1 %"58", label %"16", label %"17" "16": ; preds = %"40" - store i64 1, ptr addrspace(5) %"53", align 4 + store i64 1, ptr addrspace(5) %"47", align 4 br label %"17" "17": ; preds = %"16", %"40" - %"66" = load i1, ptr addrspace(5) %"54", align 1 - br i1 %"66", label %"19", label %"18" + %"60" = load i1, ptr addrspace(5) %"48", align 1 + br i1 %"60", label %"19", label %"18" "18": ; preds = %"17" - store i64 2, ptr addrspace(5) %"53", align 4 + store i64 2, ptr addrspace(5) %"47", align 4 br label %"19" "19": ; preds = %"18", %"17" - %"68" = load i64, ptr addrspace(5) %"50", align 4 - %"69" = load i64, ptr addrspace(5) %"53", align 4 - %"72" = inttoptr i64 %"68" to ptr - store i64 %"69", ptr %"72", align 4 + %"62" = load i64, ptr addrspace(5) %"44", align 4 + %"63" = load i64, ptr addrspace(5) %"47", align 4 + %"66" = inttoptr i64 %"62" to ptr + store i64 %"63", ptr %"66", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp_gt.ll b/ptx/src/test/ll/setp_gt.ll index 1b09246..c02b59e 100644 --- a/ptx/src/test/ll/setp_gt.ll +++ b/ptx/src/test/ll/setp_gt.ll @@ -1,69 +1,56 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #1 { - %"47" = alloca i64, align 8, addrspace(5) - %"48" = alloca i64, align 8, addrspace(5) - %"49" = alloca float, align 4, addrspace(5) - %"50" = alloca float, align 4, addrspace(5) - %"51" = alloca float, align 4, addrspace(5) - %"52" = alloca i1, align 1, addrspace(5) +define amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca float, align 4, addrspace(5) + %"44" = alloca float, align 4, addrspace(5) + %"45" = alloca float, align 4, addrspace(5) + %"46" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 br label %"38" "38": ; preds = %1 - %"53" = load i64, ptr addrspace(4) %"45", align 4 - store i64 %"53", ptr addrspace(5) %"47", align 4 - %"54" = load i64, ptr addrspace(4) %"46", align 4 - store i64 %"54", ptr addrspace(5) %"48", align 4 - %"56" = load i64, ptr addrspace(5) %"47", align 4 - %"70" = inttoptr i64 %"56" to ptr - %"55" = load float, ptr %"70", align 4 - store float %"55", ptr addrspace(5) %"49", align 4 - %"57" = load i64, ptr addrspace(5) %"47", align 4 - %"71" = inttoptr i64 %"57" to ptr - %"37" = getelementptr inbounds i8, ptr %"71", i64 4 - %"58" = load float, ptr %"37", align 4 - store float %"58", ptr addrspace(5) %"50", align 4 - %"60" = load float, ptr addrspace(5) %"49", align 4 - %"61" = load float, ptr addrspace(5) %"50", align 4 - %"59" = fcmp ogt float %"60", %"61" - store i1 %"59", ptr addrspace(5) %"52", align 1 - %"62" = load i1, ptr addrspace(5) %"52", align 1 - br i1 %"62", label %"16", label %"17" + %"47" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"47", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"48", ptr addrspace(5) %"42", align 4 + %"50" = load i64, ptr addrspace(5) %"41", align 4 + %"64" = inttoptr i64 %"50" to ptr + %"49" = load float, ptr %"64", align 4 + store float %"49", ptr addrspace(5) %"43", align 4 + %"51" = load i64, ptr addrspace(5) %"41", align 4 + %"65" = inttoptr i64 %"51" to ptr + %"37" = getelementptr inbounds i8, ptr %"65", i64 4 + %"52" = load float, ptr %"37", align 4 + store float %"52", ptr addrspace(5) %"44", align 4 + %"54" = load float, ptr addrspace(5) %"43", align 4 + %"55" = load float, ptr addrspace(5) %"44", align 4 + %"53" = fcmp ogt float %"54", %"55" + store i1 %"53", ptr addrspace(5) %"46", align 1 + %"56" = load i1, ptr addrspace(5) %"46", align 1 + br i1 %"56", label %"16", label %"17" "16": ; preds = %"38" - %"64" = load float, ptr addrspace(5) %"49", align 4 - store float %"64", ptr addrspace(5) %"51", align 4 + %"58" = load float, ptr addrspace(5) %"43", align 4 + store float %"58", ptr addrspace(5) %"45", align 4 br label %"17" "17": ; preds = %"16", %"38" - %"65" = load i1, ptr addrspace(5) %"52", align 1 - br i1 %"65", label %"19", label %"18" + %"59" = load i1, ptr addrspace(5) %"46", align 1 + br i1 %"59", label %"19", label %"18" "18": ; preds = %"17" - %"67" = load float, ptr addrspace(5) %"50", align 4 - store float %"67", ptr addrspace(5) %"51", align 4 + %"61" = load float, ptr addrspace(5) %"44", align 4 + store float %"61", ptr addrspace(5) %"45", align 4 br label %"19" "19": ; preds = %"18", %"17" - %"68" = load i64, ptr addrspace(5) %"48", align 4 - %"69" = load float, ptr addrspace(5) %"51", align 4 - %"72" = inttoptr i64 %"68" to ptr - store float %"69", ptr %"72", align 4 + %"62" = load i64, ptr addrspace(5) %"42", align 4 + %"63" = load float, ptr addrspace(5) %"45", align 4 + %"66" = inttoptr i64 %"62" to ptr + store float %"63", ptr %"66", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp_leu.ll b/ptx/src/test/ll/setp_leu.ll index 4f39849..5d19314 100644 --- a/ptx/src/test/ll/setp_leu.ll +++ b/ptx/src/test/ll/setp_leu.ll @@ -1,69 +1,56 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #1 { - %"47" = alloca i64, align 8, addrspace(5) - %"48" = alloca i64, align 8, addrspace(5) - %"49" = alloca float, align 4, addrspace(5) - %"50" = alloca float, align 4, addrspace(5) - %"51" = alloca float, align 4, addrspace(5) - %"52" = alloca i1, align 1, addrspace(5) +define amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca float, align 4, addrspace(5) + %"44" = alloca float, align 4, addrspace(5) + %"45" = alloca float, align 4, addrspace(5) + %"46" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 br label %"38" "38": ; preds = %1 - %"53" = load i64, ptr addrspace(4) %"45", align 4 - store i64 %"53", ptr addrspace(5) %"47", align 4 - %"54" = load i64, ptr addrspace(4) %"46", align 4 - store i64 %"54", ptr addrspace(5) %"48", align 4 - %"56" = load i64, ptr addrspace(5) %"47", align 4 - %"70" = inttoptr i64 %"56" to ptr - %"55" = load float, ptr %"70", align 4 - store float %"55", ptr addrspace(5) %"49", align 4 - %"57" = load i64, ptr addrspace(5) %"47", align 4 - %"71" = inttoptr i64 %"57" to ptr - %"37" = getelementptr inbounds i8, ptr %"71", i64 4 - %"58" = load float, ptr %"37", align 4 - store float %"58", ptr addrspace(5) %"50", align 4 - %"60" = load float, ptr addrspace(5) %"49", align 4 - %"61" = load float, ptr addrspace(5) %"50", align 4 - %"59" = fcmp ule float %"60", %"61" - store i1 %"59", ptr addrspace(5) %"52", align 1 - %"62" = load i1, ptr addrspace(5) %"52", align 1 - br i1 %"62", label %"16", label %"17" + %"47" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"47", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"48", ptr addrspace(5) %"42", align 4 + %"50" = load i64, ptr addrspace(5) %"41", align 4 + %"64" = inttoptr i64 %"50" to ptr + %"49" = load float, ptr %"64", align 4 + store float %"49", ptr addrspace(5) %"43", align 4 + %"51" = load i64, ptr addrspace(5) %"41", align 4 + %"65" = inttoptr i64 %"51" to ptr + %"37" = getelementptr inbounds i8, ptr %"65", i64 4 + %"52" = load float, ptr %"37", align 4 + store float %"52", ptr addrspace(5) %"44", align 4 + %"54" = load float, ptr addrspace(5) %"43", align 4 + %"55" = load float, ptr addrspace(5) %"44", align 4 + %"53" = fcmp ule float %"54", %"55" + store i1 %"53", ptr addrspace(5) %"46", align 1 + %"56" = load i1, ptr addrspace(5) %"46", align 1 + br i1 %"56", label %"16", label %"17" "16": ; preds = %"38" - %"64" = load float, ptr addrspace(5) %"49", align 4 - store float %"64", ptr addrspace(5) %"51", align 4 + %"58" = load float, ptr addrspace(5) %"43", align 4 + store float %"58", ptr addrspace(5) %"45", align 4 br label %"17" "17": ; preds = %"16", %"38" - %"65" = load i1, ptr addrspace(5) %"52", align 1 - br i1 %"65", label %"19", label %"18" + %"59" = load i1, ptr addrspace(5) %"46", align 1 + br i1 %"59", label %"19", label %"18" "18": ; preds = %"17" - %"67" = load float, ptr addrspace(5) %"50", align 4 - store float %"67", ptr addrspace(5) %"51", align 4 + %"61" = load float, ptr addrspace(5) %"44", align 4 + store float %"61", ptr addrspace(5) %"45", align 4 br label %"19" "19": ; preds = %"18", %"17" - %"68" = load i64, ptr addrspace(5) %"48", align 4 - %"69" = load float, ptr addrspace(5) %"51", align 4 - %"72" = inttoptr i64 %"68" to ptr - store float %"69", ptr %"72", align 4 + %"62" = load i64, ptr addrspace(5) %"42", align 4 + %"63" = load float, ptr addrspace(5) %"45", align 4 + %"66" = inttoptr i64 %"62" to ptr + store float %"63", ptr %"66", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp_nan.ll b/ptx/src/test/ll/setp_nan.ll index 0a11f0d..ca1e98b 100644 --- a/ptx/src/test/ll/setp_nan.ll +++ b/ptx/src/test/ll/setp_nan.ll @@ -1,178 +1,165 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"89", ptr addrspace(4) byref(i64) %"90") #1 { - %"91" = alloca i64, align 8, addrspace(5) - %"92" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"83", ptr addrspace(4) byref(i64) %"84") #0 { + %"85" = alloca i64, align 8, addrspace(5) + %"86" = alloca i64, align 8, addrspace(5) + %"87" = alloca float, align 4, addrspace(5) + %"88" = alloca float, align 4, addrspace(5) + %"89" = alloca float, align 4, addrspace(5) + %"90" = alloca float, align 4, addrspace(5) + %"91" = alloca float, align 4, addrspace(5) + %"92" = alloca float, align 4, addrspace(5) %"93" = alloca float, align 4, addrspace(5) %"94" = alloca float, align 4, addrspace(5) - %"95" = alloca float, align 4, addrspace(5) - %"96" = alloca float, align 4, addrspace(5) - %"97" = alloca float, align 4, addrspace(5) - %"98" = alloca float, align 4, addrspace(5) - %"99" = alloca float, align 4, addrspace(5) - %"100" = alloca float, align 4, addrspace(5) - %"101" = alloca i32, align 4, addrspace(5) - %"102" = alloca i1, align 1, addrspace(5) + %"95" = alloca i32, align 4, addrspace(5) + %"96" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 br label %"82" "82": ; preds = %1 - %"103" = load i64, ptr addrspace(4) %"89", align 4 - store i64 %"103", ptr addrspace(5) %"91", align 4 - %"104" = load i64, ptr addrspace(4) %"90", align 4 - store i64 %"104", ptr addrspace(5) %"92", align 4 - %"106" = load i64, ptr addrspace(5) %"91", align 4 - %"157" = inttoptr i64 %"106" to ptr - %"105" = load float, ptr %"157", align 4 - store float %"105", ptr addrspace(5) %"93", align 4 - %"107" = load i64, ptr addrspace(5) %"91", align 4 - %"158" = inttoptr i64 %"107" to ptr - %"55" = getelementptr inbounds i8, ptr %"158", i64 4 - %"108" = load float, ptr %"55", align 4 - store float %"108", ptr addrspace(5) %"94", align 4 - %"109" = load i64, ptr addrspace(5) %"91", align 4 - %"159" = inttoptr i64 %"109" to ptr - %"57" = getelementptr inbounds i8, ptr %"159", i64 8 - %"110" = load float, ptr %"57", align 4 - store float %"110", ptr addrspace(5) %"95", align 4 - %"111" = load i64, ptr addrspace(5) %"91", align 4 - %"160" = inttoptr i64 %"111" to ptr - %"59" = getelementptr inbounds i8, ptr %"160", i64 12 - %"112" = load float, ptr %"59", align 4 - store float %"112", ptr addrspace(5) %"96", align 4 - %"113" = load i64, ptr addrspace(5) %"91", align 4 - %"161" = inttoptr i64 %"113" to ptr - %"61" = getelementptr inbounds i8, ptr %"161", i64 16 - %"114" = load float, ptr %"61", align 4 - store float %"114", ptr addrspace(5) %"97", align 4 - %"115" = load i64, ptr addrspace(5) %"91", align 4 - %"162" = inttoptr i64 %"115" to ptr - %"63" = getelementptr inbounds i8, ptr %"162", i64 20 - %"116" = load float, ptr %"63", align 4 - store float %"116", ptr addrspace(5) %"98", align 4 - %"117" = load i64, ptr addrspace(5) %"91", align 4 - %"163" = inttoptr i64 %"117" to ptr - %"65" = getelementptr inbounds i8, ptr %"163", i64 24 - %"118" = load float, ptr %"65", align 4 - store float %"118", ptr addrspace(5) %"99", align 4 - %"119" = load i64, ptr addrspace(5) %"91", align 4 - %"164" = inttoptr i64 %"119" to ptr - %"67" = getelementptr inbounds i8, ptr %"164", i64 28 - %"120" = load float, ptr %"67", align 4 - store float %"120", ptr addrspace(5) %"100", align 4 - %"122" = load float, ptr addrspace(5) %"93", align 4 - %"123" = load float, ptr addrspace(5) %"94", align 4 - %"121" = fcmp uno float %"122", %"123" - store i1 %"121", ptr addrspace(5) %"102", align 1 - %"124" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"124", label %"22", label %"23" + %"97" = load i64, ptr addrspace(4) %"83", align 4 + store i64 %"97", ptr addrspace(5) %"85", align 4 + %"98" = load i64, ptr addrspace(4) %"84", align 4 + store i64 %"98", ptr addrspace(5) %"86", align 4 + %"100" = load i64, ptr addrspace(5) %"85", align 4 + %"151" = inttoptr i64 %"100" to ptr + %"99" = load float, ptr %"151", align 4 + store float %"99", ptr addrspace(5) %"87", align 4 + %"101" = load i64, ptr addrspace(5) %"85", align 4 + %"152" = inttoptr i64 %"101" to ptr + %"55" = getelementptr inbounds i8, ptr %"152", i64 4 + %"102" = load float, ptr %"55", align 4 + store float %"102", ptr addrspace(5) %"88", align 4 + %"103" = load i64, ptr addrspace(5) %"85", align 4 + %"153" = inttoptr i64 %"103" to ptr + %"57" = getelementptr inbounds i8, ptr %"153", i64 8 + %"104" = load float, ptr %"57", align 4 + store float %"104", ptr addrspace(5) %"89", align 4 + %"105" = load i64, ptr addrspace(5) %"85", align 4 + %"154" = inttoptr i64 %"105" to ptr + %"59" = getelementptr inbounds i8, ptr %"154", i64 12 + %"106" = load float, ptr %"59", align 4 + store float %"106", ptr addrspace(5) %"90", align 4 + %"107" = load i64, ptr addrspace(5) %"85", align 4 + %"155" = inttoptr i64 %"107" to ptr + %"61" = getelementptr inbounds i8, ptr %"155", i64 16 + %"108" = load float, ptr %"61", align 4 + store float %"108", ptr addrspace(5) %"91", align 4 + %"109" = load i64, ptr addrspace(5) %"85", align 4 + %"156" = inttoptr i64 %"109" to ptr + %"63" = getelementptr inbounds i8, ptr %"156", i64 20 + %"110" = load float, ptr %"63", align 4 + store float %"110", ptr addrspace(5) %"92", align 4 + %"111" = load i64, ptr addrspace(5) %"85", align 4 + %"157" = inttoptr i64 %"111" to ptr + %"65" = getelementptr inbounds i8, ptr %"157", i64 24 + %"112" = load float, ptr %"65", align 4 + store float %"112", ptr addrspace(5) %"93", align 4 + %"113" = load i64, ptr addrspace(5) %"85", align 4 + %"158" = inttoptr i64 %"113" to ptr + %"67" = getelementptr inbounds i8, ptr %"158", i64 28 + %"114" = load float, ptr %"67", align 4 + store float %"114", ptr addrspace(5) %"94", align 4 + %"116" = load float, ptr addrspace(5) %"87", align 4 + %"117" = load float, ptr addrspace(5) %"88", align 4 + %"115" = fcmp uno float %"116", %"117" + store i1 %"115", ptr addrspace(5) %"96", align 1 + %"118" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"118", label %"22", label %"23" "22": ; preds = %"82" - store i32 1, ptr addrspace(5) %"101", align 4 + store i32 1, ptr addrspace(5) %"95", align 4 br label %"23" "23": ; preds = %"22", %"82" - %"126" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"126", label %"25", label %"24" + %"120" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"120", label %"25", label %"24" "24": ; preds = %"23" - store i32 0, ptr addrspace(5) %"101", align 4 + store i32 0, ptr addrspace(5) %"95", align 4 br label %"25" "25": ; preds = %"24", %"23" - %"128" = load i64, ptr addrspace(5) %"92", align 4 - %"129" = load i32, ptr addrspace(5) %"101", align 4 - %"165" = inttoptr i64 %"128" to ptr - store i32 %"129", ptr %"165", align 4 - %"131" = load float, ptr addrspace(5) %"95", align 4 - %"132" = load float, ptr addrspace(5) %"96", align 4 - %"130" = fcmp uno float %"131", %"132" - store i1 %"130", ptr addrspace(5) %"102", align 1 - %"133" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"133", label %"26", label %"27" + %"122" = load i64, ptr addrspace(5) %"86", align 4 + %"123" = load i32, ptr addrspace(5) %"95", align 4 + %"159" = inttoptr i64 %"122" to ptr + store i32 %"123", ptr %"159", align 4 + %"125" = load float, ptr addrspace(5) %"89", align 4 + %"126" = load float, ptr addrspace(5) %"90", align 4 + %"124" = fcmp uno float %"125", %"126" + store i1 %"124", ptr addrspace(5) %"96", align 1 + %"127" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"127", label %"26", label %"27" "26": ; preds = %"25" - store i32 1, ptr addrspace(5) %"101", align 4 + store i32 1, ptr addrspace(5) %"95", align 4 br label %"27" "27": ; preds = %"26", %"25" - %"135" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"135", label %"29", label %"28" + %"129" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"129", label %"29", label %"28" "28": ; preds = %"27" - store i32 0, ptr addrspace(5) %"101", align 4 + store i32 0, ptr addrspace(5) %"95", align 4 br label %"29" "29": ; preds = %"28", %"27" - %"137" = load i64, ptr addrspace(5) %"92", align 4 - %"166" = inttoptr i64 %"137" to ptr - %"73" = getelementptr inbounds i8, ptr %"166", i64 4 - %"138" = load i32, ptr addrspace(5) %"101", align 4 - store i32 %"138", ptr %"73", align 4 - %"140" = load float, ptr addrspace(5) %"97", align 4 - %"141" = load float, ptr addrspace(5) %"98", align 4 - %"139" = fcmp uno float %"140", %"141" - store i1 %"139", ptr addrspace(5) %"102", align 1 - %"142" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"142", label %"30", label %"31" + %"131" = load i64, ptr addrspace(5) %"86", align 4 + %"160" = inttoptr i64 %"131" to ptr + %"73" = getelementptr inbounds i8, ptr %"160", i64 4 + %"132" = load i32, ptr addrspace(5) %"95", align 4 + store i32 %"132", ptr %"73", align 4 + %"134" = load float, ptr addrspace(5) %"91", align 4 + %"135" = load float, ptr addrspace(5) %"92", align 4 + %"133" = fcmp uno float %"134", %"135" + store i1 %"133", ptr addrspace(5) %"96", align 1 + %"136" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"136", label %"30", label %"31" "30": ; preds = %"29" - store i32 1, ptr addrspace(5) %"101", align 4 + store i32 1, ptr addrspace(5) %"95", align 4 br label %"31" "31": ; preds = %"30", %"29" - %"144" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"144", label %"33", label %"32" + %"138" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"138", label %"33", label %"32" "32": ; preds = %"31" - store i32 0, ptr addrspace(5) %"101", align 4 + store i32 0, ptr addrspace(5) %"95", align 4 br label %"33" "33": ; preds = %"32", %"31" - %"146" = load i64, ptr addrspace(5) %"92", align 4 - %"167" = inttoptr i64 %"146" to ptr - %"77" = getelementptr inbounds i8, ptr %"167", i64 8 - %"147" = load i32, ptr addrspace(5) %"101", align 4 - store i32 %"147", ptr %"77", align 4 - %"149" = load float, ptr addrspace(5) %"99", align 4 - %"150" = load float, ptr addrspace(5) %"100", align 4 - %"148" = fcmp uno float %"149", %"150" - store i1 %"148", ptr addrspace(5) %"102", align 1 - %"151" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"151", label %"34", label %"35" + %"140" = load i64, ptr addrspace(5) %"86", align 4 + %"161" = inttoptr i64 %"140" to ptr + %"77" = getelementptr inbounds i8, ptr %"161", i64 8 + %"141" = load i32, ptr addrspace(5) %"95", align 4 + store i32 %"141", ptr %"77", align 4 + %"143" = load float, ptr addrspace(5) %"93", align 4 + %"144" = load float, ptr addrspace(5) %"94", align 4 + %"142" = fcmp uno float %"143", %"144" + store i1 %"142", ptr addrspace(5) %"96", align 1 + %"145" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"145", label %"34", label %"35" "34": ; preds = %"33" - store i32 1, ptr addrspace(5) %"101", align 4 + store i32 1, ptr addrspace(5) %"95", align 4 br label %"35" "35": ; preds = %"34", %"33" - %"153" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"153", label %"37", label %"36" + %"147" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"147", label %"37", label %"36" "36": ; preds = %"35" - store i32 0, ptr addrspace(5) %"101", align 4 + store i32 0, ptr addrspace(5) %"95", align 4 br label %"37" "37": ; preds = %"36", %"35" - %"155" = load i64, ptr addrspace(5) %"92", align 4 - %"168" = inttoptr i64 %"155" to ptr - %"81" = getelementptr inbounds i8, ptr %"168", i64 12 - %"156" = load i32, ptr addrspace(5) %"101", align 4 - store i32 %"156", ptr %"81", align 4 + %"149" = load i64, ptr addrspace(5) %"86", align 4 + %"162" = inttoptr i64 %"149" to ptr + %"81" = getelementptr inbounds i8, ptr %"162", i64 12 + %"150" = load i32, ptr addrspace(5) %"95", align 4 + store i32 %"150", ptr %"81", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp_num.ll b/ptx/src/test/ll/setp_num.ll index 8fadbaa..4a6d56f 100644 --- a/ptx/src/test/ll/setp_num.ll +++ b/ptx/src/test/ll/setp_num.ll @@ -1,178 +1,165 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"89", ptr addrspace(4) byref(i64) %"90") #1 { - %"91" = alloca i64, align 8, addrspace(5) - %"92" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"83", ptr addrspace(4) byref(i64) %"84") #0 { + %"85" = alloca i64, align 8, addrspace(5) + %"86" = alloca i64, align 8, addrspace(5) + %"87" = alloca float, align 4, addrspace(5) + %"88" = alloca float, align 4, addrspace(5) + %"89" = alloca float, align 4, addrspace(5) + %"90" = alloca float, align 4, addrspace(5) + %"91" = alloca float, align 4, addrspace(5) + %"92" = alloca float, align 4, addrspace(5) %"93" = alloca float, align 4, addrspace(5) %"94" = alloca float, align 4, addrspace(5) - %"95" = alloca float, align 4, addrspace(5) - %"96" = alloca float, align 4, addrspace(5) - %"97" = alloca float, align 4, addrspace(5) - %"98" = alloca float, align 4, addrspace(5) - %"99" = alloca float, align 4, addrspace(5) - %"100" = alloca float, align 4, addrspace(5) - %"101" = alloca i32, align 4, addrspace(5) - %"102" = alloca i1, align 1, addrspace(5) + %"95" = alloca i32, align 4, addrspace(5) + %"96" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 br label %"82" "82": ; preds = %1 - %"103" = load i64, ptr addrspace(4) %"89", align 4 - store i64 %"103", ptr addrspace(5) %"91", align 4 - %"104" = load i64, ptr addrspace(4) %"90", align 4 - store i64 %"104", ptr addrspace(5) %"92", align 4 - %"106" = load i64, ptr addrspace(5) %"91", align 4 - %"157" = inttoptr i64 %"106" to ptr - %"105" = load float, ptr %"157", align 4 - store float %"105", ptr addrspace(5) %"93", align 4 - %"107" = load i64, ptr addrspace(5) %"91", align 4 - %"158" = inttoptr i64 %"107" to ptr - %"55" = getelementptr inbounds i8, ptr %"158", i64 4 - %"108" = load float, ptr %"55", align 4 - store float %"108", ptr addrspace(5) %"94", align 4 - %"109" = load i64, ptr addrspace(5) %"91", align 4 - %"159" = inttoptr i64 %"109" to ptr - %"57" = getelementptr inbounds i8, ptr %"159", i64 8 - %"110" = load float, ptr %"57", align 4 - store float %"110", ptr addrspace(5) %"95", align 4 - %"111" = load i64, ptr addrspace(5) %"91", align 4 - %"160" = inttoptr i64 %"111" to ptr - %"59" = getelementptr inbounds i8, ptr %"160", i64 12 - %"112" = load float, ptr %"59", align 4 - store float %"112", ptr addrspace(5) %"96", align 4 - %"113" = load i64, ptr addrspace(5) %"91", align 4 - %"161" = inttoptr i64 %"113" to ptr - %"61" = getelementptr inbounds i8, ptr %"161", i64 16 - %"114" = load float, ptr %"61", align 4 - store float %"114", ptr addrspace(5) %"97", align 4 - %"115" = load i64, ptr addrspace(5) %"91", align 4 - %"162" = inttoptr i64 %"115" to ptr - %"63" = getelementptr inbounds i8, ptr %"162", i64 20 - %"116" = load float, ptr %"63", align 4 - store float %"116", ptr addrspace(5) %"98", align 4 - %"117" = load i64, ptr addrspace(5) %"91", align 4 - %"163" = inttoptr i64 %"117" to ptr - %"65" = getelementptr inbounds i8, ptr %"163", i64 24 - %"118" = load float, ptr %"65", align 4 - store float %"118", ptr addrspace(5) %"99", align 4 - %"119" = load i64, ptr addrspace(5) %"91", align 4 - %"164" = inttoptr i64 %"119" to ptr - %"67" = getelementptr inbounds i8, ptr %"164", i64 28 - %"120" = load float, ptr %"67", align 4 - store float %"120", ptr addrspace(5) %"100", align 4 - %"122" = load float, ptr addrspace(5) %"93", align 4 - %"123" = load float, ptr addrspace(5) %"94", align 4 - %"121" = fcmp ord float %"122", %"123" - store i1 %"121", ptr addrspace(5) %"102", align 1 - %"124" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"124", label %"22", label %"23" + %"97" = load i64, ptr addrspace(4) %"83", align 4 + store i64 %"97", ptr addrspace(5) %"85", align 4 + %"98" = load i64, ptr addrspace(4) %"84", align 4 + store i64 %"98", ptr addrspace(5) %"86", align 4 + %"100" = load i64, ptr addrspace(5) %"85", align 4 + %"151" = inttoptr i64 %"100" to ptr + %"99" = load float, ptr %"151", align 4 + store float %"99", ptr addrspace(5) %"87", align 4 + %"101" = load i64, ptr addrspace(5) %"85", align 4 + %"152" = inttoptr i64 %"101" to ptr + %"55" = getelementptr inbounds i8, ptr %"152", i64 4 + %"102" = load float, ptr %"55", align 4 + store float %"102", ptr addrspace(5) %"88", align 4 + %"103" = load i64, ptr addrspace(5) %"85", align 4 + %"153" = inttoptr i64 %"103" to ptr + %"57" = getelementptr inbounds i8, ptr %"153", i64 8 + %"104" = load float, ptr %"57", align 4 + store float %"104", ptr addrspace(5) %"89", align 4 + %"105" = load i64, ptr addrspace(5) %"85", align 4 + %"154" = inttoptr i64 %"105" to ptr + %"59" = getelementptr inbounds i8, ptr %"154", i64 12 + %"106" = load float, ptr %"59", align 4 + store float %"106", ptr addrspace(5) %"90", align 4 + %"107" = load i64, ptr addrspace(5) %"85", align 4 + %"155" = inttoptr i64 %"107" to ptr + %"61" = getelementptr inbounds i8, ptr %"155", i64 16 + %"108" = load float, ptr %"61", align 4 + store float %"108", ptr addrspace(5) %"91", align 4 + %"109" = load i64, ptr addrspace(5) %"85", align 4 + %"156" = inttoptr i64 %"109" to ptr + %"63" = getelementptr inbounds i8, ptr %"156", i64 20 + %"110" = load float, ptr %"63", align 4 + store float %"110", ptr addrspace(5) %"92", align 4 + %"111" = load i64, ptr addrspace(5) %"85", align 4 + %"157" = inttoptr i64 %"111" to ptr + %"65" = getelementptr inbounds i8, ptr %"157", i64 24 + %"112" = load float, ptr %"65", align 4 + store float %"112", ptr addrspace(5) %"93", align 4 + %"113" = load i64, ptr addrspace(5) %"85", align 4 + %"158" = inttoptr i64 %"113" to ptr + %"67" = getelementptr inbounds i8, ptr %"158", i64 28 + %"114" = load float, ptr %"67", align 4 + store float %"114", ptr addrspace(5) %"94", align 4 + %"116" = load float, ptr addrspace(5) %"87", align 4 + %"117" = load float, ptr addrspace(5) %"88", align 4 + %"115" = fcmp ord float %"116", %"117" + store i1 %"115", ptr addrspace(5) %"96", align 1 + %"118" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"118", label %"22", label %"23" "22": ; preds = %"82" - store i32 2, ptr addrspace(5) %"101", align 4 + store i32 2, ptr addrspace(5) %"95", align 4 br label %"23" "23": ; preds = %"22", %"82" - %"126" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"126", label %"25", label %"24" + %"120" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"120", label %"25", label %"24" "24": ; preds = %"23" - store i32 0, ptr addrspace(5) %"101", align 4 + store i32 0, ptr addrspace(5) %"95", align 4 br label %"25" "25": ; preds = %"24", %"23" - %"128" = load i64, ptr addrspace(5) %"92", align 4 - %"129" = load i32, ptr addrspace(5) %"101", align 4 - %"165" = inttoptr i64 %"128" to ptr - store i32 %"129", ptr %"165", align 4 - %"131" = load float, ptr addrspace(5) %"95", align 4 - %"132" = load float, ptr addrspace(5) %"96", align 4 - %"130" = fcmp ord float %"131", %"132" - store i1 %"130", ptr addrspace(5) %"102", align 1 - %"133" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"133", label %"26", label %"27" + %"122" = load i64, ptr addrspace(5) %"86", align 4 + %"123" = load i32, ptr addrspace(5) %"95", align 4 + %"159" = inttoptr i64 %"122" to ptr + store i32 %"123", ptr %"159", align 4 + %"125" = load float, ptr addrspace(5) %"89", align 4 + %"126" = load float, ptr addrspace(5) %"90", align 4 + %"124" = fcmp ord float %"125", %"126" + store i1 %"124", ptr addrspace(5) %"96", align 1 + %"127" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"127", label %"26", label %"27" "26": ; preds = %"25" - store i32 2, ptr addrspace(5) %"101", align 4 + store i32 2, ptr addrspace(5) %"95", align 4 br label %"27" "27": ; preds = %"26", %"25" - %"135" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"135", label %"29", label %"28" + %"129" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"129", label %"29", label %"28" "28": ; preds = %"27" - store i32 0, ptr addrspace(5) %"101", align 4 + store i32 0, ptr addrspace(5) %"95", align 4 br label %"29" "29": ; preds = %"28", %"27" - %"137" = load i64, ptr addrspace(5) %"92", align 4 - %"166" = inttoptr i64 %"137" to ptr - %"73" = getelementptr inbounds i8, ptr %"166", i64 4 - %"138" = load i32, ptr addrspace(5) %"101", align 4 - store i32 %"138", ptr %"73", align 4 - %"140" = load float, ptr addrspace(5) %"97", align 4 - %"141" = load float, ptr addrspace(5) %"98", align 4 - %"139" = fcmp ord float %"140", %"141" - store i1 %"139", ptr addrspace(5) %"102", align 1 - %"142" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"142", label %"30", label %"31" + %"131" = load i64, ptr addrspace(5) %"86", align 4 + %"160" = inttoptr i64 %"131" to ptr + %"73" = getelementptr inbounds i8, ptr %"160", i64 4 + %"132" = load i32, ptr addrspace(5) %"95", align 4 + store i32 %"132", ptr %"73", align 4 + %"134" = load float, ptr addrspace(5) %"91", align 4 + %"135" = load float, ptr addrspace(5) %"92", align 4 + %"133" = fcmp ord float %"134", %"135" + store i1 %"133", ptr addrspace(5) %"96", align 1 + %"136" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"136", label %"30", label %"31" "30": ; preds = %"29" - store i32 2, ptr addrspace(5) %"101", align 4 + store i32 2, ptr addrspace(5) %"95", align 4 br label %"31" "31": ; preds = %"30", %"29" - %"144" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"144", label %"33", label %"32" + %"138" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"138", label %"33", label %"32" "32": ; preds = %"31" - store i32 0, ptr addrspace(5) %"101", align 4 + store i32 0, ptr addrspace(5) %"95", align 4 br label %"33" "33": ; preds = %"32", %"31" - %"146" = load i64, ptr addrspace(5) %"92", align 4 - %"167" = inttoptr i64 %"146" to ptr - %"77" = getelementptr inbounds i8, ptr %"167", i64 8 - %"147" = load i32, ptr addrspace(5) %"101", align 4 - store i32 %"147", ptr %"77", align 4 - %"149" = load float, ptr addrspace(5) %"99", align 4 - %"150" = load float, ptr addrspace(5) %"100", align 4 - %"148" = fcmp ord float %"149", %"150" - store i1 %"148", ptr addrspace(5) %"102", align 1 - %"151" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"151", label %"34", label %"35" + %"140" = load i64, ptr addrspace(5) %"86", align 4 + %"161" = inttoptr i64 %"140" to ptr + %"77" = getelementptr inbounds i8, ptr %"161", i64 8 + %"141" = load i32, ptr addrspace(5) %"95", align 4 + store i32 %"141", ptr %"77", align 4 + %"143" = load float, ptr addrspace(5) %"93", align 4 + %"144" = load float, ptr addrspace(5) %"94", align 4 + %"142" = fcmp ord float %"143", %"144" + store i1 %"142", ptr addrspace(5) %"96", align 1 + %"145" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"145", label %"34", label %"35" "34": ; preds = %"33" - store i32 2, ptr addrspace(5) %"101", align 4 + store i32 2, ptr addrspace(5) %"95", align 4 br label %"35" "35": ; preds = %"34", %"33" - %"153" = load i1, ptr addrspace(5) %"102", align 1 - br i1 %"153", label %"37", label %"36" + %"147" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"147", label %"37", label %"36" "36": ; preds = %"35" - store i32 0, ptr addrspace(5) %"101", align 4 + store i32 0, ptr addrspace(5) %"95", align 4 br label %"37" "37": ; preds = %"36", %"35" - %"155" = load i64, ptr addrspace(5) %"92", align 4 - %"168" = inttoptr i64 %"155" to ptr - %"81" = getelementptr inbounds i8, ptr %"168", i64 12 - %"156" = load i32, ptr addrspace(5) %"101", align 4 - store i32 %"156", ptr %"81", align 4 + %"149" = load i64, ptr addrspace(5) %"86", align 4 + %"162" = inttoptr i64 %"149" to ptr + %"81" = getelementptr inbounds i8, ptr %"162", i64 12 + %"150" = load i32, ptr addrspace(5) %"95", align 4 + store i32 %"150", ptr %"81", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_ptr_32.ll b/ptx/src/test/ll/shared_ptr_32.ll index 4501f4a..5a6f55f 100644 --- a/ptx/src/test/ll/shared_ptr_32.ll +++ b/ptx/src/test/ll/shared_ptr_32.ll @@ -1,53 +1,40 @@ @shared_mem1 = external addrspace(3) global [128 x i8], align 4 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 { - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) - %"45" = alloca i32, align 4, addrspace(5) - %"46" = alloca i64, align 8, addrspace(5) - %"47" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"34" "34": ; preds = %1 - %"48" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"48", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(4) %"42", align 4 - store i64 %"49", ptr addrspace(5) %"44", align 4 - store i32 ptrtoint (ptr addrspace(3) @shared_mem1 to i32), ptr addrspace(5) %"45", align 4 - %"52" = load i64, ptr addrspace(5) %"43", align 4 - %"60" = inttoptr i64 %"52" to ptr addrspace(1) - %"51" = load i64, ptr addrspace(1) %"60", align 4 - store i64 %"51", ptr addrspace(5) %"46", align 4 - %"53" = load i32, ptr addrspace(5) %"45", align 4 - %"54" = load i64, ptr addrspace(5) %"46", align 4 - %"61" = inttoptr i32 %"53" to ptr addrspace(3) - store i64 %"54", ptr addrspace(3) %"61", align 4 - %"55" = load i32, ptr addrspace(5) %"45", align 4 - %"62" = inttoptr i32 %"55" to ptr addrspace(3) - %"33" = getelementptr inbounds i8, ptr addrspace(3) %"62", i64 0 - %"56" = load i64, ptr addrspace(3) %"33", align 4 - store i64 %"56", ptr addrspace(5) %"47", align 4 - %"57" = load i64, ptr addrspace(5) %"44", align 4 - %"58" = load i64, ptr addrspace(5) %"47", align 4 - %"63" = inttoptr i64 %"57" to ptr addrspace(1) - store i64 %"58", ptr addrspace(1) %"63", align 4 + %"42" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"42", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"43", ptr addrspace(5) %"38", align 4 + store i32 ptrtoint (ptr addrspace(3) @shared_mem1 to i32), ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"37", align 4 + %"54" = inttoptr i64 %"46" to ptr addrspace(1) + %"45" = load i64, ptr addrspace(1) %"54", align 4 + store i64 %"45", ptr addrspace(5) %"40", align 4 + %"47" = load i32, ptr addrspace(5) %"39", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"55" = inttoptr i32 %"47" to ptr addrspace(3) + store i64 %"48", ptr addrspace(3) %"55", align 4 + %"49" = load i32, ptr addrspace(5) %"39", align 4 + %"56" = inttoptr i32 %"49" to ptr addrspace(3) + %"33" = getelementptr inbounds i8, ptr addrspace(3) %"56", i64 0 + %"50" = load i64, ptr addrspace(3) %"33", align 4 + store i64 %"50", ptr addrspace(5) %"41", align 4 + %"51" = load i64, ptr addrspace(5) %"38", align 4 + %"52" = load i64, ptr addrspace(5) %"41", align 4 + %"57" = inttoptr i64 %"51" to ptr addrspace(1) + store i64 %"52", ptr addrspace(1) %"57", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_ptr_take_address.ll b/ptx/src/test/ll/shared_ptr_take_address.ll index ff817aa..b075ccb 100644 --- a/ptx/src/test/ll/shared_ptr_take_address.ll +++ b/ptx/src/test/ll/shared_ptr_take_address.ll @@ -1,52 +1,39 @@ @shared_mem = external addrspace(3) global [0 x i8], align 4 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) - %"45" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"32" "32": ; preds = %1 - %"46" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"47", ptr addrspace(5) %"42", align 4 - store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %"43", align 4 - %"50" = load i64, ptr addrspace(5) %"41", align 4 - %"58" = inttoptr i64 %"50" to ptr addrspace(1) - %"49" = load i64, ptr addrspace(1) %"58", align 4 - store i64 %"49", ptr addrspace(5) %"44", align 4 - %"51" = load i64, ptr addrspace(5) %"43", align 4 - %"52" = load i64, ptr addrspace(5) %"44", align 4 - %"59" = inttoptr i64 %"51" to ptr addrspace(3) - store i64 %"52", ptr addrspace(3) %"59", align 4 - %"54" = load i64, ptr addrspace(5) %"43", align 4 - %"60" = inttoptr i64 %"54" to ptr addrspace(3) - %"53" = load i64, ptr addrspace(3) %"60", align 4 - store i64 %"53", ptr addrspace(5) %"45", align 4 - %"55" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = load i64, ptr addrspace(5) %"45", align 4 - %"61" = inttoptr i64 %"55" to ptr addrspace(1) - store i64 %"56", ptr addrspace(1) %"61", align 4 + %"40" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"40", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"41", ptr addrspace(5) %"36", align 4 + store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"52" = inttoptr i64 %"44" to ptr addrspace(1) + %"43" = load i64, ptr addrspace(1) %"52", align 4 + store i64 %"43", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"53" = inttoptr i64 %"45" to ptr addrspace(3) + store i64 %"46", ptr addrspace(3) %"53", align 4 + %"48" = load i64, ptr addrspace(5) %"37", align 4 + %"54" = inttoptr i64 %"48" to ptr addrspace(3) + %"47" = load i64, ptr addrspace(3) %"54", align 4 + store i64 %"47", ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"36", align 4 + %"50" = load i64, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"49" to ptr addrspace(1) + store i64 %"50", ptr addrspace(1) %"55", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_unify_extern.ll b/ptx/src/test/ll/shared_unify_extern.ll index acf8cae..4020f92 100644 --- a/ptx/src/test/ll/shared_unify_extern.ll +++ b/ptx/src/test/ll/shared_unify_extern.ll @@ -1,42 +1,30 @@ @shared_ex = external addrspace(3) global [0 x i32] @shared_mod = external addrspace(3) global [4 x i32] -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - define i64 @add() #0 { - %"52" = alloca i64, align 8, addrspace(5) - %"53" = alloca i64, align 8, addrspace(5) - %"54" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"41" "41": ; preds = %1 - %"55" = load i64, ptr addrspace(3) @shared_mod, align 4 - store i64 %"55", ptr addrspace(5) %"53", align 4 - %"56" = load i64, ptr addrspace(3) @shared_ex, align 4 - store i64 %"56", ptr addrspace(5) %"54", align 4 - %"58" = load i64, ptr addrspace(5) %"54", align 4 - %"59" = load i64, ptr addrspace(5) %"53", align 4 - %"81" = add i64 %"58", %"59" - store i64 %"81", ptr addrspace(5) %"52", align 4 - %2 = load i64, ptr addrspace(5) %"52", align 4 + %"49" = load i64, ptr addrspace(3) @shared_mod, align 4 + store i64 %"49", ptr addrspace(5) %"47", align 4 + %"50" = load i64, ptr addrspace(3) @shared_ex, align 4 + store i64 %"50", ptr addrspace(5) %"48", align 4 + %"52" = load i64, ptr addrspace(5) %"48", align 4 + %"53" = load i64, ptr addrspace(5) %"47", align 4 + %"75" = add i64 %"52", %"53" + store i64 %"75", ptr addrspace(5) %"46", align 4 + %2 = load i64, ptr addrspace(5) %"46", align 4 ret i64 %2 } define i64 @set_shared_temp1(i64 %"15") #0 { - %"60" = alloca i64, align 8, addrspace(5) + %"54" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 @@ -44,51 +32,51 @@ define i64 @set_shared_temp1(i64 %"15") #0 { "42": ; preds = %1 store i64 %"15", ptr addrspace(3) @shared_ex, align 4 - %"61" = call i64 @add() - store i64 %"61", ptr addrspace(5) %"60", align 4 + %"55" = call i64 @add() + store i64 %"55", ptr addrspace(5) %"54", align 4 br label %"43" "43": ; preds = %"42" - %2 = load i64, ptr addrspace(5) %"60", align 4 + %2 = load i64, ptr addrspace(5) %"54", align 4 ret i64 %2 } -define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"62", ptr addrspace(4) byref(i64) %"63") #1 { - %"64" = alloca i64, align 8, addrspace(5) - %"65" = alloca i64, align 8, addrspace(5) - %"66" = alloca i64, align 8, addrspace(5) - %"67" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56", ptr addrspace(4) byref(i64) %"57") #1 { + %"58" = alloca i64, align 8, addrspace(5) + %"59" = alloca i64, align 8, addrspace(5) + %"60" = alloca i64, align 8, addrspace(5) + %"61" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"44" "44": ; preds = %1 - %"68" = load i64, ptr addrspace(4) %"62", align 4 - store i64 %"68", ptr addrspace(5) %"64", align 4 - %"69" = load i64, ptr addrspace(4) %"63", align 4 - store i64 %"69", ptr addrspace(5) %"65", align 4 - %"71" = load i64, ptr addrspace(5) %"64", align 4 - %"84" = inttoptr i64 %"71" to ptr addrspace(1) - %"70" = load i64, ptr addrspace(1) %"84", align 4 - store i64 %"70", ptr addrspace(5) %"66", align 4 - %"72" = load i64, ptr addrspace(5) %"64", align 4 - %"85" = inttoptr i64 %"72" to ptr addrspace(1) - %"40" = getelementptr inbounds i8, ptr addrspace(1) %"85", i64 8 - %"73" = load i64, ptr addrspace(1) %"40", align 4 - store i64 %"73", ptr addrspace(5) %"67", align 4 - %"74" = load i64, ptr addrspace(5) %"67", align 4 - store i64 %"74", ptr addrspace(3) @shared_mod, align 4 - %"76" = load i64, ptr addrspace(5) %"66", align 4 - %"87" = call i64 @set_shared_temp1(i64 %"76") - store i64 %"87", ptr addrspace(5) %"67", align 4 + %"62" = load i64, ptr addrspace(4) %"56", align 4 + store i64 %"62", ptr addrspace(5) %"58", align 4 + %"63" = load i64, ptr addrspace(4) %"57", align 4 + store i64 %"63", ptr addrspace(5) %"59", align 4 + %"65" = load i64, ptr addrspace(5) %"58", align 4 + %"78" = inttoptr i64 %"65" to ptr addrspace(1) + %"64" = load i64, ptr addrspace(1) %"78", align 4 + store i64 %"64", ptr addrspace(5) %"60", align 4 + %"66" = load i64, ptr addrspace(5) %"58", align 4 + %"79" = inttoptr i64 %"66" to ptr addrspace(1) + %"40" = getelementptr inbounds i8, ptr addrspace(1) %"79", i64 8 + %"67" = load i64, ptr addrspace(1) %"40", align 4 + store i64 %"67", ptr addrspace(5) %"61", align 4 + %"68" = load i64, ptr addrspace(5) %"61", align 4 + store i64 %"68", ptr addrspace(3) @shared_mod, align 4 + %"70" = load i64, ptr addrspace(5) %"60", align 4 + %"81" = call i64 @set_shared_temp1(i64 %"70") + store i64 %"81", ptr addrspace(5) %"61", align 4 br label %"45" "45": ; preds = %"44" - %"77" = load i64, ptr addrspace(5) %"65", align 4 - %"78" = load i64, ptr addrspace(5) %"67", align 4 - %"89" = inttoptr i64 %"77" to ptr - store i64 %"78", ptr %"89", align 4 + %"71" = load i64, ptr addrspace(5) %"59", align 4 + %"72" = load i64, ptr addrspace(5) %"61", align 4 + %"83" = inttoptr i64 %"71" to ptr + store i64 %"72", ptr %"83", align 4 ret void } diff --git a/ptx/src/test/ll/shared_unify_local.ll b/ptx/src/test/ll/shared_unify_local.ll index ec373ee..ef4b605 100644 --- a/ptx/src/test/ll/shared_unify_local.ll +++ b/ptx/src/test/ll/shared_unify_local.ll @@ -1,21 +1,9 @@ @shared_ex = external addrspace(3) global [0 x i32] @shared_mod = external addrspace(3) global i64, align 4 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - define i64 @add(i64 %"10") #0 { - %"53" = alloca i64, align 8, addrspace(5) - %"54" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 @@ -23,18 +11,18 @@ define i64 @add(i64 %"10") #0 { "42": ; preds = %1 store i64 %"10", ptr addrspace(3) @shared_mod, align 4 - %"55" = load i64, ptr addrspace(3) @shared_mod, align 4 - store i64 %"55", ptr addrspace(5) %"54", align 4 + %"49" = load i64, ptr addrspace(3) @shared_mod, align 4 + store i64 %"49", ptr addrspace(5) %"48", align 4 %"101" = load i64, ptr addrspace(3) @shared_ex, align 4 - %"57" = load i64, ptr addrspace(5) %"54", align 4 - %"78" = add i64 %"101", %"57" - store i64 %"78", ptr addrspace(5) %"53", align 4 - %2 = load i64, ptr addrspace(5) %"53", align 4 + %"51" = load i64, ptr addrspace(5) %"48", align 4 + %"72" = add i64 %"101", %"51" + store i64 %"72", ptr addrspace(5) %"47", align 4 + %2 = load i64, ptr addrspace(5) %"47", align 4 ret i64 %2 } define i64 @set_shared_temp1(i64 %"15", i64 %"16") #0 { - %"58" = alloca i64, align 8, addrspace(5) + %"52" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 @@ -42,50 +30,50 @@ define i64 @set_shared_temp1(i64 %"15", i64 %"16") #0 { "43": ; preds = %1 store i64 %"15", ptr addrspace(3) @shared_ex, align 4 - %"59" = call i64 @add(i64 %"16") - store i64 %"59", ptr addrspace(5) %"58", align 4 + %"53" = call i64 @add(i64 %"16") + store i64 %"53", ptr addrspace(5) %"52", align 4 br label %"44" "44": ; preds = %"43" - %2 = load i64, ptr addrspace(5) %"58", align 4 + %2 = load i64, ptr addrspace(5) %"52", align 4 ret i64 %2 } -define amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"60", ptr addrspace(4) byref(i64) %"61") #1 { - %"62" = alloca i64, align 8, addrspace(5) - %"63" = alloca i64, align 8, addrspace(5) - %"64" = alloca i64, align 8, addrspace(5) - %"65" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"54", ptr addrspace(4) byref(i64) %"55") #1 { + %"56" = alloca i64, align 8, addrspace(5) + %"57" = alloca i64, align 8, addrspace(5) + %"58" = alloca i64, align 8, addrspace(5) + %"59" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"45" "45": ; preds = %1 - %"66" = load i64, ptr addrspace(4) %"60", align 4 - store i64 %"66", ptr addrspace(5) %"62", align 4 - %"67" = load i64, ptr addrspace(4) %"61", align 4 - store i64 %"67", ptr addrspace(5) %"63", align 4 - %"69" = load i64, ptr addrspace(5) %"62", align 4 - %"81" = inttoptr i64 %"69" to ptr addrspace(1) - %"68" = load i64, ptr addrspace(1) %"81", align 4 - store i64 %"68", ptr addrspace(5) %"64", align 4 - %"70" = load i64, ptr addrspace(5) %"62", align 4 - %"82" = inttoptr i64 %"70" to ptr addrspace(1) - %"41" = getelementptr inbounds i8, ptr addrspace(1) %"82", i64 8 - %"71" = load i64, ptr addrspace(1) %"41", align 4 - store i64 %"71", ptr addrspace(5) %"65", align 4 - %"73" = load i64, ptr addrspace(5) %"64", align 4 - %"74" = load i64, ptr addrspace(5) %"65", align 4 - %"83" = call i64 @set_shared_temp1(i64 %"73", i64 %"74") - store i64 %"83", ptr addrspace(5) %"65", align 4 + %"60" = load i64, ptr addrspace(4) %"54", align 4 + store i64 %"60", ptr addrspace(5) %"56", align 4 + %"61" = load i64, ptr addrspace(4) %"55", align 4 + store i64 %"61", ptr addrspace(5) %"57", align 4 + %"63" = load i64, ptr addrspace(5) %"56", align 4 + %"75" = inttoptr i64 %"63" to ptr addrspace(1) + %"62" = load i64, ptr addrspace(1) %"75", align 4 + store i64 %"62", ptr addrspace(5) %"58", align 4 + %"64" = load i64, ptr addrspace(5) %"56", align 4 + %"76" = inttoptr i64 %"64" to ptr addrspace(1) + %"41" = getelementptr inbounds i8, ptr addrspace(1) %"76", i64 8 + %"65" = load i64, ptr addrspace(1) %"41", align 4 + store i64 %"65", ptr addrspace(5) %"59", align 4 + %"67" = load i64, ptr addrspace(5) %"58", align 4 + %"68" = load i64, ptr addrspace(5) %"59", align 4 + %"77" = call i64 @set_shared_temp1(i64 %"67", i64 %"68") + store i64 %"77", ptr addrspace(5) %"59", align 4 br label %"46" "46": ; preds = %"45" - %"75" = load i64, ptr addrspace(5) %"63", align 4 - %"76" = load i64, ptr addrspace(5) %"65", align 4 - %"85" = inttoptr i64 %"75" to ptr - store i64 %"76", ptr %"85", align 4 + %"69" = load i64, ptr addrspace(5) %"57", align 4 + %"70" = load i64, ptr addrspace(5) %"59", align 4 + %"79" = inttoptr i64 %"69" to ptr + store i64 %"70", ptr %"79", align 4 ret void } diff --git a/ptx/src/test/ll/shared_variable.ll b/ptx/src/test/ll/shared_variable.ll index 9e8efbd..821ac7e 100644 --- a/ptx/src/test/ll/shared_variable.ll +++ b/ptx/src/test/ll/shared_variable.ll @@ -1,46 +1,33 @@ @shared_mem1 = external addrspace(3) global [128 x i8], align 4 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @shared_variable(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shared_variable(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"31" "31": ; preds = %1 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"52" = inttoptr i64 %"47" to ptr addrspace(1) - %"46" = load i64, ptr addrspace(1) %"52", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"42", align 4 - store i64 %"48", ptr addrspace(3) @shared_mem1, align 4 - %"49" = load i64, ptr addrspace(3) @shared_mem1, align 4 - store i64 %"49", ptr addrspace(5) %"43", align 4 - %"50" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = load i64, ptr addrspace(5) %"43", align 4 - %"55" = inttoptr i64 %"50" to ptr addrspace(1) - store i64 %"51", ptr addrspace(1) %"55", align 4 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr addrspace(1) + %"40" = load i64, ptr addrspace(1) %"46", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + store i64 %"42", ptr addrspace(3) @shared_mem1, align 4 + %"43" = load i64, ptr addrspace(3) @shared_mem1, align 4 + store i64 %"43", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"49" = inttoptr i64 %"44" to ptr addrspace(1) + store i64 %"45", ptr addrspace(1) %"49", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shl.ll b/ptx/src/test/ll/shl.ll index 66ae707..d1e8022 100644 --- a/ptx/src/test/ll/shl.ll +++ b/ptx/src/test/ll/shl.ll @@ -1,44 +1,31 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @shl(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shl(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"31" "31": ; preds = %1 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"52" = inttoptr i64 %"47" to ptr - %"46" = load i64, ptr %"52", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %2 = shl i64 %"49", 2 - %"53" = select i1 false, i64 0, i64 %2 - store i64 %"53", ptr addrspace(5) %"43", align 4 - %"50" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = load i64, ptr addrspace(5) %"43", align 4 - %"55" = inttoptr i64 %"50" to ptr - store i64 %"51", ptr %"55", align 4 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr + %"40" = load i64, ptr %"46", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %2 = shl i64 %"43", 2 + %"47" = select i1 false, i64 0, i64 %2 + store i64 %"47", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"49" = inttoptr i64 %"44" to ptr + store i64 %"45", ptr %"49", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shr.ll b/ptx/src/test/ll/shr.ll index c98db19..bbb8f9c 100644 --- a/ptx/src/test/ll/shr.ll +++ b/ptx/src/test/ll/shr.ll @@ -1,43 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @shr(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @shr(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 br label %"30" "30": ; preds = %1 - %"42" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"42", ptr addrspace(5) %"39", align 4 - %"43" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"43", ptr addrspace(5) %"40", align 4 - %"45" = load i64, ptr addrspace(5) %"39", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i32, ptr %"50", align 4 - store i32 %"44", ptr addrspace(5) %"41", align 4 - %"47" = load i32, ptr addrspace(5) %"41", align 4 - %2 = ashr i32 %"47", 1 - %"46" = select i1 false, i32 0, i32 %2 - store i32 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %"49" = load i32, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i32 %"49", ptr %"51", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"37" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"37", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(5) %"33", align 4 + %"44" = inttoptr i64 %"39" to ptr + %"38" = load i32, ptr %"44", align 4 + store i32 %"38", ptr addrspace(5) %"35", align 4 + %"41" = load i32, ptr addrspace(5) %"35", align 4 + %2 = ashr i32 %"41", 1 + %"40" = select i1 false, i32 0, i32 %2 + store i32 %"40", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(5) %"34", align 4 + %"43" = load i32, ptr addrspace(5) %"35", align 4 + %"45" = inttoptr i64 %"42" to ptr + store i32 %"43", ptr %"45", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/sign_extend.ll b/ptx/src/test/ll/sign_extend.ll index 5ef2627..1d8ed20 100644 --- a/ptx/src/test/ll/sign_extend.ll +++ b/ptx/src/test/ll/sign_extend.ll @@ -1,40 +1,27 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @sign_extend(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @sign_extend(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 br label %"29" "29": ; preds = %1 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"42" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"42", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"44" to ptr - %"47" = load i16, ptr %"48", align 2 - %"43" = sext i16 %"47" to i32 - store i32 %"43", ptr addrspace(5) %"40", align 4 - %"45" = load i64, ptr addrspace(5) %"39", align 4 - %"46" = load i32, ptr addrspace(5) %"40", align 4 - %"49" = inttoptr i64 %"45" to ptr - store i32 %"46", ptr %"49", align 4 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"42" = inttoptr i64 %"38" to ptr + %"41" = load i16, ptr %"42", align 2 + %"37" = sext i16 %"41" to i32 + store i32 %"37", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(5) %"33", align 4 + %"40" = load i32, ptr addrspace(5) %"34", align 4 + %"43" = inttoptr i64 %"39" to ptr + store i32 %"40", ptr %"43", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/sin.ll b/ptx/src/test/ll/sin.ll index 520bc97..922256b 100644 --- a/ptx/src/test/ll/sin.ll +++ b/ptx/src/test/ll/sin.ll @@ -1,46 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @sin(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @sin(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 br label %"29" "29": ; preds = %1 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"42" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"42", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(5) %"38", align 4 - %"49" = inttoptr i64 %"44" to ptr - %"43" = load float, ptr %"49", align 4 - store float %"43", ptr addrspace(5) %"40", align 4 - %"46" = load float, ptr addrspace(5) %"40", align 4 - %"45" = call afn float @llvm.sin.f32(float %"46") - store float %"45", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"48" = load float, ptr addrspace(5) %"40", align 4 - %"50" = inttoptr i64 %"47" to ptr - store float %"48", ptr %"50", align 4 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load float, ptr %"43", align 4 + store float %"37", ptr addrspace(5) %"34", align 4 + %"40" = load float, ptr addrspace(5) %"34", align 4 + %"39" = call afn float @llvm.sin.f32(float %"40") + store float %"39", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load float, ptr addrspace(5) %"34", align 4 + %"44" = inttoptr i64 %"41" to ptr + store float %"42", ptr %"44", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.sin.f32(float) #2 +declare float @llvm.sin.f32(float) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/sqrt.ll b/ptx/src/test/ll/sqrt.ll index db929f4..2497375 100644 --- a/ptx/src/test/ll/sqrt.ll +++ b/ptx/src/test/ll/sqrt.ll @@ -1,46 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 br label %"29" "29": ; preds = %1 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"42" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"42", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(5) %"38", align 4 - %"49" = inttoptr i64 %"44" to ptr - %"43" = load float, ptr %"49", align 4 - store float %"43", ptr addrspace(5) %"40", align 4 - %"46" = load float, ptr addrspace(5) %"40", align 4 - %"45" = call float @llvm.amdgcn.sqrt.f32(float %"46") - store float %"45", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"48" = load float, ptr addrspace(5) %"40", align 4 - %"50" = inttoptr i64 %"47" to ptr - store float %"48", ptr %"50", align 4 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load float, ptr %"43", align 4 + store float %"37", ptr addrspace(5) %"34", align 4 + %"40" = load float, ptr addrspace(5) %"34", align 4 + %"39" = call float @llvm.amdgcn.sqrt.f32(float %"40") + store float %"39", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load float, ptr addrspace(5) %"34", align 4 + %"44" = inttoptr i64 %"41" to ptr + store float %"42", ptr %"44", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.amdgcn.sqrt.f32(float) #2 +declare float @llvm.amdgcn.sqrt.f32(float) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/stateful_ld_st_ntid.ll b/ptx/src/test/ll/stateful_ld_st_ntid.ll index 0e44e53..c100da6 100644 --- a/ptx/src/test/ll/stateful_ld_st_ntid.ll +++ b/ptx/src/test/ll/stateful_ld_st_ntid.ll @@ -1,63 +1,53 @@ declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @stateful_ld_st_ntid(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 { - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) - %"45" = alloca i32, align 4, addrspace(5) - %"46" = alloca i64, align 8, addrspace(5) - %"47" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @stateful_ld_st_ntid(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"33" "33": ; preds = %1 - %"67" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"67", ptr addrspace(5) %"43", align 4 - %"68" = load i64, ptr addrspace(4) %"42", align 4 - store i64 %"68", ptr addrspace(5) %"44", align 4 - %"51" = load i64, ptr addrspace(5) %"43", align 4 - %2 = inttoptr i64 %"51" to ptr - %"50" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"50", ptr addrspace(5) %"43", align 8 - %"53" = load i64, ptr addrspace(5) %"44", align 4 - %3 = inttoptr i64 %"53" to ptr - %"52" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"52", ptr addrspace(5) %"44", align 8 + %"62" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"62", ptr addrspace(5) %"38", align 4 + %"63" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"63", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %2 = inttoptr i64 %"46" to ptr + %"45" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"45", ptr addrspace(5) %"38", align 8 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %3 = inttoptr i64 %"48" to ptr + %"47" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"47", ptr addrspace(5) %"39", align 8 %"32" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) br label %"34" "34": ; preds = %"33" - store i32 %"32", ptr addrspace(5) %"45", align 4 - %"56" = load i32, ptr addrspace(5) %"45", align 4 - %"55" = zext i32 %"56" to i64 - store i64 %"55", ptr addrspace(5) %"46", align 4 - %"58" = load i64, ptr addrspace(5) %"43", align 4 - %"59" = load i64, ptr addrspace(5) %"46", align 4 - %"69" = add i64 %"58", %"59" - store i64 %"69", ptr addrspace(5) %"43", align 4 - %"61" = load i64, ptr addrspace(5) %"44", align 4 - %"62" = load i64, ptr addrspace(5) %"46", align 4 - %"71" = add i64 %"61", %"62" - store i64 %"71", ptr addrspace(5) %"44", align 4 - %"64" = load i64, ptr addrspace(5) %"43", align 4 - %"73" = inttoptr i64 %"64" to ptr addrspace(1) - %"63" = load i64, ptr addrspace(1) %"73", align 4 - store i64 %"63", ptr addrspace(5) %"47", align 4 - %"65" = load i64, ptr addrspace(5) %"44", align 4 - %"66" = load i64, ptr addrspace(5) %"47", align 4 - %"74" = inttoptr i64 %"65" to ptr addrspace(1) - store i64 %"66", ptr addrspace(1) %"74", align 4 + store i32 %"32", ptr addrspace(5) %"40", align 4 + %"51" = load i32, ptr addrspace(5) %"40", align 4 + %"50" = zext i32 %"51" to i64 + store i64 %"50", ptr addrspace(5) %"41", align 4 + %"53" = load i64, ptr addrspace(5) %"38", align 4 + %"54" = load i64, ptr addrspace(5) %"41", align 4 + %"64" = add i64 %"53", %"54" + store i64 %"64", ptr addrspace(5) %"38", align 4 + %"56" = load i64, ptr addrspace(5) %"39", align 4 + %"57" = load i64, ptr addrspace(5) %"41", align 4 + %"66" = add i64 %"56", %"57" + store i64 %"66", ptr addrspace(5) %"39", align 4 + %"59" = load i64, ptr addrspace(5) %"38", align 4 + %"68" = inttoptr i64 %"59" to ptr addrspace(1) + %"58" = load i64, ptr addrspace(1) %"68", align 4 + store i64 %"58", ptr addrspace(5) %"42", align 4 + %"60" = load i64, ptr addrspace(5) %"39", align 4 + %"61" = load i64, ptr addrspace(5) %"42", align 4 + %"69" = inttoptr i64 %"60" to ptr addrspace(1) + store i64 %"61", ptr addrspace(1) %"69", align 4 ret void } diff --git a/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll b/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll index 700a828..c1a59c6 100644 --- a/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll +++ b/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll @@ -1,67 +1,57 @@ declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @stateful_ld_st_ntid_chain(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #1 { +define amdgpu_kernel void @stateful_ld_st_ntid_chain(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 { + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) %"47" = alloca i64, align 8, addrspace(5) - %"48" = alloca i64, align 8, addrspace(5) + %"48" = alloca i32, align 4, addrspace(5) %"49" = alloca i64, align 8, addrspace(5) %"50" = alloca i64, align 8, addrspace(5) - %"51" = alloca i64, align 8, addrspace(5) - %"52" = alloca i64, align 8, addrspace(5) - %"53" = alloca i32, align 4, addrspace(5) - %"54" = alloca i64, align 8, addrspace(5) - %"55" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"37" "37": ; preds = %1 - %"75" = load i64, ptr addrspace(4) %"45", align 4 - store i64 %"75", ptr addrspace(5) %"47", align 4 - %"76" = load i64, ptr addrspace(4) %"46", align 4 - store i64 %"76", ptr addrspace(5) %"50", align 4 - %"59" = load i64, ptr addrspace(5) %"47", align 4 - %2 = inttoptr i64 %"59" to ptr - %"58" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"58", ptr addrspace(5) %"48", align 8 - %"61" = load i64, ptr addrspace(5) %"50", align 4 - %3 = inttoptr i64 %"61" to ptr - %"60" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"60", ptr addrspace(5) %"51", align 8 + %"70" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"70", ptr addrspace(5) %"42", align 4 + %"71" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"71", ptr addrspace(5) %"45", align 4 + %"54" = load i64, ptr addrspace(5) %"42", align 4 + %2 = inttoptr i64 %"54" to ptr + %"53" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"53", ptr addrspace(5) %"43", align 8 + %"56" = load i64, ptr addrspace(5) %"45", align 4 + %3 = inttoptr i64 %"56" to ptr + %"55" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"55", ptr addrspace(5) %"46", align 8 %"36" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) br label %"38" "38": ; preds = %"37" - store i32 %"36", ptr addrspace(5) %"53", align 4 - %"64" = load i32, ptr addrspace(5) %"53", align 4 - %"63" = zext i32 %"64" to i64 - store i64 %"63", ptr addrspace(5) %"54", align 4 - %"66" = load i64, ptr addrspace(5) %"48", align 4 - %"67" = load i64, ptr addrspace(5) %"54", align 4 - %"77" = add i64 %"66", %"67" - store i64 %"77", ptr addrspace(5) %"49", align 4 - %"69" = load i64, ptr addrspace(5) %"51", align 4 - %"70" = load i64, ptr addrspace(5) %"54", align 4 - %"79" = add i64 %"69", %"70" - store i64 %"79", ptr addrspace(5) %"52", align 4 - %"72" = load i64, ptr addrspace(5) %"49", align 4 - %"81" = inttoptr i64 %"72" to ptr addrspace(1) - %"71" = load i64, ptr addrspace(1) %"81", align 4 - store i64 %"71", ptr addrspace(5) %"55", align 4 - %"73" = load i64, ptr addrspace(5) %"52", align 4 - %"74" = load i64, ptr addrspace(5) %"55", align 4 - %"82" = inttoptr i64 %"73" to ptr addrspace(1) - store i64 %"74", ptr addrspace(1) %"82", align 4 + store i32 %"36", ptr addrspace(5) %"48", align 4 + %"59" = load i32, ptr addrspace(5) %"48", align 4 + %"58" = zext i32 %"59" to i64 + store i64 %"58", ptr addrspace(5) %"49", align 4 + %"61" = load i64, ptr addrspace(5) %"43", align 4 + %"62" = load i64, ptr addrspace(5) %"49", align 4 + %"72" = add i64 %"61", %"62" + store i64 %"72", ptr addrspace(5) %"44", align 4 + %"64" = load i64, ptr addrspace(5) %"46", align 4 + %"65" = load i64, ptr addrspace(5) %"49", align 4 + %"74" = add i64 %"64", %"65" + store i64 %"74", ptr addrspace(5) %"47", align 4 + %"67" = load i64, ptr addrspace(5) %"44", align 4 + %"76" = inttoptr i64 %"67" to ptr addrspace(1) + %"66" = load i64, ptr addrspace(1) %"76", align 4 + store i64 %"66", ptr addrspace(5) %"50", align 4 + %"68" = load i64, ptr addrspace(5) %"47", align 4 + %"69" = load i64, ptr addrspace(5) %"50", align 4 + %"77" = inttoptr i64 %"68" to ptr addrspace(1) + store i64 %"69", ptr addrspace(1) %"77", align 4 ret void } diff --git a/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll b/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll index 1a88793..dd54c84 100644 --- a/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll +++ b/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll @@ -1,69 +1,59 @@ declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @stateful_ld_st_ntid_sub(ptr addrspace(4) byref(i64) %"49", ptr addrspace(4) byref(i64) %"50") #1 { +define amdgpu_kernel void @stateful_ld_st_ntid_sub(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #1 { + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i64, align 8, addrspace(5) + %"49" = alloca i64, align 8, addrspace(5) + %"50" = alloca i64, align 8, addrspace(5) %"51" = alloca i64, align 8, addrspace(5) - %"52" = alloca i64, align 8, addrspace(5) + %"52" = alloca i32, align 4, addrspace(5) %"53" = alloca i64, align 8, addrspace(5) %"54" = alloca i64, align 8, addrspace(5) - %"55" = alloca i64, align 8, addrspace(5) - %"56" = alloca i64, align 8, addrspace(5) - %"57" = alloca i32, align 4, addrspace(5) - %"58" = alloca i64, align 8, addrspace(5) - %"59" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"41" "41": ; preds = %1 - %"79" = load i64, ptr addrspace(4) %"49", align 4 - store i64 %"79", ptr addrspace(5) %"51", align 4 - %"80" = load i64, ptr addrspace(4) %"50", align 4 - store i64 %"80", ptr addrspace(5) %"54", align 4 - %"63" = load i64, ptr addrspace(5) %"51", align 4 - %2 = inttoptr i64 %"63" to ptr - %"62" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"62", ptr addrspace(5) %"52", align 8 - %"65" = load i64, ptr addrspace(5) %"54", align 4 - %3 = inttoptr i64 %"65" to ptr - %"64" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"64", ptr addrspace(5) %"55", align 8 + %"74" = load i64, ptr addrspace(4) %"44", align 4 + store i64 %"74", ptr addrspace(5) %"46", align 4 + %"75" = load i64, ptr addrspace(4) %"45", align 4 + store i64 %"75", ptr addrspace(5) %"49", align 4 + %"58" = load i64, ptr addrspace(5) %"46", align 4 + %2 = inttoptr i64 %"58" to ptr + %"57" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"57", ptr addrspace(5) %"47", align 8 + %"60" = load i64, ptr addrspace(5) %"49", align 4 + %3 = inttoptr i64 %"60" to ptr + %"59" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"59", ptr addrspace(5) %"50", align 8 %"36" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) br label %"42" "42": ; preds = %"41" - store i32 %"36", ptr addrspace(5) %"57", align 4 - %"68" = load i32, ptr addrspace(5) %"57", align 4 - %"67" = zext i32 %"68" to i64 - store i64 %"67", ptr addrspace(5) %"58", align 4 - %"70" = load i64, ptr addrspace(5) %"52", align 4 - %"71" = load i64, ptr addrspace(5) %"58", align 4 - %"81" = sub i64 %"70", %"71" - store i64 %"81", ptr addrspace(5) %"53", align 4 - %"73" = load i64, ptr addrspace(5) %"55", align 4 - %"74" = load i64, ptr addrspace(5) %"58", align 4 - %"84" = sub i64 %"73", %"74" - store i64 %"84", ptr addrspace(5) %"56", align 4 - %"75" = load i64, ptr addrspace(5) %"53", align 4 - %"87" = inttoptr i64 %"75" to ptr addrspace(1) - %"38" = getelementptr inbounds i8, ptr addrspace(1) %"87", i64 0 - %"76" = load i64, ptr addrspace(1) %"38", align 4 - store i64 %"76", ptr addrspace(5) %"59", align 4 - %"77" = load i64, ptr addrspace(5) %"56", align 4 - %"88" = inttoptr i64 %"77" to ptr addrspace(1) - %"40" = getelementptr inbounds i8, ptr addrspace(1) %"88", i64 0 - %"78" = load i64, ptr addrspace(5) %"59", align 4 - store i64 %"78", ptr addrspace(1) %"40", align 4 + store i32 %"36", ptr addrspace(5) %"52", align 4 + %"63" = load i32, ptr addrspace(5) %"52", align 4 + %"62" = zext i32 %"63" to i64 + store i64 %"62", ptr addrspace(5) %"53", align 4 + %"65" = load i64, ptr addrspace(5) %"47", align 4 + %"66" = load i64, ptr addrspace(5) %"53", align 4 + %"76" = sub i64 %"65", %"66" + store i64 %"76", ptr addrspace(5) %"48", align 4 + %"68" = load i64, ptr addrspace(5) %"50", align 4 + %"69" = load i64, ptr addrspace(5) %"53", align 4 + %"79" = sub i64 %"68", %"69" + store i64 %"79", ptr addrspace(5) %"51", align 4 + %"70" = load i64, ptr addrspace(5) %"48", align 4 + %"82" = inttoptr i64 %"70" to ptr addrspace(1) + %"38" = getelementptr inbounds i8, ptr addrspace(1) %"82", i64 0 + %"71" = load i64, ptr addrspace(1) %"38", align 4 + store i64 %"71", ptr addrspace(5) %"54", align 4 + %"72" = load i64, ptr addrspace(5) %"51", align 4 + %"83" = inttoptr i64 %"72" to ptr addrspace(1) + %"40" = getelementptr inbounds i8, ptr addrspace(1) %"83", i64 0 + %"73" = load i64, ptr addrspace(5) %"54", align 4 + store i64 %"73", ptr addrspace(1) %"40", align 4 ret void } diff --git a/ptx/src/test/ll/stateful_ld_st_simple.ll b/ptx/src/test/ll/stateful_ld_st_simple.ll index 0669a23..f945ee2 100644 --- a/ptx/src/test/ll/stateful_ld_st_simple.ll +++ b/ptx/src/test/ll/stateful_ld_st_simple.ll @@ -1,49 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @stateful_ld_st_simple(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @stateful_ld_st_simple(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"31" "31": ; preds = %1 - %"45" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"45", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %2 = inttoptr i64 %"48" to ptr - %"55" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"55", ptr addrspace(5) %"42", align 8 - %"50" = load i64, ptr addrspace(5) %"41", align 4 - %3 = inttoptr i64 %"50" to ptr - %"57" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"57", ptr addrspace(5) %"43", align 8 - %"52" = load i64, ptr addrspace(5) %"42", align 4 - %"59" = inttoptr i64 %"52" to ptr addrspace(1) - %"51" = load i64, ptr addrspace(1) %"59", align 4 - store i64 %"51", ptr addrspace(5) %"44", align 4 - %"53" = load i64, ptr addrspace(5) %"43", align 4 - %"54" = load i64, ptr addrspace(5) %"44", align 4 - %"60" = inttoptr i64 %"53" to ptr addrspace(1) - store i64 %"54", ptr addrspace(1) %"60", align 4 + %"39" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"39", ptr addrspace(5) %"34", align 4 + %"40" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"40", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(5) %"34", align 4 + %2 = inttoptr i64 %"42" to ptr + %"49" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"49", ptr addrspace(5) %"36", align 8 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %3 = inttoptr i64 %"44" to ptr + %"51" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"51", ptr addrspace(5) %"37", align 8 + %"46" = load i64, ptr addrspace(5) %"36", align 4 + %"53" = inttoptr i64 %"46" to ptr addrspace(1) + %"45" = load i64, ptr addrspace(1) %"53", align 4 + store i64 %"45", ptr addrspace(5) %"38", align 4 + %"47" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"38", align 4 + %"54" = inttoptr i64 %"47" to ptr addrspace(1) + store i64 %"48", ptr addrspace(1) %"54", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/stateful_neg_offset.ll b/ptx/src/test/ll/stateful_neg_offset.ll index 616e2f1..d51943d 100644 --- a/ptx/src/test/ll/stateful_neg_offset.ll +++ b/ptx/src/test/ll/stateful_neg_offset.ll @@ -1,58 +1,45 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @stateful_neg_offset(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) - %"45" = alloca i64, align 8, addrspace(5) - %"46" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @stateful_neg_offset(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"32" "32": ; preds = %1 - %"47" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"47", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i64, ptr addrspace(5) %"41", align 4 - %2 = inttoptr i64 %"50" to ptr - %"63" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"63", ptr addrspace(5) %"43", align 8 - %"52" = load i64, ptr addrspace(5) %"42", align 4 - %3 = inttoptr i64 %"52" to ptr - %"65" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"65", ptr addrspace(5) %"44", align 8 - %"54" = load i64, ptr addrspace(5) %"43", align 4 - %"55" = load i64, ptr addrspace(5) %"44", align 4 - %"53" = add i64 %"54", %"55" - store i64 %"53", ptr addrspace(5) %"45", align 4 - %"57" = load i64, ptr addrspace(5) %"43", align 4 - %"58" = load i64, ptr addrspace(5) %"44", align 4 - %"56" = sub i64 %"57", %"58" - store i64 %"56", ptr addrspace(5) %"45", align 4 - %"60" = load i64, ptr addrspace(5) %"43", align 4 - %"67" = inttoptr i64 %"60" to ptr addrspace(1) - %"59" = load i64, ptr addrspace(1) %"67", align 4 - store i64 %"59", ptr addrspace(5) %"46", align 4 - %"61" = load i64, ptr addrspace(5) %"44", align 4 - %"62" = load i64, ptr addrspace(5) %"46", align 4 - %"68" = inttoptr i64 %"61" to ptr addrspace(1) - store i64 %"62", ptr addrspace(1) %"68", align 4 + %"41" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"41", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"42", ptr addrspace(5) %"36", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %2 = inttoptr i64 %"44" to ptr + %"57" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"57", ptr addrspace(5) %"37", align 8 + %"46" = load i64, ptr addrspace(5) %"36", align 4 + %3 = inttoptr i64 %"46" to ptr + %"59" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"59", ptr addrspace(5) %"38", align 8 + %"48" = load i64, ptr addrspace(5) %"37", align 4 + %"49" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = add i64 %"48", %"49" + store i64 %"47", ptr addrspace(5) %"39", align 4 + %"51" = load i64, ptr addrspace(5) %"37", align 4 + %"52" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = sub i64 %"51", %"52" + store i64 %"50", ptr addrspace(5) %"39", align 4 + %"54" = load i64, ptr addrspace(5) %"37", align 4 + %"61" = inttoptr i64 %"54" to ptr addrspace(1) + %"53" = load i64, ptr addrspace(1) %"61", align 4 + store i64 %"53", ptr addrspace(5) %"40", align 4 + %"55" = load i64, ptr addrspace(5) %"38", align 4 + %"56" = load i64, ptr addrspace(5) %"40", align 4 + %"62" = inttoptr i64 %"55" to ptr addrspace(1) + store i64 %"56", ptr addrspace(1) %"62", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/sub.ll b/ptx/src/test/ll/sub.ll index 2e9eb9b..eafd223 100644 --- a/ptx/src/test/ll/sub.ll +++ b/ptx/src/test/ll/sub.ll @@ -1,43 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @sub(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @sub(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"31" "31": ; preds = %1 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"52" = inttoptr i64 %"47" to ptr - %"46" = load i64, ptr %"52", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %"48" = sub i64 %"49", 1 - store i64 %"48", ptr addrspace(5) %"43", align 4 - %"50" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = load i64, ptr addrspace(5) %"43", align 4 - %"53" = inttoptr i64 %"50" to ptr - store i64 %"51", ptr %"53", align 4 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr + %"40" = load i64, ptr %"46", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %"42" = sub i64 %"43", 1 + store i64 %"42", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"47" = inttoptr i64 %"44" to ptr + store i64 %"45", ptr %"47", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/vector.ll b/ptx/src/test/ll/vector.ll index e2c680f..95cb569 100644 --- a/ptx/src/test/ll/vector.ll +++ b/ptx/src/test/ll/vector.ll @@ -1,20 +1,8 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - define <2 x i32> @impl(<2 x i32> %"9") #0 { - %"53" = alloca <2 x i32>, align 8, addrspace(5) - %"54" = alloca <2 x i32>, align 8, addrspace(5) - %"55" = alloca i32, align 4, addrspace(5) - %"56" = alloca i32, align 4, addrspace(5) + %"47" = alloca <2 x i32>, align 8, addrspace(5) + %"48" = alloca <2 x i32>, align 8, addrspace(5) + %"49" = alloca i32, align 4, addrspace(5) + %"50" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 @@ -22,66 +10,66 @@ define <2 x i32> @impl(<2 x i32> %"9") #0 { "44": ; preds = %1 %"38" = extractelement <2 x i32> %"9", i8 0 - store i32 %"38", ptr addrspace(5) %"55", align 4 + store i32 %"38", ptr addrspace(5) %"49", align 4 %"39" = extractelement <2 x i32> %"9", i8 1 - store i32 %"39", ptr addrspace(5) %"56", align 4 - %"60" = load i32, ptr addrspace(5) %"55", align 4 - %"61" = load i32, ptr addrspace(5) %"56", align 4 - %"59" = add i32 %"60", %"61" - store i32 %"59", ptr addrspace(5) %"56", align 4 - %"62" = load i32, ptr addrspace(5) %"56", align 4 - %"64" = load <2 x i32>, ptr addrspace(5) %"54", align 8 - %"63" = insertelement <2 x i32> %"64", i32 %"62", i8 0 - store <2 x i32> %"63", ptr addrspace(5) %"54", align 8 - %"65" = load i32, ptr addrspace(5) %"56", align 4 - %"67" = load <2 x i32>, ptr addrspace(5) %"54", align 8 - %"66" = insertelement <2 x i32> %"67", i32 %"65", i8 1 - store <2 x i32> %"66", ptr addrspace(5) %"54", align 8 - %"68" = load <2 x i32>, ptr addrspace(5) %"54", align 8 - %"43" = extractelement <2 x i32> %"68", i8 1 - %"70" = load <2 x i32>, ptr addrspace(5) %"54", align 8 - %"69" = insertelement <2 x i32> %"70", i32 %"43", i8 0 - store <2 x i32> %"69", ptr addrspace(5) %"54", align 8 - %"72" = load <2 x i32>, ptr addrspace(5) %"54", align 8 - store <2 x i32> %"72", ptr addrspace(5) %"53", align 8 - %2 = load <2 x i32>, ptr addrspace(5) %"53", align 8 + store i32 %"39", ptr addrspace(5) %"50", align 4 + %"54" = load i32, ptr addrspace(5) %"49", align 4 + %"55" = load i32, ptr addrspace(5) %"50", align 4 + %"53" = add i32 %"54", %"55" + store i32 %"53", ptr addrspace(5) %"50", align 4 + %"56" = load i32, ptr addrspace(5) %"50", align 4 + %"58" = load <2 x i32>, ptr addrspace(5) %"48", align 8 + %"57" = insertelement <2 x i32> %"58", i32 %"56", i8 0 + store <2 x i32> %"57", ptr addrspace(5) %"48", align 8 + %"59" = load i32, ptr addrspace(5) %"50", align 4 + %"61" = load <2 x i32>, ptr addrspace(5) %"48", align 8 + %"60" = insertelement <2 x i32> %"61", i32 %"59", i8 1 + store <2 x i32> %"60", ptr addrspace(5) %"48", align 8 + %"62" = load <2 x i32>, ptr addrspace(5) %"48", align 8 + %"43" = extractelement <2 x i32> %"62", i8 1 + %"64" = load <2 x i32>, ptr addrspace(5) %"48", align 8 + %"63" = insertelement <2 x i32> %"64", i32 %"43", i8 0 + store <2 x i32> %"63", ptr addrspace(5) %"48", align 8 + %"66" = load <2 x i32>, ptr addrspace(5) %"48", align 8 + store <2 x i32> %"66", ptr addrspace(5) %"47", align 8 + %2 = load <2 x i32>, ptr addrspace(5) %"47", align 8 ret <2 x i32> %2 } -define amdgpu_kernel void @vector(ptr addrspace(4) byref(i64) %"73", ptr addrspace(4) byref(i64) %"74") #1 { - %"75" = alloca i64, align 8, addrspace(5) - %"76" = alloca i64, align 8, addrspace(5) - %"77" = alloca <2 x i32>, align 8, addrspace(5) - %"78" = alloca i32, align 4, addrspace(5) - %"79" = alloca i32, align 4, addrspace(5) - %"80" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @vector(ptr addrspace(4) byref(i64) %"67", ptr addrspace(4) byref(i64) %"68") #1 { + %"69" = alloca i64, align 8, addrspace(5) + %"70" = alloca i64, align 8, addrspace(5) + %"71" = alloca <2 x i32>, align 8, addrspace(5) + %"72" = alloca i32, align 4, addrspace(5) + %"73" = alloca i32, align 4, addrspace(5) + %"74" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"45" "45": ; preds = %1 - %"81" = load i64, ptr addrspace(4) %"73", align 4 - store i64 %"81", ptr addrspace(5) %"75", align 4 - %"82" = load i64, ptr addrspace(4) %"74", align 4 - store i64 %"82", ptr addrspace(5) %"76", align 4 - %"84" = load i64, ptr addrspace(5) %"75", align 4 - %"91" = inttoptr i64 %"84" to ptr - %"83" = load <2 x i32>, ptr %"91", align 8 - store <2 x i32> %"83", ptr addrspace(5) %"77", align 8 - %"86" = load <2 x i32>, ptr addrspace(5) %"77", align 8 - %"85" = call <2 x i32> @impl(<2 x i32> %"86") - store <2 x i32> %"85", ptr addrspace(5) %"77", align 8 + %"75" = load i64, ptr addrspace(4) %"67", align 4 + store i64 %"75", ptr addrspace(5) %"69", align 4 + %"76" = load i64, ptr addrspace(4) %"68", align 4 + store i64 %"76", ptr addrspace(5) %"70", align 4 + %"78" = load i64, ptr addrspace(5) %"69", align 4 + %"85" = inttoptr i64 %"78" to ptr + %"77" = load <2 x i32>, ptr %"85", align 8 + store <2 x i32> %"77", ptr addrspace(5) %"71", align 8 + %"80" = load <2 x i32>, ptr addrspace(5) %"71", align 8 + %"79" = call <2 x i32> @impl(<2 x i32> %"80") + store <2 x i32> %"79", ptr addrspace(5) %"71", align 8 br label %"46" "46": ; preds = %"45" - %"88" = load <2 x i32>, ptr addrspace(5) %"77", align 8 - %"92" = bitcast <2 x i32> %"88" to i64 - store i64 %"92", ptr addrspace(5) %"80", align 4 - %"89" = load i64, ptr addrspace(5) %"76", align 4 - %"90" = load <2 x i32>, ptr addrspace(5) %"77", align 8 - %"93" = inttoptr i64 %"89" to ptr - store <2 x i32> %"90", ptr %"93", align 8 + %"82" = load <2 x i32>, ptr addrspace(5) %"71", align 8 + %"86" = bitcast <2 x i32> %"82" to i64 + store i64 %"86", ptr addrspace(5) %"74", align 4 + %"83" = load i64, ptr addrspace(5) %"70", align 4 + %"84" = load <2 x i32>, ptr addrspace(5) %"71", align 8 + %"87" = inttoptr i64 %"83" to ptr + store <2 x i32> %"84", ptr %"87", align 8 ret void } diff --git a/ptx/src/test/ll/vector4.ll b/ptx/src/test/ll/vector4.ll index ceeedf7..cf32621 100644 --- a/ptx/src/test/ll/vector4.ll +++ b/ptx/src/test/ll/vector4.ll @@ -1,43 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @vector4(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca <4 x i32>, align 16, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @vector4(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca <4 x i32>, align 16, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 br label %"31" "31": ; preds = %1 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"52" = inttoptr i64 %"47" to ptr - %"46" = load <4 x i32>, ptr %"52", align 16 - store <4 x i32> %"46", ptr addrspace(5) %"42", align 16 - %"48" = load <4 x i32>, ptr addrspace(5) %"42", align 16 - %"30" = extractelement <4 x i32> %"48", i8 3 - store i32 %"30", ptr addrspace(5) %"43", align 4 - %"50" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = load i32, ptr addrspace(5) %"43", align 4 - %"55" = inttoptr i64 %"50" to ptr - store i32 %"51", ptr %"55", align 4 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr + %"40" = load <4 x i32>, ptr %"46", align 16 + store <4 x i32> %"40", ptr addrspace(5) %"36", align 16 + %"42" = load <4 x i32>, ptr addrspace(5) %"36", align 16 + %"30" = extractelement <4 x i32> %"42", i8 3 + store i32 %"30", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i32, ptr addrspace(5) %"37", align 4 + %"49" = inttoptr i64 %"44" to ptr + store i32 %"45", ptr %"49", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/vector_extract.ll b/ptx/src/test/ll/vector_extract.ll index 694e00c..9c615ca 100644 --- a/ptx/src/test/ll/vector_extract.ll +++ b/ptx/src/test/ll/vector_extract.ll @@ -1,99 +1,86 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @vector_extract(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #1 { - %"48" = alloca i64, align 8, addrspace(5) - %"49" = alloca i64, align 8, addrspace(5) - %"50" = alloca i16, align 2, addrspace(5) - %"51" = alloca i16, align 2, addrspace(5) - %"52" = alloca i16, align 2, addrspace(5) - %"53" = alloca i16, align 2, addrspace(5) - %"54" = alloca <4 x i16>, align 8, addrspace(5) +define amdgpu_kernel void @vector_extract(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i16, align 2, addrspace(5) + %"45" = alloca i16, align 2, addrspace(5) + %"46" = alloca i16, align 2, addrspace(5) + %"47" = alloca i16, align 2, addrspace(5) + %"48" = alloca <4 x i16>, align 8, addrspace(5) br label %1 1: ; preds = %0 br label %"39" "39": ; preds = %1 - %"55" = load i64, ptr addrspace(4) %"46", align 4 - store i64 %"55", ptr addrspace(5) %"48", align 4 - %"56" = load i64, ptr addrspace(4) %"47", align 4 - store i64 %"56", ptr addrspace(5) %"49", align 4 - %"57" = load i64, ptr addrspace(5) %"48", align 4 - %"85" = inttoptr i64 %"57" to ptr addrspace(1) - %"33" = load <4 x i8>, ptr addrspace(1) %"85", align 4 - %"86" = extractelement <4 x i8> %"33", i8 0 - %"87" = extractelement <4 x i8> %"33", i8 1 - %"88" = extractelement <4 x i8> %"33", i8 2 - %"89" = extractelement <4 x i8> %"33", i8 3 - %"58" = zext i8 %"86" to i16 - %"59" = zext i8 %"87" to i16 - %"60" = zext i8 %"88" to i16 - %"61" = zext i8 %"89" to i16 - store i16 %"58", ptr addrspace(5) %"50", align 2 - store i16 %"59", ptr addrspace(5) %"51", align 2 - store i16 %"60", ptr addrspace(5) %"52", align 2 - store i16 %"61", ptr addrspace(5) %"53", align 2 - %"62" = load i16, ptr addrspace(5) %"51", align 2 - %"63" = load i16, ptr addrspace(5) %"52", align 2 - %"64" = load i16, ptr addrspace(5) %"53", align 2 - %"65" = load i16, ptr addrspace(5) %"50", align 2 - %2 = insertelement <4 x i16> undef, i16 %"62", i8 0 - %3 = insertelement <4 x i16> %2, i16 %"63", i8 1 - %4 = insertelement <4 x i16> %3, i16 %"64", i8 2 - %"34" = insertelement <4 x i16> %4, i16 %"65", i8 3 - store <4 x i16> %"34", ptr addrspace(5) %"54", align 8 - %"67" = load <4 x i16>, ptr addrspace(5) %"54", align 8 - %"68" = extractelement <4 x i16> %"67", i8 0 - %"69" = extractelement <4 x i16> %"67", i8 1 - %"70" = extractelement <4 x i16> %"67", i8 2 - %"71" = extractelement <4 x i16> %"67", i8 3 - store i16 %"68", ptr addrspace(5) %"52", align 2 - store i16 %"69", ptr addrspace(5) %"53", align 2 - store i16 %"70", ptr addrspace(5) %"50", align 2 - store i16 %"71", ptr addrspace(5) %"51", align 2 - %"72" = load i16, ptr addrspace(5) %"52", align 2 - %"73" = load i16, ptr addrspace(5) %"53", align 2 - %"74" = load i16, ptr addrspace(5) %"50", align 2 - %"75" = load i16, ptr addrspace(5) %"51", align 2 - %5 = insertelement <4 x i16> undef, i16 %"72", i8 0 - %6 = insertelement <4 x i16> %5, i16 %"73", i8 1 - %7 = insertelement <4 x i16> %6, i16 %"74", i8 2 - %"37" = insertelement <4 x i16> %7, i16 %"75", i8 3 - %"76" = extractelement <4 x i16> %"37", i8 0 - %"77" = extractelement <4 x i16> %"37", i8 1 - %"78" = extractelement <4 x i16> %"37", i8 2 - %"79" = extractelement <4 x i16> %"37", i8 3 - store i16 %"76", ptr addrspace(5) %"53", align 2 - store i16 %"77", ptr addrspace(5) %"50", align 2 - store i16 %"78", ptr addrspace(5) %"51", align 2 - store i16 %"79", ptr addrspace(5) %"52", align 2 - %"80" = load i16, ptr addrspace(5) %"50", align 2 - %"81" = load i16, ptr addrspace(5) %"51", align 2 - %"82" = load i16, ptr addrspace(5) %"52", align 2 - %"83" = load i16, ptr addrspace(5) %"53", align 2 - %"90" = trunc i16 %"80" to i8 - %"91" = trunc i16 %"81" to i8 - %"92" = trunc i16 %"82" to i8 - %"93" = trunc i16 %"83" to i8 - %8 = insertelement <4 x i8> undef, i8 %"90", i8 0 - %9 = insertelement <4 x i8> %8, i8 %"91", i8 1 - %10 = insertelement <4 x i8> %9, i8 %"92", i8 2 - %"38" = insertelement <4 x i8> %10, i8 %"93", i8 3 - %"84" = load i64, ptr addrspace(5) %"49", align 4 - %"94" = inttoptr i64 %"84" to ptr addrspace(1) - store <4 x i8> %"38", ptr addrspace(1) %"94", align 4 + %"49" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"49", ptr addrspace(5) %"42", align 4 + %"50" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"50", ptr addrspace(5) %"43", align 4 + %"51" = load i64, ptr addrspace(5) %"42", align 4 + %"79" = inttoptr i64 %"51" to ptr addrspace(1) + %"33" = load <4 x i8>, ptr addrspace(1) %"79", align 4 + %"80" = extractelement <4 x i8> %"33", i8 0 + %"81" = extractelement <4 x i8> %"33", i8 1 + %"82" = extractelement <4 x i8> %"33", i8 2 + %"83" = extractelement <4 x i8> %"33", i8 3 + %"52" = zext i8 %"80" to i16 + %"53" = zext i8 %"81" to i16 + %"54" = zext i8 %"82" to i16 + %"55" = zext i8 %"83" to i16 + store i16 %"52", ptr addrspace(5) %"44", align 2 + store i16 %"53", ptr addrspace(5) %"45", align 2 + store i16 %"54", ptr addrspace(5) %"46", align 2 + store i16 %"55", ptr addrspace(5) %"47", align 2 + %"56" = load i16, ptr addrspace(5) %"45", align 2 + %"57" = load i16, ptr addrspace(5) %"46", align 2 + %"58" = load i16, ptr addrspace(5) %"47", align 2 + %"59" = load i16, ptr addrspace(5) %"44", align 2 + %2 = insertelement <4 x i16> undef, i16 %"56", i8 0 + %3 = insertelement <4 x i16> %2, i16 %"57", i8 1 + %4 = insertelement <4 x i16> %3, i16 %"58", i8 2 + %"34" = insertelement <4 x i16> %4, i16 %"59", i8 3 + store <4 x i16> %"34", ptr addrspace(5) %"48", align 8 + %"61" = load <4 x i16>, ptr addrspace(5) %"48", align 8 + %"62" = extractelement <4 x i16> %"61", i8 0 + %"63" = extractelement <4 x i16> %"61", i8 1 + %"64" = extractelement <4 x i16> %"61", i8 2 + %"65" = extractelement <4 x i16> %"61", i8 3 + store i16 %"62", ptr addrspace(5) %"46", align 2 + store i16 %"63", ptr addrspace(5) %"47", align 2 + store i16 %"64", ptr addrspace(5) %"44", align 2 + store i16 %"65", ptr addrspace(5) %"45", align 2 + %"66" = load i16, ptr addrspace(5) %"46", align 2 + %"67" = load i16, ptr addrspace(5) %"47", align 2 + %"68" = load i16, ptr addrspace(5) %"44", align 2 + %"69" = load i16, ptr addrspace(5) %"45", align 2 + %5 = insertelement <4 x i16> undef, i16 %"66", i8 0 + %6 = insertelement <4 x i16> %5, i16 %"67", i8 1 + %7 = insertelement <4 x i16> %6, i16 %"68", i8 2 + %"37" = insertelement <4 x i16> %7, i16 %"69", i8 3 + %"70" = extractelement <4 x i16> %"37", i8 0 + %"71" = extractelement <4 x i16> %"37", i8 1 + %"72" = extractelement <4 x i16> %"37", i8 2 + %"73" = extractelement <4 x i16> %"37", i8 3 + store i16 %"70", ptr addrspace(5) %"47", align 2 + store i16 %"71", ptr addrspace(5) %"44", align 2 + store i16 %"72", ptr addrspace(5) %"45", align 2 + store i16 %"73", ptr addrspace(5) %"46", align 2 + %"74" = load i16, ptr addrspace(5) %"44", align 2 + %"75" = load i16, ptr addrspace(5) %"45", align 2 + %"76" = load i16, ptr addrspace(5) %"46", align 2 + %"77" = load i16, ptr addrspace(5) %"47", align 2 + %"84" = trunc i16 %"74" to i8 + %"85" = trunc i16 %"75" to i8 + %"86" = trunc i16 %"76" to i8 + %"87" = trunc i16 %"77" to i8 + %8 = insertelement <4 x i8> undef, i8 %"84", i8 0 + %9 = insertelement <4 x i8> %8, i8 %"85", i8 1 + %10 = insertelement <4 x i8> %9, i8 %"86", i8 2 + %"38" = insertelement <4 x i8> %10, i8 %"87", i8 3 + %"78" = load i64, ptr addrspace(5) %"43", align 4 + %"88" = inttoptr i64 %"78" to ptr addrspace(1) + store <4 x i8> %"38", ptr addrspace(1) %"88", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/xor.ll b/ptx/src/test/ll/xor.ll index 7cb37d3..6f9633d 100644 --- a/ptx/src/test/ll/xor.ll +++ b/ptx/src/test/ll/xor.ll @@ -1,49 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @xor(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) - %"44" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @xor(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 br label %"32" "32": ; preds = %1 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"48" to ptr - %"47" = load i32, ptr %"56", align 4 - store i32 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"57" = inttoptr i64 %"49" to ptr - %"31" = getelementptr inbounds i8, ptr %"57", i64 4 - %"50" = load i32, ptr %"31", align 4 - store i32 %"50", ptr addrspace(5) %"44", align 4 - %"52" = load i32, ptr addrspace(5) %"43", align 4 - %"53" = load i32, ptr addrspace(5) %"44", align 4 - %"51" = xor i32 %"52", %"53" - store i32 %"51", ptr addrspace(5) %"43", align 4 - %"54" = load i64, ptr addrspace(5) %"42", align 4 - %"55" = load i32, ptr addrspace(5) %"43", align 4 - %"58" = inttoptr i64 %"54" to ptr - store i32 %"55", ptr %"58", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"50", align 4 + store i32 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 4 + %"44" = load i32, ptr %"31", align 4 + store i32 %"44", ptr addrspace(5) %"38", align 4 + %"46" = load i32, ptr addrspace(5) %"37", align 4 + %"47" = load i32, ptr addrspace(5) %"38", align 4 + %"45" = xor i32 %"46", %"47" + store i32 %"45", ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load i32, ptr addrspace(5) %"37", align 4 + %"52" = inttoptr i64 %"48" to ptr + store i32 %"49", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index 6f8593c..cafa480 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -195,6 +195,10 @@ test_ptx!(activemask, [0u32], [1u32]); test_ptx!(membar, [152731u32], [152731u32]); test_ptx!(shared_unify_extern, [7681u64, 7682u64], [15363u64]); test_ptx!(shared_unify_local, [16752u64, 714u64], [17466u64]); +// This test currently fails for reasons outside of ZLUDA's control. +// One of the LLVM passes does not understand that setreg instruction changes +// global floating point state and assumes that both floating point +// additions are the exact same expressions and optimizes second addition away. test_ptx!( add_ftz, [f32::from_bits(0x800000), f32::from_bits(0x007FFFFF)], @@ -272,7 +276,7 @@ fn test_llvm_assert<'a>( let mut output_file = File::create(output_file).unwrap(); output_file.write_all(actual_ll.as_bytes()).unwrap(); } - let comparison = pretty_assertions::StrComparison::new(actual_ll, expected_ll); + let comparison = pretty_assertions::StrComparison::new(expected_ll, actual_ll); panic!("assertion failed: `(left == right)`\n\n{}", comparison); } Ok(())