Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

s390x: Add support for inline probestack #9423

Merged
merged 1 commit into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 48 additions & 4 deletions cranelift/codegen/src/isa/s390x/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -648,12 +648,56 @@ impl ABIMachineSpec for S390xMachineDeps {
}

fn gen_inline_probestack(
_insts: &mut SmallInstVec<Self::I>,
insts: &mut SmallInstVec<Self::I>,
_call_conv: isa::CallConv,
_frame_size: u32,
_guard_size: u32,
frame_size: u32,
guard_size: u32,
) {
unimplemented!("Inline stack probing is unimplemented on S390x");
// Number of probes that we need to perform
let probe_count = align_to(frame_size, guard_size) / guard_size;

// The stack probe loop currently takes 4 instructions and each unrolled
// probe takes 2. Set this to 2 to keep the max size to 4 instructions.
const PROBE_MAX_UNROLL: u32 = 2;

if probe_count <= PROBE_MAX_UNROLL {
// Unrolled probe loop.
for _ in 0..probe_count {
insts.extend(Self::gen_sp_reg_adjust(-(guard_size as i32)));

insts.push(Inst::StoreImm8 {
imm: 0,
mem: MemArg::reg(stack_reg(), MemFlags::trusted()),
});
}
} else {
// Explicit probe loop.

// Load the number of probes into a register used as loop counter.
// `gen_inline_probestack` is called after regalloc2, so we can
// use the nonallocatable spilltmp register for this purpose.
let probe_count_reg = writable_spilltmp_reg();
if let Ok(probe_count) = i16::try_from(probe_count) {
insts.push(Inst::Mov32SImm16 {
rd: probe_count_reg,
imm: probe_count,
});
} else {
insts.push(Inst::Mov32Imm {
rd: probe_count_reg,
imm: probe_count,
});
}

// Emit probe loop. The guard size is assumed to fit in 16 bits.
insts.push(Inst::StackProbeLoop {
probe_count: probe_count_reg,
guard_size: i16::try_from(guard_size).unwrap(),
});
}

// Restore the stack pointer to its original position.
insts.extend(Self::gen_sp_reg_adjust((probe_count * guard_size) as i32));
}

fn gen_clobber_save(
Expand Down
5 changes: 5 additions & 0 deletions cranelift/codegen/src/isa/s390x/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,11 @@
(ridx Reg)
(targets BoxVecMachLabel))

;; Stack probe loop sequence, as one compound instruction.
(StackProbeLoop
(probe_count WritableReg)
(guard_size i16))

;; Load an inline symbol reference with relocation.
(LoadSymbolReloc
(rd WritableReg)
Expand Down
30 changes: 30 additions & 0 deletions cranelift/codegen/src/isa/s390x/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3383,6 +3383,36 @@ impl Inst {
start_off = sink.cur_offset();
}

Inst::StackProbeLoop {
probe_count,
guard_size,
} => {
// Emit the loop start label
let loop_start = sink.get_label();
sink.bind_label(loop_start, state.ctrl_plane_mut());

// aghi %r15, -GUARD_SIZE
let inst = Inst::AluRSImm16 {
alu_op: ALUOp::Add64,
rd: writable_stack_reg(),
ri: stack_reg(),
imm: -guard_size,
};
inst.emit(sink, emit_info, state);

// mvi 0(%r15), 0
let inst = Inst::StoreImm8 {
imm: 0,
mem: MemArg::reg(stack_reg(), MemFlags::trusted()),
};
inst.emit(sink, emit_info, state);

// brct PROBE_COUNT, LOOP_START
let opcode = 0xa76; // BRCT
sink.use_label_at_offset(sink.cur_offset(), loop_start, LabelUse::BranchRI);
put(sink, &enc_ri_b(opcode, probe_count.to_reg(), 0));
}

&Inst::Unwind { ref inst } => {
sink.add_unwind(inst.clone());
}
Expand Down
9 changes: 9 additions & 0 deletions cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7032,6 +7032,15 @@ fn test_s390x_binemit() {
"0: cr %r2, %r3 ; jgnh 1f ; cs %r4, %r5, 0(%r6) ; jglh 0b ; 1:",
));

insns.push((
Inst::StackProbeLoop {
probe_count: writable_gpr(1),
guard_size: 4096,
},
"A7FBF0009200F000A716FFFC",
"0: aghi %r15, -4096 ; mvi 0(%r15), 0 ; brct %r1, 0b",
));

insns.push((
Inst::FpuMove32 {
rd: writable_vr(8),
Expand Down
12 changes: 12 additions & 0 deletions cranelift/codegen/src/isa/s390x/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ impl Inst {
| Inst::Debugtrap
| Inst::Trap { .. }
| Inst::JTSequence { .. }
| Inst::StackProbeLoop { .. }
| Inst::LoadSymbolReloc { .. }
| Inst::LoadAddr { .. }
| Inst::Loop { .. }
Expand Down Expand Up @@ -969,6 +970,9 @@ fn s390x_get_operands(inst: &mut Inst, collector: &mut DenyReuseVisitor<impl Ope
collector.reg_def(rd);
memarg_operands(mem, collector);
}
Inst::StackProbeLoop { probe_count, .. } => {
collector.reg_early_def(probe_count);
}
Inst::Loop { body, .. } => {
// `reuse_def` constraints can't be permitted in a Loop instruction because the operand
// index will always be relative to the Loop instruction, not the individual
Expand Down Expand Up @@ -3286,6 +3290,14 @@ impl Inst {

format!("{mem_str}{op} {rd}, {mem}")
}
&Inst::StackProbeLoop {
probe_count,
guard_size,
} => {
let probe_count = pretty_print_reg(probe_count.to_reg());
let stack_reg = pretty_print_reg(stack_reg());
format!("0: aghi {stack_reg}, -{guard_size} ; mvi 0({stack_reg}), 0 ; brct {probe_count}, 0b")
}
&Inst::Loop { ref body, cond } => {
let body = body
.into_iter()
Expand Down
96 changes: 96 additions & 0 deletions cranelift/filetests/filetests/isa/s390x/inline-probestack.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
test compile precise-output
set enable_probestack=true
set probestack_strategy=inline
; This is the default and is equivalent to a page size of 4096
set probestack_size_log2=12
target s390x


; If the stack size is just one page, we can avoid the stack probe entirely
function %single_page() -> i64 tail {
ss0 = explicit_slot 2048

block0:
v1 = stack_addr.i64 ss0
return v1
}

; VCode:
; aghi %r15, -2048
; block0:
; la %r2, 0(%r15)
; aghi %r15, 2048
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; aghi %r15, -0x800
; block1: ; offset 0x4
; la %r2, 0(%r15)
; aghi %r15, 0x800
; br %r14

function %unrolled() -> i64 tail {
ss0 = explicit_slot 8192

block0:
v1 = stack_addr.i64 ss0
return v1
}

; VCode:
; aghi %r15, -4096
; mvi 0(%r15), 0
; aghi %r15, -4096
; mvi 0(%r15), 0
; aghi %r15, 8192
; aghi %r15, -8192
; block0:
; la %r2, 0(%r15)
; aghi %r15, 8192
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; aghi %r15, -0x1000
; mvi 0(%r15), 0
; aghi %r15, -0x1000
; mvi 0(%r15), 0
; aghi %r15, 0x2000
; aghi %r15, -0x2000
; block1: ; offset 0x18
; la %r2, 0(%r15)
; aghi %r15, 0x2000
; br %r14

function %large() -> i64 tail {
ss0 = explicit_slot 100000

block0:
v1 = stack_addr.i64 ss0
return v1
}

; VCode:
; lhi %r1, 25
; 0: aghi %r15, -4096 ; mvi 0(%r15), 0 ; brct %r1, 0b
; agfi %r15, 102400
; agfi %r15, -100000
; block0:
; la %r2, 0(%r15)
; agfi %r15, 100000
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; lhi %r1, 0x19
; aghi %r15, -0x1000
; mvi 0(%r15), 0
; brct %r1, 4
; agfi %r15, 0x19000
; agfi %r15, -0x186a0
; block1: ; offset 0x1c
; la %r2, 0(%r15)
; agfi %r15, 0x186a0
; br %r14

18 changes: 4 additions & 14 deletions crates/wasmtime/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ use core::str::FromStr;
use serde_derive::{Deserialize, Serialize};
#[cfg(any(feature = "cache", feature = "cranelift", feature = "winch"))]
use std::path::Path;
use target_lexicon::Architecture;
use wasmparser::WasmFeatures;
#[cfg(feature = "cache")]
use wasmtime_cache::CacheConfig;
Expand Down Expand Up @@ -2068,16 +2067,14 @@ impl Config {

let target = self.compiler_target();

// On supported targets, we enable stack probing by default.
// We enable stack probing by default on all targets.
// This is required on Windows because of the way Windows
// commits its stacks, but it's also a good idea on other
// platforms to ensure guard pages are hit for large frame
// sizes.
if probestack_supported(target.architecture) {
self.compiler_config
.flags
.insert("enable_probestack".into());
}
self.compiler_config
.flags
.insert("enable_probestack".into());

if let Some(unwind_requested) = self.native_unwind_info {
if !self
Expand Down Expand Up @@ -3021,13 +3018,6 @@ impl PoolingAllocationConfig {
}
}

pub(crate) fn probestack_supported(arch: Architecture) -> bool {
matches!(
arch,
Architecture::X86_64 | Architecture::Aarch64(_) | Architecture::Riscv64(_)
)
}

#[cfg(feature = "std")]
fn detect_host_feature(feature: &str) -> Option<bool> {
#[cfg(target_arch = "aarch64")]
Expand Down
2 changes: 1 addition & 1 deletion crates/wasmtime/src/engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ impl Engine {
// runtime.
"libcall_call_conv" => *value == FlagValue::Enum("isa_default".into()),
"preserve_frame_pointers" => *value == FlagValue::Bool(true),
"enable_probestack" => *value == FlagValue::Bool(crate::config::probestack_supported(target.architecture)),
"enable_probestack" => *value == FlagValue::Bool(true),
"probestack_strategy" => *value == FlagValue::Enum("inline".into()),

// Features wasmtime doesn't use should all be disabled, since
Expand Down