From 2d0db85574bcf43dee440e97550d1381cd484937 Mon Sep 17 00:00:00 2001 From: Masato Imai Date: Fri, 22 Aug 2025 14:32:21 +0000 Subject: [PATCH] add XSETBV, CLAC, STAC, initrd --- .gitignore | 1 + nel_os_bootloader/run-qemu.sh | 2 +- nel_os_kernel/src/vmm/x86_64/common/linux.rs | 11 + .../src/vmm/x86_64/intel/controls.rs | 8 +- nel_os_kernel/src/vmm/x86_64/intel/fpu.rs | 67 ++++++ nel_os_kernel/src/vmm/x86_64/intel/io.rs | 29 +++ nel_os_kernel/src/vmm/x86_64/intel/mod.rs | 1 + nel_os_kernel/src/vmm/x86_64/intel/vcpu.rs | 222 ++++++++++++++++-- 8 files changed, 315 insertions(+), 26 deletions(-) create mode 100644 nel_os_kernel/src/vmm/x86_64/intel/fpu.rs diff --git a/.gitignore b/.gitignore index 74edcc1..9f94315 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ nel_os_bootloader/fat.img nel_os_bootloader/myOSimage.img nel_os_bootloader/iso/ nel_os_bootloader/nel_os.iso +nel_os_bootloader/vmlinux diff --git a/nel_os_bootloader/run-qemu.sh b/nel_os_bootloader/run-qemu.sh index 6510e85..a047cc8 100755 --- a/nel_os_bootloader/run-qemu.sh +++ b/nel_os_bootloader/run-qemu.sh @@ -6,7 +6,7 @@ EFI_BINARY="$1" ./create-iso.sh "$EFI_BINARY" qemu-system-x86_64 -enable-kvm \ - -m 512M \ + -m 2G \ -serial mon:stdio \ -nographic \ -no-reboot \ diff --git a/nel_os_kernel/src/vmm/x86_64/common/linux.rs b/nel_os_kernel/src/vmm/x86_64/common/linux.rs index 28cc70d..ebc1119 100644 --- a/nel_os_kernel/src/vmm/x86_64/common/linux.rs +++ b/nel_os_kernel/src/vmm/x86_64/common/linux.rs @@ -10,6 +10,12 @@ pub fn load_kernel(vcpu: &mut dyn VCpu) -> Result<(), &'static str> { let kernel = unsafe { core::slice::from_raw_parts(*kernel_addr as *const u8, *kernel_size as usize) }; + let initrd_addr = crate::ROOTFS_ADDR.get().unwrap(); + let initrd_size = crate::ROOTFS_SIZE.get().unwrap(); + + let initrd = + unsafe { core::slice::from_raw_parts(*initrd_addr as *const u8, *initrd_size as usize) }; + info!("Creating boot parameters"); let guest_mem_size = vcpu.get_guest_memory_size(); let mut bp = BootParams::from_bytes(kernel)?; @@ -23,6 +29,8 @@ pub fn load_kernel(vcpu: &mut dyn VCpu) -> Result<(), &'static str> { bp.hdr.loadflags.set_keep_segments(true); bp.hdr.cmd_line_ptr = LAYOUT_CMDLINE as u32; bp.hdr.vid_mode = 0xFFFF; + bp.hdr.ramdisk_image = LAYOUT_INITRD as u32; + bp.hdr.ramdisk_size = initrd.len() as u32; bp.add_e820_entry(0, LAYOUT_KERNEL_BASE, E820Type::Ram); bp.add_e820_entry( @@ -65,6 +73,9 @@ pub fn load_kernel(vcpu: &mut dyn VCpu) -> Result<(), &'static str> { LAYOUT_KERNEL_BASE as usize, )?; + info!("Loading initrd image into guest memory"); + load_image(vcpu, initrd, LAYOUT_INITRD as usize)?; + Ok(()) } diff --git a/nel_os_kernel/src/vmm/x86_64/intel/controls.rs b/nel_os_kernel/src/vmm/x86_64/intel/controls.rs index d5806a6..c410e12 100644 --- a/nel_os_kernel/src/vmm/x86_64/intel/controls.rs +++ b/nel_os_kernel/src/vmm/x86_64/intel/controls.rs @@ -109,13 +109,7 @@ pub fn setup_exit_controls() -> Result<(), &'static str> { exit_ctrl.write()?; - /*vmwrite( - 0x4004, - 1u64 << x86::irq::DOUBLE_FAULT_VECTOR - | 1u64 << x86::irq::GENERAL_PROTECTION_FAULT_VECTOR - | 1u64 << x86::irq::PAGE_FAULT_VECTOR - | 1u64 << x86::irq::X87_FPU_VECTOR, - )?;*/ + vmwrite(0x4004, 1u64 << x86::irq::INVALID_OPCODE_VECTOR)?; Ok(()) } diff --git a/nel_os_kernel/src/vmm/x86_64/intel/fpu.rs b/nel_os_kernel/src/vmm/x86_64/intel/fpu.rs new file mode 100644 index 0000000..dbcd4b1 --- /dev/null +++ b/nel_os_kernel/src/vmm/x86_64/intel/fpu.rs @@ -0,0 +1,67 @@ +use modular_bitfield::{bitfield, prelude::B44}; + +use crate::vmm::x86_64::intel::vcpu::IntelVCpu; + +#[bitfield] +#[repr(u64)] +#[derive(Debug, Clone, Copy)] +pub struct XCR0 { + pub x87: bool, + pub sse: bool, + pub avx: bool, + pub bndreg: bool, + pub bndcsr: bool, + pub opmask: bool, + pub zmm_hi256: bool, + pub hi16_zmm: bool, + pub pt: bool, + pub pkru: bool, + pub pasid: bool, + pub cet_u: bool, + pub cet_s: bool, + pub hdc: bool, + pub intr: bool, + pub lbr: bool, + pub hwp: bool, + pub xtilecfg: bool, + pub xtiledata: bool, + pub apx: bool, + #[skip] + _reserved: B44, +} + +pub fn set_xcr(vcpu: &mut IntelVCpu, index: u32, xcr: u64) -> Result<(), &'static str> { + if index != 0 { + return Err("Invalid XCR index"); + } + + if !(xcr & 0b1 != 0) { + return Err("X87 is not enabled"); + } + + if (xcr & 0b100 != 0) && !(xcr & 0b10 != 0) { + return Err("SSE is not enabled"); + } + + if !(xcr & 0b1000) != (!(xcr & 0b10000)) { + return Err("BNDREGS and BNDCSR are not both enabled"); + } + + if xcr & 0b11100000 != 0 { + if !(xcr & 0b100 != 0) { + return Err("YMM bits are not enabled"); + } + + if (xcr & 0b11100000) != 0b11100000 { + return Err("Invalid bits set in XCR0"); + } + } + + if (xcr & 0b1000000000000 != 0) && (xcr & 0b1000000000000 != 0b1000000000000) { + return Err("xtile bits are not both enabled"); + } + + vcpu.guest_xcr0 = XCR0::from(xcr); + + Ok(()) +} diff --git a/nel_os_kernel/src/vmm/x86_64/intel/io.rs b/nel_os_kernel/src/vmm/x86_64/intel/io.rs index fbb85a7..c184f4e 100644 --- a/nel_os_kernel/src/vmm/x86_64/intel/io.rs +++ b/nel_os_kernel/src/vmm/x86_64/intel/io.rs @@ -155,6 +155,35 @@ impl PIC { Ok(false) } + pub fn inject_exception( + &mut self, + vector: u32, + error_code: Option, + ) -> Result<(), &'static str> { + let has_error_code = match vector { + 8 | 10..=14 | 17 | 21 => true, + _ => false, + }; + + let interrupt_info = EntryIntrInfo::new() + .with_vector(vector as u8) + .with_typ(3) + .with_ec_available(has_error_code) + .with_valid(true); + + vmwrite( + vmx::vmcs::control::VMENTRY_INTERRUPTION_INFO_FIELD, + u32::from(interrupt_info) as u64, + )?; + + if has_error_code { + let ec = error_code.unwrap_or(0); + vmwrite(vmx::vmcs::control::VMENTRY_EXCEPTION_ERR_CODE, ec as u64)?; + } + + Ok(()) + } + fn handle_io_in(&self, regs: &mut GuestRegisters, qual: QualIo) { match qual.port() { 0x0CF8..=0x0CFF => regs.rax = 0, diff --git a/nel_os_kernel/src/vmm/x86_64/intel/mod.rs b/nel_os_kernel/src/vmm/x86_64/intel/mod.rs index eaa021e..a163241 100644 --- a/nel_os_kernel/src/vmm/x86_64/intel/mod.rs +++ b/nel_os_kernel/src/vmm/x86_64/intel/mod.rs @@ -4,6 +4,7 @@ mod controls; mod cpuid; mod cr; mod ept; +mod fpu; mod io; mod msr; mod qual; diff --git a/nel_os_kernel/src/vmm/x86_64/intel/vcpu.rs b/nel_os_kernel/src/vmm/x86_64/intel/vcpu.rs index d0246c3..52fe847 100644 --- a/nel_os_kernel/src/vmm/x86_64/intel/vcpu.rs +++ b/nel_os_kernel/src/vmm/x86_64/intel/vcpu.rs @@ -1,6 +1,10 @@ -use core::arch::asm; +use core::arch::{ + asm, + x86_64::{_xgetbv, _xsetbv}, +}; use raw_cpuid::cpuid; +use x86::controlregs::cr4; use x86_64::{ registers::control::Cr4Flags, structures::paging::{FrameAllocator, Size4KiB}, @@ -14,6 +18,7 @@ use crate::{ common::{self, read_msr}, intel::{ auditor, controls, cpuid, ept, + fpu::{self, XCR0}, io::{vmm_interrupt_subscriber, IOBitmap}, msr::{self, ShadowMsr}, qual::{QualCr, QualIo}, @@ -50,6 +55,8 @@ pub struct IntelVCpu { pic: super::io::PIC, io_bitmap: IOBitmap, pub pending_irq: u16, + pub host_xcr0: u64, + pub guest_xcr0: XCR0, } impl IntelVCpu { @@ -118,6 +125,15 @@ impl IntelVCpu { self.step_next_inst()?; } + VmxExitReason::XSETBV => { + fpu::set_xcr( + self, + self.guest_registers.rcx as u32, + self.guest_registers.rax, + )?; + + self.step_next_inst()?; + } VmxExitReason::IO_INSTRUCTION => { let qual = vmread(vmcs::ro::EXIT_QUALIFICATION)?; let qual_io = QualIo::from(qual); @@ -147,26 +163,77 @@ impl IntelVCpu { return Err("Triple fault"); } VmxExitReason::EXCEPTION => { - let vmexit_intr_info = vmread(vmcs::ro::VMEXIT_INTERRUPTION_INFO)?; - let vector = (vmexit_intr_info & 0xFF) as u8; - let error_code = (vmexit_intr_info >> 8) & 0b111; - let error_code_valid = (vmexit_intr_info >> 11) & 0b1 != 0; + let vmexit_intr_info = vmread(vmcs::ro::VMEXIT_INTERRUPTION_INFO).unwrap(); + let vector = (vmexit_intr_info & 0xFF) as u32; + let has_error_code = (vmexit_intr_info & (1 << 11)) != 0; - let idt_vectoring_info = vmread(vmcs::ro::IDT_VECTORING_INFO)?; - info!("idt valid: {}", idt_vectoring_info >> 31 & 0b1 != 0); - - let rip = vmread(vmcs::guest::RIP)?; - let hpa = self.ept.get_phys_addr(rip).unwrap(); - - if error_code_valid { - info!( - "VM exit due to exception: vector {}, error code {}, at RIP {:#x} (hpa: {:#x})", - vector, error_code, rip, hpa - ); + let error_code = if has_error_code { + Some(vmread(vmcs::ro::VMEXIT_INTERRUPTION_ERR_CODE).unwrap() as u32) } else { - info!("VM exit due to exception: vector {}", vector); + None + }; + + let rip = vmread(vmcs::guest::RIP).unwrap(); + + let mut instruction_bytes = [0u8; 16]; + let mut valid_bytes = 0; + + match self.translate_guest_address(rip) { + Ok(guest_phys_addr) => { + for i in 0..16 { + match self.ept.get(guest_phys_addr + i) { + Ok(byte) => { + instruction_bytes[i as usize] = byte; + valid_bytes = i + 1; + } + Err(_) => break, + } + } + } + Err(e) => { + info!( + "Failed to get physical address for RIP: {:#x}, {:?}", + rip, e + ); + return Err("Failed to get physical address for RIP"); + } + } + + if valid_bytes > 0 { + match instruction_bytes[0] { + 0x0F => { + if valid_bytes > 1 { + match instruction_bytes[1] { + 0x01 => match instruction_bytes[2] { + 0xCA => { + let rflags = vmread(vmcs::guest::RFLAGS).unwrap(); + vmwrite(vmcs::guest::RFLAGS, rflags & !(1 << 18)) + .unwrap(); + self.step_next_inst().unwrap(); + } + 0xCB => { + let rflags = vmread(vmcs::guest::RFLAGS).unwrap(); + vmwrite(vmcs::guest::RFLAGS, rflags | (1 << 18)) + .unwrap(); + self.step_next_inst().unwrap(); + } + _ => { + self.pic + .inject_exception(vector, error_code) + .unwrap(); + } + }, + _ => { + self.pic.inject_exception(vector, error_code).unwrap(); + } + } + } + } + _ => { + self.pic.inject_exception(vector, error_code).unwrap(); + } + } } - return Err("VM exit due to exception"); } _ => { info!("VM exit reason: {:?}", exit_reason); @@ -178,6 +245,48 @@ impl IntelVCpu { Ok(()) } + fn load_guest_xcr0(&mut self) -> Result<(), &'static str> { + let host_cr4 = unsafe { cr4() }; + if (host_cr4.bits() & Cr4Flags::OSXSAVE.bits() as usize) == 0 { + return Ok(()); + } + + if self.host_xcr0 == 0 { + self.host_xcr0 = unsafe { _xgetbv(0) }; + } + + let guest_cr4 = vmread(x86::vmx::vmcs::guest::CR4)?; + + if guest_cr4 & Cr4Flags::OSXSAVE.bits() != 0 && u64::from(self.guest_xcr0) != self.host_xcr0 + { + unsafe { + _xsetbv(0, u64::from(self.guest_xcr0)); + } + } + + Ok(()) + } + + fn load_host_xcr0(&mut self) -> Result<(), &'static str> { + let host_cr4 = unsafe { cr4() }; + if (host_cr4.bits() & Cr4Flags::OSXSAVE.bits() as usize) == 0 { + return Ok(()); + } + + let guest_cr4 = vmread(x86::vmx::vmcs::guest::CR4)?; + + if guest_cr4 & Cr4Flags::OSXSAVE.bits() != 0 { + let current_xcr0 = unsafe { _xgetbv(0) }; + if current_xcr0 != self.host_xcr0 { + unsafe { + _xsetbv(0, self.host_xcr0); + } + } + } + + Ok(()) + } + fn step_next_inst(&mut self) -> Result<(), &'static str> { use x86::vmx::vmcs; let rip = vmread(vmcs::guest::RIP)?; @@ -193,9 +302,13 @@ impl IntelVCpu { let success = { let result: u16; + + self.load_guest_xcr0().unwrap(); unsafe { result = crate::vmm::x86_64::intel::asm::asm_vm_entry(self as *mut _); }; + self.load_host_xcr0().unwrap(); + result == 0 }; @@ -424,6 +537,77 @@ impl IntelVCpu { Ok(()) } + fn translate_guest_address(&mut self, vaddr: u64) -> Result { + let cr3 = vmread(x86::vmx::vmcs::guest::CR3).map_err(|_| "Failed to read guest CR3")?; + let pml4_base = cr3 & !0xFFF; // Clear lower 12 bits to get page table base + + let efer = vmread(x86::vmx::vmcs::guest::IA32_EFER_FULL).unwrap_or(0); + let is_long_mode = (efer & (1 << 8)) != 0; // LME bit + + if !is_long_mode { + return Ok(vaddr & 0xFFFFFFFF); + } + + let pml4_idx = ((vaddr >> 39) & 0x1FF) as u64; + let pdpt_idx = ((vaddr >> 30) & 0x1FF) as u64; + let pd_idx = ((vaddr >> 21) & 0x1FF) as u64; + let pt_idx = ((vaddr >> 12) & 0x1FF) as u64; + let page_offset = (vaddr & 0xFFF) as u64; + + let pml4_entry_addr = pml4_base + (pml4_idx * 8); + let pml4_entry = self.read_guest_phys_u64(pml4_entry_addr)?; + if (pml4_entry & 1) == 0 { + return Err("PML4 entry not present"); + } + let pdpt_base = pml4_entry & 0x000FFFFFFFFFF000; + + let pdpt_entry_addr = pdpt_base + (pdpt_idx * 8); + let pdpt_entry = self.read_guest_phys_u64(pdpt_entry_addr)?; + if (pdpt_entry & 1) == 0 { + return Err("PDPT entry not present"); + } + + if (pdpt_entry & (1 << 7)) != 0 { + let page_base = pdpt_entry & 0x000FFFFFC0000000; + return Ok(page_base | (vaddr & 0x3FFFFFFF)); + } + let pd_base = pdpt_entry & 0x000FFFFFFFFFF000; + + let pd_entry_addr = pd_base + (pd_idx * 8); + let pd_entry = self.read_guest_phys_u64(pd_entry_addr)?; + if (pd_entry & 1) == 0 { + return Err("PD entry not present"); + } + + if (pd_entry & (1 << 7)) != 0 { + let page_base = pd_entry & 0x000FFFFFFFE00000; + return Ok(page_base | (vaddr & 0x1FFFFF)); + } + let pt_base = pd_entry & 0x000FFFFFFFFFF000; + + let pt_entry_addr = pt_base + (pt_idx * 8); + let pt_entry = self.read_guest_phys_u64(pt_entry_addr)?; + if (pt_entry & 1) == 0 { + return Err("PT entry not present"); + } + let page_base = pt_entry & 0x000FFFFFFFFFF000; + + Ok(page_base | page_offset) + } + + fn read_guest_phys_u64(&mut self, gpa: u64) -> Result { + let mut result_bytes = [0u8; 8]; + + for i in 0..8 { + match self.ept.get(gpa + i) { + Ok(byte) => result_bytes[i as usize] = byte, + Err(_) => return Err("Failed to read from EPT"), + } + } + + Ok(u64::from_le_bytes(result_bytes)) + } + fn dump_vmcs_settings(&self) -> Result<(), &'static str> { info!("=== VMCS Control Fields ==="); @@ -725,6 +909,8 @@ impl VCpu for IntelVCpu { pic: super::io::PIC::new(), io_bitmap: IOBitmap::new(frame_allocator), pending_irq: 0, + host_xcr0: 0, + guest_xcr0: XCR0::new(), }) }