date:20241204

HPET device (Rust device) needs to define the bit type property.

Add a variant of define_property macro to define bit type property.

Signed-off-by: Zhao Liu 
---
 rust/qemu-api/src/qdev.rs | 12 
 1 file changed, 12 insertions(+)

diff --git a/rust/qemu-api/src/qdev.rs b/rust/qemu-api/src/qdev.rs
index 5e6580b6f261..66810803ec9d 100644
--- a/rust/qemu-api/src/qdev.rs
+++ b/rust/qemu-api/src/qdev.rs
@@ -103,6 +103,18 @@ fn class_init(dc: &mut DeviceClass) {
 
 #[macro_export]
 macro_rules! define_property {
+($name:expr, $state:ty, $field:ident, $prop:expr, $type:expr, bit = 
$bitnr:expr, default = $defval:expr$(,)*) => {
+$crate::bindings::Property {
+// use associated function syntax for type checking
+name: ::std::ffi::CStr::as_ptr($name),
+info: $prop,
+offset: $crate::offset_of!($state, $field) as isize,
+bitnr: $bitnr,
+set_default: true,
+defval: $crate::bindings::Property__bindgen_ty_1 { u: $defval as 
u64 },
+..$crate::zeroable::Zeroable::ZERO
+}
+};
 ($name:expr, $state:ty, $field:ident, $prop:expr, $type:expr, default = 
$defval:expr$(,)*) => {
 $crate::bindings::Property {
 // use associated function syntax for type checking
-- 
2.34.1

[RFC 02/13] rust: cell: add BQL-enforcing RefCell variant

From: Paolo Bonzini 

Similar to the existing BqlCell, introduce a custom interior mutability
primitive that resembles RefCell but accounts for QEMU's threading model.
Borrowing the RefCell requires proving that the BQL is held, and
attempting to access without the BQL is a runtime panic.

Almost all of the code was taken from Rust's standard library, while
removing unstable features and probably-unnecessary functionality that
amounts to 60% of the original code.  A lot of what's left is documentation,
as well as unit tests in the form of doctests.  These are not yet integrated
in "make check" but can be run with "cargo test --doc".

Signed-off-by: Paolo Bonzini 
Signed-off-by: Zhao Liu 
---
Changes Before RFC v1:
 * Changed debug_assert to assert like what Paolo did for BqlCell.
 * Added #[derive(Debug)] for BqlRefCell since it looks like BqlRefCell
   will often be embedded in the structure with Debug, e.g., HPETState.
---
 rust/qemu-api/Cargo.toml  |   3 +-
 rust/qemu-api/meson.build |   3 +
 rust/qemu-api/src/cell.rs | 546 +-
 3 files changed, 541 insertions(+), 11 deletions(-)

diff --git a/rust/qemu-api/Cargo.toml b/rust/qemu-api/Cargo.toml
index 669f288d1cb5..4aa22f319860 100644
--- a/rust/qemu-api/Cargo.toml
+++ b/rust/qemu-api/Cargo.toml
@@ -20,8 +20,9 @@ qemu_api_macros = { path = "../qemu-api-macros" }
 version_check = "~0.9"
 
 [features]
-default = []
+default = ["debug_cell"]
 allocator = []
+debug_cell = []
 
 [lints]
 workspace = true
diff --git a/rust/qemu-api/meson.build b/rust/qemu-api/meson.build
index d727ccf18354..3ac69cbc76c4 100644
--- a/rust/qemu-api/meson.build
+++ b/rust/qemu-api/meson.build
@@ -6,6 +6,9 @@ _qemu_api_cfg = run_command(rustc_args,
 if rustc.version().version_compare('>=1.77.0')
   _qemu_api_cfg += ['--cfg', 'has_offset_of']
 endif
+if get_option('debug_mutex')
+  _qemu_api_cfg += ['--feature', 'debug_cell']
+endif
 
 _qemu_api_rs = static_library(
   'qemu_api',
diff --git a/rust/qemu-api/src/cell.rs b/rust/qemu-api/src/cell.rs
index 2e4ea8d590d5..07b636f26266 100644
--- a/rust/qemu-api/src/cell.rs
+++ b/rust/qemu-api/src/cell.rs
@@ -46,20 +46,30 @@
 //! parts of a  device must be made mutable in a controlled manner through the
 //! use of cell types.
 //!
-//! This module provides a way to do so via the Big QEMU Lock.  While
-//! [`BqlCell`] is essentially the same single-threaded primitive that is
-//! available in `std::cell`, the BQL allows it to be used from a 
multi-threaded
-//! context and to share references across threads, while maintaining Rust's
-//! safety guarantees.  For this reason, unlike its `std::cell` counterpart,
-//! `BqlCell` implements the `Sync` trait.
+//! [`BqlCell`] and [`BqlRefCell`] allow doing this via the Big QEMU 
Lock.
+//! While they are essentially the same single-threaded primitives that are
+//! available in `std::cell`, the BQL allows them to be used from a
+//! multi-threaded context and to share references across threads, while
+//! maintaining Rust's safety guarantees.  For this reason, unlike
+//! their `std::cell` counterparts, `BqlCell` and `BqlRefCell` implement the
+//! `Sync` trait.
 //!
 //! BQL checks are performed in debug builds but can be optimized away in
 //! release builds, providing runtime safety during development with no 
overhead
 //! in production.
 //!
-//! Warning: While `BqlCell` is similar to its `std::cell` counterpart, the two
-//! are not interchangeable. Using `std::cell` types in QEMU device
-//! implementations is usually incorrect and can lead to thread-safety issues.
+//! The two provide different ways of handling interior mutability.
+//! `BqlRefCell` is best suited for data that is primarily accessed by the
+//! device's own methods, where multiple reads and writes can be grouped within
+//! a single borrow and a mutable reference can be passed around.  Instead,
+//! [`BqlCell`] is a better choice when sharing small pieces of data with
+//! external code (especially C code), because it provides simple get/set
+//! operations that can be used one at a time.
+//!
+//! Warning: While `BqlCell` and `BqlRefCell` are similar to their `std::cell`
+//! counterparts, they are not interchangeable. Using `std::cell` types in
+//! QEMU device implementations is usually incorrect and can lead to
+//! thread-safety safety issues.
 //!
 //! ## `BqlCell`
 //!
@@ -80,8 +90,37 @@
 //!  returns the replaced value.
 //!- [`set`](BqlCell::set): this method replaces the interior value,
 //!  dropping the replaced value.
+//!
+//! ## `BqlRefCell`
+//!
+//! [`BqlRefCell`] uses Rust's lifetimes to implement "dynamic borrowing", a
+//! process whereby one can claim temporary, exclusive, mutable access to the
+//! inner value:
+//!
+//! ```ignore
+//! fn clear_interrupts(&self, val: u32) {
+//! // A mutable borrow gives read-write access to the registers
+//! let mut regs = self.registers.borrow_mut();
+//! let old = regs.interrupt_status();
+//! regs.u

[RFC 03/13] rust/cell: add get_mut() method for BqlCell

The get_mut() is useful when doing compound assignment operations, e.g.,
*c.get_mut() += 1.

Implement get_mut() for BqlCell by referring to Cell.

Signed-off-by: Zhao Liu 
---
 rust/qemu-api/src/cell.rs | 25 +
 1 file changed, 25 insertions(+)

diff --git a/rust/qemu-api/src/cell.rs b/rust/qemu-api/src/cell.rs
index 07b636f26266..95f1cc0b3eb5 100644
--- a/rust/qemu-api/src/cell.rs
+++ b/rust/qemu-api/src/cell.rs
@@ -324,6 +324,31 @@ impl BqlCell {
 pub const fn as_ptr(&self) -> *mut T {
 self.value.get()
 }
+
+/// Returns a mutable reference to the underlying data.
+///
+/// This call borrows `BqlCell` mutably (at compile-time) which guarantees
+/// that we possess the only reference.
+///
+/// However be cautious: this method expects `self` to be mutable, which is
+/// generally not the case when using a `BqlCell`. If you require interior
+/// mutability by reference, consider using `BqlRefCell` which provides
+/// run-time checked mutable borrows through its [`borrow_mut`] method.
+///
+/// [`borrow_mut`]: BqlRefCell::borrow_mut()
+///
+/// # Examples
+///
+/// ```
+/// use qemu_api::cell::BqlCell;;
+///
+/// let mut c = BqlCell::new(5);
+/// *c.get_mut() += 1;
+///
+/// assert_eq!(c.get(), 6);
+pub fn get_mut(&mut self) -> &mut T {
+self.value.get_mut()
+}
 }
 
 impl BqlCell {
-- 
2.34.1

[RFC 11/13] rust/timer/hpet: add basic HPET timer & state

Add the HPETTimer and HPETState (HPET timer block), along with their
basic methods and register definitions.

This is in preparation for supporting the QAPI interfaces.

Note, wrap all items in HPETState that may be changed in the callback
called by C code into the BqlCell/BqlRefCell.

Signed-off-by: Zhao Liu 
---
 rust/hw/timer/hpet/src/hpet.rs | 638 +
 rust/hw/timer/hpet/src/lib.rs  |   1 +
 rust/wrapper.h |   1 +
 3 files changed, 640 insertions(+)
 create mode 100644 rust/hw/timer/hpet/src/hpet.rs

diff --git a/rust/hw/timer/hpet/src/hpet.rs b/rust/hw/timer/hpet/src/hpet.rs
new file mode 100644
index ..9550d8fe438a
--- /dev/null
+++ b/rust/hw/timer/hpet/src/hpet.rs
@@ -0,0 +1,638 @@
+// Copyright (C) 2024 Intel Corporation.
+// Author(s): Zhao Liu 
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#![allow(dead_code)]
+
+use core::ptr::{null_mut, NonNull};
+use std::os::raw::c_int;
+
+use qemu_api::{
+bindings::*,
+bitops::deposit64,
+cell::{BqlCell, BqlRefCell},
+irq::InterruptSource,
+memattrs::MEMTXATTRS_UNSPECIFIED,
+timer::{qemu_clock_get_virtual_ns, QEMUTimerImpl},
+};
+
+// Register space for each timer block. (HPET_BASE isn't defined here.)
+const HPET_REG_SPACE_LEN: u64 = 0x400; // 1024 bytes
+
+pub(crate) const HPET_MIN_TIMERS: usize = 3; // Miniumum recommended hardware 
implementation.
+const HPET_MAX_TIMERS: usize = 32; // Maximum timers in each timer block.
+
+// Flags that HPETState.flags supports.
+pub(crate) const HPET_FLAG_MSI_SUPPORT_SHIFT: usize = 0;
+
+const HPET_NUM_IRQ_ROUTES: usize = 32;
+const HPET_LEGACY_PIT_INT: u32 = 0; // HPET_LEGACY_RTC_INT isn't defined here.
+const RTC_ISA_IRQ: usize = 8;
+
+const HPET_CLK_PERIOD: u64 = 10; // 10 ns
+const FS_PER_NS: u64 = 100; // 100 femtoseconds == 1 ns
+
+// General Capabilities and ID Register
+const HPET_CAP_REG: u64 = 0x000;
+// Revision ID (bits 0:7)
+const HPET_CAP_REV_ID_VALUE: u64 = 0x1; // Revision 1 is implemented (refer to 
v1.0a spec).
+const HPET_CAP_REV_ID_SHIFT: usize = 0;
+// Number of Timers (bits 8:12)
+const HPET_CAP_NUM_TIM_SHIFT: usize = 8;
+// Counter Size (bit 13)
+const HPET_CAP_COUNT_SIZE_CAP_SHIFT: usize = 13;
+// LegacyReplacement Route Capable (bit 15)
+const HPET_CAP_LEG_RT_CAP_SHIFT: usize = 15;
+// Vendor ID (bits 16:31)
+const HPET_CAP_VENDER_ID_VALUE: u64 = 0x8086;
+const HPET_CAP_VENDER_ID_SHIFT: usize = 16;
+// Main Counter Tick Period (bits 32:63)
+const HPET_CAP_CNT_CLK_PERIOD_SHIFT: usize = 32;
+
+// General Configuration Register
+const HPET_CFG_REG: u64 = 0x010;
+// Overall Enable (bit 0)
+const HPET_CFG_ENABLE_SHIFT: usize = 0;
+// LegacyReplacement Route (bit 1)
+const HPET_CFG_LEG_RT_SHIFT: usize = 1;
+// Other bits are reserved.
+const HPET_CFG_WRITE_MASK: u64 = 0x003;
+
+// General Interrupt Status Register
+const HPET_INT_STATUS_REG: u64 = 0x020;
+
+// Main Counter Value Register
+const HPET_COUNTER_REG: u64 = 0x0f0;
+
+// Timer N Configuration and Capability Register (masked by 0x18)
+const HPET_TN_CFG_REG: u64 = 0x000;
+// bit 0, 7, and bits 16:31 are reserved.
+// bit 4, 5, 15, and bits 32:64 are read-only.
+const HPET_TN_CFG_WRITE_MASK: u64 = 0x7f4e;
+// Timer N Interrupt Type (bit 1)
+const HPET_TN_CFG_INT_TYPE_SHIFT: usize = 1;
+// Timer N Interrupt Enable (bit 2)
+const HPET_TN_CFG_INT_ENABLE_SHIFT: usize = 2;
+// Timer N Type (Periodic enabled or not, bit 3)
+const HPET_TN_CFG_PERIODIC_SHIFT: usize = 3;
+// Timer N Periodic Interrupt Capable (support Periodic or not, bit 4)
+const HPET_TN_CFG_PERIODIC_CAP_SHIFT: usize = 4;
+// Timer N Size (timer size is 64-bits or 32 bits, bit 5)
+const HPET_TN_CFG_SIZE_CAP_SHIFT: usize = 5;
+// Timer N Value Set (bit 6)
+const HPET_TN_CFG_SETVAL_SHIFT: usize = 6;
+// Timer N 32-bit Mode (bit 8)
+const HPET_TN_CFG_32BIT_SHIFT: usize = 8;
+// Timer N Interrupt Rout (bits 9:13)
+const HPET_TN_CFG_INT_ROUTE_MASK: u64 = 0x3e00;
+const HPET_TN_CFG_INT_ROUTE_SHIFT: usize = 9;
+// Timer N FSB Interrupt Enable (bit 14)
+const HPET_TN_CFG_FSB_ENABLE_SHIFT: usize = 14;
+// Timer N FSB Interrupt Delivery (bit 15)
+const HPET_TN_CFG_FSB_CAP_SHIFT: usize = 15;
+// Timer N Interrupt Routing Capability (bits 32:63)
+const HPET_TN_CFG_INT_ROUTE_CAP_SHIFT: usize = 32;
+
+// Timer N Comparator Value Register (masked by 0x18)
+const HPET_TN_CMP_REG: u64 = 0x008;
+
+// Timer N FSB Interrupt Route Register (masked by 0x18)
+const HPET_TN_FSB_ROUTE_REG: u64 = 0x010;
+
+fn hpet_next_wrap(cur_tick: u64) -> u64 {
+(cur_tick | 0x) + 1
+}
+
+fn hpet_time_after(a: u64, b: u64) -> bool {
+((b - a) as i64) < 0
+}
+
+fn ticks_to_ns(value: u64) -> u64 {
+value * HPET_CLK_PERIOD
+}
+
+fn ns_to_ticks(value: u64) -> u64 {
+value / HPET_CLK_PERIOD
+}
+
+// Avoid touching the bits that cannot be written.
+fn hpet_fixup_reg(new: u64, old: u64, mask: u64) -> u64 {
+(new & mask) | (old & !mask)
+}
+
+fn activating_bit(old: u64, new: u64, shift: usize) -> bool {
+let

[RFC 06/13] rust: add bindings for memattrs

The MemTxAttrs structure is composed of bitfield members, and bindgen is
unable to generate an equivalent macro definition for
MEMTXATTRS_UNSPECIFIED.

Therefore, we have to manually define a global constant variable
MEMTXATTRS_UNSPECIFIED to support calls from Rust code.

However, the binding methods of MemTxAttrs are non-const, so we cannot
directly use them when defining MEMTXATTRS_UNSPECIFIED. As a result,
add the third-party crate once_cell to use its Lazy to help define
MEMTXATTRS_UNSPECIFIED.

Note, lazy_static has been deprecated and LazyCell (in std) became
stable since v1.80. When the minimum supported rustc version is bumped
to v1.80 in the future, LazyCell can be used to replace the current
once_cell.

Signed-off-by: Zhao Liu 
---
 rust/Cargo.lock   |  7 ++
 rust/qemu-api/Cargo.toml  |  1 +
 rust/qemu-api/meson.build |  9 ++--
 rust/qemu-api/src/lib.rs  |  1 +
 rust/qemu-api/src/memattrs.rs | 21 +
 rust/wrapper.h|  1 +
 scripts/archive-source.sh |  2 +-
 scripts/make-release  |  2 +-
 subprojects/.gitignore|  1 +
 subprojects/once_cell-1.20-rs.wrap|  7 ++
 .../once_cell-1.20-rs/meson.build | 23 +++
 11 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 rust/qemu-api/src/memattrs.rs
 create mode 100644 subprojects/once_cell-1.20-rs.wrap
 create mode 100644 subprojects/packagefiles/once_cell-1.20-rs/meson.build

diff --git a/rust/Cargo.lock b/rust/Cargo.lock
index c0c6069247a8..6b19553b6d10 100644
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -46,6 +46,12 @@ dependencies = [
  "either",
 ]
 
+[[package]]
+name = "once_cell"
+version = "1.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index";
+checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
+
 [[package]]
 name = "pl011"
 version = "0.1.0"
@@ -92,6 +98,7 @@ dependencies = [
 name = "qemu_api"
 version = "0.1.0"
 dependencies = [
+ "once_cell",
  "qemu_api_macros",
  "version_check",
 ]
diff --git a/rust/qemu-api/Cargo.toml b/rust/qemu-api/Cargo.toml
index 4aa22f319860..265e00f97176 100644
--- a/rust/qemu-api/Cargo.toml
+++ b/rust/qemu-api/Cargo.toml
@@ -14,6 +14,7 @@ keywords = []
 categories = []
 
 [dependencies]
+once_cell = { version = "1.20.2" }
 qemu_api_macros = { path = "../qemu-api-macros" }
 
 [build-dependencies]
diff --git a/rust/qemu-api/meson.build b/rust/qemu-api/meson.build
index 00e86a679d8a..508986948883 100644
--- a/rust/qemu-api/meson.build
+++ b/rust/qemu-api/meson.build
@@ -10,6 +10,9 @@ if get_option('debug_mutex')
   _qemu_api_cfg += ['--feature', 'debug_cell']
 endif
 
+subproject('once_cell-1.20-rs', required: true)
+once_cell_dep = dependency('once_cell-1.20-rs')
+
 _qemu_api_rs = static_library(
   'qemu_api',
   structured_sources(
@@ -20,6 +23,7 @@ _qemu_api_rs = static_library(
   'src/cell.rs',
   'src/c_str.rs',
   'src/irq.rs',
+  'src/memattrs.rs',
   'src/module.rs',
   'src/offset_of.rs',
   'src/qdev.rs',
@@ -33,6 +37,7 @@ _qemu_api_rs = static_library(
   override_options: ['rust_std=2021', 'build.rust_std=2021'],
   rust_abi: 'rust',
   rust_args: _qemu_api_cfg,
+  dependencies: once_cell_dep,
 )
 
 rust.test('rust-qemu-api-tests', _qemu_api_rs,
@@ -40,7 +45,7 @@ rust.test('rust-qemu-api-tests', _qemu_api_rs,
 
 qemu_api = declare_dependency(
   link_with: _qemu_api_rs,
-  dependencies: qemu_api_macros,
+  dependencies: [qemu_api_macros, once_cell_dep],
 )
 
 # Rust executables do not support objects, so add an intermediate step.
@@ -56,7 +61,7 @@ test('rust-qemu-api-integration',
 override_options: ['rust_std=2021', 'build.rust_std=2021'],
 rust_args: ['--test'],
 install: false,
-dependencies: [qemu_api, qemu_api_macros],
+dependencies: [qemu_api, qemu_api_macros, once_cell_dep],
 link_whole: [rust_qemu_api_objs, libqemuutil]),
 args: [
 '--test',
diff --git a/rust/qemu-api/src/lib.rs b/rust/qemu-api/src/lib.rs
index 009906c907e7..e60c9ac16409 100644
--- a/rust/qemu-api/src/lib.rs
+++ b/rust/qemu-api/src/lib.rs
@@ -11,6 +11,7 @@
 pub mod c_str;
 pub mod cell;
 pub mod irq;
+pub mod memattrs;
 pub mod module;
 pub mod offset_of;
 pub mod qdev;
diff --git a/rust/qemu-api/src/memattrs.rs b/rust/qemu-api/src/memattrs.rs
new file mode 100644
index ..7cc8aea4b7b7
--- /dev/null
+++ b/rust/qemu-api/src/memattrs.rs
@@ -0,0 +1,21 @@
+// Copyright (C) 2024 Intel Corporation.
+// Author(s): Zhao Liu 
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+use once_cell::sync::Lazy;
+
+use crate::bindings::MemTxAttrs;
+
+impl MemTxAttrs {
+fn memtxattrs_unspecified() -> Self {
+let mut attrs = MemTxAttrs::default();
+attrs.set_unspecified(1);
+attrs
+}
+}
+
+/// Bu

[RFC 07/13] rust: add bindings for timer

The bindgen supports `static inline` function binding since v0.64.0 as
an experimental feature (`--wrap-static-fns`), and stabilizes it after
v0.70.0.

But the oldest version of bindgen supported by QEMU is v0.60.1, so
there's no way to generate the bindings for timer_new() and its variants
which are `static inline` (in include/qemu/timer.h).

Manually implement bindings to help create new timers in Rust.
Additionally, wrap timer_mod(), timer_del() and
qemu_clock_get_virtual_ns() as safe functions to make timer interfaces
more Rust-idiomatic.

In addition, for timer_new() and its variants, to convert the idiomatic
Rust callback into a C-style callback QEMUTimerCB, introduce a trait
QEMUTimerImpl. For any object needs to initialize a new timer, it needs
to implement QEMUTimerImpl trait and define a handler.

Signed-off-by: Zhao Liu 
---
 rust/qemu-api/meson.build  |   1 +
 rust/qemu-api/src/lib.rs   |   1 +
 rust/qemu-api/src/timer.rs | 123 +
 rust/wrapper.h |   1 +
 4 files changed, 126 insertions(+)
 create mode 100644 rust/qemu-api/src/timer.rs

diff --git a/rust/qemu-api/meson.build b/rust/qemu-api/meson.build
index 508986948883..5bf3c3dfab67 100644
--- a/rust/qemu-api/meson.build
+++ b/rust/qemu-api/meson.build
@@ -29,6 +29,7 @@ _qemu_api_rs = static_library(
   'src/qdev.rs',
   'src/qom.rs',
   'src/sysbus.rs',
+  'src/timer.rs',
   'src/vmstate.rs',
   'src/zeroable.rs',
 ],
diff --git a/rust/qemu-api/src/lib.rs b/rust/qemu-api/src/lib.rs
index e60c9ac16409..495261976dbc 100644
--- a/rust/qemu-api/src/lib.rs
+++ b/rust/qemu-api/src/lib.rs
@@ -17,6 +17,7 @@
 pub mod qdev;
 pub mod qom;
 pub mod sysbus;
+pub mod timer;
 pub mod vmstate;
 pub mod zeroable;
 
diff --git a/rust/qemu-api/src/timer.rs b/rust/qemu-api/src/timer.rs
new file mode 100644
index ..4f9e8c9277c6
--- /dev/null
+++ b/rust/qemu-api/src/timer.rs
@@ -0,0 +1,123 @@
+// Copyright (C) 2024 Intel Corporation.
+// Author(s): Zhao Liu 
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+use std::{
+borrow::BorrowMut,
+boxed::Box,
+os::raw::{c_int, c_void},
+ptr::NonNull,
+};
+
+pub use bindings::QEMUTimer;
+
+use crate::bindings::{self, *};
+
+impl QEMUTimer {
+fn new() -> Self {
+QEMUTimer {
+expire_time: 0,
+timer_list: ::core::ptr::null_mut(),
+cb: None,
+opaque: ::core::ptr::null_mut(),
+next: ::core::ptr::null_mut(),
+attributes: 0,
+scale: 0,
+}
+}
+
+// TODO: Consider how to avoid passing in C style callbacks directly.
+fn timer_new_full(
+timer_list_group: Option<&mut QEMUTimerListGroup>,
+clk_type: QEMUClockType,
+scale: u32,
+attributes: u32,
+opaque: &mut T::Opaque,
+) -> Self {
+let mut ts: Box = Box::new(QEMUTimer::new());
+let group_ptr = if let Some(g) = timer_list_group {
+g
+} else {
+::core::ptr::null_mut()
+};
+
+// Safety:
+// ts is a valid Box object which can borrow a valid mutable
+// pointer, and opaque is converted from a reference so it's
+// also valid.
+unsafe {
+timer_init_full(
+ts.borrow_mut(),
+group_ptr,
+clk_type,
+scale as c_int,
+attributes as c_int,
+Some(rust_timer_handler::),
+(opaque as *mut T::Opaque).cast::(),
+)
+};
+
+*ts
+}
+
+pub fn timer_mod(&mut self, expire_time: u64) {
+unsafe { timer_mod(self as *mut QEMUTimer, expire_time as i64) }
+}
+
+pub fn timer_del(&mut self) {
+unsafe { timer_del(self as *mut QEMUTimer) };
+}
+}
+
+/// timer expiration callback
+unsafe extern "C" fn rust_timer_handler(opaque: *mut c_void) 
{
+// SAFETY:
+// the pointer is convertible to a reference
+let para = unsafe { 
NonNull::new(opaque.cast::()).unwrap().as_mut() };
+
+T::QEMU_TIMER_CB.unwrap()(para);
+}
+
+pub trait QEMUTimerImpl {
+type Opaque;
+
+// To be more general, opaque is mutable here. But it still should
+// be protected by BqlCell/BqlRefCell.
+//
+// FIXME: limit opaque to immutable?
+const QEMU_TIMER_CB: Option = None;
+
+fn timer_new(clk_type: QEMUClockType, scale: u32, opaque: &mut 
Self::Opaque) -> QEMUTimer
+where
+Self: Sized,
+{
+QEMUTimer::timer_new_full::(None, clk_type, scale, 0, opaque)
+}
+
+fn timer_new_ns(clk_type: QEMUClockType, opaque: &mut Self::Opaque) -> 
QEMUTimer
+where
+Self: Sized,
+{
+Self::timer_new(clk_type, SCALE_NS, opaque)
+}
+
+fn timer_new_us(clk_type: QEMUClockType, opaque: &mut Self::Opaque) -> 
QEMUTimer
+where
+Self: Sized,
+{
+Self::timer_new(clk_type, SCALE_US, opaque)
+}
+
+fn timer_new_ms(cl

[RFC 13/13] i386: enable rust hpet for pc when rust is enabled

Add HPET configuration in PC's Kconfig options, and select HPET device
(Rust version) if Rust is supported.

Signed-off-by: Zhao Liu 
---
 hw/i386/Kconfig   | 2 ++
 hw/timer/Kconfig  | 1 -
 rust/hw/Kconfig   | 1 +
 rust/hw/timer/Kconfig | 2 ++
 4 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 rust/hw/timer/Kconfig

diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index 32818480d263..83ab3222c4f0 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -39,6 +39,8 @@ config PC
 select PCSPK
 select I8257
 select MC146818RTC
+select HPET if !HAVE_RUST
+select X_HPET_RUST if HAVE_RUST
 # For ACPI builder:
 select SERIAL_ISA
 select ACPI_PCI
diff --git a/hw/timer/Kconfig b/hw/timer/Kconfig
index c96fd5d97ae8..645d7531f40e 100644
--- a/hw/timer/Kconfig
+++ b/hw/timer/Kconfig
@@ -11,7 +11,6 @@ config A9_GTIMER
 
 config HPET
 bool
-default y if PC
 
 config I8254
 bool
diff --git a/rust/hw/Kconfig b/rust/hw/Kconfig
index 4d934f30afe1..36f92ec02874 100644
--- a/rust/hw/Kconfig
+++ b/rust/hw/Kconfig
@@ -1,2 +1,3 @@
 # devices Kconfig
 source char/Kconfig
+source timer/Kconfig
diff --git a/rust/hw/timer/Kconfig b/rust/hw/timer/Kconfig
new file mode 100644
index ..afd980335037
--- /dev/null
+++ b/rust/hw/timer/Kconfig
@@ -0,0 +1,2 @@
+config X_HPET_RUST
+bool
-- 
2.34.1

[RFC 12/13] rust/timer/hpet: add qdev APIs support

Implement QAPI support for HPET device in qdev.rs.

Additionally, wrap the handling of HPET internal details as traits to be
specifically implemented in hpet.rs.

Signed-off-by: Zhao Liu 
---
 rust/hw/timer/hpet/src/fw_cfg.rs |   2 -
 rust/hw/timer/hpet/src/hpet.rs   | 232 ++-
 rust/hw/timer/hpet/src/lib.rs|   5 +
 rust/hw/timer/hpet/src/qdev.rs   | 133 ++
 4 files changed, 365 insertions(+), 7 deletions(-)
 create mode 100644 rust/hw/timer/hpet/src/qdev.rs

diff --git a/rust/hw/timer/hpet/src/fw_cfg.rs b/rust/hw/timer/hpet/src/fw_cfg.rs
index a057c2778be4..6515a428cebb 100644
--- a/rust/hw/timer/hpet/src/fw_cfg.rs
+++ b/rust/hw/timer/hpet/src/fw_cfg.rs
@@ -2,8 +2,6 @@
 // Author(s): Zhao Liu 
 // SPDX-License-Identifier: GPL-2.0-or-later
 
-#![allow(dead_code)]
-
 use qemu_api::{cell::BqlCell, zeroable::Zeroable};
 
 // Each HPETState represents a Event Timer Block. The v1 spec supports
diff --git a/rust/hw/timer/hpet/src/hpet.rs b/rust/hw/timer/hpet/src/hpet.rs
index 9550d8fe438a..9480633a77dd 100644
--- a/rust/hw/timer/hpet/src/hpet.rs
+++ b/rust/hw/timer/hpet/src/hpet.rs
@@ -2,10 +2,8 @@
 // Author(s): Zhao Liu 
 // SPDX-License-Identifier: GPL-2.0-or-later
 
-#![allow(dead_code)]
-
-use core::ptr::{null_mut, NonNull};
-use std::os::raw::c_int;
+use core::ptr::{addr_of_mut, null_mut, NonNull};
+use std::os::raw::{c_uint, c_void};
 
 use qemu_api::{
 bindings::*,
@@ -13,9 +11,14 @@
 cell::{BqlCell, BqlRefCell},
 irq::InterruptSource,
 memattrs::MEMTXATTRS_UNSPECIFIED,
+qdev::DeviceGPIOImpl,
+qom::ObjectType,
 timer::{qemu_clock_get_virtual_ns, QEMUTimerImpl},
+zeroable::Zeroable,
 };
 
+use crate::{fw_cfg::*, qdev::*};
+
 // Register space for each timer block. (HPET_BASE isn't defined here.)
 const HPET_REG_SPACE_LEN: u64 = 0x400; // 1024 bytes
 
@@ -453,6 +456,38 @@ fn callback(&mut self) {
 }
 }
 
+impl RamOps for HPETTimer {
+fn read(&mut self, addr: hwaddr, _size: c_uint) -> u64 {
+let shift: u64 = (addr & 4) * 8;
+
+match addr & 0x18 {
+HPET_TN_CFG_REG => self.config >> shift, // including interrupt 
capabilities
+HPET_TN_CMP_REG => self.cmp >> shift,// comparator register
+HPET_TN_FSB_ROUTE_REG => self.fsb >> shift,
+_ => {
+// TODO: Add trace point - trace_hpet_ram_read_invalid()
+// Reserved.
+0
+}
+}
+}
+
+fn write(&mut self, addr: hwaddr, value: u64, size: u64) {
+let shift = ((addr & 4) * 8) as usize;
+let len = std::cmp::min(size * 8, 64 - shift as u64) as usize;
+
+match addr & 0x18 {
+HPET_TN_CFG_REG => self.set_tn_cfg_reg(shift, len, value),
+HPET_TN_CMP_REG => self.set_tn_cmp_reg(shift, len, value),
+HPET_TN_FSB_ROUTE_REG => self.set_tn_fsb_route_reg(shift, len, 
value),
+_ => {
+// TODO: Add trace point - trace_hpet_ram_write_invalid()
+// Reserved.
+}
+}
+}
+}
+
 #[derive(Debug)]
 pub struct HPETTimerInstance(BqlRefCell);
 
@@ -466,7 +501,7 @@ fn timer_handler(timer: &mut HPETTimerInstance) {
 /// Note: Wrap all items that may be changed in the callback called by C
 /// into the BqlCell/BqlRefCell.
 #[repr(C)]
-#[derive(Debug, qemu_api_macros::offsets)]
+#[derive(Debug, qemu_api_macros::Object, qemu_api_macros::offsets)]
 pub struct HPETState {
 parent_obj: SysBusDevice,
 iomem: MemoryRegion,
@@ -636,3 +671,190 @@ impl QEMUTimerImpl for HPETState {
 const QEMU_TIMER_CB: Option =
 Some(HPETTimerInstance::timer_handler);
 }
+
+impl ObjOps for HPETState {
+// TODO: Add binding to register idiomatic Rust callback.
+const HPET_RAM_OPS: MemoryRegionOps = MemoryRegionOps {
+read: Some(hpet_ram_read),
+write: Some(hpet_ram_write),
+read_with_attrs: None,
+write_with_attrs: None,
+valid: MemoryRegionOps__bindgen_ty_1 {
+min_access_size: 4,
+max_access_size: 8,
+..Zeroable::ZERO
+},
+impl_: MemoryRegionOps__bindgen_ty_2 {
+min_access_size: 4,
+max_access_size: 8,
+..Zeroable::ZERO
+},
+endianness: device_endian::DEVICE_NATIVE_ENDIAN,
+};
+
+unsafe fn init(&mut self) {
+// SAFETY:
+// self and self.iomem are guaranteed to be valid at this point since 
callers
+// must make sure the `self` reference is valid.
+unsafe {
+memory_region_init_io(
+addr_of_mut!(self.iomem),
+addr_of_mut!(*self).cast::(),
+&Self::HPET_RAM_OPS,
+addr_of_mut!(*self).cast::(),
+Self::TYPE_NAME.as_ptr(),
+HPET_REG_SPACE_LEN,
+);
+let sbd = addr_of_mut!(*self).cast::();
+sysbus_init_mmio(sbd, addr_of_mut!(self.iomem));
+

[RFC 05/13] rust: add a bit operation binding for deposit64

The bindgen supports `static inline` function binding since v0.64.0 as
an experimental feature (`--wrap-static-fns`), and stabilizes it after
v0.70.0.

But the oldest version of bindgen supported by QEMU is v0.60.1, so
there's no way to generate the binding for deposit64() which is `static
inline` (in include/qemu/bitops.h).

Manually implement a binding. Since it only involves bit operations,
fortunately, the Rust version of deposit64() is almost identical to the
original C version.

Signed-off-by: Zhao Liu 
---
 rust/qemu-api/meson.build   |  1 +
 rust/qemu-api/src/bitops.rs | 11 +++
 rust/qemu-api/src/lib.rs|  1 +
 3 files changed, 13 insertions(+)
 create mode 100644 rust/qemu-api/src/bitops.rs

diff --git a/rust/qemu-api/meson.build b/rust/qemu-api/meson.build
index 3ac69cbc76c4..00e86a679d8a 100644
--- a/rust/qemu-api/meson.build
+++ b/rust/qemu-api/meson.build
@@ -16,6 +16,7 @@ _qemu_api_rs = static_library(
 [
   'src/lib.rs',
   'src/bindings.rs',
+  'src/bitops.rs',
   'src/cell.rs',
   'src/c_str.rs',
   'src/irq.rs',
diff --git a/rust/qemu-api/src/bitops.rs b/rust/qemu-api/src/bitops.rs
new file mode 100644
index ..a11a07fb8830
--- /dev/null
+++ b/rust/qemu-api/src/bitops.rs
@@ -0,0 +1,11 @@
+// Copyright (C) 2024 Intel Corporation.
+// Author(s): Zhao Liu 
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+pub fn deposit64(value: u64, start: usize, length: usize, fieldval: u64) -> 
u64 {
+/* FIXME: Implement a more elegant check with error handling support? */
+assert!(length > 0 && length <= 64 - start);
+
+let mask = (u64::MAX >> (64 - length)) << start;
+(value & !mask) | ((fieldval << start) & mask)
+}
diff --git a/rust/qemu-api/src/lib.rs b/rust/qemu-api/src/lib.rs
index 0d46b372c6bb..009906c907e7 100644
--- a/rust/qemu-api/src/lib.rs
+++ b/rust/qemu-api/src/lib.rs
@@ -7,6 +7,7 @@
 #[rustfmt::skip]
 pub mod bindings;
 
+pub mod bitops;
 pub mod c_str;
 pub mod cell;
 pub mod irq;
-- 
2.34.1

[RFC 01/13] bql: check that the BQL is not dropped within marked sections

From: Paolo Bonzini 

The Big QEMU Lock (BQL) is used to provide interior mutability to Rust
code.  While BqlCell performs indivisible accesses, an equivalent of
RefCell will allow the borrower to hold to the interior content for a
long time.  If the BQL is dropped, another thread could come and mutate
the data from C code (Rust code would panic on borrow_mut() instead).
In order to prevent this, add a new BQL primitive that can mark
BQL-atomic sections and aborts if the BQL is dropped within them.

Signed-off-by: Paolo Bonzini 
Signed-off-by: Zhao Liu 
---
 include/qemu/main-loop.h | 15 +++
 stubs/iothread-lock.c| 15 +++
 system/cpus.c| 15 +++
 3 files changed, 45 insertions(+)

diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 5764db157c97..646306c272f7 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -262,6 +262,21 @@ AioContext *iohandler_get_aio_context(void);
  */
 bool bql_locked(void);
 
+/**
+ * bql_block: Allow/deny releasing the BQL
+ *
+ * The Big QEMU Lock (BQL) is used to provide interior mutability to
+ * Rust code, but this only works if other threads cannot run while
+ * the Rust code has an active borrow.  This is because C code in
+ * other threads could come in and mutate data under the Rust code's
+ * feet.
+ *
+ * @increase: Whether to increase or decrease the blocking counter.
+ *Releasing the BQL while the counter is nonzero triggers
+ *an assertion failure.
+ */
+void bql_block_unlock(bool increase);
+
 /**
  * qemu_in_main_thread: return whether it's possible to safely access
  * the global state of the block layer.
diff --git a/stubs/iothread-lock.c b/stubs/iothread-lock.c
index d7890e5581c5..54676598950f 100644
--- a/stubs/iothread-lock.c
+++ b/stubs/iothread-lock.c
@@ -1,6 +1,8 @@
 #include "qemu/osdep.h"
 #include "qemu/main-loop.h"
 
+static uint32_t bql_unlock_blocked;
+
 bool bql_locked(void)
 {
 return false;
@@ -12,4 +14,17 @@ void bql_lock_impl(const char *file, int line)
 
 void bql_unlock(void)
 {
+assert(!bql_unlock_blocked);
+}
+
+void bql_block_unlock(bool increase)
+{
+uint32_t new_value;
+
+assert(bql_locked());
+
+/* check for overflow! */
+new_value = bql_unlock_blocked + increase - !increase;
+assert((new_value > bql_unlock_blocked) == increase);
+bql_unlock_blocked = new_value;
 }
diff --git a/system/cpus.c b/system/cpus.c
index a1b46f68476a..793c4698c7ad 100644
--- a/system/cpus.c
+++ b/system/cpus.c
@@ -514,6 +514,20 @@ bool qemu_in_vcpu_thread(void)
 
 QEMU_DEFINE_STATIC_CO_TLS(bool, bql_locked)
 
+static uint32_t bql_unlock_blocked;
+
+void bql_block_unlock(bool increase)
+{
+uint32_t new_value;
+
+assert(bql_locked());
+
+/* check for overflow! */
+new_value = bql_unlock_blocked + increase - !increase;
+assert((new_value > bql_unlock_blocked) == increase);
+bql_unlock_blocked = new_value;
+}
+
 bool bql_locked(void)
 {
 return get_bql_locked();
@@ -540,6 +554,7 @@ void bql_lock_impl(const char *file, int line)
 void bql_unlock(void)
 {
 g_assert(bql_locked());
+g_assert(!bql_unlock_blocked);
 set_bql_locked(false);
 qemu_mutex_unlock(&bql);
 }
-- 
2.34.1

[RFC 09/13] i386/fw_cfg: move hpet_cfg definition to hpet.c

HPET device needs to access and update hpet_cfg variable, but now it is
defined in hw/i386/fw_cfg.c and Rust code can't access it.

Move hpet_cfg definition to hpet.c (and rename it to hpet_fw_cfg). This
allows Rust HPET device implements its own global hpet_fw_cfg variable,
and will further reduce the use of unsafe C code access and calls in the
Rust HPET implementation.

Signed-off-by: Zhao Liu 
---
 hw/i386/fw_cfg.c|  4 +---
 hw/timer/hpet.c | 16 +---
 include/hw/timer/hpet.h |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index 0e4494627c21..965e6306838a 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -26,8 +26,6 @@
 #include CONFIG_DEVICES
 #include "target/i386/cpu.h"
 
-struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX};
-
 const char *fw_cfg_arch_key_name(uint16_t key)
 {
 static const struct {
@@ -149,7 +147,7 @@ FWCfgState *fw_cfg_arch_create(MachineState *ms,
 #endif
 fw_cfg_add_i32(fw_cfg, FW_CFG_IRQ0_OVERRIDE, 1);
 
-fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, &hpet_cfg, sizeof(hpet_cfg));
+fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, &hpet_fw_cfg, sizeof(hpet_fw_cfg));
 /* allocate memory for the NUMA channel: one (64bit) word for the number
  * of nodes, one word for each VCPU->node and one word for each node to
  * hold the amount of memory.
diff --git a/hw/timer/hpet.c b/hw/timer/hpet.c
index 5399f1b2a3f7..d8bd51b7e202 100644
--- a/hw/timer/hpet.c
+++ b/hw/timer/hpet.c
@@ -40,6 +40,8 @@
 #include "qom/object.h"
 #include "trace.h"
 
+struct hpet_fw_config hpet_fw_cfg = {.count = UINT8_MAX};
+
 #define HPET_MSI_SUPPORT0
 
 OBJECT_DECLARE_SIMPLE_TYPE(HPETState, HPET)
@@ -278,7 +280,7 @@ static int hpet_post_load(void *opaque, int version_id)
 /* Push number of timers into capability returned via HPET_ID */
 s->capability &= ~HPET_ID_NUM_TIM_MASK;
 s->capability |= (s->num_timers - 1) << HPET_ID_NUM_TIM_SHIFT;
-hpet_cfg.hpet[s->hpet_id].event_timer_block_id = (uint32_t)s->capability;
+hpet_fw_cfg.hpet[s->hpet_id].event_timer_block_id = 
(uint32_t)s->capability;
 
 /* Derive HPET_MSI_SUPPORT from the capability of the first timer. */
 s->flags &= ~(1 << HPET_MSI_SUPPORT);
@@ -665,8 +667,8 @@ static void hpet_reset(DeviceState *d)
 s->hpet_counter = 0ULL;
 s->hpet_offset = 0ULL;
 s->config = 0ULL;
-hpet_cfg.hpet[s->hpet_id].event_timer_block_id = (uint32_t)s->capability;
-hpet_cfg.hpet[s->hpet_id].address = sbd->mmio[0].addr;
+hpet_fw_cfg.hpet[s->hpet_id].event_timer_block_id = 
(uint32_t)s->capability;
+hpet_fw_cfg.hpet[s->hpet_id].address = sbd->mmio[0].addr;
 
 /* to document that the RTC lowers its output on reset as well */
 s->rtc_irq_level = 0;
@@ -708,17 +710,17 @@ static void hpet_realize(DeviceState *dev, Error **errp)
 if (!s->intcap) {
 warn_report("Hpet's intcap not initialized");
 }
-if (hpet_cfg.count == UINT8_MAX) {
+if (hpet_fw_cfg.count == UINT8_MAX) {
 /* first instance */
-hpet_cfg.count = 0;
+hpet_fw_cfg.count = 0;
 }
 
-if (hpet_cfg.count == 8) {
+if (hpet_fw_cfg.count == 8) {
 error_setg(errp, "Only 8 instances of HPET is allowed");
 return;
 }
 
-s->hpet_id = hpet_cfg.count++;
+s->hpet_id = hpet_fw_cfg.count++;
 
 for (i = 0; i < HPET_NUM_IRQ_ROUTES; i++) {
 sysbus_init_irq(sbd, &s->irqs[i]);
diff --git a/include/hw/timer/hpet.h b/include/hw/timer/hpet.h
index d17a8d43199e..dbf709251a8f 100644
--- a/include/hw/timer/hpet.h
+++ b/include/hw/timer/hpet.h
@@ -74,7 +74,7 @@ struct hpet_fw_config
 struct hpet_fw_entry hpet[8];
 } QEMU_PACKED;
 
-extern struct hpet_fw_config hpet_cfg;
+extern struct hpet_fw_config hpet_fw_cfg;
 
 #define TYPE_HPET "hpet"
 
-- 
2.34.1

[RFC 04/13] rust: add bindings for gpio_{in|out} initialization

The qdev_init_gpio_{in|out} are qdev interfaces, so that it's natural to
wrap them as DeviceState's methods in Rust API, which could eliminate
unsafe cases in the device lib.

Wrap qdev_init_gpio_{in|out} as methods in a new trait DeviceGPIOImpl.

In addition, for qdev_init_gpio_in(), to convert the idiomatic Rust
callback into a C-style callback qemu_irq_handler, add a handler pointer
member in DeviceGPIOImpl. For any device needs to initialize GPIO in, it
needs to define a handler. And for device which just wants to initialize
GPIO out, it can leave the GPIO_IRQ_HANDLER as None.

Then device could use init_gpio_in() and init_gpio_out() to initialize
GPIO in and out, like C code.

Note, for qemu_irq_handler, assume the opaque parameter refers to the
self DeviceState, and this is enough as for now, as it's the most common
case in QEMU.

Signed-off-by: Zhao Liu 
---
 rust/qemu-api/src/qdev.rs | 55 +--
 1 file changed, 53 insertions(+), 2 deletions(-)

diff --git a/rust/qemu-api/src/qdev.rs b/rust/qemu-api/src/qdev.rs
index 23a06b377b2c..5e6580b6f261 100644
--- a/rust/qemu-api/src/qdev.rs
+++ b/rust/qemu-api/src/qdev.rs
@@ -4,12 +4,17 @@
 
 //! Bindings to create devices and access device functionality from Rust.
 
-use std::ffi::CStr;
+use std::{
+ffi::CStr,
+os::raw::{c_int, c_void},
+ptr::{addr_of, NonNull},
+};
 
 pub use bindings::{DeviceClass, DeviceState, Property};
 
 use crate::{
-bindings::{self, Error},
+bindings::{self, qdev_init_gpio_in, qdev_init_gpio_out, Error},
+irq::InterruptSource,
 qom::{ClassInitImpl, Object, ObjectClass, ObjectType},
 qom_isa,
 vmstate::VMStateDescription,
@@ -144,3 +149,49 @@ unsafe impl ObjectType for DeviceState {
 unsafe { CStr::from_bytes_with_nul_unchecked(bindings::TYPE_DEVICE) };
 }
 qom_isa!(DeviceState: Object);
+
+/// # Safety
+///
+/// We expect the FFI user of this function to pass a valid pointer that
+/// can be downcasted to type `T`. We also expect the device is
+/// readable/writeable from one thread at any time.
+///
+/// Note: Always assume opaque is referred to the self DeviceState, and
+/// this is also the most common case in QEMU.
+unsafe extern "C" fn rust_irq_handler(
+opaque: *mut c_void,
+lines_num: c_int,
+level: c_int,
+) {
+// SAFETY:
+// the pointer is convertible to a reference
+let state = unsafe { NonNull::new(opaque.cast::()).unwrap().as_mut() };
+
+T::GPIO_IRQ_HANDLER.unwrap()(state, lines_num as u32, level as u32);
+}
+
+/// Trait that defines the irq handler for GPIO in.
+pub trait DeviceGPIOImpl {
+const GPIO_IRQ_HANDLER: Option 
= None;
+
+fn init_gpio_in(&self, lines_num: u32)
+where
+Self: Sized,
+{
+assert!(Self::GPIO_IRQ_HANDLER.is_some());
+
+unsafe {
+qdev_init_gpio_in(
+addr_of!(*self) as *mut _,
+Some(rust_irq_handler::),
+lines_num as c_int,
+);
+}
+}
+
+fn init_gpio_out(&self, pins: &InterruptSource, lines_num: u32) {
+unsafe {
+qdev_init_gpio_out(addr_of!(*self) as *mut _, pins.as_ptr(), 
lines_num as c_int);
+}
+}
+}
-- 
2.34.1

[RFC 00/13] rust: Reinvent the wheel for HPET timer in Rust

Hi,

After making empty promises for many months, I have finally written the
Rust version of HPET :-) I'm also very grateful for the help from Paolo,
Manos, and Junjie!

Overall, HPET in Rust maintains the same logic as the original C
version, adhering to the IA-HPET spec v1.0a [1]. While keeping the logic
unchanged, it attempts to keep up with the current development progress
of Rust for QEMU, leveraging the latest and ongoing Rust binding updates
as much as possible, such as BqlCell / BqlRefCell, qom & qdev
enhancements, irq binding, and more. Additionally, it introduces new
bindings, including gpio_{in|out}, bitops, memattrs, and timer. Finally,
based on Paolo's suggestion, the vmstate part is temporarily on hold.

Welcome your comments and feedback!


(Next, I will introduce the structure of the code, the current gaps, and
share my verbose personal experience of writing a QEMU device in Rust.)


Introduction


.
│ 
...
└── timer
├── hpet
│   ├── Cargo.toml
│   ├── meson.build
│   └── src
│   ├── fw_cfg.rs
│   ├── hpet.rs
│   ├── lib.rs
│   └── qdev.rs
├── Kconfig
└── meson.build


HPET emulation contains 2 parts:
 * HPET device emulation:
   - hpet.rs:
 It includes basic operations for the HPET timer and HPET state
 (which actually represents the HPET timer block).

 Here, similar to the C implementation, it directly defines the
 registers and bit shifts as const variables, without a complete
 register space structure.

 My goal is to reduce unsafe code in this file as much as possible,
 especifically, try to eliminate the unsafe code brought by FFI.

   - qdev.rs:
 Here, it implements various QEMU qdev/qom required traits for the
 HPET state and try to exclude the detailed HPET state operations to
 the hpet.rs file above.

 * Global HPET firmwrie configuration:
   - fw_cfg.rs
 In the C code, there is a variable hpet_fw_cfg (old name: hpet_cfg)
 used to configure the number of HPET timer blocks and the basic
 HPET firmware configuration. It is defined in .c file and is
 referenced as extern in the .h file.

 For the Rust HPET, fw_cfg.rs also implementes hpet_fw_cfg so that
 the .h file can still reference it.

 Specifically, I wrapped it in BqlCell, which ensures the safety of
 Rust device access. Additionally, because BqlCell does not change
 the memory layout, it does not disrupt access from C code.


Current Gaps


* Proper bindings for MemoryRegionOps, which needs to wrap the ram
  read/write callbacks.
  - I think it shouldn't be complicated because qom/qdev already
provides good examples.

* vmstate support.
  - vmstate code for HPET is actually ready, but it will be pending (and
maybe re-writing) until the vmstate infra gets cleaned up.

* Error handling.
  - Now HPET just use panic and println to replace error_setg and
warn_report.

* Trace support.
  - No trace for now.


Experience and Considerations in Rust Device


BqlCell/BqlRefCell
--

Paolo provided a very useful Cell wrapper to operate the device under
the protection of BQL. So I tried to wrap as much as possible fields of
HPETState into BqlCell/BqlRefCell, and it works well :-). 

Anything that needs to be modified within a callback should be protected
by BqlCell/BqlRefCell.

Based on this, I am also considering how the opaque parameter in certain
callbacks should interact with BQL cells. In the timer binding (patch 
7), I think the opaque parameter accepted by the timer callback should
be placed in a BQL cell. However, considering generality, I did not make
further changes and only passed BqlRefCell as the opaque
parameter in the HPET code.

Furthermore, is it possible in the future to wrap the entire state
within a BQL cell? This could save the effort of wrapping many state
members individually when state becomes very huge and complex.


QDEV Property
-

To support bit type property, I added another macro variant (in patch 8)
to allow bitnr parameter. However, I think this lacks scalability.

In qdev-properties.h, it is clear that the PropertyInfo of a property is
bound to its type. Previously, Junjie and I attempted to do the same in
Rust by binding PropertyInfo to the type, thereby avoiding the need to
specify too many parameters in the macro definitions:

https://lore.kernel.org/qemu-devel/20241017143245.1248589-1-zhao1@intel.com/

However, unfortunately, it was missed. I am not sure if this is the
right direction, but perhaps I can pick it up again?


MEMTXATTRS_UNSPECIFIED
--

MEMTXATTRS_UNSPECIFIED is another global variable. Since it is
immutable, BQL cell is not needed.

But MemTxAttrs is a structure with bitfields, and the bindings generated
by bindgen can only be modified through methods. Therefore, it is
necessary to introduce lazy to initialize MEMTXATTRS_UNSPECI

[RFC 10/13] rust/timer/hpet: define hpet_fw_cfg

Define HPETFwEntry structure with the same memory layout as
hpet_fw_entry in C.

Further, define the global hpet_fw_cfg variable in Rust which is the
same as the C version. This hpet_fw_cfg variable in Rust will replace
the C version one and allows both Rust code and C code to access it.

The Rust version of hpet_fw_cfg is self-contained, avoiding unsafe
access to C code.

Signed-off-by: Zhao Liu 
---
 rust/Cargo.lock  |  8 +++
 rust/Cargo.toml  |  1 +
 rust/hw/meson.build  |  1 +
 rust/hw/timer/hpet/Cargo.toml| 14 +
 rust/hw/timer/hpet/meson.build   | 18 +++
 rust/hw/timer/hpet/src/fw_cfg.rs | 88 
 rust/hw/timer/hpet/src/lib.rs| 15 ++
 rust/hw/timer/meson.build|  1 +
 8 files changed, 146 insertions(+)
 create mode 100644 rust/hw/timer/hpet/Cargo.toml
 create mode 100644 rust/hw/timer/hpet/meson.build
 create mode 100644 rust/hw/timer/hpet/src/fw_cfg.rs
 create mode 100644 rust/hw/timer/hpet/src/lib.rs
 create mode 100644 rust/hw/timer/meson.build

diff --git a/rust/Cargo.lock b/rust/Cargo.lock
index 6b19553b6d10..996454af03cf 100644
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -37,6 +37,14 @@ version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index";
 checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
+[[package]]
+name = "hpet"
+version = "0.1.0"
+dependencies = [
+ "qemu_api",
+ "qemu_api_macros",
+]
+
 [[package]]
 name = "itertools"
 version = "0.11.0"
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index de0835bf5b5c..fc620bcaac00 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -4,6 +4,7 @@ members = [
 "qemu-api-macros",
 "qemu-api",
 "hw/char/pl011",
+"hw/timer/hpet",
 ]
 
 [workspace.lints.rust]
diff --git a/rust/hw/meson.build b/rust/hw/meson.build
index 860196645e71..9749d4adfc96 100644
--- a/rust/hw/meson.build
+++ b/rust/hw/meson.build
@@ -1 +1,2 @@
 subdir('char')
+subdir('timer')
diff --git a/rust/hw/timer/hpet/Cargo.toml b/rust/hw/timer/hpet/Cargo.toml
new file mode 100644
index ..db2ef4642b4f
--- /dev/null
+++ b/rust/hw/timer/hpet/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "hpet"
+version = "0.1.0"
+edition = "2021"
+authors = ["Zhao Liu "]
+license = "GPL-2.0-or-later"
+description = "IA-PC High Precision Event Timer emulation in Rust"
+
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+qemu_api = { path = "../../../qemu-api" }
+qemu_api_macros = { path = "../../../qemu-api-macros" }
diff --git a/rust/hw/timer/hpet/meson.build b/rust/hw/timer/hpet/meson.build
new file mode 100644
index ..c2d7c0532ca4
--- /dev/null
+++ b/rust/hw/timer/hpet/meson.build
@@ -0,0 +1,18 @@
+_libhpet_rs = static_library(
+  'hpet',
+  files('src/lib.rs'),
+  override_options: ['rust_std=2021', 'build.rust_std=2021'],
+  rust_abi: 'rust',
+  dependencies: [
+qemu_api,
+qemu_api_macros,
+  ],
+)
+
+rust_devices_ss.add(when: 'CONFIG_X_HPET_RUST', if_true: [declare_dependency(
+  link_whole: [_libhpet_rs],
+  # Putting proc macro crates in `dependencies` is necessary for Meson to find
+  # them when compiling the root per-target static rust lib.
+  dependencies: [qemu_api_macros],
+  variables: {'crate': 'hpet'},
+)])
diff --git a/rust/hw/timer/hpet/src/fw_cfg.rs b/rust/hw/timer/hpet/src/fw_cfg.rs
new file mode 100644
index ..a057c2778be4
--- /dev/null
+++ b/rust/hw/timer/hpet/src/fw_cfg.rs
@@ -0,0 +1,88 @@
+// Copyright (C) 2024 Intel Corporation.
+// Author(s): Zhao Liu 
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#![allow(dead_code)]
+
+use qemu_api::{cell::BqlCell, zeroable::Zeroable};
+
+// Each HPETState represents a Event Timer Block. The v1 spec supports
+// up to 8 blocks. QEMU only uses 1 block (in PC machine).
+const HPET_MAX_NUM_EVENT_TIMER_BLOCK: usize = 8;
+
+#[repr(C, packed)]
+#[derive(Copy, Clone, Default)]
+pub struct HPETFwEntry {
+pub event_timer_block_id: u32,
+pub address: u64,
+pub min_tick: u16,
+pub page_prot: u8,
+}
+
+unsafe impl Zeroable for HPETFwEntry {
+const ZERO: Self = Self {
+event_timer_block_id: 0,
+address: 0,
+min_tick: 0,
+page_prot: 0,
+};
+}
+
+#[repr(C, packed)]
+#[derive(Copy, Clone, Default)]
+pub struct HPETFwConfig {
+pub count: u8,
+pub hpet: [HPETFwEntry; HPET_MAX_NUM_EVENT_TIMER_BLOCK],
+}
+
+unsafe impl Zeroable for HPETFwConfig {
+const ZERO: Self = Self {
+count: 0,
+hpet: [Zeroable::ZERO; HPET_MAX_NUM_EVENT_TIMER_BLOCK],
+};
+}
+
+// Expose to C code to configure firmware.
+// BqlCell is picked since it has the same memory layout
+// as HPETFwConfig (just like Cell/UnsafeCell/T).
+pub struct HPETFwConfigCell(BqlCell);
+
+#[allow(non_upper_case_globals)]
+#[no_mangle]
+pub static mut hpet_fw_cfg: HPETFwConfigCell = 
HPETFwConfigCell(BqlCell::new(HPETFwConfig {
+count: u8::MAX,
+..Zeroable::ZERO
+}));
+
+impl HPETFwConfigCell {
+p

Re: [RFC 00/13] rust: Reinvent the wheel for HPET timer in Rust

> After making empty promises for many months, I have finally written the
> Rust version of HPET :-) I'm also very grateful for the help from Paolo,
> Manos, and Junjie!
> 
> Overall, HPET in Rust maintains the same logic as the original C
> version, adhering to the IA-HPET spec v1.0a [1]. While keeping the logic
> unchanged, it attempts to keep up with the current development progress
> of Rust for QEMU, leveraging the latest and ongoing Rust binding updates
> as much as possible, such as BqlCell / BqlRefCell, qom & qdev
> enhancements, irq binding, and more. Additionally, it introduces new
> bindings, including gpio_{in|out}, bitops, memattrs, and timer. Finally,
> based on Paolo's suggestion, the vmstate part is temporarily on hold.
> 
> Welcome your comments and feedback!

Based on Paolo's rust-next branch of https://gitlab.com/bonzini/qemu at
the commit 05de50008121 ("rust: qom: move device_id to PL011 class side").
 
[snip]

> Public and Private in QOM State
> ---
> 
> I recently asked on the mailing list [4] about the reason for using
> ""/"" comments in QOM structures. Peter, Junjie, and
> Balaton provided some explanations and feedback (thank you all).

And thanks Daniel!!

...

> [4]: https://lore.kernel.org/qemu-devel/zxpz5oudrcvro...@intel.com/

Regards,
Zhao

[PATCH v1] target/riscv: add support for RV64 Xiangshan Nanhu CPU

2024-12-04 Thread MollyChen

Add a CPU entry for the RV64 XiangShan NANHU CPU which
supports single-core and dual-core configurations. More
details can be found at
https://docs.xiangshan.cc/zh-cn/latest/integration/overview

Signed-off-by: MollyChen 
---
 target/riscv/cpu-qom.h |  1 +
 target/riscv/cpu.c | 29 +
 2 files changed, 30 insertions(+)

diff --git a/target/riscv/cpu-qom.h b/target/riscv/cpu-qom.h
index 6547642287..d56b067bf2 100644
--- a/target/riscv/cpu-qom.h
+++ b/target/riscv/cpu-qom.h
@@ -50,6 +50,7 @@
 #define TYPE_RISCV_CPU_THEAD_C906   RISCV_CPU_TYPE_NAME("thead-c906")
 #define TYPE_RISCV_CPU_VEYRON_V1RISCV_CPU_TYPE_NAME("veyron-v1")
 #define TYPE_RISCV_CPU_TT_ASCALON   RISCV_CPU_TYPE_NAME("tt-ascalon")
+#define TYPE_RISCV_CPU_XIANGSHAN_NANHU  RISCV_CPU_TYPE_NAME("xiangshan-nanhu")
 #define TYPE_RISCV_CPU_HOST RISCV_CPU_TYPE_NAME("host")
 
 OBJECT_DECLARE_CPU_TYPE(RISCVCPU, RISCVCPUClass, RISCV_CPU)
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 8447ad0dfb..38baaa39f8 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -645,6 +645,34 @@ static void rv64_tt_ascalon_cpu_init(Object *obj)
 #endif
 }
 
+static void rv64_xiangshan_nanhu_cpu_init(Object *obj)
+{
+CPURISCVState *env = &RISCV_CPU(obj)->env;
+RISCVCPU *cpu = RISCV_CPU(obj);
+
+riscv_cpu_set_misa_ext(env, RVG | RVC | RVB | RVS | RVU);
+env->priv_ver = PRIV_VERSION_1_12_0;
+
+/* Enable ISA extensions */
+cpu->cfg.ext_zbc = true;
+cpu->cfg.ext_zbkb = true;
+cpu->cfg.ext_zbkc = true;
+cpu->cfg.ext_zbkx = true;
+cpu->cfg.ext_zknd = true;
+cpu->cfg.ext_zkne = true;
+cpu->cfg.ext_zknh = true;
+cpu->cfg.ext_zksed = true;
+cpu->cfg.ext_zksh = true;
+cpu->cfg.ext_svinval = true;
+
+cpu->cfg.mmu = true;
+cpu->cfg.pmp = true;
+
+#ifndef CONFIG_USER_ONLY
+set_satp_mode_max_supported(cpu, VM_1_10_SV39);
+#endif
+}
+
 #ifdef CONFIG_TCG
 static void rv128_base_cpu_init(Object *obj)
 {
@@ -3050,6 +3078,7 @@ static const TypeInfo riscv_cpu_type_infos[] = {
 DEFINE_VENDOR_CPU(TYPE_RISCV_CPU_THEAD_C906, MXL_RV64,  
rv64_thead_c906_cpu_init),
 DEFINE_VENDOR_CPU(TYPE_RISCV_CPU_TT_ASCALON, MXL_RV64,  
rv64_tt_ascalon_cpu_init),
 DEFINE_VENDOR_CPU(TYPE_RISCV_CPU_VEYRON_V1,  MXL_RV64,  
rv64_veyron_v1_cpu_init),
+DEFINE_VENDOR_CPU(TYPE_RISCV_CPU_XIANGSHAN_NANHU, MXL_RV64, 
rv64_xiangshan_nanhu_cpu_init),
 #ifdef CONFIG_TCG
 DEFINE_DYNAMIC_CPU(TYPE_RISCV_CPU_BASE128,   MXL_RV128, 
rv128_base_cpu_init),
 #endif /* CONFIG_TCG */
-- 
2.34.1

Re: [PATCH v1 2/4] i386/cpu: Set up CPUID_HT in x86_cpu_expand_features() instead of cpu_x86_cpuid()

Hi Xiaoyao,

Sorry for late reply.

> @@ -7490,6 +7489,7 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu)
>  void x86_cpu_expand_features(X86CPU *cpu, Error **errp)
>  {
>  CPUX86State *env = &cpu->env;
> +CPUState *cs = CPU(cpu);
>  FeatureWord w;
>  int i;
>  GList *l;
> @@ -7531,6 +7531,10 @@ void x86_cpu_expand_features(X86CPU *cpu, Error **errp)
>  }
>  }
>  
> +if (cs->nr_cores * cs->nr_threads > 1) {
> +env->features[FEAT_1_EDX] |= CPUID_HT;
> +}
> +

We shouldn't place any CLI-configurable features here,
especially after expanding plus_features and minus_features.

HT has been made configurable since the commit 83629b1 ("target/i386/
cpu: Fix CPUID_HT exposure"), so if you want palce HT here, you
should make it un-configurable first.

Regarding commit 83629b1, in what cases do we need to actively set HT?

That commit even introduces more issues. Ideally, the hardware being
emulated by setting or masking feature bits should be feature-consistent.

However, "-cpu *,-ht -smp 2" does not remove the HT flag (which is
unexpected), and "-cpu *,+ht -smp 1" forcibly sets HT (which results in
buggy emulation). :(

In fact, HT should not be freely configurable in hardware emulation;
users should configure it in the BIOS.


>  for (i = 0; i < ARRAY_SIZE(feature_dependencies); i++) {
>  FeatureDep *d = &feature_dependencies[i];
>  if (!(env->features[d->from.index] & d->from.mask)) {
> -- 
> 2.34.1
> 
>

Re: [PATCH v1 0/4] Initialize nr_cores and nr_threads early and related clearup

I'm also very sorry, but I have a slightly different opinion...

>  accel/tcg/user-exec-stub.c |  4 +++
>  hw/core/cpu-common.c   |  2 +-
>  include/hw/core/cpu.h  |  8 +
>  system/cpus.c  |  6 +++-
>  target/alpha/cpu.c |  2 ++
>  target/arm/cpu.c   |  2 ++
>  target/avr/cpu.c   |  2 ++
>  target/hexagon/cpu.c   |  2 ++
>  target/hppa/cpu.c  |  2 ++
>  target/i386/cpu.c  | 61 +++---
>  target/loongarch/cpu.c |  2 ++
>  target/m68k/cpu.c  |  2 ++
>  target/microblaze/cpu.c|  2 ++
>  target/mips/cpu.c  |  2 ++
>  target/openrisc/cpu.c  |  2 ++
>  target/ppc/cpu_init.c  |  2 ++
>  target/riscv/cpu.c |  2 ++
>  target/rx/cpu.c|  2 ++
>  target/s390x/cpu.c |  2 ++
>  target/sh4/cpu.c   |  2 ++
>  target/sparc/cpu.c |  2 ++
>  target/tricore/cpu.c   |  2 ++
>  target/xtensa/cpu.c|  2 ++
>  23 files changed, 85 insertions(+), 32 deletions(-)
> 

I have some doubts about the necessity of changing the initialization of
nr_cores/nr_threads, because you can access the machine's topology info
via machine_topo_get_threads_per_socket(), which gives the same result as
`nr_cores * nr_threads`.

Especially, the TDX feature check hook is also within the context of
`current_machine`, so why not check if TDX's HT is consistent with QEMU's
emulation in the TDX hook?

For this reason, and based on my comment on patch 2, I think checking HT
in the TDX hook or even ignoring HT, would be a more straightforward and
less impactful solution.

-Zhao

RE: [PATCH v3 3/7] hw:sdhci: Introduce a new "capareg" class member to set the different Capability Registers

2024-12-04 Thread Jamin Lin

Hi Cedric, 

> Subject: Re: [PATCH v3 3/7] hw:sdhci: Introduce a new "capareg" class member
> to set the different Capability Registers
> 
> On 12/4/24 09:05, Jamin Lin wrote:
> > Currently, it set the hardcode value of capability registers to all
> > ASPEED SOCs However, the value of capability registers should be
> > different for all ASPEED SOCs. For example: the bit 28 of the
> > Capability Register 1 should be 1 for 64-bits System Bus support for 
> > AST2700.
> >
> > Introduce a new "capareg" class member whose data type is uint_64 to
> > set the different Capability Registers to all ASPEED SOCs.
> >
> > The value of Capability Register is "0x01e80080" for AST2400
> > and AST2500. The value of Capability Register is "0x000701f80080" for
> AST2600.
> >
> > Signed-off-by: Jamin Lin 
> > ---
> >   hw/arm/aspeed_ast2400.c  |  3 ++-
> >   hw/arm/aspeed_ast2600.c  |  7 +++--
> >   hw/sd/aspeed_sdhci.c | 52
> +---
> >   include/hw/sd/aspeed_sdhci.h | 12 +++--
> >   4 files changed, 63 insertions(+), 11 deletions(-)
> >
> > diff --git a/hw/arm/aspeed_ast2400.c b/hw/arm/aspeed_ast2400.c index
> > ecc81ecc79..3c1b419945 100644
> > --- a/hw/arm/aspeed_ast2400.c
> > +++ b/hw/arm/aspeed_ast2400.c
> > @@ -224,7 +224,8 @@ static void aspeed_ast2400_soc_init(Object *obj)
> >   snprintf(typename, sizeof(typename), "aspeed.gpio-%s", socname);
> >   object_initialize_child(obj, "gpio", &s->gpio, typename);
> >
> > -object_initialize_child(obj, "sdc", &s->sdhci, TYPE_ASPEED_SDHCI);
> > +snprintf(typename, sizeof(typename), "aspeed.sdhci-%s", socname);
> > +object_initialize_child(obj, "sdc", &s->sdhci, typename);
> >
> >   object_property_set_int(OBJECT(&s->sdhci), "num-slots", 2,
> > &error_abort);
> >
> > diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c index
> > c40d3d8443..b5703bd064 100644
> > --- a/hw/arm/aspeed_ast2600.c
> > +++ b/hw/arm/aspeed_ast2600.c
> > @@ -236,8 +236,8 @@ static void aspeed_soc_ast2600_init(Object *obj)
> >   snprintf(typename, sizeof(typename), "aspeed.gpio-%s-1_8v",
> socname);
> >   object_initialize_child(obj, "gpio_1_8v", &s->gpio_1_8v,
> > typename);
> >
> > -object_initialize_child(obj, "sd-controller", &s->sdhci,
> > -TYPE_ASPEED_SDHCI);
> > +snprintf(typename, sizeof(typename), "aspeed.sdhci-%s", socname);
> > +object_initialize_child(obj, "sd-controller", &s->sdhci,
> > + typename);
> >
> >   object_property_set_int(OBJECT(&s->sdhci), "num-slots", 2,
> > &error_abort);
> >
> > @@ -247,8 +247,7 @@ static void aspeed_soc_ast2600_init(Object *obj)
> >   &s->sdhci.slots[i],
> TYPE_SYSBUS_SDHCI);
> >   }
> >
> > -object_initialize_child(obj, "emmc-controller", &s->emmc,
> > -TYPE_ASPEED_SDHCI);
> > +object_initialize_child(obj, "emmc-controller", &s->emmc,
> > + typename);
> >
> >   object_property_set_int(OBJECT(&s->emmc), "num-slots", 1,
> > &error_abort);
> >
> > diff --git a/hw/sd/aspeed_sdhci.c b/hw/sd/aspeed_sdhci.c index
> > acd6538261..ccaeefa75b 100644
> > --- a/hw/sd/aspeed_sdhci.c
> > +++ b/hw/sd/aspeed_sdhci.c
> > @@ -148,6 +148,7 @@ static void aspeed_sdhci_realize(DeviceState *dev,
> Error **errp)
> >   {
> >   SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
> >   AspeedSDHCIState *sdhci = ASPEED_SDHCI(dev);
> > +AspeedSDHCIClass *asc = ASPEED_SDHCI_GET_CLASS(sdhci);
> >
> >   /* Create input irqs for the slots */
> >   qdev_init_gpio_in_named_with_opaque(DEVICE(sbd),
> > aspeed_sdhci_set_irq, @@ -166,10 +167,7 @@ static void
> aspeed_sdhci_realize(DeviceState *dev, Error **errp)
> >   return;
> >   }
> >
> > -if (!object_property_set_uint(sdhci_slot, "capareg",
> > -  ASPEED_SDHCI_CAPABILITIES,
> errp)) {
> > -return;
> > -}
> > +sdhci->slots[i].capareg = asc->capareg;
> 
> I think we want to keep :
> 
>  if (!object_property_set_uint(sdhci_slot, "capareg",
>asc->capareg, errp)) {
>  return;
>  }
> 
> 
Got it. Thanks

> Thanks,
> 
> C.
> 
> 
> >
> >   if (!sysbus_realize(sbd_slot, errp)) {
> >   return;
> > @@ -218,13 +216,59 @@ static void aspeed_sdhci_class_init(ObjectClass
> *classp, void *data)
> >   device_class_set_props(dc, aspeed_sdhci_properties);
> >   }
> >
> > +static void aspeed_2400_sdhci_class_init(ObjectClass *klass, void
> > +*data) {
> > +DeviceClass *dc = DEVICE_CLASS(klass);
> > +AspeedSDHCIClass *asc = ASPEED_SDHCI_CLASS(klass);
> > +
> > +dc->desc = "ASPEED 2400 SDHCI Controller";
> > +asc->capareg = 0x01e80080; }
> > +
> > +static void aspeed_2500_sdhci_class_init(ObjectClass *klass, void
> > +*data) {
> > +DeviceClass *dc = DEVICE_CLASS(klass);
> > +AspeedSDHCIClass *asc = ASPEED_SDHCI_CLASS(klass

[PATCH v4 2/6] hw/arm/aspeed: Fix coding style

Fix coding style issues from checkpatch.pl.

Signed-off-by: Jamin Lin 
Reviewed-by: Cédric Le Goater 
---
 hw/arm/aspeed_ast2600.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
index be3eb70cdd..c40d3d8443 100644
--- a/hw/arm/aspeed_ast2600.c
+++ b/hw/arm/aspeed_ast2600.c
@@ -541,7 +541,8 @@ static void aspeed_soc_ast2600_realize(DeviceState *dev, 
Error **errp)
 if (!sysbus_realize(SYS_BUS_DEVICE(&s->gpio), errp)) {
 return;
 }
-aspeed_mmio_map(s, SYS_BUS_DEVICE(&s->gpio), 0, 
sc->memmap[ASPEED_DEV_GPIO]);
+aspeed_mmio_map(s, SYS_BUS_DEVICE(&s->gpio), 0,
+sc->memmap[ASPEED_DEV_GPIO]);
 sysbus_connect_irq(SYS_BUS_DEVICE(&s->gpio), 0,
aspeed_soc_get_irq(s, ASPEED_DEV_GPIO));
 
-- 
2.34.1

[PATCH v4 0/6] Support SDHCI and eMMC for ast2700

change from v1:
This patch series do not support boot from an eMMC.
Only support eMMC and SD Slot 0 as storages.

change from v2:
- Add hw/sd/aspeed_sdhci: Fix coding style patch

change from v3:
- Directly set capareg and sd_spec_version instead of property
- Keep DEFINE_TYPES

change from v4:
- Keep to set capareg and sd_spec_version by property

Jamin Lin (6):
  hw/sd/aspeed_sdhci: Fix coding style
  hw/arm/aspeed: Fix coding style
  hw:sdhci: Introduce a new "capareg" class member to set the different
Capability Registers
  hw/sd/aspeed_sdhci: Add AST2700 Support
  aspeed/soc: Support SDHCI for AST2700
  aspeed/soc: Support eMMC for AST2700

 hw/arm/aspeed_ast2400.c  |  3 +-
 hw/arm/aspeed_ast2600.c  | 10 +++---
 hw/arm/aspeed_ast27x0.c  | 35 +++
 hw/sd/aspeed_sdhci.c | 67 ++--
 include/hw/sd/aspeed_sdhci.h | 13 +--
 5 files changed, 117 insertions(+), 11 deletions(-)

-- 
2.34.1

[PATCH v4 4/6] hw/sd/aspeed_sdhci: Add AST2700 Support

Introduce a new ast2700 class to support AST2700. Add a new ast2700 SDHCI class
init function and set the value of capability register to "0x000719f80080".

Signed-off-by: Jamin Lin 
Reviewed-by: Cédric Le Goater 
---
 hw/sd/aspeed_sdhci.c | 14 ++
 include/hw/sd/aspeed_sdhci.h |  1 +
 2 files changed, 15 insertions(+)

diff --git a/hw/sd/aspeed_sdhci.c b/hw/sd/aspeed_sdhci.c
index ae2ec4a916..f82b05397e 100644
--- a/hw/sd/aspeed_sdhci.c
+++ b/hw/sd/aspeed_sdhci.c
@@ -246,6 +246,15 @@ static void aspeed_2600_sdhci_class_init(ObjectClass 
*klass, void *data)
 asc->capareg = 0x000701f80080;
 }
 
+static void aspeed_2700_sdhci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+AspeedSDHCIClass *asc = ASPEED_SDHCI_CLASS(klass);
+
+dc->desc = "ASPEED 2700 SDHCI Controller";
+asc->capareg = 0x000719f80080;
+}
+
 static const TypeInfo aspeed_sdhci_types[] = {
 {
 .name   = TYPE_ASPEED_SDHCI,
@@ -270,6 +279,11 @@ static const TypeInfo aspeed_sdhci_types[] = {
 .parent = TYPE_ASPEED_SDHCI,
 .class_init = aspeed_2600_sdhci_class_init,
 },
+{
+.name = TYPE_ASPEED_2700_SDHCI,
+.parent = TYPE_ASPEED_SDHCI,
+.class_init = aspeed_2700_sdhci_class_init,
+},
 };
 
 DEFINE_TYPES(aspeed_sdhci_types)
diff --git a/include/hw/sd/aspeed_sdhci.h b/include/hw/sd/aspeed_sdhci.h
index 8083797e25..4ef1770471 100644
--- a/include/hw/sd/aspeed_sdhci.h
+++ b/include/hw/sd/aspeed_sdhci.h
@@ -16,6 +16,7 @@
 #define TYPE_ASPEED_2400_SDHCI TYPE_ASPEED_SDHCI "-ast2400"
 #define TYPE_ASPEED_2500_SDHCI TYPE_ASPEED_SDHCI "-ast2500"
 #define TYPE_ASPEED_2600_SDHCI TYPE_ASPEED_SDHCI "-ast2600"
+#define TYPE_ASPEED_2700_SDHCI TYPE_ASPEED_SDHCI "-ast2700"
 OBJECT_DECLARE_TYPE(AspeedSDHCIState, AspeedSDHCIClass, ASPEED_SDHCI)
 
 #define ASPEED_SDHCI_NUM_SLOTS2
-- 
2.34.1

Re: [PATCH] MAINTAINERS: Cover the tests/functional/test_sh4eb_r2d.py file

2024-12-04 Thread Philippe Mathieu-Daudé


On 4/12/24 08:11, Thomas Huth wrote:

This file should belong to the R2D machine in the MAINTAINERS file.

Signed-off-by: Thomas Huth 
---
  MAINTAINERS | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Reviewed-by: Philippe Mathieu-Daudé

[PATCH v4 5/6] aspeed/soc: Support SDHCI for AST2700

Add SDHCI model for AST2700 SDHCI support. The SDHCI controller only support 1
slot and registers base address is start at 0x1408_ and its interrupt is
connected to GICINT133_INTC at bit 1.

Signed-off-by: Jamin Lin 
Reviewed-by: Cédric Le Goater 
---
 hw/arm/aspeed_ast27x0.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/hw/arm/aspeed_ast27x0.c b/hw/arm/aspeed_ast27x0.c
index 63d1fcb086..baddd35ecf 100644
--- a/hw/arm/aspeed_ast27x0.c
+++ b/hw/arm/aspeed_ast27x0.c
@@ -65,6 +65,7 @@ static const hwaddr aspeed_soc_ast2700_memmap[] = {
 [ASPEED_DEV_I2C]   =  0x14C0F000,
 [ASPEED_DEV_GPIO]  =  0x14C0B000,
 [ASPEED_DEV_RTC]   =  0x12C0F000,
+[ASPEED_DEV_SDHCI] =  0x1408,
 };
 
 #define AST2700_MAX_IRQ 256
@@ -113,6 +114,7 @@ static const int aspeed_soc_ast2700_irqmap[] = {
 [ASPEED_DEV_KCS]   = 128,
 [ASPEED_DEV_DP]= 28,
 [ASPEED_DEV_I3C]   = 131,
+[ASPEED_DEV_SDHCI] = 133,
 };
 
 /* GICINT 128 */
@@ -158,6 +160,7 @@ static const int aspeed_soc_ast2700_gic132_intcmap[] = {
 
 /* GICINT 133 */
 static const int aspeed_soc_ast2700_gic133_intcmap[] = {
+[ASPEED_DEV_SDHCI] = 1,
 [ASPEED_DEV_PECI]  = 4,
 };
 
@@ -380,6 +383,14 @@ static void aspeed_soc_ast2700_init(Object *obj)
 object_initialize_child(obj, "gpio", &s->gpio, typename);
 
 object_initialize_child(obj, "rtc", &s->rtc, TYPE_ASPEED_RTC);
+
+snprintf(typename, sizeof(typename), "aspeed.sdhci-%s", socname);
+object_initialize_child(obj, "sd-controller", &s->sdhci, typename);
+object_property_set_int(OBJECT(&s->sdhci), "num-slots", 1, &error_abort);
+
+/* Init sd card slot class here so that they're under the correct parent */
+object_initialize_child(obj, "sd-controller.sdhci",
+&s->sdhci.slots[0], TYPE_SYSBUS_SDHCI);
 }
 
 /*
@@ -681,6 +692,15 @@ static void aspeed_soc_ast2700_realize(DeviceState *dev, 
Error **errp)
 sysbus_connect_irq(SYS_BUS_DEVICE(&s->rtc), 0,
aspeed_soc_get_irq(s, ASPEED_DEV_RTC));
 
+/* SDHCI */
+if (!sysbus_realize(SYS_BUS_DEVICE(&s->sdhci), errp)) {
+return;
+}
+aspeed_mmio_map(s, SYS_BUS_DEVICE(&s->sdhci), 0,
+sc->memmap[ASPEED_DEV_SDHCI]);
+sysbus_connect_irq(SYS_BUS_DEVICE(&s->sdhci), 0,
+   aspeed_soc_get_irq(s, ASPEED_DEV_SDHCI));
+
 create_unimplemented_device("ast2700.dpmcu", 0x1100, 0x4);
 create_unimplemented_device("ast2700.iomem0", 0x1200, 0x0100);
 create_unimplemented_device("ast2700.iomem1", 0x1400, 0x0100);
-- 
2.34.1

[PATCH v4 1/6] hw/sd/aspeed_sdhci: Fix coding style

Fix coding style issues from checkpatch.pl.

Signed-off-by: Jamin Lin 
Reviewed-by: Cédric Le Goater 
---
 hw/sd/aspeed_sdhci.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/hw/sd/aspeed_sdhci.c b/hw/sd/aspeed_sdhci.c
index 98d5460905..acd6538261 100644
--- a/hw/sd/aspeed_sdhci.c
+++ b/hw/sd/aspeed_sdhci.c
@@ -87,10 +87,12 @@ static void aspeed_sdhci_write(void *opaque, hwaddr addr, 
uint64_t val,
 sdhci->regs[TO_REG(addr)] = (uint32_t)val & ~ASPEED_SDHCI_INFO_RESET;
 break;
 case ASPEED_SDHCI_SDIO_140:
-sdhci->slots[0].capareg = deposit64(sdhci->slots[0].capareg, 0, 32, 
val);
+sdhci->slots[0].capareg = deposit64(sdhci->slots[0].capareg,
+0, 32, val);
 break;
 case ASPEED_SDHCI_SDIO_144:
-sdhci->slots[0].capareg = deposit64(sdhci->slots[0].capareg, 32, 32, 
val);
+sdhci->slots[0].capareg = deposit64(sdhci->slots[0].capareg,
+32, 32, val);
 break;
 case ASPEED_SDHCI_SDIO_148:
 sdhci->slots[0].maxcurr = deposit64(sdhci->slots[0].maxcurr,
-- 
2.34.1

[PATCH v4 3/6] hw:sdhci: Introduce a new "capareg" class member to set the different Capability Registers

Currently, it set the hardcode value of capability registers to all ASPEED SOCs
However, the value of capability registers should be different for all ASPEED
SOCs. For example: the bit 28 of the Capability Register 1 should be 1 for
64-bits System Bus support for AST2700.

Introduce a new "capareg" class member whose data type is uint_64 to set the
different Capability Registers to all ASPEED SOCs.

The value of Capability Register is "0x01e80080" for AST2400 and
AST2500. The value of Capability Register is "0x000701f80080" for AST2600.

Signed-off-by: Jamin Lin 
---
 hw/arm/aspeed_ast2400.c  |  3 ++-
 hw/arm/aspeed_ast2600.c  |  7 +++---
 hw/sd/aspeed_sdhci.c | 47 +++-
 include/hw/sd/aspeed_sdhci.h | 12 +++--
 4 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/hw/arm/aspeed_ast2400.c b/hw/arm/aspeed_ast2400.c
index ecc81ecc79..3c1b419945 100644
--- a/hw/arm/aspeed_ast2400.c
+++ b/hw/arm/aspeed_ast2400.c
@@ -224,7 +224,8 @@ static void aspeed_ast2400_soc_init(Object *obj)
 snprintf(typename, sizeof(typename), "aspeed.gpio-%s", socname);
 object_initialize_child(obj, "gpio", &s->gpio, typename);
 
-object_initialize_child(obj, "sdc", &s->sdhci, TYPE_ASPEED_SDHCI);
+snprintf(typename, sizeof(typename), "aspeed.sdhci-%s", socname);
+object_initialize_child(obj, "sdc", &s->sdhci, typename);
 
 object_property_set_int(OBJECT(&s->sdhci), "num-slots", 2, &error_abort);
 
diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
index c40d3d8443..b5703bd064 100644
--- a/hw/arm/aspeed_ast2600.c
+++ b/hw/arm/aspeed_ast2600.c
@@ -236,8 +236,8 @@ static void aspeed_soc_ast2600_init(Object *obj)
 snprintf(typename, sizeof(typename), "aspeed.gpio-%s-1_8v", socname);
 object_initialize_child(obj, "gpio_1_8v", &s->gpio_1_8v, typename);
 
-object_initialize_child(obj, "sd-controller", &s->sdhci,
-TYPE_ASPEED_SDHCI);
+snprintf(typename, sizeof(typename), "aspeed.sdhci-%s", socname);
+object_initialize_child(obj, "sd-controller", &s->sdhci, typename);
 
 object_property_set_int(OBJECT(&s->sdhci), "num-slots", 2, &error_abort);
 
@@ -247,8 +247,7 @@ static void aspeed_soc_ast2600_init(Object *obj)
 &s->sdhci.slots[i], TYPE_SYSBUS_SDHCI);
 }
 
-object_initialize_child(obj, "emmc-controller", &s->emmc,
-TYPE_ASPEED_SDHCI);
+object_initialize_child(obj, "emmc-controller", &s->emmc, typename);
 
 object_property_set_int(OBJECT(&s->emmc), "num-slots", 1, &error_abort);
 
diff --git a/hw/sd/aspeed_sdhci.c b/hw/sd/aspeed_sdhci.c
index acd6538261..ae2ec4a916 100644
--- a/hw/sd/aspeed_sdhci.c
+++ b/hw/sd/aspeed_sdhci.c
@@ -148,6 +148,7 @@ static void aspeed_sdhci_realize(DeviceState *dev, Error 
**errp)
 {
 SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
 AspeedSDHCIState *sdhci = ASPEED_SDHCI(dev);
+AspeedSDHCIClass *asc = ASPEED_SDHCI_GET_CLASS(sdhci);
 
 /* Create input irqs for the slots */
 qdev_init_gpio_in_named_with_opaque(DEVICE(sbd), aspeed_sdhci_set_irq,
@@ -167,7 +168,7 @@ static void aspeed_sdhci_realize(DeviceState *dev, Error 
**errp)
 }
 
 if (!object_property_set_uint(sdhci_slot, "capareg",
-  ASPEED_SDHCI_CAPABILITIES, errp)) {
+  asc->capareg, errp)) {
 return;
 }
 
@@ -218,12 +219,56 @@ static void aspeed_sdhci_class_init(ObjectClass *classp, 
void *data)
 device_class_set_props(dc, aspeed_sdhci_properties);
 }
 
+static void aspeed_2400_sdhci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+AspeedSDHCIClass *asc = ASPEED_SDHCI_CLASS(klass);
+
+dc->desc = "ASPEED 2400 SDHCI Controller";
+asc->capareg = 0x01e80080;
+}
+
+static void aspeed_2500_sdhci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+AspeedSDHCIClass *asc = ASPEED_SDHCI_CLASS(klass);
+
+dc->desc = "ASPEED 2500 SDHCI Controller";
+asc->capareg = 0x01e80080;
+}
+
+static void aspeed_2600_sdhci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+AspeedSDHCIClass *asc = ASPEED_SDHCI_CLASS(klass);
+
+dc->desc = "ASPEED 2600 SDHCI Controller";
+asc->capareg = 0x000701f80080;
+}
+
 static const TypeInfo aspeed_sdhci_types[] = {
 {
 .name   = TYPE_ASPEED_SDHCI,
 .parent = TYPE_SYS_BUS_DEVICE,
 .instance_size  = sizeof(AspeedSDHCIState),
 .class_init = aspeed_sdhci_class_init,
+.class_size = sizeof(AspeedSDHCIClass),
+.abstract = true,
+},
+{
+.name = TYPE_ASPEED_2400_SDHCI,
+.parent = TYPE_ASPEED_SDHCI,
+.class_init = aspeed_2400_sdhci_class_init,
+},
+{
+.name = TYPE_ASPEED_2500_SDHCI,
+.parent = TYPE_ASP

[PATCH v3 2/8] i386: Add init and realize funciontality for RDT device.

From: ‪Hendrik Wüthrich 

Add code to initialize all necessary state for the RDT device.

Signed-off-by: Hendrik Wüthrich 
---
 hw/i386/rdt.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/hw/i386/rdt.c b/hw/i386/rdt.c
index d0afbd04fb..2fb9fb476b 100644
--- a/hw/i386/rdt.c
+++ b/hw/i386/rdt.c
@@ -19,6 +19,7 @@
 #include "hw/isa/isa.h"
 #include "hw/qdev-properties.h"
 #include "qom/object.h"
+#include "target/i386/cpu.h"
 
 /* Max counts for allocation masks or CBMs. In other words, the size of 
respective MSRs*/
 #define RDT_MAX_L3_MASK_COUNT  127
@@ -79,8 +80,36 @@ static void rdt_init(Object *obj)
 {
 }
 
+static void rdt_realize(DeviceState *dev, Error **errp)
+{
+CPUState *cs = first_cpu;
+RDTState *rdtDev = RDT(dev);
+
+rdtDev->rdtInstances = g_malloc(sizeof(RDTStatePerCore) * cs->nr_cores);
+CPU_FOREACH(cs) {
+RDTStatePerCore *rdt = &rdtDev->rdtInstances[cs->cpu_index];
+X86CPU *cpu = X86_CPU(cs);
+
+rdt->rdtstate = rdtDev;
+cpu->rdt = rdt;
+
+rdt->monitors = g_malloc(sizeof(RDTMonitor) * rdtDev->rmids);
+rdt->rdtstate->allocations = g_malloc(sizeof(RDTAllocation) * 
rdtDev->rmids);
+}
+}
+
 static void rdt_finalize(Object *obj)
 {
+CPUState *cs;
+RDTState *rdt = RDT(obj);
+
+CPU_FOREACH(cs) {
+RDTStatePerCore *rdtInstance = &rdt->rdtInstances[cs->cpu_index];
+g_free(rdtInstance->monitors);
+g_free(rdtInstance->rdtstate->allocations);
+}
+
+g_free(rdt->rdtInstances);
 }
 
 static void rdt_class_init(ObjectClass *klass, void *data)
@@ -90,6 +119,7 @@ static void rdt_class_init(ObjectClass *klass, void *data)
 dc->hotpluggable = false;
 dc->desc = "RDT";
 dc->user_creatable = true;
+dc->realize = rdt_realize;
 
 device_class_set_props(dc, rdt_properties);
 }
-- 
2.47.0.338.g60cca15819-goog

[PATCH v3 0/8] The aim of this patch series is to emulate Intel RDT features in order to make testing of the linux Resctrl subsystem possible with Qemu.

From: Hendrik Wüthrich 

A branch with the patches applied can be found at:
https://github.com/Gray-Colors/Intel_RDT_patches_applied

The changes made introduce the following features:

* Feature enumeration for Intel RDT allocation.
* Feature enumeration for Intel RDT monitoring.
* Intel RDT monitoring system interface.
* Intel RDT allocation system interface.

By adding these features, a barebones implementation most of the RDT
state and MSRs is introduced, which can be enabled through qemu
command line flags.
The features missing for a faithful recreation of RDT are CDP and
non-linear MBA throttle, as well as the possibility to configure
various values through the command line, as some properties can be
different across different machines. For increased ease of use, the
correct features should be automatically enabled on machines that
support RDT functionality.
The missing features mentioned above will be implemented in the
following order:

* Expand feature set for RDT allocation to include CDP and non-linear
 MBA throttle
* Allow for command line configuration of some values, such as the L3
 CBM length
* Automatically enable RDT on machines that officially support it.

Will NOT be implemented
* Tests to simulate interaction with the host by the guest

Command line examples assuming entire patch series is applied (This
requires a kernel with Resctrl enabled):

To emulate Intel RDT features:

Currently, it is necessary to force the RDT options on in qemu, as it is
not automatically enabled for any machines. An example would be the
following:
-cpu Skylake-Server,+l3-cmt,+rdt-m,+rdt-a,+mba,+l3-cat,+l2-cat
and
-device rdt

Just enabling RDT in qemu won't really help, though. The following
option allows resctrl in the kernel:
- Kernel options: rdt=mbmlocal,mbmtotal,cmt,mba,l2cat,l3cat

To use Resctrl in the Qemu, please refer to:
https://docs.kernel.org/arch/x86/resctrl.html

V2 -> V3
- Fix up command line examples in cover letter
- Fix CBM sizes which were wrong before
- Warn instead of crash when -device rdt is not set, while RDT options
  are forced on.

‪Hendrik Wüthrich (8):
  i386: Add Intel RDT device and State to config.
  i386: Add init and realize funciontality for RDT device.
  i386: Add RDT functionality
  i386: Add RDT device interface through MSRs
  i386: Add CPUID enumeration for RDT
  i386: Add RDT feature flags.
  i386/cpu: Adjust CPUID level for RDT features
  i386/cpu: Adjust level for RDT on full_cpuid_auto_level

 hw/i386/Kconfig  |   4 +
 hw/i386/meson.build  |   1 +
 hw/i386/rdt.c| 281 +++
 include/hw/i386/rdt.h|  76 
 target/i386/cpu.c| 112 ++-
 target/i386/cpu.h|  24 +++
 target/i386/tcg/sysemu/misc_helper.c |  84 
 7 files changed, 580 insertions(+), 2 deletions(-)
 create mode 100644 hw/i386/rdt.c
 create mode 100644 include/hw/i386/rdt.h

-- 
2.47.0.338.g60cca15819-goog

[PATCH v3 4/8] i386: Add RDT device interface through MSRs

From: ‪Hendrik Wüthrich 

Implement rdmsr and wrmsr for the following MSRs:
* MSR_IA32_PQR_ASSOC
* MSR_IA32_QM_EVTSEL
* MSR_IA32_QM_CTR
* IA32_L3_QOS_Mask_n
* IA32_L2_QOS_Mask_n
* IA32_L2_QoS_Ext_BW_Thrtl_n

This allows for the guest to call RDT-internal functions to
associate an RMID with a CLOSID / set an active RMID for
monitoring, read monitoring data, and set classes of service.

Signed-off-by: Hendrik Wüthrich 
---
 hw/i386/rdt.c| 22 
 include/hw/i386/rdt.h|  6 +-
 target/i386/cpu.h| 14 +
 target/i386/tcg/sysemu/misc_helper.c | 84 
 4 files changed, 113 insertions(+), 13 deletions(-)

diff --git a/hw/i386/rdt.c b/hw/i386/rdt.c
index 19fea8999a..f295273aec 100644
--- a/hw/i386/rdt.c
+++ b/hw/i386/rdt.c
@@ -77,6 +77,10 @@ struct RDTState {
 struct RDTStateClass {
 };
 
+uint32_t rdt_get_cpuid_10_1_edx_cos_max(void) { return RDT_MAX_L3_MASK_COUNT; }
+uint32_t rdt_get_cpuid_10_2_edx_cos_max(void) { return RDT_MAX_L2_MASK_COUNT; }
+uint32_t rdt_get_cpuid_10_3_edx_cos_max(void) { return 
RDT_MAX_MBA_THRTL_COUNT; }
+
 bool rdt_associate_rmid_cos(uint64_t msr_ia32_pqr_assoc) {
 X86CPU *cpu = X86_CPU(current_cpu);
 RDTStatePerCore *rdt = cpu->rdt;
@@ -86,7 +90,7 @@ bool rdt_associate_rmid_cos(uint64_t msr_ia32_pqr_assoc) {
 uint32_t rmid = msr_ia32_pqr_assoc & 0x;
 
 if (cos_id > RDT_MAX_L3_MASK_COUNT || cos_id > RDT_MAX_L2_MASK_COUNT ||
-cos_id > RDT_MAX_MBA_THRTL_COUNT || rmid > rdt_max_rmid(rdt)) {
+cos_id > RDT_MAX_MBA_THRTL_COUNT || rmid > rdt_max_rmid(rdt)) {
 return false;
 }
 
@@ -104,8 +108,7 @@ uint32_t rdt_read_l3_mask(uint32_t pos)
 X86CPU *cpu = X86_CPU(current_cpu);
 RDTStatePerCore *rdt = cpu->rdt;
 
-uint32_t val = rdt->rdtstate->msr_L3_ia32_mask_n[pos];
-return val;
+return rdt->rdtstate->msr_L3_ia32_mask_n[pos];
 }
 
 uint32_t rdt_read_l2_mask(uint32_t pos)
@@ -113,8 +116,7 @@ uint32_t rdt_read_l2_mask(uint32_t pos)
 X86CPU *cpu = X86_CPU(current_cpu);
 RDTStatePerCore *rdt = cpu->rdt;
 
-uint32_t val = rdt->rdtstate->msr_L2_ia32_mask_n[pos];
-return val;
+return rdt->rdtstate->msr_L2_ia32_mask_n[pos];
 }
 
 uint32_t rdt_read_mba_thrtl(uint32_t pos)
@@ -122,8 +124,7 @@ uint32_t rdt_read_mba_thrtl(uint32_t pos)
 X86CPU *cpu = X86_CPU(current_cpu);
 RDTStatePerCore *rdt = cpu->rdt;
 
-uint32_t val = rdt->rdtstate->ia32_L2_qos_ext_bw_thrtl_n[pos];
-return val;
+return rdt->rdtstate->ia32_L2_qos_ext_bw_thrtl_n[pos];
 }
 
 void rdt_write_msr_l3_mask(uint32_t pos, uint32_t val) {
@@ -153,7 +154,8 @@ uint32_t rdt_max_rmid(RDTStatePerCore *rdt)
 return rdtdev->rmids - 1;
 }
 
-uint64_t rdt_read_event_count(RDTStatePerCore *rdtInstance, uint32_t rmid, 
uint32_t event_id)
+uint64_t rdt_read_event_count(RDTStatePerCore *rdtInstance,
+  uint32_t rmid, uint32_t event_id)
 {
 CPUState *cs;
 RDTMonitor *mon;
@@ -181,13 +183,10 @@ uint64_t rdt_read_event_count(RDTStatePerCore 
*rdtInstance, uint32_t rmid, uint3
 switch (event_id) {
 case RDT_EVENT_L3_OCCUPANCY:
 return count_l3 == 0 ? QM_CTR_UNAVAILABLE : count_l3;
-break;
 case RDT_EVENT_L3_REMOTE_BW:
 return count_remote == 0 ? QM_CTR_UNAVAILABLE : count_remote;
-break;
 case RDT_EVENT_L3_LOCAL_BW:
 return count_local == 0 ? QM_CTR_UNAVAILABLE : count_local;
-break;
 default:
 return QM_CTR_ERROR;
 }
@@ -247,4 +246,3 @@ static void rdt_class_init(ObjectClass *klass, void *data)
 
 device_class_set_props(dc, rdt_properties);
 }
-
diff --git a/include/hw/i386/rdt.h b/include/hw/i386/rdt.h
index 875142bad8..ec82a149f2 100644
--- a/include/hw/i386/rdt.h
+++ b/include/hw/i386/rdt.h
@@ -25,7 +25,10 @@ typedef struct RDTStatePerCore RDTStatePerCore;
 typedef struct RDTMonitor RDTMonitor;
 typedef struct RDTAllocation RDTAllocation;
 
-#endif
+uint32_t rdt_get_cpuid_10_1_edx_cos_max(void);
+uint32_t rdt_get_cpuid_10_2_edx_cos_max(void);
+uint32_t rdt_get_cpuid_10_3_edx_cos_max(void);
+
 bool rdt_associate_rmid_cos(uint64_t msr_ia32_pqr_assoc);
 
 void rdt_write_msr_l3_mask(uint32_t pos, uint32_t val);
@@ -39,3 +42,4 @@ uint32_t rdt_read_mba_thrtl(uint32_t pos);
 uint64_t rdt_read_event_count(RDTStatePerCore *rdt, uint32_t rmid, uint32_t 
event_id);
 uint32_t rdt_max_rmid(RDTStatePerCore *rdt);
 
+#endif
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index a2941f98eb..d7d5ad37fd 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -574,6 +574,17 @@ typedef enum X86Seg {
 #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x0490
 #define MSR_IA32_VMX_VMFUNC 0x0491
 
+#define MSR_IA32_QM_EVTSEL  0x0c8d
+#define MSR_IA32_QM_CTR 0x0c8e
+#define MSR_IA32_PQR_ASSOC  0x0c8f
+
+#define MSR_IA32_L3_CBM_BASE0x0c90
+#define

[PATCH v3 8/8] i386/cpu: Adjust level for RDT on full_cpuid_auto_level

From: ‪Hendrik Wüthrich 

Make sure that RDT monitoring and allocation features are included in
in full_cpuid_auto_level.

Signed-off-by: Hendrik Wüthrich 
---
 target/i386/cpu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index a400839216..787bb5ba92 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -875,6 +875,7 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
 #else
 #define TCG_7_0_ECX_RDPID 0
 #endif
+
 #define TCG_7_0_ECX_FEATURES (CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | \
   /* CPUID_7_0_ECX_OSPKE is dynamic */ \
   CPUID_7_0_ECX_LA57 | CPUID_7_0_ECX_PKS | CPUID_7_0_ECX_VAES | \
@@ -7526,6 +7527,8 @@ void x86_cpu_expand_features(X86CPU *cpu, Error **errp)
 x86_cpu_adjust_feat_level(cpu, FEAT_C000_0001_EDX);
 x86_cpu_adjust_feat_level(cpu, FEAT_SVM);
 x86_cpu_adjust_feat_level(cpu, FEAT_XSAVE);
+x86_cpu_adjust_feat_level(cpu, FEAT_RDT_15_0_EDX);
+x86_cpu_adjust_feat_level(cpu, FEAT_RDT_10_0_EBX);
 
 /* Intel Processor Trace requires CPUID[0x14] */
 if ((env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT)) {
-- 
2.47.0.338.g60cca15819-goog

[PATCH v3 3/8] i386: Add RDT functionality

From: ‪Hendrik Wüthrich 

Add RDT code to Associate CLOSID with RMID / set RMID for monitoring,
write COS, and read monitoring data. This patch does not add code for
the guest to interact through these things with MSRs, only the actual
ability for the RDT device to do them.

Signed-off-by: Hendrik Wüthrich 
---
 hw/i386/rdt.c | 124 ++
 include/hw/i386/rdt.h |  16 ++
 2 files changed, 140 insertions(+)

diff --git a/hw/i386/rdt.c b/hw/i386/rdt.c
index 2fb9fb476b..19fea8999a 100644
--- a/hw/i386/rdt.c
+++ b/hw/i386/rdt.c
@@ -21,6 +21,11 @@
 #include "qom/object.h"
 #include "target/i386/cpu.h"
 
+/* RDT Monitoring Event Codes */
+#define RDT_EVENT_L3_OCCUPANCY 1
+#define RDT_EVENT_L3_REMOTE_BW 2
+#define RDT_EVENT_L3_LOCAL_BW 3
+
 /* Max counts for allocation masks or CBMs. In other words, the size of 
respective MSRs*/
 #define RDT_MAX_L3_MASK_COUNT  127
 #define RDT_MAX_L2_MASK_COUNT  63
@@ -29,6 +34,9 @@
 #define TYPE_RDT "rdt"
 #define RDT_NUM_RMID_PROP "rmids"
 
+#define QM_CTR_ERROR(1ULL << 63)
+#define QM_CTR_UNAVAILABLE  (1ULL << 62)
+
 OBJECT_DECLARE_TYPE(RDTState, RDTStateClass, RDT);
 
 struct RDTMonitor {
@@ -69,6 +77,122 @@ struct RDTState {
 struct RDTStateClass {
 };
 
+bool rdt_associate_rmid_cos(uint64_t msr_ia32_pqr_assoc) {
+X86CPU *cpu = X86_CPU(current_cpu);
+RDTStatePerCore *rdt = cpu->rdt;
+RDTAllocation *alloc;
+
+uint32_t cos_id = (msr_ia32_pqr_assoc & 0x) >> 16;
+uint32_t rmid = msr_ia32_pqr_assoc & 0x;
+
+if (cos_id > RDT_MAX_L3_MASK_COUNT || cos_id > RDT_MAX_L2_MASK_COUNT ||
+cos_id > RDT_MAX_MBA_THRTL_COUNT || rmid > rdt_max_rmid(rdt)) {
+return false;
+}
+
+rdt->active_rmid = rmid;
+
+alloc = &rdt->rdtstate->allocations[rmid];
+
+alloc->active_cos = cos_id;
+
+return true;
+}
+
+uint32_t rdt_read_l3_mask(uint32_t pos)
+{
+X86CPU *cpu = X86_CPU(current_cpu);
+RDTStatePerCore *rdt = cpu->rdt;
+
+uint32_t val = rdt->rdtstate->msr_L3_ia32_mask_n[pos];
+return val;
+}
+
+uint32_t rdt_read_l2_mask(uint32_t pos)
+{
+X86CPU *cpu = X86_CPU(current_cpu);
+RDTStatePerCore *rdt = cpu->rdt;
+
+uint32_t val = rdt->rdtstate->msr_L2_ia32_mask_n[pos];
+return val;
+}
+
+uint32_t rdt_read_mba_thrtl(uint32_t pos)
+{
+X86CPU *cpu = X86_CPU(current_cpu);
+RDTStatePerCore *rdt = cpu->rdt;
+
+uint32_t val = rdt->rdtstate->ia32_L2_qos_ext_bw_thrtl_n[pos];
+return val;
+}
+
+void rdt_write_msr_l3_mask(uint32_t pos, uint32_t val) {
+X86CPU *cpu = X86_CPU(current_cpu);
+RDTStatePerCore *rdt = cpu->rdt;
+
+rdt->rdtstate->msr_L3_ia32_mask_n[pos] = val;
+}
+
+void rdt_write_msr_l2_mask(uint32_t pos, uint32_t val) {
+X86CPU *cpu = X86_CPU(current_cpu);
+RDTStatePerCore *rdt = cpu->rdt;
+
+rdt->rdtstate->msr_L2_ia32_mask_n[pos] = val;
+}
+
+void rdt_write_mba_thrtl(uint32_t pos, uint32_t val) {
+X86CPU *cpu = X86_CPU(current_cpu);
+RDTStatePerCore *rdt = cpu->rdt;
+
+rdt->rdtstate->ia32_L2_qos_ext_bw_thrtl_n[pos] = val;
+}
+
+uint32_t rdt_max_rmid(RDTStatePerCore *rdt)
+{
+RDTState *rdtdev = rdt->rdtstate;
+return rdtdev->rmids - 1;
+}
+
+uint64_t rdt_read_event_count(RDTStatePerCore *rdtInstance, uint32_t rmid, 
uint32_t event_id)
+{
+CPUState *cs;
+RDTMonitor *mon;
+RDTState *rdt = rdtInstance->rdtstate;
+
+uint32_t count_l3 = 0;
+uint32_t count_local= 0;
+uint32_t count_remote = 0;
+
+if (!rdt) {
+return 0;
+}
+
+CPU_FOREACH(cs) {
+rdtInstance = &rdt->rdtInstances[cs->cpu_index];
+if (rmid >= rdtInstance->monitors->len) {
+return QM_CTR_ERROR;
+}
+mon = &g_array_index(rdtInstance->monitors, RDTMonitor, rmid);
+count_l3 += mon->count_l3;
+count_local += mon->count_local;
+count_remote += mon->count_remote;
+}
+
+switch (event_id) {
+case RDT_EVENT_L3_OCCUPANCY:
+return count_l3 == 0 ? QM_CTR_UNAVAILABLE : count_l3;
+break;
+case RDT_EVENT_L3_REMOTE_BW:
+return count_remote == 0 ? QM_CTR_UNAVAILABLE : count_remote;
+break;
+case RDT_EVENT_L3_LOCAL_BW:
+return count_local == 0 ? QM_CTR_UNAVAILABLE : count_local;
+break;
+default:
+return QM_CTR_ERROR;
+}
+}
+
 OBJECT_DEFINE_TYPE(RDTState, rdt, RDT, ISA_DEVICE);
 
 static Property rdt_properties[] = {
diff --git a/include/hw/i386/rdt.h b/include/hw/i386/rdt.h
index a21d95b265..875142bad8 100644
--- a/include/hw/i386/rdt.h
+++ b/include/hw/i386/rdt.h
@@ -17,9 +17,25 @@
 #ifndef HW_RDT_H
 #define HW_RDT_H
 
+#include 
+#include 
+
 typedef struct RDTState RDTState;
 typedef struct RDTStatePerCore RDTStatePerCore;
 typedef struct RDTMonitor RDTMonitor;
 typedef struct RDTAllocation RDTAllocation;
 
 #endif
+bool rdt_associate_rmid_cos(uint64_t msr_ia32_pqr_assoc);
+
+void rdt_write_msr_l3_mask(uint32_t po

Re: [PATCH] 9pfs: improve v9fs_walk() tracing

2024-12-04 Thread Greg Kurz

On Tue, 3 Dec 2024 10:14:28 +0100
Christian Schoenebeck  wrote:

> 'Twalk' is the most important request type in the 9p protocol to look out
> for when debugging 9p communication. That's because it is the only part
> of the 9p protocol which actually deals with human-readable path names,
> whereas all other 9p request types work on numeric file IDs (FIDs) only.
> 
> Improve tracing of 'Twalk' requests, e.g. let's say client wanted to walk
> to "/home/bob/src", then improve trace output from:
> 
>   v9fs_walk tag 0 id 110 fid 0 newfid 1 nwnames=3
> 
> to:
> 
>   v9fs_walk tag=0 id=110 fid=0 newfid=1 nwnames=3 wnames={home, bob, src}
> 
> To achieve this, add a new helper function trace_v9fs_walk_wnames() which
> converts the received V9fsString array of individual path elements into a
> comma-separated string presentation for being passed to the tracing system.
> As this conversion is somewhat expensive, this new helper function returns
> immediately if tracing of event 'v9fs_walk' is currently not enabled.
> 
> Signed-off-by: Christian Schoenebeck 
> ---

Reviewed-by: Greg Kurz 

>  CCing tracing maintainers in case they have better ideas how to do this.
> 
>  hw/9pfs/9p.c | 42 +-
>  hw/9pfs/trace-events |  2 +-
>  2 files changed, 38 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index 578517739a..c08e7e492b 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -1774,6 +1774,30 @@ static bool same_stat_id(const struct stat *a, const 
> struct stat *b)
>  return a->st_dev == b->st_dev && a->st_ino == b->st_ino;
>  }
>  
> +/*
> + * Returns a (newly allocated) comma-separated string presentation of the
> + * passed array for logging (tracing) purpose for trace event "v9fs_walk" 
> only.
> + * If tracing for that event is disabled, it immediately returns NULL 
> instead.
> + *
> + * It is caller's responsibility to free the returned string.
> + */
> +static char *trace_v9fs_walk_wnames(V9fsString *wnames, size_t nwnames)
> +{
> +g_autofree char **arr = NULL;
> +
> +if (trace_event_get_state(TRACE_V9FS_WALK) &&
> +qemu_loglevel_mask(LOG_TRACE))
> +{
> +arr = g_malloc0_n(nwnames + 1, sizeof(char *));
> +for (size_t i = 0; i < nwnames; ++i) {
> +arr[i] = wnames[i].data;
> +}
> +return g_strjoinv(", ", arr);
> +}
> +
> +return NULL;
> +}
> +
>  static void coroutine_fn v9fs_walk(void *opaque)
>  {
>  int name_idx, nwalked;
> @@ -1787,6 +1811,7 @@ static void coroutine_fn v9fs_walk(void *opaque)
>  size_t offset = 7;
>  int32_t fid, newfid;
>  P9ARRAY_REF(V9fsString) wnames = NULL;
> +g_autofree char *trace_wnames = NULL;
>  V9fsFidState *fidp;
>  V9fsFidState *newfidp = NULL;
>  V9fsPDU *pdu = opaque;
> @@ -1800,11 +1825,9 @@ static void coroutine_fn v9fs_walk(void *opaque)
>  }
>  offset += err;
>  
> -trace_v9fs_walk(pdu->tag, pdu->id, fid, newfid, nwnames);
> -
>  if (nwnames > P9_MAXWELEM) {
>  err = -EINVAL;
> -goto out_nofid;
> +goto out_nofid_nownames;
>  }
>  if (nwnames) {
>  P9ARRAY_NEW(V9fsString, wnames, nwnames);
> @@ -1814,15 +1837,20 @@ static void coroutine_fn v9fs_walk(void *opaque)
>  for (i = 0; i < nwnames; i++) {
>  err = pdu_unmarshal(pdu, offset, "s", &wnames[i]);
>  if (err < 0) {
> -goto out_nofid;
> +goto out_nofid_nownames;
>  }
>  if (name_is_illegal(wnames[i].data)) {
>  err = -ENOENT;
> -goto out_nofid;
> +goto out_nofid_nownames;
>  }
>  offset += err;
>  }
> +trace_wnames = trace_v9fs_walk_wnames(wnames, nwnames);
> +trace_v9fs_walk(pdu->tag, pdu->id, fid, newfid, nwnames, 
> trace_wnames);
> +} else {
> +trace_v9fs_walk(pdu->tag, pdu->id, fid, newfid, nwnames, "");
>  }
> +
>  fidp = get_fid(pdu, fid);
>  if (fidp == NULL) {
>  err = -ENOENT;
> @@ -1957,7 +1985,11 @@ out:
>  }
>  v9fs_path_free(&dpath);
>  v9fs_path_free(&path);
> +goto out_pdu_complete;
> +out_nofid_nownames:
> +trace_v9fs_walk(pdu->tag, pdu->id, fid, newfid, nwnames, "");
>  out_nofid:
> +out_pdu_complete:
>  pdu_complete(pdu, err);
>  }
>  
> diff --git a/hw/9pfs/trace-events b/hw/9pfs/trace-events
> index a12e55c165..ed9f4e7209 100644
> --- a/hw/9pfs/trace-events
> +++ b/hw/9pfs/trace-events
> @@ -11,7 +11,7 @@ v9fs_stat(uint16_t tag, uint8_t id, int32_t fid) "tag %d id 
> %d fid %d"
>  v9fs_stat_return(uint16_t tag, uint8_t id, int32_t mode, int32_t atime, 
> int32_t mtime, int64_t length) "tag %d id %d stat={mode %d atime %d mtime %d 
> length %"PRId64"}"
>  v9fs_getattr(uint16_t tag, uint8_t id, int32_t fid, uint64_t request_mask) 
> "tag %d id %d fid %d request_mask %"PRIu64
>  v9fs_getattr_return(uint16_t tag, uint8_t id, uint64_t

[PATCH v3 7/8] i386/cpu: Adjust CPUID level for RDT features

From: ‪Hendrik Wüthrich 

Adjust minimum CPUID level if RDT monitoring or allocation features are
enabled to ensure that CPUID will return them.

Signed-off-by: Hendrik Wüthrich 
---
 target/i386/cpu.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index af1da35985..a400839216 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -7568,6 +7568,16 @@ void x86_cpu_expand_features(X86CPU *cpu, Error **errp)
 if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_SGX) {
 x86_cpu_adjust_level(cpu, &env->cpuid_min_level, 0x12);
 }
+
+/* RDT monitoring requires CPUID[0xF] */
+if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_PQM) {
+x86_cpu_adjust_level(cpu, &env->cpuid_min_level, 0xF);
+}
+
+/* RDT allocation requires CPUID[0x10] */
+if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_PQE) {
+x86_cpu_adjust_level(cpu, &env->cpuid_min_level, 0x10);
+}
 }
 
 /* Set cpuid_*level* based on cpuid_min_*level, if not explicitly set */
-- 
2.47.0.338.g60cca15819-goog

[PATCH v4 6/6] aspeed/soc: Support eMMC for AST2700

Add SDHCI model for AST2700 eMMC support. The eMMC controller only support 1
slot and registers base address is start at 0x1209_ and its interrupt is
connected to GICINT 15.

Signed-off-by: Jamin Lin 
Reviewed-by: Cédric Le Goater 
---
 hw/arm/aspeed_ast27x0.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/hw/arm/aspeed_ast27x0.c b/hw/arm/aspeed_ast27x0.c
index baddd35ecf..23571584b2 100644
--- a/hw/arm/aspeed_ast27x0.c
+++ b/hw/arm/aspeed_ast27x0.c
@@ -391,6 +391,12 @@ static void aspeed_soc_ast2700_init(Object *obj)
 /* Init sd card slot class here so that they're under the correct parent */
 object_initialize_child(obj, "sd-controller.sdhci",
 &s->sdhci.slots[0], TYPE_SYSBUS_SDHCI);
+
+object_initialize_child(obj, "emmc-controller", &s->emmc, typename);
+object_property_set_int(OBJECT(&s->emmc), "num-slots", 1, &error_abort);
+
+object_initialize_child(obj, "emmc-controller.sdhci", &s->emmc.slots[0],
+TYPE_SYSBUS_SDHCI);
 }
 
 /*
@@ -701,6 +707,15 @@ static void aspeed_soc_ast2700_realize(DeviceState *dev, 
Error **errp)
 sysbus_connect_irq(SYS_BUS_DEVICE(&s->sdhci), 0,
aspeed_soc_get_irq(s, ASPEED_DEV_SDHCI));
 
+/* eMMC */
+if (!sysbus_realize(SYS_BUS_DEVICE(&s->emmc), errp)) {
+return;
+}
+aspeed_mmio_map(s, SYS_BUS_DEVICE(&s->emmc), 0,
+sc->memmap[ASPEED_DEV_EMMC]);
+sysbus_connect_irq(SYS_BUS_DEVICE(&s->emmc), 0,
+   aspeed_soc_get_irq(s, ASPEED_DEV_EMMC));
+
 create_unimplemented_device("ast2700.dpmcu", 0x1100, 0x4);
 create_unimplemented_device("ast2700.iomem0", 0x1200, 0x0100);
 create_unimplemented_device("ast2700.iomem1", 0x1400, 0x0100);
-- 
2.34.1

[PATCH v3 1/8] i386: Add Intel RDT device and State to config.

From: ‪Hendrik Wüthrich 

Change config to show RDT, add minimal code to the rdt.c module to make
sure things still compile.

Signed-off-by: Hendrik Wüthrich 
---
 hw/i386/Kconfig   |  4 ++
 hw/i386/meson.build   |  1 +
 hw/i386/rdt.c | 96 +++
 include/hw/i386/rdt.h | 25 +++
 target/i386/cpu.h |  3 ++
 5 files changed, 129 insertions(+)
 create mode 100644 hw/i386/rdt.c
 create mode 100644 include/hw/i386/rdt.h

diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index f4a33b6c08..4dd05ed6f2 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -10,6 +10,9 @@ config SGX
 bool
 depends on KVM
 
+config RDT
+bool
+
 config PC
 bool
 imply APPLESMC
@@ -26,6 +29,7 @@ config PC
 imply QXL
 imply SEV
 imply SGX
+imply RDT
 imply TEST_DEVICES
 imply TPM_CRB
 imply TPM_TIS_ISA
diff --git a/hw/i386/meson.build b/hw/i386/meson.build
index 03aad10df7..fdbf5962b5 100644
--- a/hw/i386/meson.build
+++ b/hw/i386/meson.build
@@ -21,6 +21,7 @@ i386_ss.add(when: 'CONFIG_VMPORT', if_true: files('vmport.c'))
 i386_ss.add(when: 'CONFIG_VTD', if_true: files('intel_iommu.c'))
 i386_ss.add(when: 'CONFIG_SGX', if_true: files('sgx-epc.c','sgx.c'),
 if_false: files('sgx-stub.c'))
+i386_ss.add(when: 'CONFIG_RDT', if_true: files('rdt.c'))
 
 i386_ss.add(when: 'CONFIG_ACPI', if_true: files('acpi-common.c'))
 i386_ss.add(when: 'CONFIG_PC', if_true: files(
diff --git a/hw/i386/rdt.c b/hw/i386/rdt.c
new file mode 100644
index 00..d0afbd04fb
--- /dev/null
+++ b/hw/i386/rdt.c
@@ -0,0 +1,96 @@
+/*
+ * Intel Resource Director Technology (RDT).
+ *
+ * Copyright 2024 Google LLC
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+
+#include "hw/i386/rdt.h"
+#include "qemu/osdep.h" /* Needs to be included before isa.h */
+#include "hw/isa/isa.h"
+#include "hw/qdev-properties.h"
+#include "qom/object.h"
+
+/* Max counts for allocation masks or CBMs. In other words, the size of 
respective MSRs*/
+#define RDT_MAX_L3_MASK_COUNT  127
+#define RDT_MAX_L2_MASK_COUNT  63
+#define RDT_MAX_MBA_THRTL_COUNT63
+
+#define TYPE_RDT "rdt"
+#define RDT_NUM_RMID_PROP "rmids"
+
+OBJECT_DECLARE_TYPE(RDTState, RDTStateClass, RDT);
+
+struct RDTMonitor {
+uint64_t count_local;
+uint64_t count_remote;
+uint64_t count_l3;
+};
+
+struct RDTAllocation {
+uint32_t active_cos;
+};
+
+struct RDTStatePerCore {
+uint32_t active_rmid;
+GArray *monitors;
+
+/*Parent RDTState*/
+RDTState *rdtstate;
+};
+
+/*One instance of RDT-internal state to be shared by all cores*/
+struct RDTState {
+ISADevice parent;
+
+/*Max amount of RMIDs*/
+uint32_t rmids;
+
+/*Per core state*/
+RDTStatePerCore *rdtInstances;
+RDTAllocation *allocations;
+
+/*RDT Allocation bitmask MSRs*/
+uint32_t msr_L3_ia32_mask_n[RDT_MAX_L3_MASK_COUNT];
+uint32_t msr_L2_ia32_mask_n[RDT_MAX_L2_MASK_COUNT];
+uint32_t ia32_L2_qos_ext_bw_thrtl_n[RDT_MAX_MBA_THRTL_COUNT];
+};
+
+struct RDTStateClass {
+};
+
+OBJECT_DEFINE_TYPE(RDTState, rdt, RDT, ISA_DEVICE);
+
+static Property rdt_properties[] = {
+DEFINE_PROP_UINT32(RDT_NUM_RMID_PROP, RDTState, rmids, 256),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static void rdt_init(Object *obj)
+{
+}
+
+static void rdt_finalize(Object *obj)
+{
+}
+
+static void rdt_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+
+dc->hotpluggable = false;
+dc->desc = "RDT";
+dc->user_creatable = true;
+
+device_class_set_props(dc, rdt_properties);
+}
+
diff --git a/include/hw/i386/rdt.h b/include/hw/i386/rdt.h
new file mode 100644
index 00..a21d95b265
--- /dev/null
+++ b/include/hw/i386/rdt.h
@@ -0,0 +1,25 @@
+/*
+ * Intel Resource Director Technology (RDT).
+ *
+ * Copyright 2024 Google LLC
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+
+#ifndef HW_RDT_H
+#define HW_RDT_H
+
+typedef struct RDTState RDTState;
+typedef struct RDTStatePerCore RDTStatePerCore;
+typedef struct RDTMoni

Re: [PATCH v4 13/15] acpi/ghes: move offset calculus to a separate function

Em Wed, 4 Dec 2024 08:54:40 +0100
Igor Mammedov  escreveu:

> On Tue, 3 Dec 2024 14:47:30 +0100
> Mauro Carvalho Chehab  wrote:
> 
> > Em Tue, 3 Dec 2024 12:51:43 +0100
> > Igor Mammedov  escreveu:
> >   
> > > On Fri, 22 Nov 2024 10:11:30 +0100
> > > Mauro Carvalho Chehab  wrote:
> > > 
> > > > Currently, CPER address location is calculated as an offset of
> > > > the hardware_errors table. It is also badly named, as the
> > > > offset actually used is the address where the CPER data starts,
> > > > and not the beginning of the error source.
> > > > 
> > > > Move the logic which calculates such offset to a separate
> > > > function, in preparation for a patch that will be changing the
> > > > logic to calculate it from the HEST table.
> > > > 
> > > > While here, properly name the variable which stores the cper
> > > > address.
> > > > 
> > > > Signed-off-by: Mauro Carvalho Chehab 
> > > > Reviewed-by: Jonathan Cameron 
> > > > ---
> > > >  hw/acpi/ghes.c | 41 -
> > > >  1 file changed, 32 insertions(+), 9 deletions(-)
> > > > 
> > > > diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> > > > index 87fd3feedd2a..d99697b20164 100644
> > > > --- a/hw/acpi/ghes.c
> > > > +++ b/hw/acpi/ghes.c
> > > > @@ -364,10 +364,37 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, 
> > > > FWCfgState *s,
> > > >  ags->present = true;
> > > >  }
> > > >  
> > > > +static void get_hw_error_offsets(uint64_t ghes_addr,
> > > > + uint64_t *cper_addr,
> > > > + uint64_t *read_ack_register_addr)
> > > > +{  
> > > 
> > > 
> > > > +if (!ghes_addr) {
> > > > +return;
> > > > +}  
> > > 
> > > why do we need this check?
> > 
> > It is a safeguard measure to avoid crashes and OOM access. If fw_cfg 
> > callback doesn't fill it properly, this will be zero.  
> 
> shouldn't happen, but yeah it firmware job to write back addr
> which might happen for whatever reason (a bug for example).
>

The main reason I added it is that, after the second series, it could 
also happen if there's something wrong with the backward compat logic.

So, both here and after switching to HEST-based offsets, I opted
to explicitly test.

> Perhaps push this up to the stack, so we don't have to deal
> with scattered checks in ghes code.
> 
> kvm_arch_on_sigbus_vcpu() looks like a goo candidate for check
> and warn_once if that ever happens.
> It already calls acpi_ghes_present() which resolves GED device
> and then later we duplicate this job in ghes_record_cper_errors()
> 
> so maybe rename acpi_ghes_present to something like AcpiGhesState* 
> acpi_ghes_get_state()
> and call it instead. And then move ghes_addr check/warn_once there.
> This way the rest of ghes code won't have to deal handling practically
> impossible error conditions that cause reader to wonder why it might happen.

I'll look on it. Yet, if ok for you, I would prefer dealing with this
once we have a bigger picture, e.g. once we merge those tree series:

- cleanup series (this one);
- HEST offset (I'll be sending a new version today);
- error_inject.

Thanks,
Mauro

Re: [PATCH 5/7] docs: add a codebase section

On Tue, Dec 03, 2024 at 05:22:50PM +, Alex Bennée wrote:
> Peter Maydell  writes:
> 
> > On Mon, 18 Nov 2024 at 17:24, Pierrick Bouvier
> >  wrote:
> >>
> >> Present the various parts of QEMU and organization of codebase.
> >>
> >> Signed-off-by: Pierrick Bouvier 
> >
> > I like this; it's something I've thought for a while would
> > be good to have, but which I never got round to trying to
> > put together. Thanks for doing this!
> >
> > Mostly my comments below are spelling/typo nits and
> > other minor stuff.
> >
> >> ---
> >>  docs/about/emulation.rst   |   2 +
> >>  docs/codebase/index.rst| 211 +
> >>  docs/devel/decodetree.rst  |   2 +
> >>  docs/devel/ebpf_rss.rst|   2 +
> >>  docs/devel/index-internals.rst |   2 +
> >>  docs/devel/migration/main.rst  |   2 +
> >>  docs/devel/qapi-code-gen.rst   |   1 +
> >>  docs/devel/testing/main.rst|   9 +-
> >>  docs/devel/testing/qtest.rst   |   2 +
> >>  docs/index.rst |   3 +
> >>  docs/interop/qemu-ga.rst   |   2 +
> >>  docs/system/qemu-block-drivers.rst.inc |   2 +
> >>  docs/tools/qemu-storage-daemon.rst |   2 +
> >>  docs/user/main.rst |   6 +
> >>  14 files changed, 247 insertions(+), 1 deletion(-)
> >>  create mode 100644 docs/codebase/index.rst
> >>
> 
> >> +  Block devices and `image formats` implementation.
> >> +* `bsd-user 
> >> `_:
> >> +  `BSD User mode`.
> >> +* build: Where the code built goes!
> >
> > The built code doesn't have to be in 'build'. We could say:
> >
> >  * build: You can tell the QEMU build system to put the built code
> >anywhere you like. By default it will go into a directory named
> >``build``. Sometimes documentation will assume this default
> >for convenience when describing command lines; you can always
> >replace it with the path to your build tree.
> >
> > ?
> 
> I always recommend creating a builds directory and having multiple build
> trees under it:

I can understand why you do that, but I'm doubtful the need to have
many parallel build directories is the common case. IOW, I expect
that for the majority of contributors the default single 'build'
directory is sufficient.

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH] tests/functional: Bump the timeout of the sh4_tuxrun test

On Wed, Dec 04, 2024 at 08:07:57AM +0100, Thomas Huth wrote:
> When running "make -j$(nproc) check SPEED=thorough", the sh4_tuxrun
> test is timing out for me, and using TIMEOUT_MULTIPLIER I can see
> that it clearly takes more than 100 seconds to finish. Thus increase
> the timeout setting of this test to avoid the problem.
> 
> Signed-off-by: Thomas Huth 
> ---
>  tests/functional/meson.build | 1 +
>  1 file changed, 1 insertion(+)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

[PATCH v3 5/8] i386: Add CPUID enumeration for RDT

From: ‪Hendrik Wüthrich 

Add CPUID enumeration for intel RDT monitoring and allocation, as well
as the flags used in the enumeration code.

Signed-off-by: Hendrik Wüthrich 
---
 hw/i386/rdt.c | 33 +
 include/hw/i386/rdt.h | 31 +++
 target/i386/cpu.c | 69 +++
 target/i386/cpu.h |  5 
 4 files changed, 138 insertions(+)

diff --git a/hw/i386/rdt.c b/hw/i386/rdt.c
index f295273aec..a3c748c127 100644
--- a/hw/i386/rdt.c
+++ b/hw/i386/rdt.c
@@ -31,6 +31,20 @@
 #define RDT_MAX_L2_MASK_COUNT  63
 #define RDT_MAX_MBA_THRTL_COUNT63
 
+/* RDT L3 Allocation features */
+#define CPUID_10_1_EAX_CBM_LENGTH   0xf
+#define CPUID_10_1_EBX_CBM  0x0
+#define CPUID_10_1_ECX_CDP  0x0 // to enable, it would be (1U << 2)
+#define CPUID_10_1_EDX_COS_MAX  MAX_L3_MASK_COUNT
+/* RDT L2 Allocation features*/
+#define CPUID_10_2_EAX_CBM_LENGTH   0xf
+#define CPUID_10_2_EBX_CBM  0x0
+#define CPUID_10_2_EDX_COS_MAX  MAX_L2_MASK_COUNT
+/* RDT MBA features */
+#define CPUID_10_3_EAX_THRTL_MAX89
+#define CPUID_10_3_ECX_LINEAR_RESPONSE (1U << 2)
+#define CPUID_10_3_EDX_COS_MAX  MAX_MBA_THRTL_COUNT
+
 #define TYPE_RDT "rdt"
 #define RDT_NUM_RMID_PROP "rmids"
 
@@ -77,8 +91,27 @@ struct RDTState {
 struct RDTStateClass {
 };
 
+uint32_t rdt_get_cpuid_15_0_edx_l3(void) { return CPUID_15_1_EDX_L3_OCCUPANCY 
| CPUID_15_1_EDX_L3_TOTAL_BW | CPUID_15_1_EDX_L3_LOCAL_BW; }
+
+uint32_t rdt_cpuid_15_1_edx_l3_total_bw_enabled(void) { return 
CPUID_15_1_EDX_L3_TOTAL_BW; }
+uint32_t rdt_cpuid_15_1_edx_l3_local_bw_enabled(void) { return 
CPUID_15_1_EDX_L3_LOCAL_BW; }
+uint32_t rdt_cpuid_15_1_edx_l3_occupancy_enabled(void) { return 
CPUID_15_1_EDX_L3_OCCUPANCY; }
+
+uint32_t rdt_cpuid_10_0_ebx_l3_cat_enabled(void) { return 
CPUID_10_0_EBX_L3_CAT; }
+uint32_t rdt_cpuid_10_0_ebx_l2_cat_enabled(void) { return 
CPUID_10_0_EBX_L2_CAT; }
+uint32_t rdt_cpuid_10_0_ebx_l2_mba_enabled(void) { return CPUID_10_0_EBX_MBA; }
+
+uint32_t rdt_get_cpuid_10_1_eax_cbm_length(void) { return 
CPUID_10_1_EAX_CBM_LENGTH; }
+uint32_t rdt_cpuid_10_1_ebx_cbm_enabled(void) { return CPUID_10_1_EBX_CBM; }
+uint32_t rdt_cpuid_10_1_ecx_cdp_enabled(void) { return CPUID_10_1_ECX_CDP; }
 uint32_t rdt_get_cpuid_10_1_edx_cos_max(void) { return RDT_MAX_L3_MASK_COUNT; }
+
+uint32_t rdt_get_cpuid_10_2_eax_cbm_length(void) { return 
CPUID_10_2_EAX_CBM_LENGTH; }
+uint32_t rdt_cpuid_10_2_ebx_cbm_enabled(void) { return CPUID_10_2_EBX_CBM; }
 uint32_t rdt_get_cpuid_10_2_edx_cos_max(void) { return RDT_MAX_L2_MASK_COUNT; }
+
+uint32_t rdt_get_cpuid_10_3_eax_thrtl_max(void) { return 
CPUID_10_3_EAX_THRTL_MAX; }
+uint32_t rdt_cpuid_10_3_eax_linear_response_enabled(void) { return 
CPUID_10_3_ECX_LINEAR_RESPONSE; }
 uint32_t rdt_get_cpuid_10_3_edx_cos_max(void) { return 
RDT_MAX_MBA_THRTL_COUNT; }
 
 bool rdt_associate_rmid_cos(uint64_t msr_ia32_pqr_assoc) {
diff --git a/include/hw/i386/rdt.h b/include/hw/i386/rdt.h
index ec82a149f2..57d2fa5b77 100644
--- a/include/hw/i386/rdt.h
+++ b/include/hw/i386/rdt.h
@@ -20,13 +20,44 @@
 #include 
 #include 
 
+/* RDT L3 Cache Monitoring Technology */
+#define CPUID_15_0_EDX_L3   (1U << 1)
+#define CPUID_15_1_EDX_L3_OCCUPANCY (1U << 0)
+#define CPUID_15_1_EDX_L3_TOTAL_BW  (1U << 1)
+#define CPUID_15_1_EDX_L3_LOCAL_BW  (1U << 2)
+
+/* RDT Cache Allocation Technology */
+#define CPUID_10_0_EBX_L3_CAT   (1U << 1)
+#define CPUID_10_0_EBX_L2_CAT   (1U << 2)
+#define CPUID_10_0_EBX_MBA  (1U << 3)
+#define CPUID_10_0_EDX CPUID_10_0_EBX_L3_CAT | CPUID_10_0_EBX_L2_CAT | 
CPUID_10_0_EBX_MBA
+
 typedef struct RDTState RDTState;
 typedef struct RDTStatePerCore RDTStatePerCore;
 typedef struct RDTMonitor RDTMonitor;
 typedef struct RDTAllocation RDTAllocation;
 
+uint32_t rdt_get_cpuid_15_0_edx_l3(void);
+
+uint32_t rdt_cpuid_15_1_edx_l3_total_bw_enabled(void);
+uint32_t rdt_cpuid_15_1_edx_l3_local_bw_enabled(void);
+uint32_t rdt_cpuid_15_1_edx_l3_occupancy_enabled(void);
+
+uint32_t rdt_cpuid_10_0_ebx_l3_cat_enabled(void);
+uint32_t rdt_cpuid_10_0_ebx_l2_cat_enabled(void);
+uint32_t rdt_cpuid_10_0_ebx_l2_mba_enabled(void);
+
+uint32_t rdt_get_cpuid_10_1_eax_cbm_length(void);
+uint32_t rdt_cpuid_10_1_ebx_cbm_enabled(void);
+uint32_t rdt_cpuid_10_1_ecx_cdp_enabled(void);
 uint32_t rdt_get_cpuid_10_1_edx_cos_max(void);
+
+uint32_t rdt_get_cpuid_10_2_eax_cbm_length(void);
+uint32_t rdt_cpuid_10_2_ebx_cbm_enabled(void);
 uint32_t rdt_get_cpuid_10_2_edx_cos_max(void);
+
+uint32_t rdt_get_cpuid_10_3_eax_thrtl_max(void);
+uint32_t rdt_cpuid_10_3_eax_linear_response_enabled(void);
 uint32_t rdt_get_cpuid_10_3_edx_cos_max(void);
 
 bool rdt_associate_rmid_cos(uint64_t msr_ia32_pqr_assoc);
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 4688d140c2..a8198fe5a7 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -42,6 +42,7 @@
 #include "hw/boards.

[PATCH v3 6/8] i386: Add RDT feature flags.

From: ‪Hendrik Wüthrich 

Add RDT features to feature word / TCG.

Signed-off-by: Hendrik Wüthrich 
---
 target/i386/cpu.c | 30 --
 target/i386/cpu.h |  2 ++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index a8198fe5a7..af1da35985 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -864,7 +864,8 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
   CPUID_7_0_EBX_CLFLUSHOPT |\
   CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_MPX | CPUID_7_0_EBX_FSGSBASE | \
   CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_AVX2 | CPUID_7_0_EBX_RDSEED | \
-  CPUID_7_0_EBX_SHA_NI | CPUID_7_0_EBX_KERNEL_FEATURES)
+  CPUID_7_0_EBX_SHA_NI | CPUID_7_0_EBX_KERNEL_FEATURES | \
+  CPUID_7_0_EBX_PQM | CPUID_7_0_EBX_PQE)
   /* missing:
   CPUID_7_0_EBX_HLE
   CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM */
@@ -900,6 +901,7 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
 #define TCG_SGX_12_0_EAX_FEATURES 0
 #define TCG_SGX_12_0_EBX_FEATURES 0
 #define TCG_SGX_12_1_EAX_FEATURES 0
+#define TCG_RDT_15_0_EDX_FEATURES CPUID_15_0_EDX_L3
 
 #if defined CONFIG_USER_ONLY
 #define CPUID_8000_0008_EBX_KERNEL_FEATURES (CPUID_8000_0008_EBX_IBPB | \
@@ -1057,7 +1059,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 "fsgsbase", "tsc-adjust", "sgx", "bmi1",
 "hle", "avx2", NULL, "smep",
 "bmi2", "erms", "invpcid", "rtm",
-NULL, NULL, "mpx", NULL,
+"rdt-m", NULL, "mpx", "rdt-a",
 "avx512f", "avx512dq", "rdseed", "adx",
 "smap", "avx512ifma", "pcommit", "clflushopt",
 "clwb", "intel-pt", "avx512pf", "avx512er",
@@ -1607,6 +1609,30 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 },
 .tcg_features = TCG_SGX_12_1_EAX_FEATURES,
 },
+
+[FEAT_RDT_10_0_EBX] = {
+.type = CPUID_FEATURE_WORD,
+.feat_names = {
+NULL, "l3-cat", "l2-cat", "mba"
+},
+.cpuid = {
+.eax = 0x10,
+.needs_ecx = true, .ecx = 0,
+.reg = R_EBX,
+}
+},
+[FEAT_RDT_15_0_EDX] = {
+.type = CPUID_FEATURE_WORD,
+.feat_names = {
+[1] = "l3-cmt"
+},
+.cpuid = {
+.eax = 0xf,
+.needs_ecx = true, .ecx = 0,
+.reg = R_EDX,
+},
+.tcg_features = TCG_RDT_15_0_EDX_FEATURES,
+},
 };
 
 typedef struct FeatureMask {
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 1520a93927..b9d78f4d4e 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -673,7 +673,9 @@ typedef enum FeatureWord {
 FEAT_XSAVE_XSS_HI, /* CPUID[EAX=0xd,ECX=1].EDX */
 FEAT_7_1_EDX,   /* CPUID[EAX=7,ECX=1].EDX */
 FEAT_7_2_EDX,   /* CPUID[EAX=7,ECX=2].EDX */
+FEAT_RDT_15_0_EBX,  /* CPUID[EAX=0xf,ECX=0].EBX (RDT CMT/MBM) */
 FEAT_RDT_15_0_EDX,  /* CPUID[EAX=0xf,ECX=0].EDX (RDT CMT/MBM) */
+FEAT_RDT_10_0_EBX,  /* CPUID[EAX=0x10,ECX=0].EBX (RDT CAT/MBA) */
 FEATURE_WORDS,
 } FeatureWord;
 
-- 
2.47.0.338.g60cca15819-goog

Re: [PATCH] qga: implement a 'guest-get-load' command

Reviewed-by: Konstantin Kostiuk 

On Mon, Dec 2, 2024 at 2:19 PM Daniel P. Berrangé 
wrote:

> Provide a way to report the process load average, via a new
> 'guest-get-load' command.
>
> This is only implemented for POSIX platforms providing 'getloadavg'.
>
> Example illustrated with qmp-shell:
>
> (QEMU) guest-get-load
> {
> "return": {
> "load15m": 1.546875,
> "load1m": 1.669921875,
> "load5m": 1.9306640625
> }
> }
>
> Windows has no native equivalent API, but it would be possible to
> simulate it as illustrated here (BSD-3-Clause):
>
>   https://github.com/giampaolo/psutil/pull/1485
>
> This is left as an exercise for future contributors.
>
> Signed-off-by: Daniel P. Berrangé 
> ---
>  meson.build  |  1 +
>  qga/commands-posix.c | 20 
>  qga/qapi-schema.json | 37 +
>  3 files changed, 58 insertions(+)
>
> diff --git a/meson.build b/meson.build
> index a290dbfa33..9c65e56fff 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2619,6 +2619,7 @@ config_host_data.set('CONFIG_SETNS',
> cc.has_function('setns') and cc.has_functio
>  config_host_data.set('CONFIG_SYNCFS', cc.has_function('syncfs'))
>  config_host_data.set('CONFIG_SYNC_FILE_RANGE',
> cc.has_function('sync_file_range'))
>  config_host_data.set('CONFIG_TIMERFD', cc.has_function('timerfd_create'))
> +config_host_data.set('CONFIG_GETLOADAVG', cc.has_function('getloadavg'))
>  config_host_data.set('HAVE_COPY_FILE_RANGE',
> cc.has_function('copy_file_range'))
>  config_host_data.set('HAVE_GETIFADDRS', cc.has_function('getifaddrs'))
>  config_host_data.set('HAVE_GLIB_WITH_SLICE_ALLOCATOR', glib_has_gslice)
> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
> index 636307bedf..6e3c15f539 100644
> --- a/qga/commands-posix.c
> +++ b/qga/commands-posix.c
> @@ -1368,3 +1368,23 @@ char *qga_get_host_name(Error **errp)
>
>  return g_steal_pointer(&hostname);
>  }
> +
> +#ifdef CONFIG_GETLOADAVG
> +GuestLoadAverage *qmp_guest_get_load(Error **errp)
> +{
> +double loadavg[3];
> +GuestLoadAverage *ret = NULL;
> +
> +if (getloadavg(loadavg, G_N_ELEMENTS(loadavg)) < 0) {
> +error_setg_errno(errp, errno,
> + "cannot query load average");
> +return NULL;
> +}
> +
> +ret = g_new0(GuestLoadAverage, 1);
> +ret->load1m = loadavg[0];
> +ret->load5m = loadavg[1];
> +ret->load15m = loadavg[2];
> +return ret;
> +}
> +#endif
> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
> index 0537bb7886..995594aaf4 100644
> --- a/qga/qapi-schema.json
> +++ b/qga/qapi-schema.json
> @@ -1843,6 +1843,43 @@
>'if': 'CONFIG_LINUX'
>  }
>
> +
> +##
> +# @GuestLoadAverage:
> +#
> +# Statistics about process load information
> +#
> +# @load1m: 1-minute load avage
> +#
> +# @load5m: 5-minute load avage
> +#
> +# @load15m: 15-minute load avage
> +#
> +# Since: 10.0
> +##
> +{ 'struct': 'GuestLoadAverage',
> +  'data': {
> +  'load1m': 'number',
> +  'load5m': 'number',
> +  'load15m': 'number'
> +  },
> +  'if': 'CONFIG_GETLOADAVG'
> +}
> +
> +##
> +# @guest-get-load:
> +#
> +# Retrieve CPU process load information
> +#
> +# Returns: load information
> +#
> +# Since: 10.0
> +##
> +{ 'command': 'guest-get-load',
> +  'returns': 'GuestLoadAverage',
> +  'if': 'CONFIG_GETLOADAVG'
> +}
> +
>  ##
>  # @GuestNetworkRoute:
>  #
> --
> 2.46.0
>
>

Re: [PATCH 1/4] qga: Don't access global variable in run_agent_once()

Reviewed-by: Konstantin Kostiuk 

On Mon, Nov 4, 2024 at 11:54 AM Michal Privoznik 
wrote:

> The run_agent_once() function is already given GAState via an
> argument. There's no need to access the global ga_state variable
> which points to the argument anyways (thanks to
> initialize_agent()). Worse, some parts of the function use the
> argument and the other use the global variable.  Stick with the
> function argument.
>
> Signed-off-by: Michal Privoznik 
> ---
>  qga/main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/qga/main.c b/qga/main.c
> index 50186760bf..4a695235f0 100644
> --- a/qga/main.c
> +++ b/qga/main.c
> @@ -1519,7 +1519,7 @@ static int run_agent_once(GAState *s)
>  return EXIT_FAILURE;
>  }
>
> -g_main_loop_run(ga_state->main_loop);
> +g_main_loop_run(s->main_loop);
>
>  if (s->channel) {
>  ga_channel_free(s->channel);
> --
> 2.45.2
>
>

Re: [PATCH 2/4] qga: Invert logic on return value in main()

Hi Michal,

Please fix the issue that Jan mentioned.
All commits should be logically correct even if it part of one series.
Applying this will cause regression.

Best Regards,
Konstantin Kostiuk.


On Wed, Nov 6, 2024 at 6:07 PM Ján Tomko  wrote:

> On a Monday in 2024, Michal Privoznik wrote:
> >Current logic on return value ('ret' variable) in main() is error
> >prone. The variable is initialized to EXIT_SUCCESS and then set
> >to EXIT_FAILURE on error paths. This makes it very easy to forget
> >to set the variable to indicate error when adding new error path,
> >as is demonstrated by handling of initialize_agent() failure.
> >It's simply lacking setting of the variable.
> >
> >There's just one case where success should be indicated: when
> >dumping the config ('-D' cmd line argument).
> >
> >To resolve this, initialize the variable to failure value and set
> >it explicitly to success value in that one specific case.
> >
> >Signed-off-by: Michal Privoznik 
> >---
> > qga/main.c | 6 ++
> > 1 file changed, 2 insertions(+), 4 deletions(-)
> >
> >diff --git a/qga/main.c b/qga/main.c
> >index 4a695235f0..c003aacbe0 100644
> >--- a/qga/main.c
> >+++ b/qga/main.c
> >@@ -1579,7 +1579,7 @@ static void stop_agent(GAState *s, bool requested)
> >
> > int main(int argc, char **argv)
> > {
> >-int ret = EXIT_SUCCESS;
> >+int ret = EXIT_FAILURE;
> > GAState *s;
> > GAConfig *config = g_new0(GAConfig, 1);
> > int socket_activation;
> >@@ -1607,7 +1607,6 @@ int main(int argc, char **argv)
> > socket_activation = check_socket_activation();
> > if (socket_activation > 1) {
> > g_critical("qemu-ga only supports listening on one socket");
> >-ret = EXIT_FAILURE;
> > goto end;
> > }
> > if (socket_activation) {
> >@@ -1631,7 +1630,6 @@ int main(int argc, char **argv)
> >
> > if (!config->method) {
> > g_critical("unsupported listen fd type");
> >-ret = EXIT_FAILURE;
> > goto end;
> > }
> > } else if (config->channel_path == NULL) {
> >@@ -1643,13 +1641,13 @@ int main(int argc, char **argv)
> > config->channel_path = g_strdup(QGA_SERIAL_PATH_DEFAULT);
> > } else {
> > g_critical("must specify a path for this channel");
> >-ret = EXIT_FAILURE;
> > goto end;
> > }
> > }
> >
> > if (config->dumpconf) {
> > config_dump(config);
> >+ret = EXIT_SUCCESS;
> > goto end;
> > }
> >
>
> Below this there's another place that misses an EXIT_SUCCESS, on _WIN32
> when config->daemonize is set:
>
>   #ifdef _WIN32
>   if (config->daemonize) {
>   SERVICE_TABLE_ENTRY service_table[] = {
>   { (char *)QGA_SERVICE_NAME, service_main }, { NULL, NULL } };
>   StartServiceCtrlDispatcher(service_table);
>   } else {
>   ret = run_agent(s);
>   }
>   #else
>   ret = run_agent(s);
>   #endif
>
> But after patch 4/4 ret is set to EXIT_SUCCESS in all the cases.
>
> Jano
>

Re: [PATCH v4] tests/functional/aarch64: add tests for FEAT_RME

2024-12-04 Thread Alex Bennée

Pierrick Bouvier  writes:

> This boot an OP-TEE environment, and launch a nested guest VM inside it
> using the Realms feature. We do it for virt and sbsa-ref platforms.
>
> Signed-off-by: Pierrick Bouvier 

Queued to testing/next, thanks.

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [PATCH 3/4] qga: Don't daemonize before channel is initialized

On Mon, Nov 4, 2024 at 11:54 AM Michal Privoznik 
wrote:

> If the agent is set to daemonize but for whatever reason fails to
> init the channel, the error message is lost. Worse, the agent
> daemonizes needlessly and returns success. For instance:
>
>   # qemu-ga -m virtio-serial \
> -p /dev/nonexistent_device \
> -f /run/qemu-ga.pid \
> -t /run \
> -d
>   # echo $?
>   0
>
> This makes it needlessly hard for init scripts to detect a
> failure in qemu-ga startup. Though, they shouldn't pass '-d' in
> the first place.
>
> Let's open the channel first and only after that become a daemon.
>
> Related bug: https://bugs.gentoo.org/810628
>
> Signed-off-by: Michal Privoznik 
> ---
>  qga/main.c | 24 ++--
>  1 file changed, 14 insertions(+), 10 deletions(-)
>
> diff --git a/qga/main.c b/qga/main.c
> index c003aacbe0..6240845f39 100644
> --- a/qga/main.c
> +++ b/qga/main.c
> @@ -1430,7 +1430,6 @@ static GAState *initialize_agent(GAConfig *config,
> int socket_activation)
>  if (config->daemonize) {
>  /* delay opening/locking of pidfile till filesystems are
> unfrozen */
>  s->deferred_options.pid_filepath = config->pid_filepath;
> -become_daemon(NULL);
>  }
>  if (config->log_filepath) {
>  /* delay opening the log file till filesystems are unfrozen */
> @@ -1438,9 +1437,6 @@ static GAState *initialize_agent(GAConfig *config,
> int socket_activation)
>  }
>  ga_disable_logging(s);
>  } else {
> -if (config->daemonize) {
> -become_daemon(config->pid_filepath);
> -}
>  if (config->log_filepath) {
>  FILE *log_file = ga_open_logfile(config->log_filepath);
>  if (!log_file) {
> @@ -1487,6 +1483,20 @@ static GAState *initialize_agent(GAConfig *config,
> int socket_activation)
>
>  ga_apply_command_filters(s);
>
> +if (!channel_init(s, s->config->method, s->config->channel_path,
> +  s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD :
> -1)) {
> +g_critical("failed to initialize guest agent channel");
> +return NULL;
> +}
> +
> +if (config->daemonize) {
> +if (ga_is_frozen(s)) {
> +become_daemon(NULL);
> +} else {
> +become_daemon(config->pid_filepath);
> +}
> +}
> +
>  ga_state = s;
>  return s;
>  }
> @@ -1513,12 +1523,6 @@ static void cleanup_agent(GAState *s)
>
>  static int run_agent_once(GAState *s)
>  {
> -if (!channel_init(s, s->config->method, s->config->channel_path,
> -  s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD :
> -1)) {
> -g_critical("failed to initialize guest agent channel");
> -return EXIT_FAILURE;
> -}
> -


The old flow:
run_agent call run_agent_once in loop
run_agent_once initialize channel for every run

after your changes
you expect to initialize the channel only once
this can cause issues on Windows during driver update
the default configuration on Windows is QGA + VirtioSerial and in CLI
--retry-path
during driver update (that can happen via Windows Update) the channel will
be closed
so QGA must reinitialize the channel

Theoretically, the same can happen on Linux with a UNIX socket



>  g_main_loop_run(s->main_loop);
>
>  if (s->channel) {
> --
> 2.45.2
>
>

Re: [PATCH] binfmt: Don't consider riscv{32,64} part of the same family

2024-12-04 Thread Laurent Vivier


Le 03/12/2024 à 10:47, Andrea Bolognani a écrit :

Currently the script won't generate a configuration file that
sets up qemu-user-riscv32 on riscv64, likely under the
assumption that 64-bit RISC-V machines can natively run 32-bit
RISC-V code.

However this functionality, while theoretically possible, in
practice is missing from most commonly available RISC-V hardware
and not enabled at the distro level. So qemu-user-riscv32 really
is the only option to run riscv32 binaries on riscv64.

Make riscv32 and riscv64 each its own family, so that the
configuration file we need to make 32-on-64 userspace emulation
work gets generated.

Link: https://src.fedoraproject.org/rpms/qemu/pull-request/72
Thanks: David Abdurachmanov 
Thanks: Daniel P. Berrangé 
Signed-off-by: Andrea Bolognani 
---
  scripts/qemu-binfmt-conf.sh | 7 ++-
  1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/scripts/qemu-binfmt-conf.sh b/scripts/qemu-binfmt-conf.sh
index 6ef9f118d9..e38b767c24 100755
--- a/scripts/qemu-binfmt-conf.sh
+++ b/scripts/qemu-binfmt-conf.sh
@@ -110,11 +110,11 @@ hppa_family=hppa
  
  riscv32_magic='\x7fELF\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xf3\x00'

  
riscv32_mask='\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff'
-riscv32_family=riscv
+riscv32_family=riscv32
  
  riscv64_magic='\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xf3\x00'

  
riscv64_mask='\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff'
-riscv64_family=riscv
+riscv64_family=riscv64
  
  xtensa_magic='\x7fELF\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x5e\x00'

  
xtensa_mask='\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff'
@@ -168,9 +168,6 @@ qemu_get_family() {
  sparc*)
  echo "sparc"
  ;;
-riscv*)
-echo "riscv"
-;;
  loongarch*)
  echo "loongarch"
  ;;


Reviewed-by: Laurent Vivier

Re: [RFC PATCH 0/5] support unaligned access to xHCI Capability

2024-12-04 Thread Tomoyuki HIROSE


On 2024/12/02 23:17, Peter Maydell wrote:

On Fri, 29 Nov 2024 at 03:33, Tomoyuki HIROSE
 wrote:

OK, thanks! I got understand. I thought MemoryRegionOps had to be
'static const' .
I will try to improve code so that it does not require the use of
memaccess-testdev.h.inc .

Great. The other thing I thought of this weekend is that
we should document the behaviour in docs/devel/memory.rst.
We could have a new section there that describes how the
core memory code synthesizes accesses that are permitted
by the .valid settings but not handled by the .impl
settings. That way device model authors can know what
happens without having to read the source code.

OK, I will also write the doc as I can.

thanks
-- PMM

thanks,
Tomoyuki HIROSE

Re: [PATCH] binfmt: Don't consider riscv{32,64} part of the same family

On Tue, Dec 03, 2024 at 07:57:14AM -0600, Richard Henderson wrote:
> On 12/3/24 04:35, Peter Maydell wrote:
> > On Tue, 3 Dec 2024 at 10:19, Daniel P. Berrangé  wrote:
> > > Separatley this from patch, we should also consider whether
> > > it is time to do the same for aarch64/arm7.
> > > 
> > > If I look at this page:
> > > 
> > >https://gpages.juszkiewicz.com.pl/arm-socs-table/arm-socs.html
> > > 
> > > and sort by 'announced' to see msot recent CPUs first, then
> > > almost all of them have "NO" in the "aarch32 support" column.
> > > 
> > > IOW, on modern aarch64 CPUs, qemu-arm is the only viable way
> > > to run 32-bit usermode binaries AFAICT, and suggests we ought
> > > to be creating a binfmt rule for that on aarch64 hosts.
> > 
> > What happens if you have a host CPU that *does* support 32-bit
> > natively and you also register the binfmt rule? Does the
> > host kernel prefer to execute natively or does it invoke
> > QEMU? I don't think we want to roll out something that
> > silently downgrades native execution to emulation...
> 
> The registered rule applies and the kernel invokes qemu.

This is all quiet difficult from a distro POV, but not QEMU's fault.

We want to install the binfmt rules in a way that we "do the right thing"
regardless of hardware out of the box.

The systemd logic for loading binfmt rules is unconditional, loading
everything from /usr/lib/binfmt.d, but we need a way to make things
conditional on the lack of support for aarch32 on the currently running
platform.

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 2/5] tests/functional: Extract the find_free_ports() function into a helper file

On Wed, Dec 04, 2024 at 08:19:08AM +0100, Thomas Huth wrote:
> We'll need this functionality in other functional tests, too, so
> let's extract it into the qemu_test module.
> Also add  an __enter__ and __exit__ function that can be used for
> using this functionality in a locked context, so that tests that
> are running in parallel don't try to compete for the same ports
> later.
> 
> Signed-off-by: Thomas Huth 
> ---
>  tests/functional/qemu_test/ports.py | 53 +
>  tests/functional/test_vnc.py| 36 +---
>  2 files changed, 61 insertions(+), 28 deletions(-)
>  create mode 100644 tests/functional/qemu_test/ports.py
> 
> diff --git a/tests/functional/qemu_test/ports.py 
> b/tests/functional/qemu_test/ports.py
> new file mode 100644
> index 00..d235d3432b
> --- /dev/null
> +++ b/tests/functional/qemu_test/ports.py
> @@ -0,0 +1,53 @@
> +#!/usr/bin/env python3
> +#
> +# Simple functional tests for VNC functionality
> +#
> +# Copyright 2018, 2024 Red Hat, Inc.
> +#
> +# This work is licensed under the terms of the GNU GPL, version 2 or
> +# later.  See the COPYING file in the top-level directory.
> +
> +import fcntl
> +import os
> +import socket
> +import sys
> +import tempfile
> +from typing import List
> +
> +class Ports():
> +
> +PORTS_ADDR = '127.0.0.1'
> +PORTS_START = 32768
> +PORTS_END = PORTS_START + 1024
> +
> +def __enter__(self):
> +lock_file = os.path.join(tempfile.gettempdir(), "qemu_port_lock")
> +self.lock_fh = os.open(lock_file, os.O_CREAT)
> +fcntl.flock(self.lock_fh, fcntl.LOCK_EX)
> +return self
> +
> +def __exit__(self, exc_type, exc_value, traceback):
> +fcntl.flock(self.lock_fh, fcntl.LOCK_UN)
> +os.close(self.lock_fh)
> +
> +def check_bind(self, port: int) -> bool:
> +with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
> +try:
> +sock.bind((self.PORTS_ADDR, port))
> +except OSError:
> +return False
> +
> +return True
> +
> +def find_free_ports(self, count: int) -> List[int]:
> +result = []
> +for port in range(self.PORTS_START, self.PORTS_END):
> +if self.check_bind(port):
> +result.append(port)
> +if len(result) >= count:
> +break
> +assert len(result) == count
> +return result
> +
> +def find_free_port(self) -> int:
> +return self.find_free_ports(1)[0]
> diff --git a/tests/functional/test_vnc.py b/tests/functional/test_vnc.py
> index b769d3b268..32a81259e4 100755
> --- a/tests/functional/test_vnc.py
> +++ b/tests/functional/test_vnc.py
> @@ -14,22 +14,9 @@
>  from typing import List
>  
>  from qemu_test import QemuSystemTest
> -
> +from qemu_test.ports import Ports
>  
>  VNC_ADDR = '127.0.0.1'
> -VNC_PORT_START = 32768
> -VNC_PORT_END = VNC_PORT_START + 1024
> -
> -
> -def check_bind(port: int) -> bool:
> -with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
> -try:
> -sock.bind((VNC_ADDR, port))
> -except OSError:
> -return False
> -
> -return True
> -
>  
>  def check_connect(port: int) -> bool:
>  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
> @@ -40,18 +27,6 @@ def check_connect(port: int) -> bool:
>  
>  return True
>  
> -
> -def find_free_ports(count: int) -> List[int]:
> -result = []
> -for port in range(VNC_PORT_START, VNC_PORT_END):
> -if check_bind(port):
> -result.append(port)
> -if len(result) >= count:
> -break
> -assert len(result) == count
> -return result
> -
> -
>  class Vnc(QemuSystemTest):
>  
>  def test_no_vnc(self):
> @@ -90,8 +65,7 @@ def test_change_password(self):
>  self.vm.cmd('change-vnc-password',
>  password='new_password')
>  
> -def test_change_listen(self):
> -a, b, c = find_free_ports(3)
> +def do_test_change_listen(self, a, b, c):
>  self.assertFalse(check_connect(a))
>  self.assertFalse(check_connect(b))
>  self.assertFalse(check_connect(c))
> @@ -113,5 +87,11 @@ def test_change_listen(self):
>  self.assertTrue(check_connect(b))
>  self.assertTrue(check_connect(c))
>  
> +def test_change_listen(self):
> +with Ports() as ports:
> +a, b, c = ports.find_free_ports(3)
> +self.do_test_change_listen(a, b, c)

I think it would be possible to implement a decorator using "Ports"
such that we don't need to manually wrap the methods, and just receive
the list of port numbers as arguments.

eg to make this pattern with:

@findFreePorts(3)
def test_change_listen(self, ports):
 use 'ports' list 

Fully untested, but I think something approximately like this would
work:

   def findFreePorts(num)
 def findFreePortsDeco(func):
   def wrapper(*

Re: [PATCH v4 13/15] acpi/ghes: move offset calculus to a separate function

On Wed, 4 Dec 2024 09:56:35 +0100
Mauro Carvalho Chehab  wrote:

> Em Wed, 4 Dec 2024 08:54:40 +0100
> Igor Mammedov  escreveu:
> 
> > On Tue, 3 Dec 2024 14:47:30 +0100
> > Mauro Carvalho Chehab  wrote:
> >   
> > > Em Tue, 3 Dec 2024 12:51:43 +0100
> > > Igor Mammedov  escreveu:
> > > 
> > > > On Fri, 22 Nov 2024 10:11:30 +0100
> > > > Mauro Carvalho Chehab  wrote:
> > > >   
> > > > > Currently, CPER address location is calculated as an offset of
> > > > > the hardware_errors table. It is also badly named, as the
> > > > > offset actually used is the address where the CPER data starts,
> > > > > and not the beginning of the error source.
> > > > > 
> > > > > Move the logic which calculates such offset to a separate
> > > > > function, in preparation for a patch that will be changing the
> > > > > logic to calculate it from the HEST table.
> > > > > 
> > > > > While here, properly name the variable which stores the cper
> > > > > address.
> > > > > 
> > > > > Signed-off-by: Mauro Carvalho Chehab 
> > > > > Reviewed-by: Jonathan Cameron 
> > > > > ---
> > > > >  hw/acpi/ghes.c | 41 -
> > > > >  1 file changed, 32 insertions(+), 9 deletions(-)
> > > > > 
> > > > > diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> > > > > index 87fd3feedd2a..d99697b20164 100644
> > > > > --- a/hw/acpi/ghes.c
> > > > > +++ b/hw/acpi/ghes.c
> > > > > @@ -364,10 +364,37 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, 
> > > > > FWCfgState *s,
> > > > >  ags->present = true;
> > > > >  }
> > > > >  
> > > > > +static void get_hw_error_offsets(uint64_t ghes_addr,
> > > > > + uint64_t *cper_addr,
> > > > > + uint64_t *read_ack_register_addr)
> > > > > +{
> > > > 
> > > >   
> > > > > +if (!ghes_addr) {
> > > > > +return;
> > > > > +}
> > > > 
> > > > why do we need this check?  
> > > 
> > > It is a safeguard measure to avoid crashes and OOM access. If fw_cfg 
> > > callback doesn't fill it properly, this will be zero.
> > 
> > shouldn't happen, but yeah it firmware job to write back addr
> > which might happen for whatever reason (a bug for example).
> >  
> 
> The main reason I added it is that, after the second series, it could 
> also happen if there's something wrong with the backward compat logic.
> 
> So, both here and after switching to HEST-based offsets, I opted
> to explicitly test.
> 
> > Perhaps push this up to the stack, so we don't have to deal
> > with scattered checks in ghes code.
> > 
> > kvm_arch_on_sigbus_vcpu() looks like a goo candidate for check
> > and warn_once if that ever happens.
> > It already calls acpi_ghes_present() which resolves GED device
> > and then later we duplicate this job in ghes_record_cper_errors()
> > 
> > so maybe rename acpi_ghes_present to something like AcpiGhesState* 
> > acpi_ghes_get_state()
> > and call it instead. And then move ghes_addr check/warn_once there.
> > This way the rest of ghes code won't have to deal handling practically
> > impossible error conditions that cause reader to wonder why it might 
> > happen.  
> 
> I'll look on it. Yet, if ok for you, I would prefer dealing with this
> once we have a bigger picture, e.g. once we merge those tree series:
> 
>   - cleanup series (this one);
>   - HEST offset (I'll be sending a new version today);
ok, lets revisit this point after this series.
Since at this point we should have a clean picture of how new code
works.

>   - error_inject.
> 
> Thanks,
> Mauro
>

Re: [PATCH 4/5] tests/functional/test_vnc: Remove the test_no_vnc test

On Wed, Dec 04, 2024 at 08:19:10AM +0100, Thomas Huth wrote:
> This test matches exactly the first three lines of the following
> test_no_vnc_change_password test, so there is exactly zero additional
> test coverage in here.
> 
> Signed-off-by: Thomas Huth 
> ---
>  tests/functional/test_vnc.py | 5 -
>  1 file changed, 5 deletions(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 2/5] tests/functional: Extract the find_free_ports() function into a helper file

On Wed, Dec 04, 2024 at 08:19:08AM +0100, Thomas Huth wrote:
> We'll need this functionality in other functional tests, too, so
> let's extract it into the qemu_test module.
> Also add  an __enter__ and __exit__ function that can be used for
> using this functionality in a locked context, so that tests that
> are running in parallel don't try to compete for the same ports
> later.
> 
> Signed-off-by: Thomas Huth 
> ---
>  tests/functional/qemu_test/ports.py | 53 +
>  tests/functional/test_vnc.py| 36 +---
>  2 files changed, 61 insertions(+), 28 deletions(-)
>  create mode 100644 tests/functional/qemu_test/ports.py
> 
> diff --git a/tests/functional/qemu_test/ports.py 
> b/tests/functional/qemu_test/ports.py
> new file mode 100644
> index 00..d235d3432b
> --- /dev/null
> +++ b/tests/functional/qemu_test/ports.py
> @@ -0,0 +1,53 @@
> +#!/usr/bin/env python3
> +#
> +# Simple functional tests for VNC functionality
> +#
> +# Copyright 2018, 2024 Red Hat, Inc.
> +#
> +# This work is licensed under the terms of the GNU GPL, version 2 or
> +# later.  See the COPYING file in the top-level directory.
> +
> +import fcntl
> +import os
> +import socket
> +import sys
> +import tempfile
> +from typing import List
> +
> +class Ports():
> +
> +PORTS_ADDR = '127.0.0.1'
> +PORTS_START = 32768
> +PORTS_END = PORTS_START + 1024
> +
> +def __enter__(self):
> +lock_file = os.path.join(tempfile.gettempdir(), "qemu_port_lock")
> +self.lock_fh = os.open(lock_file, os.O_CREAT)
> +fcntl.flock(self.lock_fh, fcntl.LOCK_EX)
> +return self
> +
> +def __exit__(self, exc_type, exc_value, traceback):
> +fcntl.flock(self.lock_fh, fcntl.LOCK_UN)
> +os.close(self.lock_fh)

This code will leave '/tmp/qemu_port_lock' existing forever which is
correct, because if you try to unlink it after closing, you'll introduce
a race because the 2nd __enter__ will now be locking an unlinked file,
and a 3rd __enter__ that comes along will create & lock an entirely new
file.

There are ways to make this safe by using stat + fstat either side of
LOCK_EX, in a loop, to detect locking of an unlinked file. That is
overkill though.  It is simpler to just put the lock file in the build
directory IMHO, and thus avoid needing to care about unlinking - that'll
be done when the user purges their build dir.

> +
> +def check_bind(self, port: int) -> bool:
> +with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
> +try:
> +sock.bind((self.PORTS_ADDR, port))
> +except OSError:
> +return False
> +
> +return True
> +
> +def find_free_ports(self, count: int) -> List[int]:
> +result = []
> +for port in range(self.PORTS_START, self.PORTS_END):
> +if self.check_bind(port):
> +result.append(port)
> +if len(result) >= count:
> +break
> +assert len(result) == count
> +return result
> +
> +def find_free_port(self) -> int:
> +return self.find_free_ports(1)[0]
> diff --git a/tests/functional/test_vnc.py b/tests/functional/test_vnc.py
> index b769d3b268..32a81259e4 100755
> --- a/tests/functional/test_vnc.py
> +++ b/tests/functional/test_vnc.py
> @@ -14,22 +14,9 @@
>  from typing import List
>  
>  from qemu_test import QemuSystemTest
> -
> +from qemu_test.ports import Ports
>  
>  VNC_ADDR = '127.0.0.1'
> -VNC_PORT_START = 32768
> -VNC_PORT_END = VNC_PORT_START + 1024
> -
> -
> -def check_bind(port: int) -> bool:
> -with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
> -try:
> -sock.bind((VNC_ADDR, port))
> -except OSError:
> -return False
> -
> -return True
> -
>  
>  def check_connect(port: int) -> bool:
>  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
> @@ -40,18 +27,6 @@ def check_connect(port: int) -> bool:
>  
>  return True
>  
> -
> -def find_free_ports(count: int) -> List[int]:
> -result = []
> -for port in range(VNC_PORT_START, VNC_PORT_END):
> -if check_bind(port):
> -result.append(port)
> -if len(result) >= count:
> -break
> -assert len(result) == count
> -return result
> -
> -
>  class Vnc(QemuSystemTest):
>  
>  def test_no_vnc(self):
> @@ -90,8 +65,7 @@ def test_change_password(self):
>  self.vm.cmd('change-vnc-password',
>  password='new_password')
>  
> -def test_change_listen(self):
> -a, b, c = find_free_ports(3)
> +def do_test_change_listen(self, a, b, c):
>  self.assertFalse(check_connect(a))
>  self.assertFalse(check_connect(b))
>  self.assertFalse(check_connect(c))
> @@ -113,5 +87,11 @@ def test_change_listen(self):
>  self.assertTrue(check_connect(b))
>  self.assertTrue(check_connect(c))
>  
> +def t

Re: [PATCH v3 0/9] Require error handling for dynamically created objects

2024-12-04 Thread Markus Armbruster

Daniel P. Berrangé  writes:

> NB, this series is targetting 10.0, NOT for 9.2 freeze.
>
> With code like
>
> Object *obj = object_new(TYPE_BLAH)
>
> the caller can be pretty confident that they will successfully create
> an object instance of TYPE_BLAH. They know exactly what type has been
> requested, so it passing an abstract type for example, it is a clear
> programmer error that they'll get an assertion failure.
>
> Conversely with code like
>
>void somefunc(const char *typename) {
>   Object * obj = object_new(typename)
>   ...
>}
>
> all bets are off, because the call of object_new() knows nothing
> about what 'typename' resolves to.

We know nothing *locally*.

Commonly, a non-local argument can demonstrate safety.  Only when the
type name comes from the user, we truly know nothing.

>It could easily be an abstract
> type.

It could also be no type at all.

>   As a result, many code paths have added a manual check ahead
> of time
>
>if (object_class_is_abstract(typename)) {
>   error_setg(errp, )
>}

Actually, object_class_is_abstract() takes an ObjectClass, not a type
name string.

The actual guards we use are variations of

klass = object_class_by_name(typename);
if (!klass) {
error_setg(errp, "invalid object type: %s", typename);
return NULL;
}

if (object_class_is_abstract(klass)) {
error_setg(errp, "object type '%s' is abstract", typename);
return NULL;
}

which covers "no type at all", too.

Sometimes, we use module_object_class_by_name() instead, which I believe
additionally loads the module providing the type, if any.  Which of the
two should be used where is a mystery to me, and I suspect we're getting
it wrong in places.  But this is turning into a digression.  To
hopefully maintain focus, I'm pretending modules don't exist until later
in this message.

Sometimes, we throw in an object_class_dynamic_cast(klass, T) to check
@typename resolves to a subtype of some T.

> ...except for where we forget to do this, such as qdev_new().

We did not forget it there!  It's by design a thin wrapper around
object_new(), with preconditions just like object_new().

> Overall 'object_new' is a bad design because it is inherantly
> unsafe to call with unvalidated typenames.

To be fair, object_new() was not designed for use with user-provided
type names.  When it chokes on type names not provided by the user, it's
clearly a programming error, and assert() is a perfectly fine way to
catch programming errors.  Same for qdev_new().

However, we do in fact use these functions with user-provided type
names, if rarely.  When we do, we need to validate the type name before
we pass it to them.

Trouble is the validation code is a bit involved, and reimplementing it
everywhere it's needed is asking for bugs.

Creating and using more interfaces that are more convenient for this
purpose would avoid that.

> This problem is made worse by the proposal to introduce the idea
> of 'singleton' classes[1].
>
> Thus, this series suggests a way to improve safety at build
> time. The core idea is to allow 'object_new' to continue to be
> used *if-and-only-if* given a static, const string, because that
> scenario indicates the caller is aware of what type they are
> creating at build time.
>
> A new 'object_new_dynamic' method is proposed for cases where
> the typename is dynamically chosen at runtime. This method has
> an "Error **errp" parameter, which can report when an abstract
> type is created, leaving the assert()s only for scenarios which
> are unambiguous programmer errors.
>
> With a little macro magic, we guarantee a compile error is
> generated if 'object_new' is called with a dynamic type, forcing
> all potentially unsafe code over to object_new_dynamic.

Three cases:

1. Type name is literal string.  No change.  This is the most common
   case.

2. It's not.

2a. Type name is user-provided.  This is rare.  We replace

if (... guard ...) {
... return failure ...
}
obj = object_new(...);

by

obj = object_new_dynamic(..., errp);
if (!obj) {
... return failure ...
}

This is an improvement.

2b. It's not.  We should replace

obj = object_new(...);

by

obj = object_new_dynamic(..., &error_abort);

Exact same behavior, just wordier, to placate the compiler.
Tolerable as long as it's relatively rare.

But I'm not sure it's worthwhile.  All it really does is helping
some towards not getting case 2a wrong.  But 2a is rare.

Same for similar interfaces, e.g. qdev_new().

> This is more tractable than adding 'Error **errp' to 'object_new'
> as only a handful of places use a dynamic type name.

True!

Alright, enter modules.

Modules break a fundamental design assumption: object_new() on a
compiled-in type name is safe, i.e. the failure modes are all
programming errors.

Modules ad

Re: [PATCH 25/67] target/arm: Remove helper_sqrt_f16

2024-12-04 Thread Philippe Mathieu-Daudé


On 1/12/24 16:05, Richard Henderson wrote:

This function is identical with helper_vfp_sqrth.
Replace all uses.

Signed-off-by: Richard Henderson 
---
  target/arm/tcg/helper-a64.h|  1 -
  target/arm/tcg/helper-a64.c| 11 ---
  target/arm/tcg/translate-a64.c |  4 ++--
  3 files changed, 2 insertions(+), 14 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 1/1] hw/arm/sbsa-ref: bump default memory size to 2GB

2024-12-04 Thread Leif Lindholm


On 2024-12-02 10:53, Marcin Juszkiewicz wrote:

W dniu 26.11.2024 o 14:14, Peter Maydell pisze:

On Tue, 26 Nov 2024 at 08:49, Marcin Juszkiewicz
 wrote:


We are working on adding RME support to SBSA Reference Platform.
When RME is enabled then RMM (Realm Managment Monitor) takes 1072MB of
memory for own use. Which ends with firmware panic on 1GB machine.


Reasonable change, but isn't it also a bug in the RMM that it
grabs 1GB of RAM regardless of how much RAM the machine
actually has?


I think that the goal is "get it working" first and then optimize.


I agree on a different platform this could feel quite hacky, but in 
reality even 2GB falls within "ridiculously low for an SBSA platform".


If we're worried about overhead for CI jobs that do not require the 
feature, we could always conditionalize it on RME being enabled. But I'd 
be happy to wait and see.


Reviewed-by: Leif Lindholm

Re: [PATCH] hw/timer/nrf51_timer: prevent integer overflow

2024-12-04 Thread Anastasia Belova




On 12/3/24 7:46 PM, Peter Maydell wrote:

On Tue, 3 Dec 2024 at 16:25, Anastasia Belova  wrote:

Both counter and tick are uint32_t and the result
of their addition may not fit this type. Add
explicit casting to uint64_t.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Fixes: c5a4829c08 ("hw/timer/nrf51_timer: Add nRF51 Timer peripheral")
Signed-off-by: Anastasia Belova 
---
  hw/timer/nrf51_timer.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/timer/nrf51_timer.c b/hw/timer/nrf51_timer.c
index 35b0e62d5b..b5ff235eb8 100644
--- a/hw/timer/nrf51_timer.c
+++ b/hw/timer/nrf51_timer.c
@@ -44,7 +44,7 @@ static uint32_t update_counter(NRF51TimerState *s, int64_t 
now)
  {
  uint32_t ticks = ns_to_ticks(s, now - s->update_counter_ns);

-s->counter = (s->counter + ticks) % BIT(bitwidths[s->bitmode]);
+s->counter = ((uint64_t)s->counter + ticks) % BIT(bitwidths[s->bitmode]);

Can you explain when adding the cast makes a difference?
Since s->counter is 32 bits and ticks is 32 bits and
the RHS of the % is a power of 2, it's not clear to
me that keeping the top 32 bits in the addition and then
discarding it after the % is any different from only
taking the bottom 32 bits of the addition.


You're right. I was sure this situation invokes UB.

Thanks for your patience,
Anastasia Belova

Re: [PATCH v4 08/11] target/riscv: Add counter delegation/configuration support

2024-12-04 Thread Daniel Henrique Barboza





On 12/3/24 8:14 PM, Atish Patra wrote:

From: Kaiwen Xue 

The Smcdeleg/Ssccfg adds the support for counter delegation via
S*indcsr and Ssccfg.

It also adds a new shadow CSR scountinhibit and menvcfg enable bit (CDE)
to enable this extension and scountovf virtualization.

Signed-off-by: Kaiwen Xue 
Co-developed-by: Atish Patra 
Signed-off-by: Atish Patra 
---


Reviewed-by: Daniel Henrique Barboza 


  target/riscv/csr.c | 304 ++---
  1 file changed, 292 insertions(+), 12 deletions(-)

diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index 0985dbdca76d..a77b6ed4c9f3 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -383,6 +383,21 @@ static RISCVException aia_smode32(CPURISCVState *env, int 
csrno)
  return smode32(env, csrno);
  }
  
+static RISCVException scountinhibit_pred(CPURISCVState *env, int csrno)

+{
+RISCVCPU *cpu = env_archcpu(env);
+
+if (!cpu->cfg.ext_ssccfg || !cpu->cfg.ext_smcdeleg) {
+return RISCV_EXCP_ILLEGAL_INST;
+}
+
+if (env->virt_enabled) {
+return RISCV_EXCP_VIRT_INSTRUCTION_FAULT;
+}
+
+return smode(env, csrno);
+}
+
  static bool csrind_extensions_present(CPURISCVState *env)
  {
  return riscv_cpu_cfg(env)->ext_smcsrind || 
riscv_cpu_cfg(env)->ext_sscsrind;
@@ -1220,10 +1235,9 @@ done:
  return result;
  }
  
-static RISCVException write_mhpmcounter(CPURISCVState *env, int csrno,

-target_ulong val)
+static RISCVException riscv_pmu_write_ctr(CPURISCVState *env, target_ulong val,
+  uint32_t ctr_idx)
  {
-int ctr_idx = csrno - CSR_MCYCLE;
  PMUCTRState *counter = &env->pmu_ctrs[ctr_idx];
  uint64_t mhpmctr_val = val;
  
@@ -1248,10 +1262,9 @@ static RISCVException write_mhpmcounter(CPURISCVState *env, int csrno,

  return RISCV_EXCP_NONE;
  }
  
-static RISCVException write_mhpmcounterh(CPURISCVState *env, int csrno,

- target_ulong val)
+static RISCVException riscv_pmu_write_ctrh(CPURISCVState *env, target_ulong 
val,
+  uint32_t ctr_idx)
  {
-int ctr_idx = csrno - CSR_MCYCLEH;
  PMUCTRState *counter = &env->pmu_ctrs[ctr_idx];
  uint64_t mhpmctr_val = counter->mhpmcounter_val;
  uint64_t mhpmctrh_val = val;
@@ -1273,6 +1286,20 @@ static RISCVException write_mhpmcounterh(CPURISCVState 
*env, int csrno,
  return RISCV_EXCP_NONE;
  }
  
+static int write_mhpmcounter(CPURISCVState *env, int csrno, target_ulong val)

+{
+int ctr_idx = csrno - CSR_MCYCLE;
+
+return riscv_pmu_write_ctr(env, val, ctr_idx);
+}
+
+static int write_mhpmcounterh(CPURISCVState *env, int csrno, target_ulong val)
+{
+int ctr_idx = csrno - CSR_MCYCLEH;
+
+return riscv_pmu_write_ctrh(env, val, ctr_idx);
+}
+
  RISCVException riscv_pmu_read_ctr(CPURISCVState *env, target_ulong *val,
   bool upper_half, uint32_t ctr_idx)
  {
@@ -1338,6 +1365,167 @@ static RISCVException read_hpmcounterh(CPURISCVState 
*env, int csrno,
  return riscv_pmu_read_ctr(env, val, true, ctr_index);
  }
  
+static int rmw_cd_mhpmcounter(CPURISCVState *env, int ctr_idx,

+  target_ulong *val, target_ulong new_val,
+  target_ulong wr_mask)
+{
+if (wr_mask != 0 && wr_mask != -1) {
+return -EINVAL;
+}
+
+if (!wr_mask && val) {
+riscv_pmu_read_ctr(env, val, false, ctr_idx);
+} else if (wr_mask) {
+riscv_pmu_write_ctr(env, new_val, ctr_idx);
+} else {
+return -EINVAL;
+}
+
+return 0;
+}
+
+static int rmw_cd_mhpmcounterh(CPURISCVState *env, int ctr_idx,
+   target_ulong *val, target_ulong new_val,
+   target_ulong wr_mask)
+{
+if (wr_mask != 0 && wr_mask != -1) {
+return -EINVAL;
+}
+
+if (!wr_mask && val) {
+riscv_pmu_read_ctr(env, val, true, ctr_idx);
+} else if (wr_mask) {
+riscv_pmu_write_ctrh(env, new_val, ctr_idx);
+} else {
+return -EINVAL;
+}
+
+return 0;
+}
+
+static int rmw_cd_mhpmevent(CPURISCVState *env, int evt_index,
+target_ulong *val, target_ulong new_val,
+target_ulong wr_mask)
+{
+uint64_t mhpmevt_val = new_val;
+
+if (wr_mask != 0 && wr_mask != -1) {
+return -EINVAL;
+}
+
+if (!wr_mask && val) {
+*val = env->mhpmevent_val[evt_index];
+if (riscv_cpu_cfg(env)->ext_sscofpmf) {
+*val &= ~MHPMEVENT_BIT_MINH;
+}
+} else if (wr_mask) {
+wr_mask &= ~MHPMEVENT_BIT_MINH;
+mhpmevt_val = (new_val & wr_mask) |
+  (env->mhpmevent_val[evt_index] & ~wr_mask);
+if (riscv_cpu_mxl(env) == MXL_RV32) {
+mhpmevt_val = mhpmevt_val |
+  ((uint64_t)env->mhpmeventh_

Re: [PATCH v4 04/11] target/riscv: Support generic CSR indirect access

2024-12-04 Thread Daniel Henrique Barboza





On 12/3/24 8:14 PM, Atish Patra wrote:

From: Kaiwen Xue 

This adds the indirect access registers required by sscsrind/smcsrind
and the operations on them. Note that xiselect and xireg are used for
both AIA and sxcsrind, and the behavior of accessing them depends on
whether each extension is enabled and the value stored in xiselect.

Co-developed-by: Atish Patra 
Signed-off-by: Atish Patra 
Signed-off-by: Kaiwen Xue 
---


Reviewed-by: Daniel Henrique Barboza 


  target/riscv/cpu_bits.h |  28 +-
  target/riscv/csr.c  | 144 ++--
  2 files changed, 166 insertions(+), 6 deletions(-)

diff --git a/target/riscv/cpu_bits.h b/target/riscv/cpu_bits.h
index 385a2c67c24b..e13c5420a251 100644
--- a/target/riscv/cpu_bits.h
+++ b/target/riscv/cpu_bits.h
@@ -173,6 +173,13 @@
  #define CSR_MISELECT0x350
  #define CSR_MIREG   0x351
  
+/* Machine Indirect Register Alias */

+#define CSR_MIREG2  0x352
+#define CSR_MIREG3  0x353
+#define CSR_MIREG4  0x355
+#define CSR_MIREG5  0x356
+#define CSR_MIREG6  0x357
+
  /* Machine-Level Interrupts (AIA) */
  #define CSR_MTOPEI  0x35c
  #define CSR_MTOPI   0xfb0
@@ -222,6 +229,13 @@
  #define CSR_SISELECT0x150
  #define CSR_SIREG   0x151
  
+/* Supervisor Indirect Register Alias */

+#define CSR_SIREG2  0x152
+#define CSR_SIREG3  0x153
+#define CSR_SIREG4  0x155
+#define CSR_SIREG5  0x156
+#define CSR_SIREG6  0x157
+
  /* Supervisor-Level Interrupts (AIA) */
  #define CSR_STOPEI  0x15c
  #define CSR_STOPI   0xdb0
@@ -288,6 +302,13 @@
  #define CSR_VSISELECT   0x250
  #define CSR_VSIREG  0x251
  
+/* Virtual Supervisor Indirect Alias */

+#define CSR_VSIREG2 0x252
+#define CSR_VSIREG3 0x253
+#define CSR_VSIREG4 0x255
+#define CSR_VSIREG5 0x256
+#define CSR_VSIREG6 0x257
+
  /* VS-Level Interrupts (H-extension with AIA) */
  #define CSR_VSTOPEI 0x25c
  #define CSR_VSTOPI  0xeb0
@@ -863,10 +884,13 @@ typedef enum RISCVException {
  #define ISELECT_IMSIC_EIE630xff
  #define ISELECT_IMSIC_FIRSTISELECT_IMSIC_EIDELIVERY
  #define ISELECT_IMSIC_LAST ISELECT_IMSIC_EIE63
-#define ISELECT_MASK   0x1ff
+#define ISELECT_MASK_AIA   0x1ff
+
+/* MISELECT, SISELECT, and VSISELECT bits (AIA) */
+#define ISELECT_MASK_SXCSRIND  0xfff
  
  /* Dummy [M|S|VS]ISELECT value for emulating [M|S|VS]TOPEI CSRs */

-#define ISELECT_IMSIC_TOPEI(ISELECT_MASK + 1)
+#define ISELECT_IMSIC_TOPEI(ISELECT_MASK_AIA + 1)
  
  /* IMSIC bits (AIA) */

  #define IMSIC_TOPEI_IID_SHIFT  16
diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index c91a26a52ef6..424e9dbbd4ff 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -306,6 +306,15 @@ static RISCVException aia_any32(CPURISCVState *env, int 
csrno)
  return any32(env, csrno);
  }
  
+static RISCVException csrind_any(CPURISCVState *env, int csrno)

+{
+if (!riscv_cpu_cfg(env)->ext_smcsrind) {
+return RISCV_EXCP_ILLEGAL_INST;
+}
+
+return RISCV_EXCP_NONE;
+}
+
  static RISCVException csrind_or_aia_any(CPURISCVState *env, int csrno)
  {
  if (!riscv_cpu_cfg(env)->ext_smaia && !riscv_cpu_cfg(env)->ext_smcsrind) {
@@ -389,6 +398,15 @@ static bool csrind_or_aia_extensions_present(CPURISCVState 
*env)
  return csrind_extensions_present(env) || aia_extensions_present(env);
  }
  
+static RISCVException csrind_smode(CPURISCVState *env, int csrno)

+{
+if (!csrind_extensions_present(env)) {
+return RISCV_EXCP_ILLEGAL_INST;
+}
+
+return smode(env, csrno);
+}
+
  static RISCVException csrind_or_aia_smode(CPURISCVState *env, int csrno)
  {
  if (!csrind_or_aia_extensions_present(env)) {
@@ -417,6 +435,15 @@ static RISCVException hmode32(CPURISCVState *env, int 
csrno)
  
  }
  
+static RISCVException csrind_hmode(CPURISCVState *env, int csrno)

+{
+if (!csrind_extensions_present(env)) {
+return RISCV_EXCP_ILLEGAL_INST;
+}
+
+return hmode(env, csrno);
+}
+
  static RISCVException csrind_or_aia_hmode(CPURISCVState *env, int csrno)
  {
  if (!csrind_or_aia_extensions_present(env)) {
@@ -2065,7 +2092,12 @@ static int csrind_xlate_vs_csrno(CPURISCVState *env, int 
csrno)
  case CSR_SISELECT:
  return CSR_VSISELECT;
  case CSR_SIREG:
-return CSR_VSIREG;
+case CSR_SIREG2:
+case CSR_SIREG3:
+case CSR_SIREG4:
+case CSR_SIREG5:
+case CSR_SIREG6:
+return CSR_VSIREG + (csrno - CSR_SIREG);
  default:
  return csrno;
  };
@@ -2105,7 +2137,12 @@ static RISCVException rmw_xiselect(CPURISCVState *env, 
int csrno,
  *val = *iselect;
  }
  
-wr_mask &= ISELECT_MASK;

+if (riscv_cpu_cfg(env)->ext_smcsrind || riscv_cpu_cfg(e

[PATCH v4 1/7] target/riscv: Remove obsolete sfence.vm instruction

Signed-off-by: Rajnesh Kanwal 
Reviewed-by: Alistair Francis 
Reviewed-by: Jason Chien 
---
 target/riscv/insn32.decode | 1 -
 target/riscv/insn_trans/trans_privileged.c.inc | 5 -
 2 files changed, 6 deletions(-)

diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index 
e9139ec1b9cfdb2dc5029dd28430933a2b4e1442..a2b4c0ddd47ad9464b4b180fb19e6a3b64dbe4e5
 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -119,7 +119,6 @@ sret000100000010 0 000 0 1110011
 mret001100000010 0 000 0 1110011
 wfi 000100000101 0 000 0 1110011
 sfence_vma  0001001. . 000 0 1110011 @sfence_vma
-sfence_vm   000100000100 . 000 0 1110011 @sfence_vm
 
 # *** RV32I Base Instruction Set ***
 lui     . 0110111 @u
diff --git a/target/riscv/insn_trans/trans_privileged.c.inc 
b/target/riscv/insn_trans/trans_privileged.c.inc
index 
ecd3b8b2c9e6c698f63c9bd3b3e5758426fcfe63..0bdfa9a0ed3313223ce9032fb24484c3887cddf9
 100644
--- a/target/riscv/insn_trans/trans_privileged.c.inc
+++ b/target/riscv/insn_trans/trans_privileged.c.inc
@@ -127,8 +127,3 @@ static bool trans_sfence_vma(DisasContext *ctx, 
arg_sfence_vma *a)
 #endif
 return false;
 }
-
-static bool trans_sfence_vm(DisasContext *ctx, arg_sfence_vm *a)
-{
-return false;
-}

-- 
2.34.1

[PATCH v4 2/7] target/riscv: Add Control Transfer Records CSR definitions.

The Control Transfer Records (CTR) extension provides a method to
record a limited branch history in register-accessible internal chip
storage.

This extension is similar to Arch LBR in x86 and BRBE in ARM.
The Extension has been stable and the latest release can be found here
https://github.com/riscv/riscv-control-transfer-records/releases/tag/v1.0_rc5

Signed-off-by: Rajnesh Kanwal 
Acked-by: Alistair Francis 
---
 target/riscv/cpu_bits.h | 94 +
 1 file changed, 94 insertions(+)

diff --git a/target/riscv/cpu_bits.h b/target/riscv/cpu_bits.h
index 
4ac065ac5e5a688d5ec9bbb8288c3deb82f05314..0cf6ef133ce9565f4a19e99f3cfd1d73da77f47a
 100644
--- a/target/riscv/cpu_bits.h
+++ b/target/riscv/cpu_bits.h
@@ -247,6 +247,17 @@
 #define CSR_SIEH0x114
 #define CSR_SIPH0x154
 
+/* Machine-Level Control transfer records CSRs */
+#define CSR_MCTRCTL 0x34e
+
+/* Supervisor-Level Control transfer records CSRs */
+#define CSR_SCTRCTL 0x14e
+#define CSR_SCTRSTATUS  0x14f
+#define CSR_SCTRDEPTH   0x15f
+
+/* VS-Level Control transfer records CSRs */
+#define CSR_VSCTRCTL0x24e
+
 /* Hpervisor CSRs */
 #define CSR_HSTATUS 0x600
 #define CSR_HEDELEG 0x602
@@ -344,6 +355,7 @@
 #define SMSTATEEN0_CS   (1ULL << 0)
 #define SMSTATEEN0_FCSR (1ULL << 1)
 #define SMSTATEEN0_JVT  (1ULL << 2)
+#define SMSTATEEN0_CTR  (1ULL << 54)
 #define SMSTATEEN0_P1P13(1ULL << 56)
 #define SMSTATEEN0_HSCONTXT (1ULL << 57)
 #define SMSTATEEN0_IMSIC(1ULL << 58)
@@ -877,6 +889,88 @@ typedef enum RISCVException {
 #define UMTE_U_PM_INSN  U_PM_INSN
 #define UMTE_MASK (UMTE_U_PM_ENABLE | MMTE_U_PM_CURRENT | UMTE_U_PM_INSN)
 
+/* CTR control register commom fields */
+#define XCTRCTL_U  BIT_ULL(0)
+#define XCTRCTL_S  BIT_ULL(1)
+#define XCTRCTL_RASEMU BIT_ULL(7)
+#define XCTRCTL_STEBIT_ULL(8)
+#define XCTRCTL_BPFRZ  BIT_ULL(11)
+#define XCTRCTL_LCOFIFRZ   BIT_ULL(12)
+#define XCTRCTL_EXCINH BIT_ULL(33)
+#define XCTRCTL_INTRINHBIT_ULL(34)
+#define XCTRCTL_TRETINHBIT_ULL(35)
+#define XCTRCTL_NTBREN BIT_ULL(36)
+#define XCTRCTL_TKBRINHBIT_ULL(37)
+#define XCTRCTL_INDCALLINH BIT_ULL(40)
+#define XCTRCTL_DIRCALLINH BIT_ULL(41)
+#define XCTRCTL_INDJMPINH  BIT_ULL(42)
+#define XCTRCTL_DIRJMPINH  BIT_ULL(43)
+#define XCTRCTL_CORSWAPINH BIT_ULL(44)
+#define XCTRCTL_RETINH BIT_ULL(45)
+#define XCTRCTL_INDLJMPINH BIT_ULL(46)
+#define XCTRCTL_DIRLJMPINH BIT_ULL(47)
+
+#define XCTRCTL_MASK (XCTRCTL_U | XCTRCTL_S | XCTRCTL_RASEMU |\
+  XCTRCTL_STE | XCTRCTL_BPFRZ | XCTRCTL_LCOFIFRZ |\
+  XCTRCTL_EXCINH | XCTRCTL_INTRINH | XCTRCTL_TRETINH |\
+  XCTRCTL_NTBREN | XCTRCTL_TKBRINH | XCTRCTL_INDCALLINH | \
+  XCTRCTL_DIRCALLINH | XCTRCTL_INDJMPINH |\
+  XCTRCTL_DIRJMPINH | XCTRCTL_CORSWAPINH |\
+  XCTRCTL_RETINH | XCTRCTL_INDLJMPINH | XCTRCTL_DIRLJMPINH)
+
+#define XCTRCTL_INH_START 32U
+
+/* CTR mctrctl bits */
+#define MCTRCTL_M BIT_ULL(2)
+#define MCTRCTL_MTE   BIT_ULL(9)
+
+#define MCTRCTL_MASK  (XCTRCTL_MASK | MCTRCTL_M | MCTRCTL_MTE)
+#define SCTRCTL_MASK  XCTRCTL_MASK
+#define VSCTRCTL_MASK XCTRCTL_MASK
+
+/* sctrstatus CSR bits. */
+#define SCTRSTATUS_WRPTR_MASK   0xFF
+#define SCTRSTATUS_FROZEN   BIT(31)
+#define SCTRSTATUS_MASK (SCTRSTATUS_WRPTR_MASK | SCTRSTATUS_FROZEN)
+
+/* sctrdepth CSR bits. */
+#define SCTRDEPTH_MASK  0x7
+#define SCTRDEPTH_MIN   0U  /* 16 Entries. */
+#define SCTRDEPTH_MAX   4U  /* 256 Entries. */
+
+#define CTR_ENTRIES_FIRST   0x200
+#define CTR_ENTRIES_LAST0x2ff
+
+#define CTRSOURCE_VALID BIT(0)
+#define CTRTARGET_MISP  BIT(0)
+
+#define CTRDATA_TYPE_MASK   0xF
+#define CTRDATA_CCV BIT(15)
+#define CTRDATA_CCM_MASK0xFFF
+#define CTRDATA_CCE_MASK0xF000
+
+#define CTRDATA_MASK(CTRDATA_TYPE_MASK | CTRDATA_CCV |  \
+ CTRDATA_CCM_MASK | CTRDATA_CCE_MASK)
+
+typedef enum CTRType {
+CTRDATA_TYPE_NONE   = 0,
+CTRDATA_TYPE_EXCEPTION  = 1,
+CTRDATA_TYPE_INTERRUPT  = 2,
+CTRDATA_TYPE_EXCEP_INT_RET  = 3,
+CTRDATA_TYPE_NONTAKEN_BRANCH= 4,
+CTRDATA_TYPE_TAKEN_BRANCH   = 5,
+CTRDATA_TYPE_RESERVED_0 = 6,
+CTRDATA_TYPE_RESERVED_1 = 7,
+CTRDATA_TYPE_INDIRECT_CALL  = 8,
+CTRDATA_TYPE_DIRECT_CALL= 9,
+CTRDATA_TYPE_INDIRECT_JUMP  = 10,
+CTRDATA_TYPE_DIRECT_JUMP=

Re: [PATCH RESEND v1] target/riscv: add support for RV64 Xiangshan Nanhu CPU

2024-12-04 Thread Daniel Henrique Barboza


Hi,

Can you please re-send the patch rebased on top of:

https://github.com/alistair23/qemu/tree/riscv-to-apply.next

This is the branch we use to queue RISC-V patches that are pending upstreaming.


Thanks,

Daniel

On 12/4/24 12:15 AM, MollyChen wrote:

Add a CPU entry for the RV64 XiangShan NANHU CPU which
supports single-core and dual-core configurations. More
details can be found at
https://docs.xiangshan.cc/zh-cn/latest/integration/overview

Signed-off-by: MollyChen 
---
  target/riscv/cpu-qom.h |  1 +
  target/riscv/cpu.c | 29 +
  2 files changed, 30 insertions(+)

diff --git a/target/riscv/cpu-qom.h b/target/riscv/cpu-qom.h
index 62115375cd..8f6fac463c 100644
--- a/target/riscv/cpu-qom.h
+++ b/target/riscv/cpu-qom.h
@@ -49,6 +49,7 @@
  #define TYPE_RISCV_CPU_SIFIVE_U54   RISCV_CPU_TYPE_NAME("sifive-u54")
  #define TYPE_RISCV_CPU_THEAD_C906   RISCV_CPU_TYPE_NAME("thead-c906")
  #define TYPE_RISCV_CPU_VEYRON_V1RISCV_CPU_TYPE_NAME("veyron-v1")
+#define TYPE_RISCV_CPU_XIANGSHAN_NANHU  RISCV_CPU_TYPE_NAME("xiangshan-nanhu")
  #define TYPE_RISCV_CPU_HOST RISCV_CPU_TYPE_NAME("host")
  
  OBJECT_DECLARE_CPU_TYPE(RISCVCPU, RISCVCPUClass, RISCV_CPU)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index f219f0c3b5..738d833115 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -579,6 +579,34 @@ static void rv64_veyron_v1_cpu_init(Object *obj)
  #endif
  }
  
+static void rv64_xiangshan_nanhu_cpu_init(Object *obj)

+{
+CPURISCVState *env = &RISCV_CPU(obj)->env;
+RISCVCPU *cpu = RISCV_CPU(obj);
+
+riscv_cpu_set_misa_ext(env, RVG | RVC | RVB | RVS | RVU);
+env->priv_ver = PRIV_VERSION_1_12_0;
+
+/* Enable ISA extensions */
+cpu->cfg.ext_zbc = true;
+cpu->cfg.ext_zbkb = true;
+cpu->cfg.ext_zbkc = true;
+cpu->cfg.ext_zbkx = true;
+cpu->cfg.ext_zknd = true;
+cpu->cfg.ext_zkne = true;
+cpu->cfg.ext_zknh = true;
+cpu->cfg.ext_zksed = true;
+cpu->cfg.ext_zksh = true;
+cpu->cfg.ext_svinval = true;
+
+cpu->cfg.mmu = true;
+cpu->cfg.pmp = true;
+
+#ifndef CONFIG_USER_ONLY
+set_satp_mode_max_supported(cpu, VM_1_10_SV39);
+#endif
+}
+
  #ifdef CONFIG_TCG
  static void rv128_base_cpu_init(Object *obj)
  {
@@ -2983,6 +3011,7 @@ static const TypeInfo riscv_cpu_type_infos[] = {
  DEFINE_VENDOR_CPU(TYPE_RISCV_CPU_SHAKTI_C,   MXL_RV64,  
rv64_sifive_u_cpu_init),
  DEFINE_VENDOR_CPU(TYPE_RISCV_CPU_THEAD_C906, MXL_RV64,  
rv64_thead_c906_cpu_init),
  DEFINE_VENDOR_CPU(TYPE_RISCV_CPU_VEYRON_V1,  MXL_RV64,  
rv64_veyron_v1_cpu_init),
+DEFINE_VENDOR_CPU(TYPE_RISCV_CPU_XIANGSHAN_NANHU, MXL_RV64, 
rv64_xiangshan_nanhu_cpu_init),
  #ifdef CONFIG_TCG
  DEFINE_DYNAMIC_CPU(TYPE_RISCV_CPU_BASE128,   MXL_RV128, 
rv128_base_cpu_init),
  #endif /* CONFIG_TCG */

[PATCH v4 3/7] target/riscv: Add support for Control Transfer Records extension CSRs.

This commit adds support for [m|s|vs]ctrcontrol, sctrstatus and
sctrdepth CSRs handling.

Signed-off-by: Rajnesh Kanwal 
---
 target/riscv/cpu.h |   5 ++
 target/riscv/cpu_cfg.h |   2 +
 target/riscv/csr.c | 144 +
 3 files changed, 151 insertions(+)

diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index 
903268626374474306f0e0259f37128326b354d4..da14ac2f874b81d3f01bc31b0064d020f2dbdf61
 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -305,6 +305,11 @@ struct CPUArchState {
 target_ulong mcause;
 target_ulong mtval;  /* since: priv-1.10.0 */
 
+uint64_t mctrctl;
+uint32_t sctrdepth;
+uint32_t sctrstatus;
+uint64_t vsctrctl;
+
 /* Machine and Supervisor interrupt priorities */
 uint8_t miprio[64];
 uint8_t siprio[64];
diff --git a/target/riscv/cpu_cfg.h b/target/riscv/cpu_cfg.h
index 
ae2b019703fe4849eb7f264b4d90743d4c013b86..e365a368d71a695b1b99c3b6ae330347143d3422
 100644
--- a/target/riscv/cpu_cfg.h
+++ b/target/riscv/cpu_cfg.h
@@ -130,6 +130,8 @@ struct RISCVCPUConfig {
 bool ext_zvfhmin;
 bool ext_smaia;
 bool ext_ssaia;
+bool ext_smctr;
+bool ext_ssctr;
 bool ext_sscofpmf;
 bool ext_smepmp;
 bool rvv_ta_all_1s;
diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index 
31ea8b8ec20db5a5af23e829757cccaafc02e2da..7e03065d3dcd8713e2cadae3017ed355c9f9bf10
 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -651,6 +651,48 @@ static RISCVException pointer_masking(CPURISCVState *env, 
int csrno)
 return RISCV_EXCP_ILLEGAL_INST;
 }
 
+/*
+ * M-mode:
+ * Without ext_smctr raise illegal inst excep.
+ * Otherwise everything is accessible to m-mode.
+ *
+ * S-mode:
+ * Without ext_ssctr or mstateen.ctr raise illegal inst excep.
+ * Otherwise everything other than mctrctl is accessible.
+ *
+ * VS-mode:
+ * Without ext_ssctr or mstateen.ctr raise illegal inst excep.
+ * Without hstateen.ctr raise virtual illegal inst excep.
+ * Otherwise allow sctrctl (vsctrctl), sctrstatus, 0x200-0x2ff entry range.
+ * Always raise illegal instruction exception for sctrdepth.
+ */
+static RISCVException ctr_mmode(CPURISCVState *env, int csrno)
+{
+/* Check if smctr-ext is present */
+if (riscv_cpu_cfg(env)->ext_smctr) {
+return RISCV_EXCP_NONE;
+}
+
+return RISCV_EXCP_ILLEGAL_INST;
+}
+
+static RISCVException ctr_smode(CPURISCVState *env, int csrno)
+{
+const RISCVCPUConfig *cfg = riscv_cpu_cfg(env);
+
+if (!cfg->ext_smctr && !cfg->ext_ssctr) {
+return RISCV_EXCP_ILLEGAL_INST;
+}
+
+RISCVException ret = smstateen_acc_ok(env, 0, SMSTATEEN0_CTR);
+if (ret == RISCV_EXCP_NONE && csrno == CSR_SCTRDEPTH &&
+env->virt_enabled) {
+return RISCV_EXCP_VIRT_INSTRUCTION_FAULT;
+}
+
+return ret;
+}
+
 static RISCVException aia_hmode(CPURISCVState *env, int csrno)
 {
 int ret;
@@ -3160,6 +3202,10 @@ static RISCVException write_mstateen0(CPURISCVState 
*env, int csrno,
 wr_mask |= (SMSTATEEN0_AIA | SMSTATEEN0_IMSIC);
 }
 
+if (riscv_cpu_cfg(env)->ext_ssctr) {
+wr_mask |= SMSTATEEN0_CTR;
+}
+
 return write_mstateen(env, csrno, wr_mask, new_val);
 }
 
@@ -3199,6 +3245,10 @@ static RISCVException write_mstateen0h(CPURISCVState 
*env, int csrno,
 wr_mask |= SMSTATEEN0_P1P13;
 }
 
+if (riscv_cpu_cfg(env)->ext_ssctr) {
+wr_mask |= SMSTATEEN0_CTR;
+}
+
 return write_mstateenh(env, csrno, wr_mask, new_val);
 }
 
@@ -3253,6 +3303,10 @@ static RISCVException write_hstateen0(CPURISCVState 
*env, int csrno,
 wr_mask |= (SMSTATEEN0_AIA | SMSTATEEN0_IMSIC);
 }
 
+if (riscv_cpu_cfg(env)->ext_ssctr) {
+wr_mask |= SMSTATEEN0_CTR;
+}
+
 return write_hstateen(env, csrno, wr_mask, new_val);
 }
 
@@ -3292,6 +3346,10 @@ static RISCVException write_hstateen0h(CPURISCVState 
*env, int csrno,
 {
 uint64_t wr_mask = SMSTATEEN_STATEEN | SMSTATEEN0_HSENVCFG;
 
+if (riscv_cpu_cfg(env)->ext_ssctr) {
+wr_mask |= SMSTATEEN0_CTR;
+}
+
 return write_hstateenh(env, csrno, wr_mask, new_val);
 }
 
@@ -4005,6 +4063,86 @@ static RISCVException write_satp(CPURISCVState *env, int 
csrno,
 return RISCV_EXCP_NONE;
 }
 
+static RISCVException rmw_sctrdepth(CPURISCVState *env, int csrno,
+target_ulong *ret_val,
+target_ulong new_val, target_ulong wr_mask)
+{
+uint64_t mask = wr_mask & SCTRDEPTH_MASK;
+
+if (ret_val) {
+*ret_val = env->sctrdepth;
+}
+
+env->sctrdepth = (env->sctrdepth & ~mask) | (new_val & mask);
+
+/* Correct depth. */
+if (mask) {
+uint64_t depth = get_field(env->sctrdepth, SCTRDEPTH_MASK);
+
+if (depth > SCTRDEPTH_MAX) {
+depth = SCTRDEPTH_MAX;
+env->sctrdepth = set_field(env->sctrdepth, SCTRDEPTH_MASK, depth);
+}
+
+/* Update sctrstatus.WRPTR with a legal value *

[PATCH v4 6/7] target/riscv: Add support to access ctrsource, ctrtarget, ctrdata regs.

CTR entries are accessed using ctrsource, ctrtarget and ctrdata
registers using smcsrind/sscsrind extension. This commits extends
the csrind extension to support CTR registers.

ctrsource is accessible through xireg CSR, ctrtarget is accessible
through xireg1 and ctrdata is accessible through xireg2 CSR.

CTR supports maximum depth of 256 entries which are accessed using
xiselect range 0x200 to 0x2ff.

This commits also adds properties to enable CTR extension. CTR can be
enabled using smctr=true and ssctr=true now.

Signed-off-by: Rajnesh Kanwal 
---
 target/riscv/cpu.c |  26 +++-
 target/riscv/csr.c | 150 -
 target/riscv/tcg/tcg-cpu.c |  11 
 3 files changed, 185 insertions(+), 2 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 
2a4f285a974ffc62e7f3e938691dbffe376a7e46..751029e924d4690aaa5de65456fd5a5ec25b916a
 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -199,6 +199,8 @@ const RISCVIsaExtData isa_edata_arr[] = {
 ISA_EXT_DATA_ENTRY(sstvala, PRIV_VERSION_1_12_0, has_priv_1_12),
 ISA_EXT_DATA_ENTRY(sstvecd, PRIV_VERSION_1_12_0, has_priv_1_12),
 ISA_EXT_DATA_ENTRY(svade, PRIV_VERSION_1_11_0, ext_svade),
+ISA_EXT_DATA_ENTRY(smctr, PRIV_VERSION_1_12_0, ext_smctr),
+ISA_EXT_DATA_ENTRY(ssctr, PRIV_VERSION_1_12_0, ext_ssctr),
 ISA_EXT_DATA_ENTRY(svadu, PRIV_VERSION_1_12_0, ext_svadu),
 ISA_EXT_DATA_ENTRY(svinval, PRIV_VERSION_1_12_0, ext_svinval),
 ISA_EXT_DATA_ENTRY(svnapot, PRIV_VERSION_1_12_0, ext_svnapot),
@@ -1481,6 +1483,8 @@ const RISCVCPUMultiExtConfig riscv_cpu_extensions[] = {
 MULTI_EXT_CFG_BOOL("smcdeleg", ext_smcdeleg, false),
 MULTI_EXT_CFG_BOOL("sscsrind", ext_sscsrind, false),
 MULTI_EXT_CFG_BOOL("ssccfg", ext_ssccfg, false),
+MULTI_EXT_CFG_BOOL("smctr", ext_smctr, false),
+MULTI_EXT_CFG_BOOL("ssctr", ext_ssctr, false),
 MULTI_EXT_CFG_BOOL("zifencei", ext_zifencei, true),
 MULTI_EXT_CFG_BOOL("zicfilp", ext_zicfilp, false),
 MULTI_EXT_CFG_BOOL("zicfiss", ext_zicfiss, false),
@@ -2656,6 +2660,26 @@ static RISCVCPUImpliedExtsRule SSCFG_IMPLIED = {
 },
 };
 
+static RISCVCPUImpliedExtsRule SMCTR_IMPLIED = {
+.ext = CPU_CFG_OFFSET(ext_smctr),
+.implied_misa_exts = RVS,
+.implied_multi_exts = {
+CPU_CFG_OFFSET(ext_sscsrind),
+
+RISCV_IMPLIED_EXTS_RULE_END
+},
+};
+
+static RISCVCPUImpliedExtsRule SSCTR_IMPLIED = {
+.ext = CPU_CFG_OFFSET(ext_ssctr),
+.implied_misa_exts = RVS,
+.implied_multi_exts = {
+CPU_CFG_OFFSET(ext_sscsrind),
+
+RISCV_IMPLIED_EXTS_RULE_END
+},
+};
+
 RISCVCPUImpliedExtsRule *riscv_misa_ext_implied_rules[] = {
 &RVA_IMPLIED, &RVD_IMPLIED, &RVF_IMPLIED,
 &RVM_IMPLIED, &RVV_IMPLIED, NULL
@@ -2674,7 +2698,7 @@ RISCVCPUImpliedExtsRule *riscv_multi_ext_implied_rules[] 
= {
 &ZVFH_IMPLIED, &ZVFHMIN_IMPLIED, &ZVKN_IMPLIED,
 &ZVKNC_IMPLIED, &ZVKNG_IMPLIED, &ZVKNHB_IMPLIED,
 &ZVKS_IMPLIED,  &ZVKSC_IMPLIED, &ZVKSG_IMPLIED, &SSCFG_IMPLIED,
-NULL
+&SMCTR_IMPLIED, &SSCTR_IMPLIED, NULL
 };
 
 static Property riscv_cpu_properties[] = {
diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index 
7e03065d3dcd8713e2cadae3017ed355c9f9bf10..d80684a708891e062393deebe880650fb4df44ab
 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -2401,6 +2401,13 @@ static bool xiselect_cd_range(target_ulong isel)
 return (ISELECT_CD_FIRST <= isel && isel <= ISELECT_CD_LAST);
 }
 
+static bool xiselect_ctr_range(int csrno, target_ulong isel)
+{
+/* MIREG-MIREG6 for the range 0x200-0x2ff are not used by CTR. */
+return CTR_ENTRIES_FIRST <= isel && isel <= CTR_ENTRIES_LAST &&
+   csrno < CSR_MIREG;
+}
+
 static int rmw_iprio(target_ulong xlen,
  target_ulong iselect, uint8_t *iprio,
  target_ulong *val, target_ulong new_val,
@@ -2446,6 +2453,124 @@ static int rmw_iprio(target_ulong xlen,
 return 0;
 }
 
+static int rmw_ctrsource(CPURISCVState *env, int isel, target_ulong *val,
+  target_ulong new_val, target_ulong wr_mask)
+{
+/*
+ * CTR arrays are treated as circular buffers and TOS always points to next
+ * empty slot, keeping TOS - 1 always pointing to latest entry. Given entry
+ * 0 is always the latest one, traversal is a bit different here. See the
+ * below example.
+ *
+ * Depth = 16.
+ *
+ * idx[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [A] [B] [C] [D] [E] [F]
+ * TOS H
+ * entry   6   5   4   3   2   1   0   F   E   D   C   B   A   9   8   7
+ */
+const uint64_t entry = isel - CTR_ENTRIES_FIRST;
+const uint64_t depth = 16 << get_field(env->sctrdepth, SCTRDEPTH_MASK);
+uint64_t idx;
+
+/* Entry greater than depth-1 is read-only zero */
+if (entry >= depth) {
+if (val) {
+*val = 0;
+}
+return 0;
+}
+
+idx = get_

[PATCH v4 7/7] target/riscv: machine: Add Control Transfer Record state description

Add a subsection to machine.c to migrate CTR CSR state

Signed-off-by: Rajnesh Kanwal 
---
 target/riscv/machine.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/target/riscv/machine.c b/target/riscv/machine.c
index 
e1bdc31c7c53a8a4f539113d501c8e46f7a914e9..b67e660ef03b6053fa00d5a79e2ab20ecf3331b8
 100644
--- a/target/riscv/machine.c
+++ b/target/riscv/machine.c
@@ -311,6 +311,30 @@ static const VMStateDescription vmstate_envcfg = {
 }
 };
 
+static bool ctr_needed(void *opaque)
+{
+RISCVCPU *cpu = opaque;
+
+return cpu->cfg.ext_smctr || cpu->cfg.ext_ssctr;
+}
+
+static const VMStateDescription vmstate_ctr = {
+.name = "cpu/ctr",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = ctr_needed,
+.fields = (const VMStateField[]) {
+VMSTATE_UINT64(env.mctrctl, RISCVCPU),
+VMSTATE_UINT32(env.sctrdepth, RISCVCPU),
+VMSTATE_UINT32(env.sctrstatus, RISCVCPU),
+VMSTATE_UINT64(env.vsctrctl, RISCVCPU),
+VMSTATE_UINT64_ARRAY(env.ctr_src, RISCVCPU, 16 << SCTRDEPTH_MAX),
+VMSTATE_UINT64_ARRAY(env.ctr_dst, RISCVCPU, 16 << SCTRDEPTH_MAX),
+VMSTATE_UINT64_ARRAY(env.ctr_data, RISCVCPU, 16 << SCTRDEPTH_MAX),
+VMSTATE_END_OF_LIST()
+}
+};
+
 static bool pmu_needed(void *opaque)
 {
 RISCVCPU *cpu = opaque;
@@ -461,6 +485,7 @@ const VMStateDescription vmstate_riscv_cpu = {
 &vmstate_jvt,
 &vmstate_elp,
 &vmstate_ssp,
+&vmstate_ctr,
 NULL
 }
 };

-- 
2.34.1

Re: [PATCH v3 0/6] target/riscv: Add support for Control Transfer Records Ext.

On Tue, Nov 5, 2024 at 3:58 PM Richard Henderson
 wrote:
>
> On 11/4/24 21:51, Rajnesh Kanwal wrote:
> >   target/riscv/cpu.c |  26 ++-
> >   target/riscv/cpu.h |  13 ++
> >   target/riscv/cpu_bits.h|  94 
> >   target/riscv/cpu_cfg.h |   2 +
> >   target/riscv/cpu_helper.c  | 266 
> > ++
> >   target/riscv/csr.c | 294 
> > -
> >   target/riscv/helper.h  |   9 +-
> >   target/riscv/insn32.decode |   2 +-
> >   target/riscv/insn_trans/trans_privileged.c.inc |  22 +-
> >   target/riscv/insn_trans/trans_rvi.c.inc|  31 +++
> >   target/riscv/insn_trans/trans_rvzce.c.inc  |  20 ++
> >   target/riscv/op_helper.c   | 155 -
> >   target/riscv/tcg/tcg-cpu.c |  11 +
> >   target/riscv/translate.c   |  10 +
> >   14 files changed, 941 insertions(+), 14 deletions(-)
>
> You're missing code in machine.c to migrate the new state.
>

Nice catch. Thanks for your feedback Richard. I have fixed
all your feedback in v4.

- Rajnesh

>
> r~

Re: [PATCH v5 08/16] acpi/ghes: don't check if physical_address is not zero

On Wed,  4 Dec 2024 08:41:16 +0100
Mauro Carvalho Chehab  wrote:

> The 'physical_address' value is a faulty page. As such, 0 is
> as valid as any other value.
> 
> Suggested-by: Igor Mammedov 
> Signed-off-by: Mauro Carvalho Chehab 

Reviewed-by: Igor Mammedov 

> ---
>  hw/acpi/ghes.c | 4 
>  1 file changed, 4 deletions(-)
> 
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> index edc74c38bf8a..a3dffd78b012 100644
> --- a/hw/acpi/ghes.c
> +++ b/hw/acpi/ghes.c
> @@ -400,10 +400,6 @@ int acpi_ghes_record_errors(uint16_t source_id, uint64_t 
> physical_address)
>  
>  start_addr = le64_to_cpu(ags->ghes_addr_le);
>  
> -if (!physical_address) {
> -return -1;
> -}
> -
>  start_addr += source_id * sizeof(uint64_t);
>  
>  cpu_physical_memory_read(start_addr, &error_block_addr,

[PATCH v4 0/7] target/riscv: Add support for Control Transfer Records Ext.

This series enables Control Transfer Records extension support on riscv
platform. This extension is similar to Arch LBR in x86 and BRBE in ARM.
The Extension has been stable and this series is based on v1.0_rc6 [0]

CTR extension depends on both the implementation of S-mode and Sscsrind
extension v1.0.0 [1]. CTR access ctrsource, ctrtartget and ctrdata CSRs using
sscsrind extension.

The series is based on Smcdeleg/Ssccfg counter delegation extension [2]
patches [3]. CTR itself doesn't depend on counter delegation support. This
rebase is basically to include the Smcsrind patches.

Here is the link to a quick start guide [4] to setup and run a basic perf demo
on Linux to use CTR Ext.

Qemu patches can be found here:
https://github.com/rajnesh-kanwal/qemu/tree/b4/ctr_upstream_v4

Opensbi patch can be found here:
https://github.com/rajnesh-kanwal/opensbi/tree/ctr_upstream_v2

Linux kernel patches can be found here:
https://github.com/rajnesh-kanwal/linux/tree/b4/ctr_upstream_v2

[0]:
https://github.com/riscv/riscv-control-transfer-records/releases/tag/v1.0_rc6
[1]:
https://github.com/riscvarchive/riscv-indirect-csr-access/releases/tag/v1.0.0
[2]: https://github.com/riscvarchive/riscv-smcdeleg-ssccfg/releases/tag/v1.0.0
[3]: https://lore.kernel.org/all/20240217000134.3634191-1-ati...@rivosinc.com/
[4]:
https://github.com/rajnesh-kanwal/linux/wiki/Running-CTR-basic-demo-on-QEMU-RISC%E2%80%90V-Virt-machine

Signed-off-by: Rajnesh Kanwal
---
Changelog:

v4: Improvements based on Richard Henderson's feedback.
- Refactored CTR related code generation to move more code into
translation side and avoid unnecessary code execution in generated
code.
- Added missing code in machine.c to migrate the new state.

v3: Improvements based on Jason Chien and Frank Chang's feedback.
- Created single set of MACROs for CTR CSRs in cpu_bit.h
- Some fixes in riscv_ctr_add_entry.
- Return zero for vs/sireg4-6 for CTR 0x200 to 0x2ff range.
- Improved extension dependency check.
- Fixed invalid ctrctl csr selection bug in riscv_ctr_freeze.
- Added implied rules for Smctr and Ssctr.
- Added missing SMSTATEEN0_CTR bit in mstateen0 and hstateen0 write ops.
- Some more cosmetic changes.
-
https://lore.kernel.org/qemu-riscv/20241104-b4-ctr_upstream_v3-v3-0-32fd3c482...@rivosinc.com/

v2: Lots of improvements based on Jason Chien's feedback including:
- Added CTR recording for cm.jalt, cm.jt, cm.popret, cm.popretz.
- Fixed and added more CTR extension enable checks.
- Fixed CTR CSR predicate functions.
- Fixed external trap xTE bit checks.
- One fix in freeze function for VS-mode.
- Lots of minor code improvements.
- Added checks in sctrclr instruction helper.
-
https://lore.kernel.org/qemu-riscv/20240619152708.135991-1-rkan...@rivosinc.com/

v1:
-
https://lore.kernel.org/qemu-riscv/20240529160950.132754-1-rkan...@rivosinc.com/

---
Rajnesh Kanwal (7):
target/riscv: Remove obsolete sfence.vm instruction
target/riscv: Add Control Transfer Records CSR definitions.
target/riscv: Add support for Control Transfer Records extension CSRs.
target/riscv: Add support to record CTR entries.
target/riscv: Add CTR sctrclr instruction.
target/riscv: Add support to access ctrsource, ctrtarget, ctrdata regs.
target/riscv: machine: Add Control Transfer Record state description

target/riscv/cpu.c | 26 ++-
target/riscv/cpu.h | 13 ++
target/riscv/cpu_bits.h| 94
target/riscv/cpu_cfg.h | 2 +
target/riscv/cpu_helper.c | 266 ++
target/riscv/csr.c | 294 -
target/riscv/helper.h | 6 +-
target/riscv/insn32.decode | 2 +-
target/riscv/insn_trans/trans_privileged.c.inc | 22 +-
target/riscv/insn_trans/trans_rvi.c.inc| 70 ++
target/riscv/insn_trans/trans_rvzce.c.inc | 22 ++
target/riscv/machine.c | 25 +++
target/riscv/op_helper.c | 52 -
target/riscv/tcg/tcg-cpu.c | 11 +
target/riscv/translate.c | 44
15 files changed, 935 insertions(+), 14 deletions(-)
---
base-commit: 2564f23eaefca2fc77d574c8eaf5c34138ce627f
change-id: 20241029-b4-ctr_upstream_v3-7ab764c68bf1
--
Best regards,
Rajnesh Kanwal

Re: [PATCH v6 1/1] target/riscv: rvv: reduce the overhead for simple RISC-V vector unit-stride loads and stores

2024-12-04 Thread Max Chou


Hi Craig,

I think that the unexpected vstart issue persists in this patchset.
This version is unable to update the vstart CSR to the correct index when
grouping load/store elements.

For instance, if an exception is raised by an element following the first
one, and the optimization attempts to group multiple elements, the vstart
value remains the index of the first element, which is not the actual
element index that raised the exception.

Max

On 2024/12/4 8:29 PM, Craig Blackmore wrote:

This patch improves the performance of the emulation of the RVV unit-stride
loads and stores in the following cases:

- when the data being loaded/stored per iteration amounts to 8 bytes or less.
- when the vector length is 16 bytes (VLEN=128) and there's no grouping of the
   vector registers (LMUL=1).

The optimization consists of avoiding the overhead of probing the RAM of the
host machine and doing a loop load/store on the input data grouped in chunks
of as many bytes as possible (8,4,2,1 bytes).

Co-authored-by: Helene CHELIN 
Co-authored-by: Paolo Savini 
Co-authored-by: Craig Blackmore 

Signed-off-by: Helene CHELIN 
Signed-off-by: Paolo Savini 
Signed-off-by: Craig Blackmore 
---
  target/riscv/vector_helper.c | 54 
  1 file changed, 54 insertions(+)

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index a85dd1d200..068010ccd2 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -393,6 +393,60 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState 
*env, uint32_t desc,
  return;
  }
  
+#if defined(CONFIG_USER_ONLY) && !HOST_BIG_ENDIAN

+/*
+ * For data sizes <= 64 bits and for LMUL=1 with VLEN=128 bits we get a
+ * better performance by doing a simple simulation of the load/store
+ * without the overhead of prodding the host RAM
+ */
+if ((nf == 1) && ((evl << log2_esz) <= 8 ||
+((vext_lmul(desc) == 0) && (simd_maxsz(desc) == 16 {
+
+uint32_t evl_b = evl << log2_esz;
+
+for (uint32_t j = env->vstart; j < evl_b;) {
+addr = base + j;
+if ((evl_b - j) >= 8) {
+if (is_load) {
+lde_d_tlb(env, adjust_addr(env, addr), j, vd, ra);
+} else {
+ste_d_tlb(env, adjust_addr(env, addr), j, vd, ra);
+}
+env->vstart += (8 >> log2_esz);
+j += 8;
+} else if ((evl_b - j) >= 4) {
+if (is_load) {
+lde_w_tlb(env, adjust_addr(env, addr), j, vd, ra);
+} else {
+ste_w_tlb(env, adjust_addr(env, addr), j, vd, ra);
+}
+env->vstart += (4 >> log2_esz);
+j += 4;
+} else if ((evl_b - j) >= 2) {
+if (is_load) {
+lde_h_tlb(env, adjust_addr(env, addr), j, vd, ra);
+} else {
+ste_h_tlb(env, adjust_addr(env, addr), j, vd, ra);
+}
+env->vstart += (2 >> log2_esz);
+j += 2;
+} else {
+if (is_load) {
+lde_b_tlb(env, adjust_addr(env, addr), j, vd, ra);
+} else {
+ste_b_tlb(env, adjust_addr(env, addr), j, vd, ra);
+}
+env->vstart++;
+j += 1;
+}
+}
+
+env->vstart = 0;
+vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
+return;
+}
+#endif
+
  /* Calculate the page range of first page */
  addr = base + ((env->vstart * nf) << log2_esz);
  page_split = -(addr | TARGET_PAGE_MASK);

[PATCH v4 5/7] target/riscv: Add CTR sctrclr instruction.

CTR extension adds a new instruction sctrclr to quickly
clear the recorded entries buffer.

Signed-off-by: Rajnesh Kanwal 
---
 target/riscv/cpu.h |  1 +
 target/riscv/cpu_helper.c  |  7 +++
 target/riscv/helper.h  |  1 +
 target/riscv/insn32.decode |  1 +
 target/riscv/insn_trans/trans_privileged.c.inc | 11 ++
 target/riscv/op_helper.c   | 29 ++
 6 files changed, 50 insertions(+)

diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index 
f39ca48d37332c4e5907ca87040de420f78df2e4..85ca2bfe435d0c9d245f2690fe3bde3e076d3b2f
 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -613,6 +613,7 @@ void riscv_cpu_set_mode(CPURISCVState *env, target_ulong 
newpriv, bool virt_en);
 
 void riscv_ctr_add_entry(CPURISCVState *env, target_long src, target_long dst,
 enum CTRType type, target_ulong prev_priv, bool prev_virt);
+void riscv_ctr_clear(CPURISCVState *env);
 
 void riscv_translate_init(void);
 G_NORETURN void riscv_raise_exception(CPURISCVState *env,
diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index 
dbdad4e29d7de0713f7530c46e9fab03d3c459a4..b1130180710b0e01e8ebe33f0974edd8d5abe56d
 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -783,6 +783,13 @@ static void riscv_ctr_freeze(CPURISCVState *env, uint64_t 
freeze_mask,
 }
 }
 
+void riscv_ctr_clear(CPURISCVState *env)
+{
+memset(env->ctr_src, 0x0, sizeof(env->ctr_src));
+memset(env->ctr_dst, 0x0, sizeof(env->ctr_dst));
+memset(env->ctr_data, 0x0, sizeof(env->ctr_data));
+}
+
 static uint64_t riscv_ctr_priv_to_mask(target_ulong priv, bool virt)
 {
 switch (priv) {
diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index 
065d82d3997b1df46a0ed1b96d33bee13c049fad..79899b9cebd6a6731370097e56cea3f5e3ee6a5e
 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -131,6 +131,7 @@ DEF_HELPER_6(csrrw_i128, tl, env, int, tl, tl, tl, tl)
 #ifndef CONFIG_USER_ONLY
 DEF_HELPER_2(sret, tl, env, tl)
 DEF_HELPER_2(mret, tl, env, tl)
+DEF_HELPER_1(ctr_clear, void, env)
 DEF_HELPER_1(wfi, void, env)
 DEF_HELPER_1(wrs_nto, void, env)
 DEF_HELPER_1(tlb_flush, void, env)
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index 
a2b4c0ddd47ad9464b4b180fb19e6a3b64dbe4e5..8188113bcc90482733f676227858829bac5c5462
 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -114,6 +114,7 @@
 # *** Privileged Instructions ***
 ecall    0 000 0 1110011
 ebreak  0001 0 000 0 1110011
+sctrclr 00010100 0 000 0 1110011
 uret00000010 0 000 0 1110011
 sret000100000010 0 000 0 1110011
 mret001100000010 0 000 0 1110011
diff --git a/target/riscv/insn_trans/trans_privileged.c.inc 
b/target/riscv/insn_trans/trans_privileged.c.inc
index 
a5c2410cfa0779b1a928e7b89bd2ee5bb24216e4..1d7a17373e06a9f3226c1c14a54beb1a56e17b83
 100644
--- a/target/riscv/insn_trans/trans_privileged.c.inc
+++ b/target/riscv/insn_trans/trans_privileged.c.inc
@@ -69,6 +69,17 @@ static bool trans_ebreak(DisasContext *ctx, arg_ebreak *a)
 return true;
 }
 
+static bool trans_sctrclr(DisasContext *ctx, arg_sctrclr *a)
+{
+#ifndef CONFIG_USER_ONLY
+if (ctx->cfg_ptr->ext_smctr || ctx->cfg_ptr->ext_ssctr) {
+gen_helper_ctr_clear(tcg_env);
+return true;
+}
+#endif
+return false;
+}
+
 static bool trans_uret(DisasContext *ctx, arg_uret *a)
 {
 return false;
diff --git a/target/riscv/op_helper.c b/target/riscv/op_helper.c
index 
b55b7f3ac3d209d39b16075e79c2342b89bdf805..d22609347ee63be183ab253e7a0158a19ff0bf52
 100644
--- a/target/riscv/op_helper.c
+++ b/target/riscv/op_helper.c
@@ -389,6 +389,35 @@ void helper_ctr_add_entry(CPURISCVState *env, target_ulong 
src,
 env->priv, env->virt_enabled);
 }
 
+void helper_ctr_clear(CPURISCVState *env)
+{
+/*
+ * It's safe to call smstateen_acc_ok() for umode access regardless of the
+ * state of bit 54 (CTR bit in case of m/hstateen) of sstateen. If the bit
+ * is zero, smstateen_acc_ok() will return the correct exception code and
+ * if it's one, smstateen_acc_ok() will return RISCV_EXCP_NONE. In that
+ * scenario the U-mode check below will handle that case.
+ */
+RISCVException ret = smstateen_acc_ok(env, 0, SMSTATEEN0_CTR);
+if (ret != RISCV_EXCP_NONE) {
+riscv_raise_exception(env, ret, GETPC());
+}
+
+if (env->priv == PRV_U) {
+/*
+ * One corner case is when sctrclr is executed from VU-mode and
+ * mstateen.CTR = 0, in which case we are supposed to raise
+ * RISCV_EXCP_ILLEGAL_INST. This case is already handled in
+ * smstateen_acc_ok().
+ */
+uint32_t excep = env->virt_enabled ? RISCV_EXCP_VIRT_INSTRUCTION_FAULT

[PATCH v4 4/7] target/riscv: Add support to record CTR entries.

This commit adds logic to records CTR entries of different types
and adds required hooks in TCG and interrupt/Exception logic to
record events.

This commit also adds support to invoke freeze CTR logic for breakpoint
exceptions and counter overflow interrupts.

Signed-off-by: Rajnesh Kanwal 
---
 target/riscv/cpu.h |   7 +
 target/riscv/cpu_helper.c  | 259 +
 target/riscv/helper.h  |   5 +-
 target/riscv/insn_trans/trans_privileged.c.inc |   6 +-
 target/riscv/insn_trans/trans_rvi.c.inc|  70 +++
 target/riscv/insn_trans/trans_rvzce.c.inc  |  22 +++
 target/riscv/op_helper.c   |  23 ++-
 target/riscv/translate.c   |  44 +
 8 files changed, 430 insertions(+), 6 deletions(-)

diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index 
da14ac2f874b81d3f01bc31b0064d020f2dbdf61..f39ca48d37332c4e5907ca87040de420f78df2e4
 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -310,6 +310,10 @@ struct CPUArchState {
 uint32_t sctrstatus;
 uint64_t vsctrctl;
 
+uint64_t ctr_src[16 << SCTRDEPTH_MAX];
+uint64_t ctr_dst[16 << SCTRDEPTH_MAX];
+uint64_t ctr_data[16 << SCTRDEPTH_MAX];
+
 /* Machine and Supervisor interrupt priorities */
 uint8_t miprio[64];
 uint8_t siprio[64];
@@ -607,6 +611,9 @@ RISCVException smstateen_acc_ok(CPURISCVState *env, int 
index, uint64_t bit);
 
 void riscv_cpu_set_mode(CPURISCVState *env, target_ulong newpriv, bool 
virt_en);
 
+void riscv_ctr_add_entry(CPURISCVState *env, target_long src, target_long dst,
+enum CTRType type, target_ulong prev_priv, bool prev_virt);
+
 void riscv_translate_init(void);
 G_NORETURN void riscv_raise_exception(CPURISCVState *env,
   uint32_t exception, uintptr_t pc);
diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index 
0a3ead69eabaf0e395fc7c78868640a4216573ee..dbdad4e29d7de0713f7530c46e9fab03d3c459a4
 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -771,6 +771,247 @@ void riscv_cpu_set_aia_ireg_rmw_fn(CPURISCVState *env, 
uint32_t priv,
 }
 }
 
+static void riscv_ctr_freeze(CPURISCVState *env, uint64_t freeze_mask,
+ bool virt)
+{
+uint64_t ctl = virt ? env->vsctrctl : env->mctrctl;
+
+assert((freeze_mask & (~(XCTRCTL_BPFRZ | XCTRCTL_LCOFIFRZ))) == 0);
+
+if (ctl & freeze_mask) {
+env->sctrstatus |= SCTRSTATUS_FROZEN;
+}
+}
+
+static uint64_t riscv_ctr_priv_to_mask(target_ulong priv, bool virt)
+{
+switch (priv) {
+case PRV_M:
+return MCTRCTL_M;
+case PRV_S:
+if (virt) {
+return XCTRCTL_S;
+}
+return XCTRCTL_S;
+case PRV_U:
+if (virt) {
+return XCTRCTL_U;
+}
+return XCTRCTL_U;
+}
+
+g_assert_not_reached();
+}
+
+static uint64_t riscv_ctr_get_control(CPURISCVState *env, target_long priv,
+  bool virt)
+{
+switch (priv) {
+case PRV_M:
+return env->mctrctl;
+case PRV_S:
+case PRV_U:
+if (virt) {
+return env->vsctrctl;
+}
+return env->mctrctl;
+}
+
+g_assert_not_reached();
+}
+
+/*
+ * This function assumes that src privilege and target privilege are not same
+ * and src privilege is less than target privilege. This includes the virtual
+ * state as well.
+ */
+static bool riscv_ctr_check_xte(CPURISCVState *env, target_long src_prv,
+bool src_virt)
+{
+target_long tgt_prv = env->priv;
+bool res = true;
+
+/*
+ * VS and U mode are same in terms of xTE bits required to record an
+ * external trap. See 6.1.2. External Traps, table 8 External Trap Enable
+ * Requirements. This changes VS to U to simplify the logic a bit.
+ */
+if (src_virt && src_prv == PRV_S) {
+src_prv = PRV_U;
+} else if (env->virt_enabled && tgt_prv == PRV_S) {
+tgt_prv = PRV_U;
+}
+
+/* VU mode is an outlier here. */
+if (src_virt && src_prv == PRV_U) {
+res &= !!(env->vsctrctl & XCTRCTL_STE);
+}
+
+switch (src_prv) {
+case PRV_U:
+if (tgt_prv == PRV_U) {
+break;
+}
+res &= !!(env->mctrctl & XCTRCTL_STE);
+/* fall-through */
+case PRV_S:
+if (tgt_prv == PRV_S) {
+break;
+}
+res &= !!(env->mctrctl & MCTRCTL_MTE);
+/* fall-through */
+case PRV_M:
+break;
+}
+
+return res;
+}
+
+/*
+ * Special cases for traps and trap returns:
+ *
+ * 1- Traps, and trap returns, between enabled modes are recorded as normal.
+ * 2- Traps from an inhibited mode to an enabled mode, and trap returns from an
+ * enabled mode back to an inhibited mode, are partially recorded.  In such
+ * cases, the PC from the inhibited mode (source PC for traps, and target PC
+ *

Re: [PATCH v4 4/7] target/riscv: Add support to record CTR entries.

2024-12-04 Thread Richard Henderson


On 12/4/24 06:56, Rajnesh Kanwal wrote:

diff --git a/target/riscv/insn_trans/trans_privileged.c.inc 
b/target/riscv/insn_trans/trans_privileged.c.inc
index 
0bdfa9a0ed3313223ce9032fb24484c3887cddf9..a5c2410cfa0779b1a928e7b89bd2ee5bb24216e4
 100644
--- a/target/riscv/insn_trans/trans_privileged.c.inc
+++ b/target/riscv/insn_trans/trans_privileged.c.inc
@@ -78,9 +78,10 @@ static bool trans_sret(DisasContext *ctx, arg_sret *a)
  {
  #ifndef CONFIG_USER_ONLY
  if (has_ext(ctx, RVS)) {
+TCGv src = tcg_constant_tl(ctx->base.pc_next);


This is incorrect.
You need to use gen_pc_plus_diff(src, ctx, 0).

Alternately, for here in sret and mret, instead of adding an extra parameter, use 
gen_update_pc(ctx, 0) to update env->pc





@@ -95,9 +96,10 @@ static bool trans_sret(DisasContext *ctx, arg_sret *a)
  static bool trans_mret(DisasContext *ctx, arg_mret *a)
  {
  #ifndef CONFIG_USER_ONLY
+TCGv src = tcg_constant_tl(ctx->base.pc_next);


Likewise.



diff --git a/target/riscv/insn_trans/trans_rvi.c.inc 
b/target/riscv/insn_trans/trans_rvi.c.inc
index 
96c218a9d7875c6419287ac3aa9746251be3f442..fc182e7b18a289e13ad212f10a3233aca25fae41
 100644
--- a/target/riscv/insn_trans/trans_rvi.c.inc
+++ b/target/riscv/insn_trans/trans_rvi.c.inc
@@ -93,6 +93,50 @@ static bool trans_jal(DisasContext *ctx, arg_jal *a)
  return true;
  }
  
+#ifndef CONFIG_USER_ONLY

+/*
+ * Indirect calls
+ * - jalr x1, rs where rs != x5;
+ * - jalr x5, rs where rs != x1;
+ * - c.jalr rs1 where rs1 != x5;
+ *
+ * Indirect jumps
+ * - jalr x0, rs where rs != x1 and rs != x5;
+ * - c.jr rs1 where rs1 != x1 and rs1 != x5.
+ *
+ * Returns
+ * - jalr rd, rs where (rs == x1 or rs == x5) and rd != x1 and rd != x5;
+ * - c.jr rs1 where rs1 == x1 or rs1 == x5.
+ *
+ * Co-routine swap
+ * - jalr x1, x5;
+ * - jalr x5, x1;
+ * - c.jalr x5.
+ *
+ * Other indirect jumps
+ * - jalr rd, rs where rs != x1, rs != x5, rd != x0, rd != x1 and rd != x5.
+ */
+static void helper_ctr_jalr(DisasContext *ctx, arg_jalr *a)


Generally "helper_*" are out-of-line functions, whereas this is generating inline code. 
Better as "gen_ctr_jalr".



+{
+TCGv src = tcg_constant_tl(ctx->base.pc_next);


gen_pc_plus_diff


@@ -219,6 +269,9 @@ static bool gen_branch(DisasContext *ctx, arg_b *a, TCGCond 
cond)
  TCGv src1 = get_gpr(ctx, a->rs1, EXT_SIGN);
  TCGv src2 = get_gpr(ctx, a->rs2, EXT_SIGN);
  target_ulong orig_pc_save = ctx->pc_save;
+#ifndef CONFIG_USER_ONLY
+TCGv src = tcg_constant_tl(ctx->base.pc_next);
+#endif


gen_pc_plus_diff, though perhaps delay until used.

  
  if (get_xl(ctx) == MXL_RV128) {

  TCGv src1h = get_gprh(ctx, a->rs1);
@@ -231,6 +284,15 @@ static bool gen_branch(DisasContext *ctx, arg_b *a, 
TCGCond cond)
  } else {
  tcg_gen_brcond_tl(cond, src1, src2, l);
  }
+
+#ifndef CONFIG_USER_ONLY
+if (ctx->cfg_ptr->ext_smctr || ctx->cfg_ptr->ext_ssctr) {
+TCGv type = tcg_constant_tl(CTRDATA_TYPE_NONTAKEN_BRANCH);
+TCGv dest = tcg_constant_tl(ctx->base.pc_next + ctx->cur_insn_len);


gen_pc_plus_diff


+gen_helper_ctr_add_entry(tcg_env, src, dest, type);
+}
+#endif
+
  gen_goto_tb(ctx, 1, ctx->cur_insn_len);
  ctx->pc_save = orig_pc_save;
  
@@ -243,6 +305,14 @@ static bool gen_branch(DisasContext *ctx, arg_b *a, TCGCond cond)

  gen_pc_plus_diff(target_pc, ctx, a->imm);
  gen_exception_inst_addr_mis(ctx, target_pc);
  } else {
+#ifndef CONFIG_USER_ONLY
+if (ctx->cfg_ptr->ext_smctr || ctx->cfg_ptr->ext_ssctr) {
+TCGv type = tcg_constant_tl(CTRDATA_TYPE_TAKEN_BRANCH);
+TCGv dest = tcg_constant_tl(ctx->base.pc_next + a->imm);


gen_pc_plus_diff.


diff --git a/target/riscv/insn_trans/trans_rvzce.c.inc 
b/target/riscv/insn_trans/trans_rvzce.c.inc
index 
cd234ad960724c936b92afb6fd1f3c7c2a37cb80..07b51d9f4d847c4411165b422a843fea65c86d45
 100644
--- a/target/riscv/insn_trans/trans_rvzce.c.inc
+++ b/target/riscv/insn_trans/trans_rvzce.c.inc
@@ -204,6 +204,13 @@ static bool gen_pop(DisasContext *ctx, arg_cmpp *a, bool 
ret, bool ret_val)
  if (ret) {
  TCGv ret_addr = get_gpr(ctx, xRA, EXT_SIGN);
  tcg_gen_mov_tl(cpu_pc, ret_addr);
+#ifndef CONFIG_USER_ONLY
+if (ctx->cfg_ptr->ext_smctr || ctx->cfg_ptr->ext_ssctr) {
+TCGv src = tcg_constant_tl(ctx->base.pc_next);


gen_pc_plus_diff, and it will need to be done *before* the assignment to cpu_pc.


@@ -309,6 +316,21 @@ static bool trans_cm_jalt(DisasContext *ctx, arg_cm_jalt 
*a)
  gen_set_gpr(ctx, xRA, succ_pc);
  }
  
+#ifndef CONFIG_USER_ONLY

+if (ctx->cfg_ptr->ext_smctr || ctx->cfg_ptr->ext_ssctr) {
+TCGv src = tcg_constant_tl(ctx->base.pc_next);


Here, we have updated cpu_pc to current (see the start of the function), so you can just 
use that instead of src.



+void helper_ctr_add_entry(CPURISCVState *env, target_ulong src,
+  target_ulong dest, target_ulong type)
+{
+

Re: [PATCH RFC 06/11] migration/block: Extend the migration_block_* API to dest side

On Tue, Dec 03, 2024 at 07:51:33PM -0500, Peter Xu wrote:
> diff --git a/migration/migration.c b/migration/migration.c
> index 8f7d09ca84..e01264168f 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -701,6 +701,12 @@ migration_incoming_state_setup(MigrationIncomingState 
> *mis, Error **errp)
>  return false;
>  }
>  
> +/*
> + * Newly setup QEMU, prepared for incoming migration.  Mark the block
> + * active state to reflect that the src currently owns the disks.
> + */
> +migrate_get_current()->block_active = false;
> +
>  migrate_set_state(&mis->state, current, MIGRATION_STATUS_SETUP);
>  return true;
>  }

I plan to move this over to the end of qmp_migrate_incoming(), which might
be easier to follow (where we updated "once"..), to say that we initialize
the value to FALSE once and for all for an incoming instance.

I don't think there's any functional change on the fixup.  The only
difference is when qemu_start_incoming_migration() can fail after
migration_incoming_state_setup().  I don't think it matter hugely on the
block_active flag itself.. as if that failure happens it means the next
qmp_migrate_incoming will overwrite the flag again with nobody being able
to touch it..  But still, if I'll post a new version I'll squash below into
this patch:

===8<===
diff --git a/migration/migration.c b/migration/migration.c
index 1a1c570c5b..bc3a29482a 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -701,12 +701,6 @@ migration_incoming_state_setup(MigrationIncomingState 
*mis, Error **errp)
 return false;
 }

-/*
- * Newly setup QEMU, prepared for incoming migration.  Mark the block
- * active state to reflect that the src currently owns the disks.
- */
-migrate_get_current()->block_active = false;
-
 migrate_set_state(&mis->state, current, MIGRATION_STATUS_SETUP);
 return true;
 }
@@ -1892,6 +1886,12 @@ void qmp_migrate_incoming(const char *uri, bool 
has_channels,
 return;
 }

+/*
+ * Newly setup incoming QEMU.  Mark the block active state to reflect
+ * that the src currently owns the disks.
+ */
+migrate_get_current()->block_active = false;
+
 once = false;
 }

-- 
Peter Xu

Re: [PATCH v2 0/9] vfio/igd: Enable legacy mode on more devices

2024-12-04 Thread Tomita Moeko

On 12/4/24 04:12, Alex Williamson wrote:
> On Tue,  3 Dec 2024 21:35:39 +0800
> Tomita Moeko  wrote:
> 
>> This patchset extends the support of legacy mode igd passthrough to
>> all Intel Gen 11 and 12 devices (including Ice Lake, Jasper Lake,
>> Rocket Lake, Alder Lake and Raptor Lake), and emulates GGC register
>> in MMIO BAR0 for better compatibiltiy (It is tested Windows and GOP
>> driver will read this MMIO register).
>>
>> It also replaces magic numbers with macros to improve readability,
>> and aligns behavior (BDSM registor mirroring and GGMS calculation for
>> gen7) with i915 driver to avoid possible issues.
>>
>> The x-igd-gms option removed in 971ca22f041b ("vfio/igd: don't set
>> stolen memory size to zero") is also added back so that data stolen
>> memory size can be specified for guest. It is tested that GMS may
>> related to framebuffer size, a small GMS value may cause display issues
>> like blackscreen. It can be changed by DVMT Pre-allocated option in
>> host BIOS, but not all BIOS comes with this option. Having it in QEMU
>> helps resolves such issues.
>>
>> This patchset was verified on Intel i9-12900K CPU(UHD 770, 8086:4680)
>> with custom OVMF firmware [1] and IntelGopDriver extracted from host
>> bios. IGD device works well in both Windows and Linux guests, and
>> scored 726 in 3DMark Time Spy Graphics on Windows guest.
>>
>> [1] https://github.com/tomitamoeko/edk2/commits/igd-pt-adl/
>>
>> Btw, IO BAR4 seems never be used by guest, and it the IO BAR itself
>> is not working on Gen11+ devices in my experiments. There is no hints
>> about that in old commit message and mailing list. It would be greatly
>> appreciated if someone shares the background.
> 
> The quirks related to BAR4 access are generally for the vBIOS, we
> wouldn't expect guest OS level drivers to use them.  IIRC this is
> handling moving the stolen memory from the HPA to the GPA when the
> vBIOS is writing the GTT.

Got it. I'm wondering why vBIOS still writes HPA instead of GPA when
it's in virtual machine, maybe the address is hardcoded?

> Maybe that brings up an interesting topic.  Traditionally "legacy mode"
> IGD assignment has been only for 440fx machines with SeaBIOS and last I
> was aware edk2 wasn't willing to accept the same hack for the BDSM as
> we had put into SeaBIOS, instead indicating that it should be
> implemented in the device ROM.  Your branch in [1] above seems to
> indicate edk2 does now have assigned IGD specific code.
>
> Are these patches developing full stack support of these new devices,
> from BIOS hand-off, through pre-boot environments, and through to guest
> OS drivers, or are we only concerned that the guest OS level driver
> lights up a display?

Yes these patches provide a complete legacy mode passthrough solution,
from EFI DXE phase to guest OS, but the EFI part requires specific
changes in edk2.

> If you're using q35 and OVMF then you must be operating in the realm of
> the mythical "Universal Pass-through" mode that I thought Intel had
> abandoned.  It seems like we need an update to docs/igd-assign.txt as
> it's likely very out of date based on recent improvements here and by
> Corvin.

Actually the only machine supports legacy mode is i440fx, windows driver
checks the vendor and device id of LPC bridge device at 00:1f.0, if it
doesn't match, display driver won't work [1]. On q35 machine, there is
already a emulated ICH9 LPC at 00.1f.0. Previous there was a try in
modifying the id, but it breaks functionality [2].

[1] 
https://github.com/projectacrn/acrn-hypervisor/blob/master/devicemodel/hw/pci/lpc.c#L519
[2] 
https://lore.kernel.org/all/1457080913-30018-1-git-send-email-kra...@redhat.com/

I had a try "Univerisal Pass-through" mode as igd-assign.txt with my
UHD 770. On linux guest, it works just as a normal gpu, except there
is no display output before i915 driver loaded. Even the device's vbdf
is not 00:02.0, and i915 log shows the DSM is 0M, it works perfectly.
intel_gpu_top shows igpu is working when I am playing a youtube video.

I also tried setting primary gpu in bios to discrete gpu, which makes
the pci class code of igpu changed from 0x03 (VGA compatible 
controller) to 0x308000 (Display controller), it can still output to
monitor connected to it on Linux guest. All with a simple
-device vfio-pci,host=00:02.0,id=hostdev0

But for windows guest, I never had any luck. I attached a virtio-gpu to
it, with intel grahics drivers installed in guest. If igd is not at
00:02.0 or gop driver is not provided, windows BSOD immediately on boot

> Also, are you proposing the noted edk2 change upstream?  It seems like
> edk2 would need some sort of device version detection to know whether
> to use a 32 or 64-bit BDSM value.  Thanks,
> 
> Alex

I'm afraid the answer is no, these edk2 changes are not fully open
source as they were taken from inten directly [3], except the last
patch. (It seems intel uses a modified qemu as "etc/igd-dsm-base"
used in patch 5 doesn't exist in qem

[PATCH v3 5/5] acpi/generic_event_device: add logic to detect if HEST addr is available

Create a new property (x-has-hest-addr) and use it to detect if
the GHES table offsets can be calculated from the HEST address
(qemu 9.2 and upper) or via the legacy way via an offset obtained
from the hardware_errors firmware file.

Signed-off-by: Mauro Carvalho Chehab 
Reviewed-by: Jonathan Cameron 
---
 hw/acpi/generic_event_device.c |  1 +
 hw/acpi/ghes.c | 24 +---
 hw/arm/virt-acpi-build.c   | 30 ++
 hw/core/machine.c  |  2 ++
 include/hw/acpi/ghes.h |  1 +
 5 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index c1116dd8d7ae..df6b4fab2d30 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -318,6 +318,7 @@ static void acpi_ged_send_event(AcpiDeviceIf *adev, 
AcpiEventStatusBits ev)
 
 static Property acpi_ged_properties[] = {
 DEFINE_PROP_UINT32("ged-event", AcpiGedState, ged_event_bitmap, 0),
+DEFINE_PROP_BOOL("x-has-hest-addr", AcpiGedState, ghes_state.hest_lookup, 
true),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index af55bfe106bf..c9295a3b0db7 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -361,6 +361,8 @@ void acpi_build_hest(GArray *table_data, GArray 
*hardware_errors,
 {
 AcpiTable table = { .sig = "HEST", .rev = 1,
 .oem_id = oem_id, .oem_table_id = oem_table_id };
+AcpiGedState *acpi_ged_state;
+AcpiGhesState *ags;
 int i;
 
 build_ghes_error_table(hardware_errors, linker, num_sources);
@@ -381,10 +383,16 @@ void acpi_build_hest(GArray *table_data, GArray 
*hardware_errors,
  * tell firmware to write into GPA the address of HEST via fw_cfg,
  * once initialized.
  */
-bios_linker_loader_write_pointer(linker,
- ACPI_HEST_ADDR_FW_CFG_FILE, 0,
- sizeof(uint64_t),
- ACPI_BUILD_TABLE_FILE, hest_offset);
+
+acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
+   NULL));
+ags = &acpi_ged_state->ghes_state;
+if (ags->hest_lookup) {
+bios_linker_loader_write_pointer(linker,
+ ACPI_HEST_ADDR_FW_CFG_FILE, 0,
+ sizeof(uint64_t),
+ ACPI_BUILD_TABLE_FILE, hest_offset);
+}
 }
 
 void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
@@ -398,8 +406,10 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState 
*s,
 fw_cfg_add_file_callback(s, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, NULL, NULL,
 NULL, &(ags->hw_error_le), sizeof(ags->hw_error_le), false);
 
-fw_cfg_add_file_callback(s, ACPI_HEST_ADDR_FW_CFG_FILE, NULL, NULL,
-NULL, &(ags->hest_addr_le), sizeof(ags->hest_addr_le), false);
+if (ags->hest_lookup) {
+fw_cfg_add_file_callback(s, ACPI_HEST_ADDR_FW_CFG_FILE, NULL, NULL,
+NULL, &(ags->hest_addr_le), sizeof(ags->hest_addr_le), false);
+}
 
 ags->present = true;
 }
@@ -520,7 +530,7 @@ void ghes_record_cper_errors(const void *cper, size_t len,
 }
 ags = &acpi_ged_state->ghes_state;
 
-if (!ags->hest_addr_le) {
+if (!ags->hest_lookup) {
 get_hw_error_offsets(le64_to_cpu(ags->hw_error_le),
  &cper_addr, &read_ack_register_addr);
 } else {
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index bd5582bc75f8..46ce3f3bb07a 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -893,6 +893,10 @@ static const AcpiNotificationSourceId hest_ghes_notify[] = 
{
 { ACPI_HEST_SRC_ID_SYNC, ACPI_GHES_NOTIFY_SEA },
 };
 
+static const AcpiNotificationSourceId hest_ghes_notify_9_1[] = {
+{ ACPI_HEST_SRC_ID_SYNC, ACPI_GHES_NOTIFY_SEA },
+};
+
 static
 void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables)
 {
@@ -946,10 +950,28 @@ void virt_acpi_build(VirtMachineState *vms, 
AcpiBuildTables *tables)
 build_dbg2(tables_blob, tables->linker, vms);
 
 if (vms->ras) {
-acpi_add_table(table_offsets, tables_blob);
-acpi_build_hest(tables_blob, tables->hardware_errors, tables->linker,
-hest_ghes_notify, ARRAY_SIZE(hest_ghes_notify),
-vms->oem_id, vms->oem_table_id);
+AcpiGhesState *ags;
+AcpiGedState *acpi_ged_state;
+
+acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
+   NULL));
+if (acpi_ged_state) {
+ags = &acpi_ged_state->ghes_state;
+
+acpi_add_table(table_offsets, tables_blob);
+
+if (!ags->hest_lookup) {
+acpi_build_hest(tables_blob, tables->hardware_errors,
+tables->linker, hes

[PATCH v3 1/5] acpi/ghes: Prepare to support multiple sources on ghes

The current code is actually dependent on having just one error
structure with a single source.

As the number of sources should be arch-dependent, as it will depend on
what kind of synchronous/assynchronous notifications will exist, change
the logic to dynamically build the table.

Yet, for a proper support, we need to get the number of sources by
reading the number from the HEST table. However, bios currently doesn't
store a pointer to it.

For now just change the logic at table build time, while enforcing that
it will behave like before with a single source ID.

A future patch will add a HEST table bios pointer and change the logic
at acpi_ghes_record_errors() to dynamically use the new size.

Signed-off-by: Mauro Carvalho Chehab 
Reviewed-by: Jonathan Cameron 
---
 hw/acpi/ghes.c   | 43 ++--
 hw/arm/virt-acpi-build.c |  5 +
 include/hw/acpi/ghes.h   | 21 +---
 3 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index 5efa50413af3..b77e5c9d1b19 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -206,17 +206,26 @@ ghes_gen_err_data_uncorrectable_recoverable(GArray *block,
  * Initialize "etc/hardware_errors" and "etc/hardware_errors_addr" fw_cfg 
blobs.
  * See docs/specs/acpi_hest_ghes.rst for blobs format.
  */
-static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker)
+static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker,
+   int num_sources)
 {
 int i, error_status_block_offset;
 
+/*
+ * TODO: Current version supports only one source.
+ * A further patch will drop this check, after adding a proper migration
+ * code, as, for the code to work, we need to store a bios pointer to the
+ * HEST table.
+ */
+assert(num_sources == 1);
+
 /* Build error_block_address */
-for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) {
+for (i = 0; i < num_sources; i++) {
 build_append_int_noprefix(hardware_errors, 0, sizeof(uint64_t));
 }
 
 /* Build read_ack_register */
-for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) {
+for (i = 0; i < num_sources; i++) {
 /*
  * Initialize the value of read_ack_register to 1, so GHES can be
  * writable after (re)boot.
@@ -231,13 +240,13 @@ static void build_ghes_error_table(GArray 
*hardware_errors, BIOSLinker *linker)
 
 /* Reserve space for Error Status Data Block */
 acpi_data_push(hardware_errors,
-ACPI_GHES_MAX_RAW_DATA_LENGTH * ACPI_GHES_ERROR_SOURCE_COUNT);
+ACPI_GHES_MAX_RAW_DATA_LENGTH * num_sources);
 
 /* Tell guest firmware to place hardware_errors blob into RAM */
 bios_linker_loader_alloc(linker, ACPI_HW_ERROR_FW_CFG_FILE,
  hardware_errors, sizeof(uint64_t), false);
 
-for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) {
+for (i = 0; i < num_sources; i++) {
 /*
  * Tell firmware to patch error_block_address entries to point to
  * corresponding "Generic Error Status Block"
@@ -263,10 +272,12 @@ static void build_ghes_error_table(GArray 
*hardware_errors, BIOSLinker *linker)
 /* Build Generic Hardware Error Source version 2 (GHESv2) */
 static void build_ghes_v2(GArray *table_data,
   BIOSLinker *linker,
-  enum AcpiGhesNotifyType notify,
-  uint16_t source_id)
+  const AcpiNotificationSourceId *notif_src,
+  uint16_t index, int num_sources)
 {
 uint64_t address_offset;
+const uint16_t notify = notif_src->notify;
+const uint16_t source_id = notif_src->source_id;
 
 /*
  * Type:
@@ -297,7 +308,7 @@ static void build_ghes_v2(GArray *table_data,
address_offset + GAS_ADDR_OFFSET,
sizeof(uint64_t),
ACPI_HW_ERROR_FW_CFG_FILE,
-   source_id * sizeof(uint64_t));
+   index * sizeof(uint64_t));
 
 /* Notification Structure */
 build_ghes_hw_error_notification(table_data, notify);
@@ -317,8 +328,7 @@ static void build_ghes_v2(GArray *table_data,
address_offset + GAS_ADDR_OFFSET,
sizeof(uint64_t),
ACPI_HW_ERROR_FW_CFG_FILE,
-   (ACPI_GHES_ERROR_SOURCE_COUNT + source_id)
-   * sizeof(uint64_t));
+   (num_sources + index) * sizeof(uint64_t));
 
 /*
  * Read Ack Preserve field
@@ -333,19 +343,23 @@ static void build_ghes_v2(GArray *table_data,
 /* Build Hardware Error Source Table */
 void acpi_build_hest(GArray *table_data, GArray *hardware_errors,
  BIOSLinker *linker,

[PATCH v3 4/5] acpi/generic_event_device: Update GHES migration to cover hest addr

The GHES migration logic at GED should now support HEST table
location too.

Signed-off-by: Mauro Carvalho Chehab 
Reviewed-by: Jonathan Cameron 
---
 hw/acpi/generic_event_device.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index 17baf36132a8..c1116dd8d7ae 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -387,6 +387,34 @@ static const VMStateDescription vmstate_ghes_state = {
 }
 };
 
+static const VMStateDescription vmstate_hest = {
+.name = "acpi-hest",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (const VMStateField[]) {
+VMSTATE_UINT64(hest_addr_le, AcpiGhesState),
+VMSTATE_END_OF_LIST()
+},
+};
+
+static bool hest_needed(void *opaque)
+{
+AcpiGedState *s = opaque;
+return s->ghes_state.hest_addr_le;
+}
+
+static const VMStateDescription vmstate_hest_state = {
+.name = "acpi-ged/hest",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = hest_needed,
+.fields = (const VMStateField[]) {
+VMSTATE_STRUCT(ghes_state, AcpiGedState, 1,
+   vmstate_hest, AcpiGhesState),
+VMSTATE_END_OF_LIST()
+}
+};
+
 static const VMStateDescription vmstate_acpi_ged = {
 .name = "acpi-ged",
 .version_id = 1,
@@ -399,6 +427,7 @@ static const VMStateDescription vmstate_acpi_ged = {
 &vmstate_memhp_state,
 &vmstate_cpuhp_state,
 &vmstate_ghes_state,
+&vmstate_hest_state,
 NULL
 }
 };
-- 
2.47.1

[PATCH v3 2/5] acpi/ghes: add a firmware file with HEST address

Store HEST table address at GPA, placing its content at
hest_addr_le variable.

Signed-off-by: Mauro Carvalho Chehab 
Reviewed-by: Jonathan Cameron 

---

Change from v8:
- hest_addr_lr is now pointing to the error source size and data.

Signed-off-by: Mauro Carvalho Chehab 
---
 hw/acpi/ghes.c | 17 -
 include/hw/acpi/ghes.h |  1 +
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index b77e5c9d1b19..4a826c8ca6d4 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -30,6 +30,7 @@
 
 #define ACPI_HW_ERROR_FW_CFG_FILE   "etc/hardware_errors"
 #define ACPI_HW_ERROR_ADDR_FW_CFG_FILE  "etc/hardware_errors_addr"
+#define ACPI_HEST_ADDR_FW_CFG_FILE  "etc/acpi_table_hest_addr"
 
 /* The max size in bytes for one error block */
 #define ACPI_GHES_MAX_RAW_DATA_LENGTH   (1 * KiB)
@@ -261,7 +262,7 @@ static void build_ghes_error_table(GArray *hardware_errors, 
BIOSLinker *linker,
 }
 
 /*
- * tell firmware to write hardware_errors GPA into
+ * Tell firmware to write hardware_errors GPA into
  * hardware_errors_addr fw_cfg, once the former has been initialized.
  */
 bios_linker_loader_write_pointer(linker, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, 0,
@@ -355,6 +356,8 @@ void acpi_build_hest(GArray *table_data, GArray 
*hardware_errors,
 
 acpi_table_begin(&table, table_data);
 
+int hest_offset = table_data->len;
+
 /* Error Source Count */
 build_append_int_noprefix(table_data, num_sources, 4);
 for (i = 0; i < num_sources; i++) {
@@ -362,6 +365,15 @@ void acpi_build_hest(GArray *table_data, GArray 
*hardware_errors,
 }
 
 acpi_table_end(linker, &table);
+
+/*
+ * tell firmware to write into GPA the address of HEST via fw_cfg,
+ * once initialized.
+ */
+bios_linker_loader_write_pointer(linker,
+ ACPI_HEST_ADDR_FW_CFG_FILE, 0,
+ sizeof(uint64_t),
+ ACPI_BUILD_TABLE_FILE, hest_offset);
 }
 
 void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
@@ -375,6 +387,9 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
 fw_cfg_add_file_callback(s, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, NULL, NULL,
 NULL, &(ags->hw_error_le), sizeof(ags->hw_error_le), false);
 
+fw_cfg_add_file_callback(s, ACPI_HEST_ADDR_FW_CFG_FILE, NULL, NULL,
+NULL, &(ags->hest_addr_le), sizeof(ags->hest_addr_le), false);
+
 ags->present = true;
 }
 
diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h
index 9f0120d0d596..237721fec0a2 100644
--- a/include/hw/acpi/ghes.h
+++ b/include/hw/acpi/ghes.h
@@ -58,6 +58,7 @@ enum AcpiGhesNotifyType {
 };
 
 typedef struct AcpiGhesState {
+uint64_t hest_addr_le;
 uint64_t hw_error_le;
 bool present; /* True if GHES is present at all on this board */
 } AcpiGhesState;
-- 
2.47.1

[PATCH v3 3/5] acpi/ghes: Use HEST table offsets when preparing GHES records

There are two pointers that are needed during error injection:

1. The start address of the CPER block to be stored;
2. The address of the ack, which needs a reset before next error.

It is preferable to calculate them from the HEST table.  This allows
checking the source ID, the size of the table and the type of the
HEST error block structures.

Yet, keep the old code, as this is needed for migration purposes.

Signed-off-by: Mauro Carvalho Chehab 
Reviewed-by: Jonathan Cameron 
---
 hw/acpi/ghes.c | 106 -
 1 file changed, 96 insertions(+), 10 deletions(-)

diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index 4a826c8ca6d4..af55bfe106bf 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -61,6 +61,25 @@
  */
 #define ACPI_GHES_GESB_SIZE 20
 
+/*
+ * Offsets with regards to the start of the HEST table stored at
+ * ags->hest_addr_le, according with the memory layout map at
+ * docs/specs/acpi_hest_ghes.rst.
+ */
+
+/*
+ * ACPI 6.2: 18.3.2.8 Generic Hardware Error Source version 2
+ * Table 18-382 Generic Hardware Error Source version 2 (GHESv2) Structure
+ */
+#define HEST_GHES_V2_TABLE_SIZE  92
+#define GHES_ACK_OFFSET  (64 + GAS_ADDR_OFFSET)
+
+/*
+ * ACPI 6.2: 18.3.2.7: Generic Hardware Error Source
+ * Table 18-380: 'Error Status Address' field
+ */
+#define GHES_ERR_ST_ADDR_OFFSET  (20 + GAS_ADDR_OFFSET)
+
 /*
  * Values for error_severity field
  */
@@ -212,14 +231,6 @@ static void build_ghes_error_table(GArray 
*hardware_errors, BIOSLinker *linker,
 {
 int i, error_status_block_offset;
 
-/*
- * TODO: Current version supports only one source.
- * A further patch will drop this check, after adding a proper migration
- * code, as, for the code to work, we need to store a bios pointer to the
- * HEST table.
- */
-assert(num_sources == 1);
-
 /* Build error_block_address */
 for (i = 0; i < num_sources; i++) {
 build_append_int_noprefix(hardware_errors, 0, sizeof(uint64_t));
@@ -419,6 +430,76 @@ static void get_hw_error_offsets(uint64_t ghes_addr,
 *read_ack_register_addr = ghes_addr + sizeof(uint64_t);
 }
 
+static void get_ghes_source_offsets(uint16_t source_id, uint64_t hest_addr,
+uint64_t *cper_addr,
+uint64_t *read_ack_start_addr,
+Error **errp)
+{
+uint64_t hest_err_block_addr, hest_read_ack_addr;
+uint64_t err_source_struct, error_block_addr;
+uint32_t num_sources, i;
+
+if (!hest_addr) {
+return;
+}
+
+cpu_physical_memory_read(hest_addr, &num_sources, sizeof(num_sources));
+num_sources = le32_to_cpu(num_sources);
+
+err_source_struct = hest_addr + sizeof(num_sources);
+
+/*
+ * Currently, HEST Error source navigates only for GHESv2 tables
+ */
+
+for (i = 0; i < num_sources; i++) {
+uint64_t addr = err_source_struct;
+uint16_t type, src_id;
+
+cpu_physical_memory_read(addr, &type, sizeof(type));
+type = le16_to_cpu(type);
+
+/* For now, we only know the size of GHESv2 table */
+if (type != ACPI_GHES_SOURCE_GENERIC_ERROR_V2) {
+error_setg(errp, "HEST: type %d not supported.", type);
+return;
+}
+
+/* Compare CPER source address at the GHESv2 structure */
+addr += sizeof(type);
+cpu_physical_memory_read(addr, &src_id, sizeof(src_id));
+
+if (src_id == source_id) {
+break;
+}
+
+err_source_struct += HEST_GHES_V2_TABLE_SIZE;
+}
+if (i == num_sources) {
+error_setg(errp, "HEST: Source %d not found.", source_id);
+return;
+}
+
+/* Navigate though table address pointers */
+hest_err_block_addr = err_source_struct + GHES_ERR_ST_ADDR_OFFSET;
+hest_read_ack_addr = err_source_struct + GHES_ACK_OFFSET;
+
+cpu_physical_memory_read(hest_err_block_addr, &error_block_addr,
+ sizeof(error_block_addr));
+
+error_block_addr =  le64_to_cpu(error_block_addr);
+
+cpu_physical_memory_read(error_block_addr, cper_addr,
+ sizeof(*cper_addr));
+
+*cper_addr = le64_to_cpu(*cper_addr);
+
+cpu_physical_memory_read(hest_read_ack_addr, read_ack_start_addr,
+ sizeof(*read_ack_start_addr));
+
+*read_ack_start_addr = le64_to_cpu(*read_ack_start_addr);
+}
+
 void ghes_record_cper_errors(const void *cper, size_t len,
  uint16_t source_id, Error **errp)
 {
@@ -439,8 +520,13 @@ void ghes_record_cper_errors(const void *cper, size_t len,
 }
 ags = &acpi_ged_state->ghes_state;
 
-get_hw_error_offsets(le64_to_cpu(ags->hw_error_le),
- &cper_addr, &read_ack_register_addr);
+if (!ags->hest_addr_le) {
+get_hw_error_offsets(le64_to_cpu(ags->hw_error_le),
+ &cper_addr, &read_ack

Re: [PATCH v2] riscv/gdb: add virt mode debug interface

2024-12-04 Thread Mario Fleischmann


Hi everyone,

I'd like to chime in here because we are sitting on a similar patch 
which I wanted to send to the mailing list as soon as riscv-debug-spec 
v1.0.0 becomes ratified.


For hypervisor support, `(qemu) info registers` isn't enough. We need to 
have both read and write access to the V-bit.


On 04.12.2024 14:43, Yanfeng Liu wrote:

On Fri, 2024-11-29 at 09:59 +, Alex Bennée wrote:

Yanfeng  writes:


On Thu, 2024-11-28 at 14:21 +, Alex Bennée wrote:

Yanfeng Liu  writes:


This adds `virt` virtual register on debug interface so that users
can access current virtualization mode for debugging purposes.

Signed-off-by: Yanfeng Liu 
---
  gdb-xml/riscv-32bit-virtual.xml |  1 +
  gdb-xml/riscv-64bit-virtual.xml |  1 +
  target/riscv/gdbstub.c  | 18 --
  3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/gdb-xml/riscv-32bit-virtual.xml b/gdb-xml/riscv-32bit-
virtual.xml
index 905f1c555d..d44b6ca2dc 100644
--- a/gdb-xml/riscv-32bit-virtual.xml
+++ b/gdb-xml/riscv-32bit-virtual.xml
@@ -8,4 +8,5 @@
  
  
    
+  
  
diff --git a/gdb-xml/riscv-64bit-virtual.xml b/gdb-xml/riscv-64bit-
virtual.xml
index 62d86c237b..7c9b63d5b6 100644
--- a/gdb-xml/riscv-64bit-virtual.xml
+++ b/gdb-xml/riscv-64bit-virtual.xml
@@ -8,4 +8,5 @@
  
  
    
+  
  


I assume these are mirrored in gdb not a QEMU only extension?


So far I think it is a QEMU extension and the `gdb-multiarch` doesn't treat
is
specially. My tests shows it basically works:

```
(gdb) ir virt
priv   0x3  prv:3 [Machine]
virt   0x0  0
(gdb) set $priv = 2
(gdb) ir virt
priv   0x1  prv:1 [Supervisor]
virt   0x0  0
(gdb) set $virt = 1
(gdb) ir virt
priv   0x1  prv:1 [Supervisor]
virt   0x1  1
(gdb) set $virt = 0
(gdb) ir virt
priv   0x1  prv:1 [Supervisor]
virt   0x0  0
(gdb) set $virt = 1
(gdb) ir virt
priv   0x1  prv:1 [Supervisor]
virt   0x1  1
(gdb) set $priv = 3
(gdb) ir virt
priv   0x3  prv:3 [Machine]
virt   0x0  0
```


A gdbstub test case would be useful for this although I don't know if
the RiscV check-tcg tests switch mode at all.



As I am rather new to QEMU, please teach how we can add it as a QEMU only
extension.


You don't need to extend the XML from GDB, you can build a specific one
for QEMU extensions. For example:

     gdb_feature_builder_init(¶m.builder,
  &cpu->dyn_sysreg_feature.desc,
  "org.qemu.gdb.arm.sys.regs",
  "system-registers.xml",
  base_reg);

This exports all the system registers QEMU knows about and GDB can
access generically. Note the id is org.qemu..., indicating its our
schema not gdbs.

Thanks for teaching, I need time to digest. I guess more feature builder APIs
are needed (like append_reg) and the getter/setter callbacks might be at a
different place.

BTW, compared to adding virtual register `virt`, how do you think if we share
the V bit as part of existing `priv` register?


IMHO this is a very good idea since the latest release candidate of 
riscv-debug-spec also includes the V bit in priv:2.



Or maybe we shall talk to GDB community to get their opinions? If they agree to
add a few words about V bit here
https://sourceware.org/gdb/current/onlinedocs/gdb.html/RISC_002dV-Features.html,
then it saves us a lot.


Except being currently not supported by GDB

(gdb) info register $priv
priv   0x5  prv:5 [INVALID]

are there any reasons from QEMU's side that would speak against 
including V in priv?

[PATCH v3 0/5] Change ghes driver to use HEST-based offsets

This  series was part of the previous PR to add generic error injection
support on GHES. It depends on a cleanup patch series sent earlier
today:


https://lore.kernel.org/qemu-devel/cover.1733297707.git.mchehab+hua...@kernel.org/T/#t

It contains the changes of the math used to calculate offsets at HEST table 
and hardware_error firmware file. It prepares for the addition of GHES
error injection.

The first patch was previously at the cleanup series. It prepares
the logic to support multiple sources.

The second patch adds a new firmware file to store HEST address.

The third patch use the new firmware to calculate offsets using
HEST table.

Patches 4 and 5 add migration support. They assume that this
series will be merged for qemu 9.2 (maybe it is too late for that,
as QEMU is now on soft freeze). 

I tested migration using both virt-9.1 and virt-9.2 machines
on qemu 9.2.

I also tested migration with:

qemu-9.1 -M virt-9.1 -cpu cortex-a57 => qemu-9.2 -M virt-9.1 -cpu 
cortex-a57
qemu-9.2 -M virt-9.1 -cpu cortex-a57 => qemu-9.1 -M virt-9.1 -cpu 
cortex-a57 

The full qemu command when test backward-compatibility when running virt-9.1 is:


~/qemu/build/qemu-system-aarch64 \
-m 4g,maxmem=8G,slots=8 -monitor stdio -no-reboot -bios 
~/emulator/QEMU_EFI-silent.fd 
-kernel~/kernel/arm64_build/arch/arm64/boot/Image.gz -device 
pcie-root-port,id=root_port1 -device virtio-blk-pci,drive=hd -device 
virtio-net-pci,netdev=mynet,id=bob -drive 
if=none,file=~/emulator/debian.qcow2,format=qcow2,id=hd -object 
memory-backend-ram,size=4G,id=mem0 -netdev 
type=user,id=mynet,hostfwd=tcp::-:22 -qmp 
tcp:localhost:4445,server=on,wait=off -M 
virt-9.1,nvdimm=on,gic-version=3,ras=on -cpu max -smp 4 -numa 
node,nodeid=0,cpus=0-3,memdev=mem0 -append 'earlycon nomodeset root=/dev/vda1 
fsck.mode=skip tp_printk maxcpus=4'

(I actually call it from two different directories, one with qemu-9.1 and the 
other one with qemu-9.2.

For tests on qemu-9.2 with virt-9.2, I used a similar command:

~/qemu/build/qemu-system-aarch64 -m 4g,maxmem=8G,slots=8 -monitor stdio 
-no-reboot -bios ~/emulator/QEMU_EFI-silent.fd -kernel 
~/kernel/arm64_build/arch/arm64/boot/Image.gz -device 
pcie-root-port,id=root_port1 -device virtio-blk-pci,drive=hd -device 
virtio-net-pci,netdev=mynet,id=bob -drive 
if=none,file=~/emulator/debian.qcow2,format=qcow2,id=hd -object 
memory-backend-ram,size=4G,id=mem0 -netdev 
type=user,id=mynet,hostfwd=tcp::-:22 -qmp 
tcp:localhost:4445,server=on,wait=off -M 
virt-9.2,nvdimm=on,gic-version=3,ras=on -cpu max -smp 4 -numa 
node,nodeid=0,cpus=0-3,memdev=mem0 -append 'earlycon nomodeset root=/dev/vda1 
fsck.mode=skip tp_printk maxcpus=4'

---

v3: did some minor cleanups at the code, as suggested by Jonathan Cameron.

v2:
  - some whitespace and comment changes
  - patch 3/6 (acpi/ghes: rename the function which gets hw error offsets)
was merged on the cleanup series.

Mauro Carvalho Chehab (5):
  acpi/ghes: Prepare to support multiple sources on ghes
  acpi/ghes: add a firmware file with HEST address
  acpi/ghes: Use HEST table offsets when preparing GHES records
  acpi/generic_event_device: Update GHES migration to cover hest addr
  acpi/generic_event_device: add logic to detect if HEST addr is
available

 hw/acpi/generic_event_device.c |  30 +++
 hw/acpi/ghes.c | 160 +
 hw/arm/virt-acpi-build.c   |  33 ++-
 hw/core/machine.c  |   2 +
 include/hw/acpi/ghes.h |  23 +++--
 5 files changed, 220 insertions(+), 28 deletions(-)

-- 
2.47.1

Re: [PATCH V4 14/19] migration: cpr-transfer mode

2024-12-04 Thread Steven Sistare


On 12/2/2024 8:20 AM, Steve Sistare wrote:
[...]

+
+/*
+ * If qmp_migrate_finish has not been called, then there is no path that
+ * will complete the cancellation.  Do it now.
+ */
+if (setup && !s->to_dst_file) {
+migrate_set_state(&s->state, s->state, MIGRATION_STATUS_CANCELLED);
+cpr_state_close();
+migrate_hup_delete(s);
+vm_resume(s->vm_old_state);
+}


I forgot to make changes here as we discussed in
  
https://lore.kernel.org/qemu-devel/2dc614cb-8754-423f-8c31-e5425075a...@oracle.com/

I will move vm_resume to the patch "stop vm earlier for cpr" in a later series, 
and
verify the current state:

+if (setup && !s->to_dst_file) {
+migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING, 
MIGRATION_STATUS_CANCELLED);
+cpr_state_close();
+migrate_hup_delete(s);
+}

- Steve

[PATCH v5 06/16] acpi/ghes: Remove a duplicated out of bounds check

acpi_ghes_record_errors() has an assert() at the beginning
to ensure that source_id will be lower than
ACPI_GHES_ERROR_SOURCE_COUNT. Remove a duplicated check.

Signed-off-by: Mauro Carvalho Chehab 
Reviewed-by: Igor Mammedov 
Reviewed-by: Jonathan Cameron 
---
 hw/acpi/ghes.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index 29cd7e4d8171..5f67322bf0f2 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -404,9 +404,7 @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t 
physical_address)
 return -1;
 }
 
-if (source_id < ACPI_GHES_ERROR_SOURCE_COUNT) {
-start_addr += source_id * sizeof(uint64_t);
-}
+start_addr += source_id * sizeof(uint64_t);
 
 cpu_physical_memory_read(start_addr, &error_block_addr,
  sizeof(error_block_addr));
-- 
2.47.1

Re: [PATCH v5 13/16] acpi/ghes: better name the offset of the hardware error firmware

On Wed,  4 Dec 2024 08:41:21 +0100
Mauro Carvalho Chehab  wrote:

> The hardware error firmware is where HEST error structures are
  ^^^ I can't parse this, suspect you've meant 
something else here

> stored. Those can be GHESv2, but they can also be other types.
> 
> Better name the location of the hardware error.
> 
> No functional changes.
> 
> Signed-off-by: Mauro Carvalho Chehab 
> Reviewed-by: Jonathan Cameron 
> Reviewed-by: Igor Mammedov 
> ---
>  hw/acpi/generic_event_device.c | 4 ++--
>  hw/acpi/ghes.c | 4 ++--
>  include/hw/acpi/ghes.h | 2 +-
>  3 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
> index 663d9cb09380..17baf36132a8 100644
> --- a/hw/acpi/generic_event_device.c
> +++ b/hw/acpi/generic_event_device.c
> @@ -364,7 +364,7 @@ static const VMStateDescription vmstate_ghes = {
>  .version_id = 1,
>  .minimum_version_id = 1,
>  .fields = (const VMStateField[]) {
> -VMSTATE_UINT64(ghes_addr_le, AcpiGhesState),
> +VMSTATE_UINT64(hw_error_le, AcpiGhesState),
>  VMSTATE_END_OF_LIST()
>  },
>  };
> @@ -372,7 +372,7 @@ static const VMStateDescription vmstate_ghes = {
>  static bool ghes_needed(void *opaque)
>  {
>  AcpiGedState *s = opaque;
> -return s->ghes_state.ghes_addr_le;
> +return s->ghes_state.hw_error_le;
>  }
>  
>  static const VMStateDescription vmstate_ghes_state = {
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> index 52c2b69d3664..90d76b9c2d8c 100644
> --- a/hw/acpi/ghes.c
> +++ b/hw/acpi/ghes.c
> @@ -359,7 +359,7 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState 
> *s,
>  
>  /* Create a read-write fw_cfg file for Address */
>  fw_cfg_add_file_callback(s, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, NULL, NULL,
> -NULL, &(ags->ghes_addr_le), sizeof(ags->ghes_addr_le), false);
> +NULL, &(ags->hw_error_le), sizeof(ags->hw_error_le), false);
>  
>  ags->present = true;
>  }
> @@ -385,7 +385,7 @@ void ghes_record_cper_errors(const void *cper, size_t len,
>  }
>  ags = &acpi_ged_state->ghes_state;
>  
> -start_addr = le64_to_cpu(ags->ghes_addr_le);
> +start_addr = le64_to_cpu(ags->hw_error_le);
>  
>  start_addr += source_id * sizeof(uint64_t);
>  
> diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h
> index 21666a4bcc8b..39619a2457cb 100644
> --- a/include/hw/acpi/ghes.h
> +++ b/include/hw/acpi/ghes.h
> @@ -65,7 +65,7 @@ enum {
>  };
>  
>  typedef struct AcpiGhesState {
> -uint64_t ghes_addr_le;
> +uint64_t hw_error_le;
>  bool present; /* True if GHES is present at all on this board */
>  } AcpiGhesState;
>

Re: [PATCH v5 09/16] acpi/ghes: make the GHES record generation more generic

On Wed,  4 Dec 2024 08:41:17 +0100
Mauro Carvalho Chehab  wrote:

> Split the code into separate functions to allow using the
> common CPER filling code by different error sources.
> 
> The generic code was moved to ghes_record_cper_errors(),
> and ghes_gen_err_data_uncorrectable_recoverable() now contains
> only a logic to fill the Generic Error Data part of the record,
> as described at:
> 
>   ACPI 6.2: 18.3.2.7.1 Generic Error Data
> 
> The remaining code to generate a memory error now belongs to
> acpi_ghes_record_errors() function.
> 
> A further patch will give it a better name.
> 
> Signed-off-by: Mauro Carvalho Chehab 

Reviewed-by: Igor Mammedov 

> 
> # Conflicts:
> # roms/edk2
> ---
>  hw/acpi/ghes.c | 121 -
>  include/hw/acpi/ghes.h |   3 +
>  2 files changed, 73 insertions(+), 51 deletions(-)
> 
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> index a3dffd78b012..4b5332f8c667 100644
> --- a/hw/acpi/ghes.c
> +++ b/hw/acpi/ghes.c
> @@ -181,51 +181,24 @@ static void acpi_ghes_build_append_mem_cper(GArray 
> *table,
>  build_append_int_noprefix(table, 0, 7);
>  }
>  
> -static int acpi_ghes_record_mem_error(uint64_t error_block_address,
> -  uint64_t error_physical_addr)
> +static void
> +ghes_gen_err_data_uncorrectable_recoverable(GArray *block,
> +const uint8_t *section_type,
> +int data_length)
>  {
> -GArray *block;
> -
> -/* Memory Error Section Type */
> -const uint8_t uefi_cper_mem_sec[] =
> -  UUID_LE(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83, \
> -  0xED, 0x7C, 0x83, 0xB1);
> -
>  /* invalid fru id: ACPI 4.0: 17.3.2.6.1 Generic Error Data,
>   * Table 17-13 Generic Error Data Entry
>   */
>  QemuUUID fru_id = {};
> -uint32_t data_length;
> -
> -block = g_array_new(false, true /* clear */, 1);
> -
> -/* This is the length if adding a new generic error data entry*/
> -data_length = ACPI_GHES_DATA_LENGTH + ACPI_GHES_MEM_CPER_LENGTH;
> -/*
> - * It should not run out of the preallocated memory if adding a new 
> generic
> - * error data entry
> - */
> -assert((data_length + ACPI_GHES_GESB_SIZE) <=
> -ACPI_GHES_MAX_RAW_DATA_LENGTH);
>  
>  /* Build the new generic error status block header */
>  acpi_ghes_generic_error_status(block, ACPI_GEBS_UNCORRECTABLE,
>  0, 0, data_length, ACPI_CPER_SEV_RECOVERABLE);
>  
>  /* Build this new generic error data entry header */
> -acpi_ghes_generic_error_data(block, uefi_cper_mem_sec,
> +acpi_ghes_generic_error_data(block, section_type,
>  ACPI_CPER_SEV_RECOVERABLE, 0, 0,
>  ACPI_GHES_MEM_CPER_LENGTH, fru_id, 0);
> -
> -/* Build the memory section CPER for above new generic error data entry 
> */
> -acpi_ghes_build_append_mem_cper(block, error_physical_addr);
> -
> -/* Write the generic error data entry into guest memory */
> -cpu_physical_memory_write(error_block_address, block->data, block->len);
> -
> -g_array_free(block, true);
> -
> -return 0;
>  }
>  
>  /*
> @@ -383,15 +356,18 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, 
> FWCfgState *s,
>  ags->present = true;
>  }
>  
> -int acpi_ghes_record_errors(uint16_t source_id, uint64_t physical_address)
> +void ghes_record_cper_errors(const void *cper, size_t len,
> + uint16_t source_id, Error **errp)
>  {
>  uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
>  uint64_t start_addr;
> -bool ret = -1;
>  AcpiGedState *acpi_ged_state;
>  AcpiGhesState *ags;
>  
> -assert(source_id < ACPI_GHES_ERROR_SOURCE_COUNT);
> +if (len > ACPI_GHES_MAX_RAW_DATA_LENGTH) {
> +error_setg(errp, "GHES CPER record is too big: %ld", len);
> +return;
> +}
>  
>  acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
> NULL));
> @@ -406,6 +382,10 @@ int acpi_ghes_record_errors(uint16_t source_id, uint64_t 
> physical_address)
>   sizeof(error_block_addr));
>  
>  error_block_addr = le64_to_cpu(error_block_addr);
> +if (!error_block_addr) {
> +error_setg(errp, "can not find Generic Error Status Block");
> +return;
> +}
>  
>  read_ack_register_addr = start_addr +
>   ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t);
> @@ -415,24 +395,63 @@ int acpi_ghes_record_errors(uint16_t source_id, 
> uint64_t physical_address)
>  
>  /* zero means OSPM does not acknowledge the error */
>  if (!read_ack_register) {
> -error_report("OSPM does not acknowledge previous error,"
> - " so can not record CPER for current error anymore");
> -} else if (error_block_addr) {
> -read_ack

Re: [PATCH v5 10/16] acpi/ghes: better name GHES memory error function

On Wed,  4 Dec 2024 08:41:18 +0100
Mauro Carvalho Chehab  wrote:

> The current function used to generate GHES data is specific for
> memory errors. Give a better name for it, as we now have a generic
> function as well.
> 
> Reviewed-by: Igor Mammedov 
> Reviewed-by: Jonathan Cameron 
> Signed-off-by: Mauro Carvalho Chehab 

not that it matters but for FYI
Sign off of author goes 1st and then after it other tags
that were added later

> ---
>  hw/acpi/ghes-stub.c| 2 +-
>  hw/acpi/ghes.c | 2 +-
>  include/hw/acpi/ghes.h | 4 ++--
>  target/arm/kvm.c   | 2 +-
>  4 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c
> index 2b64cbd2819a..7cec1812dad9 100644
> --- a/hw/acpi/ghes-stub.c
> +++ b/hw/acpi/ghes-stub.c
> @@ -11,7 +11,7 @@
>  #include "qemu/osdep.h"
>  #include "hw/acpi/ghes.h"
>  
> -int acpi_ghes_record_errors(uint16_t source_id, uint64_t physical_address)
> +int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address)
>  {
>  return -1;
>  }
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> index 4b5332f8c667..414a4a1ee00e 100644
> --- a/hw/acpi/ghes.c
> +++ b/hw/acpi/ghes.c
> @@ -415,7 +415,7 @@ void ghes_record_cper_errors(const void *cper, size_t len,
>  return;
>  }
>  
> -int acpi_ghes_record_errors(uint16_t source_id, uint64_t physical_address)
> +int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address)
>  {
>  /* Memory Error Section Type */
>  const uint8_t guid[] =
> diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h
> index 8859346af51a..21666a4bcc8b 100644
> --- a/include/hw/acpi/ghes.h
> +++ b/include/hw/acpi/ghes.h
> @@ -74,15 +74,15 @@ void acpi_build_hest(GArray *table_data, GArray 
> *hardware_errors,
>   const char *oem_id, const char *oem_table_id);
>  void acpi_ghes_add_fw_cfg(AcpiGhesState *vms, FWCfgState *s,
>GArray *hardware_errors);
> +int acpi_ghes_memory_errors(uint16_t source_id, uint64_t 
> error_physical_addr);
>  void ghes_record_cper_errors(const void *cper, size_t len,
>   uint16_t source_id, Error **errp);
> -int acpi_ghes_record_errors(uint16_t source_id, uint64_t 
> error_physical_addr);
>  
>  /**
>   * acpi_ghes_present: Report whether ACPI GHES table is present
>   *
>   * Returns: true if the system has an ACPI GHES table and it is
> - * safe to call acpi_ghes_record_errors() to record a memory error.
> + * safe to call acpi_ghes_memory_errors() to record a memory error.
>   */
>  bool acpi_ghes_present(void);
>  #endif
> diff --git a/target/arm/kvm.c b/target/arm/kvm.c
> index 7b6812c0de2e..b4260467f8b9 100644
> --- a/target/arm/kvm.c
> +++ b/target/arm/kvm.c
> @@ -2387,7 +2387,7 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, 
> void *addr)
>   */
>  if (code == BUS_MCEERR_AR) {
>  kvm_cpu_synchronize_state(c);
> -if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
> +if (!acpi_ghes_memory_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
>  kvm_inject_arm_sea(c);
>  } else {
>  error_report("failed to record the error");

Re: [PATCH v5 14/16] acpi/ghes: move offset calculus to a separate function

On Wed,  4 Dec 2024 08:41:22 +0100
Mauro Carvalho Chehab  wrote:

> Currently, CPER address location is calculated as an offset of
> the hardware_errors table. It is also badly named, as the
> offset actually used is the address where the CPER data starts,
> and not the beginning of the error source.
> 
> Move the logic which calculates such offset to a separate
> function, in preparation for a patch that will be changing the
> logic to calculate it from the HEST table.
> 
> While here, properly name the variable which stores the cper
> address.
> 
> Signed-off-by: Mauro Carvalho Chehab 
> Reviewed-by: Jonathan Cameron 

Reviewed-by: Igor Mammedov 

> ---
>  hw/acpi/ghes.c | 40 +++-
>  1 file changed, 31 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> index 90d76b9c2d8c..a4453ee357bc 100644
> --- a/hw/acpi/ghes.c
> +++ b/hw/acpi/ghes.c
> @@ -364,10 +364,37 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, 
> FWCfgState *s,
>  ags->present = true;
>  }
>  
> +static void get_hw_error_offsets(uint64_t ghes_addr,
> + uint64_t *cper_addr,
> + uint64_t *read_ack_register_addr)
> +{
> +if (!ghes_addr) {
> +return;
> +}
> +
> +/*
> + * non-HEST version supports only one source, so no need to change
> + * the start offset based on the source ID. Also, we can't validate
> + * the source ID, as it is stored inside the HEST table.
> + */
> +
> +cpu_physical_memory_read(ghes_addr, cper_addr,
> + sizeof(*cper_addr));
> +
> +*cper_addr = le64_to_cpu(*cper_addr);
> +
> +/*
> + * As the current version supports only one source, the ack offset is
> + * just sizeof(uint64_t).
> + */
> +*read_ack_register_addr = ghes_addr +
> +   ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t);
> +}
> +
>  void ghes_record_cper_errors(const void *cper, size_t len,
>   uint16_t source_id, Error **errp)
>  {
> -uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
> +uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register;
>  uint64_t start_addr;
>  AcpiGedState *acpi_ged_state;
>  AcpiGhesState *ags;
> @@ -389,18 +416,13 @@ void ghes_record_cper_errors(const void *cper, size_t 
> len,
>  
>  start_addr += source_id * sizeof(uint64_t);
>  
> -cpu_physical_memory_read(start_addr, &error_block_addr,
> - sizeof(error_block_addr));
> +get_hw_error_offsets(start_addr, &cper_addr, &read_ack_register_addr);
>  
> -error_block_addr = le64_to_cpu(error_block_addr);
> -if (!error_block_addr) {
> +if (!cper_addr) {
>  error_setg(errp, "can not find Generic Error Status Block");
>  return;
>  }
>  
> -read_ack_register_addr = start_addr +
> - ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t);
> -
>  cpu_physical_memory_read(read_ack_register_addr,
>   &read_ack_register, sizeof(read_ack_register));
>  
> @@ -421,7 +443,7 @@ void ghes_record_cper_errors(const void *cper, size_t len,
>  &read_ack_register, sizeof(uint64_t));
>  
>  /* Write the generic error data entry into guest memory */
> -cpu_physical_memory_write(error_block_addr, cper, len);
> +cpu_physical_memory_write(cper_addr, cper, len);
>  
>  return;
>  }

[PATCH] hostmem-file: add the 'hmem' option

2024-12-04 Thread Zhigang Luo

This boolean option 'hmem' allows users to set a memory region from
memory-backend-file as heterogeneous memory. If 'hmem=on', QEMU will
set the flag RAM_HMEM in the RAM block of the corresponding memory
region and set the e820 type to E820_SOFT_RESERVED for this region.

Signed-off-by: Zhigang Luo 
---
 backends/hostmem-file.c  | 23 +++
 hw/i386/e820_memory_layout.h |  1 +
 hw/i386/pc.c | 16 
 include/exec/cpu-common.h|  1 +
 include/exec/memory.h|  3 +++
 qapi/qom.json|  4 
 system/physmem.c |  7 ++-
 7 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index 7e5072e33e..5ddfdbaf86 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -32,6 +32,7 @@ struct HostMemoryBackendFile {
 uint64_t offset;
 bool discard_data;
 bool is_pmem;
+bool is_hmem;
 bool readonly;
 OnOffAuto rom;
 };
@@ -88,6 +89,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error 
**errp)
 ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
 ram_flags |= backend->guest_memfd ? RAM_GUEST_MEMFD : 0;
 ram_flags |= fb->is_pmem ? RAM_PMEM : 0;
+ram_flags |= fb->is_hmem ? RAM_HMEM : 0;
 ram_flags |= RAM_NAMED_FILE;
 return memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), 
name,
 backend->size, fb->align, 
ram_flags,
@@ -256,6 +258,25 @@ static void file_memory_backend_set_rom(Object *obj, 
Visitor *v,
 visit_type_OnOffAuto(v, name, &fb->rom, errp);
 }
 
+static bool file_memory_backend_get_hmem(Object *o, Error **errp)
+{
+return MEMORY_BACKEND_FILE(o)->is_hmem;
+}
+
+static void file_memory_backend_set_hmem(Object *o, bool value, Error **errp)
+{
+HostMemoryBackend *backend = MEMORY_BACKEND(o);
+HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
+
+if (host_memory_backend_mr_inited(backend)) {
+error_setg(errp, "cannot change property 'hmem' of %s.",
+   object_get_typename(o));
+return;
+}
+
+fb->is_hmem = value;
+}
+
 static void file_backend_unparent(Object *obj)
 {
 HostMemoryBackend *backend = MEMORY_BACKEND(obj);
@@ -295,6 +316,8 @@ file_backend_class_init(ObjectClass *oc, void *data)
 object_class_property_add_bool(oc, "pmem",
 file_memory_backend_get_pmem, file_memory_backend_set_pmem);
 #endif
+object_class_property_add_bool(oc, "hmem",
+file_memory_backend_get_hmem, file_memory_backend_set_hmem);
 object_class_property_add_bool(oc, "readonly",
 file_memory_backend_get_readonly,
 file_memory_backend_set_readonly);
diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
index b50acfa201..8af6a9cfac 100644
--- a/hw/i386/e820_memory_layout.h
+++ b/hw/i386/e820_memory_layout.h
@@ -15,6 +15,7 @@
 #define E820_ACPI   3
 #define E820_NVS4
 #define E820_UNUSABLE   5
+#define E820_SOFT_RESERVED  0xEFFF
 
 struct e820_entry {
 uint64_t address;
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 317aaca25a..41e9cc276c 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -785,6 +785,21 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, 
uint64_t pci_hole64_size)
 return pc_above_4g_end(pcms) - 1;
 }
 
+static int pc_update_hmem_memory(RAMBlock *rb, void *opaque)
+{
+X86MachineState *x86ms = opaque;
+ram_addr_t offset;
+ram_addr_t length;
+
+if (qemu_ram_is_hmem(rb)) {
+offset = qemu_ram_get_offset(rb) + (0x1ULL - 
x86ms->below_4g_mem_size);
+length = qemu_ram_get_used_length(rb);
+e820_add_entry(offset, length, E820_SOFT_RESERVED);
+}
+
+return 0;
+}
+
 /*
  * AMD systems with an IOMMU have an additional hole close to the
  * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
@@ -895,6 +910,7 @@ void pc_memory_init(PCMachineState *pcms,
 e820_add_entry(x86ms->above_4g_mem_start, x86ms->above_4g_mem_size,
E820_RAM);
 }
+qemu_ram_foreach_block(pc_update_hmem_memory, x86ms);
 
 if (pcms->sgx_epc.size != 0) {
 e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 638dc806a5..1b2dfb31e8 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -98,6 +98,7 @@ ram_addr_t qemu_ram_get_offset(RAMBlock *rb);
 ram_addr_t qemu_ram_get_used_length(RAMBlock *rb);
 ram_addr_t qemu_ram_get_max_length(RAMBlock *rb);
 bool qemu_ram_is_shared(RAMBlock *rb);
+bool qemu_ram_is_hmem(RAMBlock *rb);
 bool qemu_ram_is_noreserve(RAMBlock *rb);
 bool qemu_ram_is_uf_zeroable(RAMBlock *rb);
 void qemu_ram_set_uf_zeroable(RAMBlock *rb);
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 9458e2801d..18c593a00b 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -246,6 +246,9 @@ typedef stru

Re: [PATCH RFC 00/11] migration/block: disk activation rewrite

On Tue, Dec 03, 2024 at 07:51:27PM -0500, Peter Xu wrote:
>   migration/block: Merge block reactivations for fail/cancel
>   migration/block: Extend the migration_block_* API to dest side
>   migration/block: Apply the migration_block_* API to postcopy

I just noticed these three patches cannot be separate, because right after
we introduce a flag to cache disk activation status, it will need to apply
to all the code or inconsistency can happen if someone applies one of the
patches, for example.

I also overlooked that we must take qmp_cont() into the picture too, so
that will also need to use the API so the flag will be consistent.

So in the next version I'll squash the three patches into one, and also use
the new API in qmp_cont(), so the status flag should always be consistent.

Again, to block layer developers: please help if any of you know how to
make bdrv_inactivate_all() safe to be called on top of inactivated disks,
or any reasoning on why it mustn't.  It could be very helpful.

-- 
Peter Xu

Re: [PATCH 2/6] migration: Kick postcopy threads on cancel

On Mon, Dec 02, 2024 at 07:01:33PM -0300, Fabiano Rosas wrote:
> Make sure postcopy threads are released when migrate_cancel is
> issued. Kick the postcopy_pause semaphore and have the fault thread
> read 'fault_thread_quit' when joining.
> 
> While here fix the comment mentioning userfault_event_fd.
> 
> Signed-off-by: Fabiano Rosas 

I remember when working on postcopy, I thought about failing migrate-cancel
for postcopy in general, rejecting such request.  And when working on the
recover feature, there's no concern on having it being cancelled, because
the user really shouldn't do that..

The problem is migrate-cancel means crashing the VM on both sides when QEMU
already goes into postcopy stage.

If the user wants to crash the VM anyway, an easier way to do is killing on
both sides.

If the user wished to cancel, we should tell them "postcopy cannot be
cancelled, until complete".  That's probably the major reason why people
think postcopy is dangerous to use..

Or do we have any use case this could be a valid scenario?

-- 
Peter Xu

Re: [PATCH 5/6] migration: Fix hang after error in destination setup phase

On Mon, Dec 02, 2024 at 07:01:36PM -0300, Fabiano Rosas wrote:
> If the destination side fails at migration_ioc_process_incoming()
> before starting the coroutine, it will report the error but QEMU will
> not exit.
> 
> Set the migration state to FAILED and exit the process if
> exit-on-error allows.
> 
> CC: Thomas Huth 
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2633
> Reported-by: Daniel P. Berrangé 
> Signed-off-by: Fabiano Rosas 

(I skipped the postcopy patches as of now, until we finish the discussion
 in patch 2)

> ---
>  migration/channel.c   | 11 ++-
>  migration/migration.c | 31 ++-
>  migration/migration.h |  2 +-
>  3 files changed, 25 insertions(+), 19 deletions(-)
> 
> diff --git a/migration/channel.c b/migration/channel.c
> index f9de064f3b..6d7f9172d8 100644
> --- a/migration/channel.c
> +++ b/migration/channel.c
> @@ -40,13 +40,14 @@ void migration_channel_process_incoming(QIOChannel *ioc)
>  
>  if (migrate_channel_requires_tls_upgrade(ioc)) {
>  migration_tls_channel_process_incoming(s, ioc, &local_err);
> +
> +if (local_err) {
> +error_report_err(local_err);
> +}

What if tls processing failed here, do we have similar issue that qemu will
stall?  Do we want to cover that too?

> +
>  } else {
>  migration_ioc_register_yank(ioc);
> -migration_ioc_process_incoming(ioc, &local_err);
> -}
> -
> -if (local_err) {
> -error_report_err(local_err);
> +migration_ioc_process_incoming(ioc);
>  }
>  }
>  
> diff --git a/migration/migration.c b/migration/migration.c
> index 8a61cc26d7..cd88ebc875 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -943,7 +943,7 @@ static bool migration_should_start_incoming(bool 
> main_channel)
>  return true;
>  }
>  
> -void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
> +void migration_ioc_process_incoming(QIOChannel *ioc)
>  {
>  MigrationIncomingState *mis = migration_incoming_get_current();
>  Error *local_err = NULL;
> @@ -966,10 +966,9 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
> Error **errp)
>   * issue is not possible.
>   */
>  ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
> -  sizeof(channel_magic), errp);
> -
> +  sizeof(channel_magic), &local_err);
>  if (ret != 0) {
> -return;
> +goto err;
>  }
>  
>  default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
> @@ -977,8 +976,8 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
> Error **errp)
>  default_channel = !mis->from_src_file;
>  }
>  
> -if (multifd_recv_setup(errp) != 0) {
> -return;
> +if (multifd_recv_setup(&local_err) != 0) {
> +goto err;
>  }
>  
>  if (default_channel) {
> @@ -995,18 +994,24 @@ void migration_ioc_process_incoming(QIOChannel *ioc, 
> Error **errp)
>  postcopy_preempt_new_channel(mis, f);
>  }
>  if (local_err) {
> -error_propagate(errp, local_err);
> -return;
> +goto err;
>  }
>  }
>  
> -if (migration_should_start_incoming(default_channel)) {
> -/* If it's a recovery, we're done */
> -if (postcopy_try_recover()) {
> -return;
> -}
> +if (migration_should_start_incoming(default_channel) &&
> +!postcopy_try_recover()) {
>  migration_incoming_process();
>  }
> +
> +return;
> +
> +err:
> +error_report_err(local_err);
> +migrate_set_state(&mis->state, MIGRATION_STATUS_SETUP,
> +  MIGRATION_STATUS_FAILED);
> +if (mis->exit_on_error) {
> +exit(EXIT_FAILURE);
> +}
>  }
>  
>  /**
> diff --git a/migration/migration.h b/migration/migration.h
> index 0956e9274b..c367e5ea40 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -477,7 +477,7 @@ void migrate_set_state(MigrationStatus *state, 
> MigrationStatus old_state,
> MigrationStatus new_state);
>  
>  void migration_fd_process_incoming(QEMUFile *f);
> -void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp);
> +void migration_ioc_process_incoming(QIOChannel *ioc);
>  void migration_incoming_process(void);
>  
>  bool  migration_has_all_channels(void);
> -- 
> 2.35.3
> 

-- 
Peter Xu

Re: [PATCH 1/6] tests/qtest/migration: Introduce migration_test_add_suffix