[Qemu-devel] [PATCH 4/4] scripts/analyse-9p-simpletrace.py: Add symbolic names for 9p operations.

2011-12-21 Thread Aneesh Kumar K.V
From: Harsh Prateek Bora 

Currently, we just print the numerical value of 9p operation identifier in
case of RERROR which is less meaningful for readability. Mapping 9p
operation ids to symbolic names provides a better tracelog:

RERROR (tag = 1 , id = TWALK , err = " No such file or directory ")
RERROR (tag = 1 , id = TUNLINKAT , err = " Directory not empty ")

This patch provides a dictionary of all possible 9p operation symbols mapped
to their numerical identifiers which are likely to be used in future at
various places in this script.

Signed-off-by: Harsh Prateek Bora 
Signed-off-by: Aneesh Kumar K.V 
---
 scripts/analyse-9p-simpletrace.py |   75 -
 1 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/scripts/analyse-9p-simpletrace.py 
b/scripts/analyse-9p-simpletrace.py
index b6d58fd..3c3dee4 100755
--- a/scripts/analyse-9p-simpletrace.py
+++ b/scripts/analyse-9p-simpletrace.py
@@ -3,15 +3,86 @@
 # Usage: ./analyse-9p-simpletrace  
 #
 # Author: Harsh Prateek Bora
-
+import os
 import simpletrace
 
+symbol_9p = {
+6   : 'TLERROR',
+7   : 'RLERROR',
+8   : 'TSTATFS',
+9   : 'RSTATFS',
+12  : 'TLOPEN',
+13  : 'RLOPEN',
+14  : 'TLCREATE',
+15  : 'RLCREATE',
+16  : 'TSYMLINK',
+17  : 'RSYMLINK',
+18  : 'TMKNOD',
+19  : 'RMKNOD',
+20  : 'TRENAME',
+21  : 'RRENAME',
+22  : 'TREADLINK',
+23  : 'RREADLINK',
+24  : 'TGETATTR',
+25  : 'RGETATTR',
+26  : 'TSETATTR',
+27  : 'RSETATTR',
+30  : 'TXATTRWALK',
+31  : 'RXATTRWALK',
+32  : 'TXATTRCREATE',
+33  : 'RXATTRCREATE',
+40  : 'TREADDIR',
+41  : 'RREADDIR',
+50  : 'TFSYNC',
+51  : 'RFSYNC',
+52  : 'TLOCK',
+53  : 'RLOCK',
+54  : 'TGETLOCK',
+55  : 'RGETLOCK',
+70  : 'TLINK',
+71  : 'RLINK',
+72  : 'TMKDIR',
+73  : 'RMKDIR',
+74  : 'TRENAMEAT',
+75  : 'RRENAMEAT',
+76  : 'TUNLINKAT',
+77  : 'RUNLINKAT',
+100 : 'TVERSION',
+101 : 'RVERSION',
+102 : 'TAUTH',
+103 : 'RAUTH',
+104 : 'TATTACH',
+105 : 'RATTACH',
+106 : 'TERROR',
+107 : 'RERROR',
+108 : 'TFLUSH',
+109 : 'RFLUSH',
+110 : 'TWALK',
+111 : 'RWALK',
+112 : 'TOPEN',
+113 : 'ROPEN',
+114 : 'TCREATE',
+115 : 'RCREATE',
+116 : 'TREAD',
+117 : 'RREAD',
+118 : 'TWRITE',
+119 : 'RWRITE',
+120 : 'TCLUNK',
+121 : 'RCLUNK',
+122 : 'TREMOVE',
+123 : 'RREMOVE',
+124 : 'TSTAT',
+125 : 'RSTAT',
+126 : 'TWSTAT',
+127 : 'RWSTAT'
+}
+
 class VirtFSRequestTracker(simpletrace.Analyzer):
 def begin(self):
 print "Pretty printing 9p simpletrace log ..."
 
 def v9fs_rerror(self, tag, id, err):
-print "RERROR (tag =", tag, ", id =", id, ",err =", err, ")"
+print "RERROR (tag =", tag, ", id =", symbol_9p[id], ", err = 
\"", os.strerror(err), "\")"
 
 def v9fs_version(self, tag, id, msize, version):
 print "TVERSION (tag =", tag, ", msize =", msize, ", version 
=", version, ")"
-- 
1.7.8.247.g10f4e.dirty




[Qemu-devel] [PATCH 3/4] hw/9pfs: iattr_valid flags are kernel internal flags map them to 9p values.

2011-12-21 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

Kernel internal values can change, add protocol values for these constant and
use them.

Signed-off-by: Aneesh Kumar K.V 
---
 hw/9pfs/virtio-9p.c |   47 ---
 1 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/hw/9pfs/virtio-9p.c b/hw/9pfs/virtio-9p.c
index 7f1301b..df0a8e7 100644
--- a/hw/9pfs/virtio-9p.c
+++ b/hw/9pfs/virtio-9p.c
@@ -1293,17 +1293,18 @@ out_nofid:
 complete_pdu(s, pdu, retval);
 }
 
-/* From Linux kernel code */
-#define ATTR_MODE(1 << 0)
-#define ATTR_UID (1 << 1)
-#define ATTR_GID (1 << 2)
-#define ATTR_SIZE(1 << 3)
-#define ATTR_ATIME   (1 << 4)
-#define ATTR_MTIME   (1 << 5)
-#define ATTR_CTIME   (1 << 6)
-#define ATTR_MASK127
-#define ATTR_ATIME_SET  (1 << 7)
-#define ATTR_MTIME_SET  (1 << 8)
+/* Attribute flags */
+#define P9_ATTR_MODE   (1 << 0)
+#define P9_ATTR_UID(1 << 1)
+#define P9_ATTR_GID(1 << 2)
+#define P9_ATTR_SIZE   (1 << 3)
+#define P9_ATTR_ATIME  (1 << 4)
+#define P9_ATTR_MTIME  (1 << 5)
+#define P9_ATTR_CTIME  (1 << 6)
+#define P9_ATTR_ATIME_SET  (1 << 7)
+#define P9_ATTR_MTIME_SET  (1 << 8)
+
+#define P9_ATTR_MASK127
 
 static void v9fs_setattr(void *opaque)
 {
@@ -1322,16 +1323,16 @@ static void v9fs_setattr(void *opaque)
 err = -EINVAL;
 goto out_nofid;
 }
-if (v9iattr.valid & ATTR_MODE) {
+if (v9iattr.valid & P9_ATTR_MODE) {
 err = v9fs_co_chmod(pdu, &fidp->path, v9iattr.mode);
 if (err < 0) {
 goto out;
 }
 }
-if (v9iattr.valid & (ATTR_ATIME | ATTR_MTIME)) {
+if (v9iattr.valid & (P9_ATTR_ATIME | P9_ATTR_MTIME)) {
 struct timespec times[2];
-if (v9iattr.valid & ATTR_ATIME) {
-if (v9iattr.valid & ATTR_ATIME_SET) {
+if (v9iattr.valid & P9_ATTR_ATIME) {
+if (v9iattr.valid & P9_ATTR_ATIME_SET) {
 times[0].tv_sec = v9iattr.atime_sec;
 times[0].tv_nsec = v9iattr.atime_nsec;
 } else {
@@ -1340,8 +1341,8 @@ static void v9fs_setattr(void *opaque)
 } else {
 times[0].tv_nsec = UTIME_OMIT;
 }
-if (v9iattr.valid & ATTR_MTIME) {
-if (v9iattr.valid & ATTR_MTIME_SET) {
+if (v9iattr.valid & P9_ATTR_MTIME) {
+if (v9iattr.valid & P9_ATTR_MTIME_SET) {
 times[1].tv_sec = v9iattr.mtime_sec;
 times[1].tv_nsec = v9iattr.mtime_nsec;
 } else {
@@ -1359,13 +1360,13 @@ static void v9fs_setattr(void *opaque)
  * If the only valid entry in iattr is ctime we can call
  * chown(-1,-1) to update the ctime of the file
  */
-if ((v9iattr.valid & (ATTR_UID | ATTR_GID)) ||
-((v9iattr.valid & ATTR_CTIME)
- && !((v9iattr.valid & ATTR_MASK) & ~ATTR_CTIME))) {
-if (!(v9iattr.valid & ATTR_UID)) {
+if ((v9iattr.valid & (P9_ATTR_UID | P9_ATTR_GID)) ||
+((v9iattr.valid & P9_ATTR_CTIME)
+ && !((v9iattr.valid & P9_ATTR_MASK) & ~P9_ATTR_CTIME))) {
+if (!(v9iattr.valid & P9_ATTR_UID)) {
 v9iattr.uid = -1;
 }
-if (!(v9iattr.valid & ATTR_GID)) {
+if (!(v9iattr.valid & P9_ATTR_GID)) {
 v9iattr.gid = -1;
 }
 err = v9fs_co_chown(pdu, &fidp->path, v9iattr.uid,
@@ -1374,7 +1375,7 @@ static void v9fs_setattr(void *opaque)
 goto out;
 }
 }
-if (v9iattr.valid & (ATTR_SIZE)) {
+if (v9iattr.valid & (P9_ATTR_SIZE)) {
 err = v9fs_co_truncate(pdu, &fidp->path, v9iattr.size);
 if (err < 0) {
 goto out;
-- 
1.7.8.247.g10f4e.dirty




[Qemu-devel] [PATCH 1/4] hw/9pfs: replace iovec manipulation with QEMUIOVector

2011-12-21 Thread Aneesh Kumar K.V
From: Stefan Hajnoczi 

The v9fs_read() and v9fs_write() functions rely on iovec[] manipulation
code should be replaced with QEMUIOVector to avoid duplicating code.
In the future it may be possible to make the code even more concise by
using QEMUIOVector consistently across virtio and 9pfs.

The "v" format specifier for pdu_marshal() and pdu_unmarshal() is
dropped since it does not actually pack/unpack anything.  The specifier
was also not implemented to update the offset variable and could only be
used at the end of a format string, another sign that this shouldn't
really be a format specifier.  Instead, see the new
v9fs_init_qiov_from_pdu() function.

This change avoids a possible iovec[] buffer overflow when indirect
vrings are used since the number of vectors is now limited by the
underlying VirtQueueElement and cannot be out-of-bounds.

Signed-off-by: Stefan Hajnoczi 
Signed-off-by: Aneesh Kumar K.V 
---
 hw/9pfs/virtio-9p.c |  162 +++
 1 files changed, 60 insertions(+), 102 deletions(-)

diff --git a/hw/9pfs/virtio-9p.c b/hw/9pfs/virtio-9p.c
index 36a862f..46dc9f7 100644
--- a/hw/9pfs/virtio-9p.c
+++ b/hw/9pfs/virtio-9p.c
@@ -674,40 +674,6 @@ static size_t pdu_pack(V9fsPDU *pdu, size_t offset, const 
void *src,
  offset, size, 1);
 }
 
-static int pdu_copy_sg(V9fsPDU *pdu, size_t offset, int rx, struct iovec *sg)
-{
-size_t pos = 0;
-int i, j;
-struct iovec *src_sg;
-unsigned int num;
-
-if (rx) {
-src_sg = pdu->elem.in_sg;
-num = pdu->elem.in_num;
-} else {
-src_sg = pdu->elem.out_sg;
-num = pdu->elem.out_num;
-}
-
-j = 0;
-for (i = 0; i < num; i++) {
-if (offset <= pos) {
-sg[j].iov_base = src_sg[i].iov_base;
-sg[j].iov_len = src_sg[i].iov_len;
-j++;
-} else if (offset < (src_sg[i].iov_len + pos)) {
-sg[j].iov_base = src_sg[i].iov_base;
-sg[j].iov_len = src_sg[i].iov_len;
-sg[j].iov_base += (offset - pos);
-sg[j].iov_len -= (offset - pos);
-j++;
-}
-pos += src_sg[i].iov_len;
-}
-
-return j;
-}
-
 static size_t pdu_unmarshal(V9fsPDU *pdu, size_t offset, const char *fmt, ...)
 {
 size_t old_offset = offset;
@@ -743,12 +709,6 @@ static size_t pdu_unmarshal(V9fsPDU *pdu, size_t offset, 
const char *fmt, ...)
 *valp = le64_to_cpu(val);
 break;
 }
-case 'v': {
-struct iovec *iov = va_arg(ap, struct iovec *);
-int *iovcnt = va_arg(ap, int *);
-*iovcnt = pdu_copy_sg(pdu, offset, 0, iov);
-break;
-}
 case 's': {
 V9fsString *str = va_arg(ap, V9fsString *);
 offset += pdu_unmarshal(pdu, offset, "w", &str->size);
@@ -827,12 +787,6 @@ static size_t pdu_marshal(V9fsPDU *pdu, size_t offset, 
const char *fmt, ...)
 offset += pdu_pack(pdu, offset, &val, sizeof(val));
 break;
 }
-case 'v': {
-struct iovec *iov = va_arg(ap, struct iovec *);
-int *iovcnt = va_arg(ap, int *);
-*iovcnt = pdu_copy_sg(pdu, offset, 1, iov);
-break;
-}
 case 's': {
 V9fsString *str = va_arg(ap, V9fsString *);
 offset += pdu_marshal(pdu, offset, "w", str->size);
@@ -1143,42 +1097,6 @@ static void stat_to_v9stat_dotl(V9fsState *s, const 
struct stat *stbuf,
 stat_to_qid(stbuf, &v9lstat->qid);
 }
 
-static struct iovec *adjust_sg(struct iovec *sg, int len, int *iovcnt)
-{
-while (len && *iovcnt) {
-if (len < sg->iov_len) {
-sg->iov_len -= len;
-sg->iov_base += len;
-len = 0;
-} else {
-len -= sg->iov_len;
-sg++;
-*iovcnt -= 1;
-}
-}
-
-return sg;
-}
-
-static struct iovec *cap_sg(struct iovec *sg, int cap, int *cnt)
-{
-int i;
-int total = 0;
-
-for (i = 0; i < *cnt; i++) {
-if ((total + sg[i].iov_len) > cap) {
-sg[i].iov_len -= ((total + sg[i].iov_len) - cap);
-i++;
-break;
-}
-total += sg[i].iov_len;
-}
-
-*cnt = i;
-
-return sg;
-}
-
 static void print_sg(struct iovec *sg, int cnt)
 {
 int i;
@@ -1861,6 +1779,38 @@ out:
 return count;
 }
 
+/*
+ * Create a QEMUIOVector for a sub-region of PDU iovecs
+ *
+ * @qiov:   uninitialized QEMUIOVector
+ * @skip:   number of bytes to skip from beginning of PDU
+ * @size:   number of bytes to include
+ * @is_write:   true - write, false - read
+ *
+ * The resulting QEMUIOVector has heap-allocated iovecs and must be cleaned up
+ * with qemu_iovec_destroy().
+ */
+static void v9fs_init_qiov_from_pdu(QEMUIOVector *qiov, V9fsPDU *pdu,
+uint64_t skip, size_t size,
+bool is_write)
+{
+QEMUI

Re: [Qemu-devel] [PATCH [0/9] various ARM fixes

2011-12-21 Thread Markus Armbruster
Please thread together your patches so that the parts appear as replies
to the cover letter.  git-send-email should do that by default.

Why?  Disconnected parts can easily get separated in the list.  When
that happens, your reviewers need to hunt for parts.  They may choose to
review something else instead.  Can't blame them; reviewing is more
productive than hunting for parts.



[Qemu-devel] the principle of the vm snapshot based on qemu-dm

2011-12-21 Thread ¤終於aware
Hi everyone,
  
  
 I'm not clear about the principle of the vm snapshot based on qemu-dm, so 
the following below maybe a joke. Sorry for that...
 register_savevm() included in ioemu-qemu-xen/hw/*.c covers various kinds 
of hardware in order to save the state of current running vm. In this way,  the 
file saved by the cmd of 'xm save domain snapshot.save' is of large size and 
the operation costs much time. Is there anyelse way to qucikly create a 
snapshot, such as only saving the memory state(similary with Hibernation) or 
something else?!
 If someone who understands the principle or have the relevant statistics, 
thanks for sharing.
  
  
 cheers
 bruce

Re: [Qemu-devel] [Bug 907063] [NEW] Error reading VMDK4 with footer instead of header

2011-12-21 Thread Stefan Hajnoczi
On Tue, Dec 20, 2011 at 8:53 PM, bbgordonn <907...@bugs.launchpad.net> wrote:
> Public bug reported:
>
> VMDK4 files can have a footer in the last block, which is the same 
> datastructure as the header but must be used instead if present. In this 
> case, the gd_offset in the usual header at the beginning of the file is the 
> special flag -1 (VMDK 1.1 spec, page 17, "GD_AT_END
> "). qemu-img doesn't know about this flag so it goes on to try to read 
> extents with a bogus l1_table from the wrong location in the file.
>
> I have regression-tested this with various OVAs exported from
> VSphere/ESXi 3 and 4. Current master and all previous QEMU versions were
> unable to import any compressed VMDKs with a footer. It now works on all
> the ones I have.
>
> bb45ded93115ad4303471c9a492579dc36716547 changed the order of gd_offset
> and rgd_offset in the VMDK4Header struct. Page 8 of the VMDK 1.1 spec
> from VMWare shows the structure as rgd_ then gd_, while QEMU now has gd_
> *before* rgd_offset. I was only able to get VMDK conversion to work by
> switching the order back to that specified by VMWare and previously used
> by QEMU. I don't know what VMDK this commit is referring to, so I can't
> test to see if I've broken it. :(
>
> I will submit this patch to the mailing list if I get a chance, but I'm
> also uploading it here so I don't lose it.
>
> ** Affects: qemu
>     Importance: Undecided
>         Status: New
>
> --
> You received this bug notification because you are a member of qemu-
> devel-ml, which is subscribed to QEMU.
> https://bugs.launchpad.net/bugs/907063

Thanks for reporting this.  I have CCed Fam who worked on VMDK this
summer.

Please submit patches to the mailing list according to the guidelines
here:

http://wiki.qemu.org/Contribute/SubmitAPatch

Stefan

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/907063

Title:
  Error reading VMDK4 with footer instead of header

Status in QEMU:
  New

Bug description:
  VMDK4 files can have a footer in the last block, which is the same 
datastructure as the header but must be used instead if present. In this case, 
the gd_offset in the usual header at the beginning of the file is the special 
flag -1 (VMDK 1.1 spec, page 17, "GD_AT_END
  "). qemu-img doesn't know about this flag so it goes on to try to read 
extents with a bogus l1_table from the wrong location in the file.

  I have regression-tested this with various OVAs exported from
  VSphere/ESXi 3 and 4. Current master and all previous QEMU versions
  were unable to import any compressed VMDKs with a footer. It now works
  on all the ones I have.

  bb45ded93115ad4303471c9a492579dc36716547 changed the order of
  gd_offset and rgd_offset in the VMDK4Header struct. Page 8 of the VMDK
  1.1 spec from VMWare shows the structure as rgd_ then gd_, while QEMU
  now has gd_ *before* rgd_offset. I was only able to get VMDK
  conversion to work by switching the order back to that specified by
  VMWare and previously used by QEMU. I don't know what VMDK this commit
  is referring to, so I can't test to see if I've broken it. :(

  I will submit this patch to the mailing list if I get a chance, but
  I'm also uploading it here so I don't lose it.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/907063/+subscriptions



Re: [Qemu-devel] [RFC] Migration convergence - a suggestion

2011-12-21 Thread Ronen Hod

On 12/20/2011 03:39 PM, Anthony Liguori wrote:

On 12/20/2011 01:06 AM, Ronen Hod wrote:
Well the issue is not new, anyhow, following a conversation with Orit 
...


Since we want the migration to finish, I believe that the "migration 
speed"

parameter alone cannot do the job.
I suggest using two distinct parameters:
1. Migration speed - will be used to limit the network resources 
utilization
2. aggressionLevel - A number between 0.0 and 1.0, where low values 
imply

minimal interruption to the guest, and 1.0 mean that the guest will be
completely stalled.

In any case the migration will have to do its work and finish given 
any actual
migration-speed, so even low aggressionLevel values will sometimes 
imply that

the guest will be throttled substantially.

The algorithm:
The aggressionLevel should determine the targetGuest%CPU (how much 
CPU time we

want to allocate to the guest)


QEMU has no way to limit the guest CPU time.


Wouldn't any "yield" (sleep / whatever) limit the guest's CPU time, be 
it in qemu or in KVM.
My intention is to suggest an algorithm that is based on guest 
throttling. Looking at the relevant BZs, I do not see how we can avoid 
it. I certainly have no claims regarding the architecture.
Avi and mst, believe that it is better to continuously control the 
guest's CPU from the outside (libvirt) using cgroups. Although less 
responsive to changes, it should still work.
In the meantime, I also discovered that everybody has a different point 
of view regarding the requirements. Regardless, I believe that the same 
basic mechanics (once decided), can do the work

Some relevant configuration "requirements" are:
1. Max bandwidth
2. Min CPU per guest
3. Max guest stall time
4. Max migration time
These requirements will often conflict, and may imply changes in 
behavior over time.


I would also suggest that the management GUI will let the user select 
the aggression-level (or whatever), and display the implication on all 
the other parameters (total-time, %CPU) based on the current behavior of 
the guest and network.


Regards, Ronen



Regards,

Anthony Liguori





[Qemu-devel] [PATCH] scsi virtio-blk usb-msd: Clean up device init error messages

2011-12-21 Thread Markus Armbruster
Replace

error_report("DEVICE-NAME: MESSAGE");

by just

error_report("MESSAGE");

in block device init functions.

DEVICE-NAME is bogus in some cases: it's "scsi-disk" for device
scsi-hd and scsi-cd, "virtio-blk-pci" for virtio-blk-s390, and
"usb-msd" for usb-storage.

There is no real need to put a device name in the message, because
error_report() points to the offending command line option already:

$ qemu-system-x86_64 --nodefaults --enable-kvm -vnc :0 -S -monitor stdio -usb 
-device virtio-blk-pci
upstream-qemu: -device virtio-blk-pci: virtio-blk-pci: drive property not set
upstream-qemu: -device virtio-blk-pci: Device 'virtio-blk-pci' could not be 
initialized

And for a monitor command, it's obvious anyway:

$ qemu-system-x86_64 --nodefaults --enable-kvm -vnc :0 -S -monitor stdio -usb
(qemu) device_add virtio-blk-pci
virtio-blk-pci: drive property not set
Device 'virtio-blk-pci' could not be initialized

Reported-by: Amit Shah 
Signed-off-by: Markus Armbruster 
---
 hw/scsi-disk.c|4 ++--
 hw/scsi-generic.c |8 
 hw/usb-msd.c  |2 +-
 hw/virtio-blk.c   |2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 505accd..5d8bf53 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -1515,7 +1515,7 @@ static int scsi_initfn(SCSIDevice *dev)
 DriveInfo *dinfo;
 
 if (!s->qdev.conf.bs) {
-error_report("scsi-disk: drive property not set");
+error_report("drive property not set");
 return -1;
 }
 
@@ -1537,7 +1537,7 @@ static int scsi_initfn(SCSIDevice *dev)
 }
 
 if (bdrv_is_sg(s->qdev.conf.bs)) {
-error_report("scsi-disk: unwanted /dev/sg*");
+error_report("unwanted /dev/sg*");
 return -1;
 }
 
diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c
index 6f7d3db..0aebcdd 100644
--- a/hw/scsi-generic.c
+++ b/hw/scsi-generic.c
@@ -374,13 +374,13 @@ static int scsi_generic_initfn(SCSIDevice *s)
 struct sg_scsi_id scsiid;
 
 if (!s->conf.bs) {
-error_report("scsi-generic: drive property not set");
+error_report("drive property not set");
 return -1;
 }
 
 /* check we are really using a /dev/sg* file */
 if (!bdrv_is_sg(s->conf.bs)) {
-error_report("scsi-generic: not /dev/sg*");
+error_report("not /dev/sg*");
 return -1;
 }
 
@@ -396,13 +396,13 @@ static int scsi_generic_initfn(SCSIDevice *s)
 /* check we are using a driver managing SG_IO (version 3 and after */
 if (bdrv_ioctl(s->conf.bs, SG_GET_VERSION_NUM, &sg_version) < 0 ||
 sg_version < 3) {
-error_report("scsi-generic: scsi generic interface too old");
+error_report("scsi generic interface too old");
 return -1;
 }
 
 /* get LUN of the /dev/sg? */
 if (bdrv_ioctl(s->conf.bs, SG_GET_SCSI_ID, &scsiid)) {
-error_report("scsi-generic: SG_GET_SCSI_ID ioctl failed");
+error_report("SG_GET_SCSI_ID ioctl failed");
 return -1;
 }
 
diff --git a/hw/usb-msd.c b/hw/usb-msd.c
index 4c06950..b87b929 100644
--- a/hw/usb-msd.c
+++ b/hw/usb-msd.c
@@ -517,7 +517,7 @@ static int usb_msd_initfn(USBDevice *dev)
 DriveInfo *dinfo;
 
 if (!bs) {
-error_report("usb-msd: drive property not set");
+error_report("drive property not set");
 return -1;
 }
 
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index ef27421..5e81f53 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -569,7 +569,7 @@ VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf 
*conf,
 DriveInfo *dinfo;
 
 if (!conf->bs) {
-error_report("virtio-blk-pci: drive property not set");
+error_report("drive property not set");
 return NULL;
 }
 if (!bdrv_is_inserted(conf->bs)) {
-- 
1.7.6.4




Re: [Qemu-devel] [PATCH] qemu-options.hx: fix tls-channel help text

2011-12-21 Thread Stefan Hajnoczi
On Tue, Dec 20, 2011 at 01:05:18PM +0200, Alon Levy wrote:
> Remove the default compiled out tunnel channel, add the always available
> cursor channel. Optimally the man page would depend on compiled in
> options, but that's harder to do.
> 
> RHBZ: 688586
> 
> Signed-off-by: Alon Levy 
> ---
>  qemu-options.hx |4 ++--
>  1 files changed, 2 insertions(+), 2 deletions(-)

Looks fine but giving Gerd a chance to review and respond.

Stefan



Re: [Qemu-devel] [PATCH] qemu-options.hx: fix tls-channel help text

2011-12-21 Thread Stefan Hajnoczi
On Tue, Dec 20, 2011 at 01:05:18PM +0200, Alon Levy wrote:
> Remove the default compiled out tunnel channel, add the always available
> cursor channel. Optimally the man page would depend on compiled in
> options, but that's harder to do.
> 
> RHBZ: 688586
> 
> Signed-off-by: Alon Levy 
> ---
>  qemu-options.hx |4 ++--
>  1 files changed, 2 insertions(+), 2 deletions(-)

Thanks, applied to the trivial patches tree:
https://github.com/stefanha/qemu/commits/trivial-patches

Stefan



[Qemu-devel] Regression: cold plug with device_add fails assertion

2011-12-21 Thread Markus Armbruster
Test case:

$ echo -e 'device_add usb-mouse\nq' | qemu-system-x86_64 -vnc :0 -monitor stdio 
-usb

Works in v1.0, doesn't work in master's commit ab0115e1:

qemu-system-x86_64: /work/armbru/qemu/hw/qdev.c:97: qdev_create_from_info: 
Assertion `bus->allow_hotplug' failed.

Same for virtio-blk-pci, so it's not specific to USB.

git-bisect:

8eb02831af6c5534d8712cb8db1104852cac26e9 is the first bad commit
commit 8eb02831af6c5534d8712cb8db1104852cac26e9
Author: Anthony Liguori 
Date:   Mon Dec 12 14:29:37 2011 -0600

dev: add an anonymous peripheral container

Signed-off-by: Anthony Liguori 



Re: [Qemu-devel] [Qemu-trivial] [PATCH] Strip trailing '\n' from error_report()'s first argument (again)

2011-12-21 Thread Stefan Hajnoczi
On Tue, Dec 20, 2011 at 06:13:08PM +0100, Markus Armbruster wrote:
> Commit 6daf194d got rid of them, but Hans and Gerd added some more
> lately.  Tracked down with this Coccinelle semantic patch:
> 
> @r@
> expression fmt;
> position p;
> @@
> error_report(fmt, ...)@p
> @script:python@
> fmt << r.fmt;
> p << r.p;
> @@
> if "\\n" in str(fmt):
> print "%s:%s:%s:%s" % (p[0].file, p[0].line, p[0].column, fmt)
> 
> Signed-off-by: Markus Armbruster 
> ---
>  hw/usb-bus.c |   12 ++--
>  usb-redir.c  |4 ++--
>  2 files changed, 8 insertions(+), 8 deletions(-)

Thanks, applied to the trivial patches tree:
https://github.com/stefanha/qemu/commits/trivial-patches

Stefan



Re: [Qemu-devel] [PATCH] scsi virtio-blk usb-msd: Clean up device init error messages

2011-12-21 Thread Stefan Hajnoczi
On Wed, Dec 21, 2011 at 11:37:57AM +0100, Markus Armbruster wrote:
> Replace
> 
> error_report("DEVICE-NAME: MESSAGE");
> 
> by just
> 
> error_report("MESSAGE");
> 
> in block device init functions.
> 
> DEVICE-NAME is bogus in some cases: it's "scsi-disk" for device
> scsi-hd and scsi-cd, "virtio-blk-pci" for virtio-blk-s390, and
> "usb-msd" for usb-storage.
> 
> There is no real need to put a device name in the message, because
> error_report() points to the offending command line option already:
> 
> $ qemu-system-x86_64 --nodefaults --enable-kvm -vnc :0 -S -monitor stdio -usb 
> -device virtio-blk-pci
> upstream-qemu: -device virtio-blk-pci: virtio-blk-pci: drive property not set
> upstream-qemu: -device virtio-blk-pci: Device 'virtio-blk-pci' could not be 
> initialized
> 
> And for a monitor command, it's obvious anyway:
> 
> $ qemu-system-x86_64 --nodefaults --enable-kvm -vnc :0 -S -monitor stdio -usb
> (qemu) device_add virtio-blk-pci
> virtio-blk-pci: drive property not set
> Device 'virtio-blk-pci' could not be initialized
> 
> Reported-by: Amit Shah 
> Signed-off-by: Markus Armbruster 
> ---
>  hw/scsi-disk.c|4 ++--
>  hw/scsi-generic.c |8 
>  hw/usb-msd.c  |2 +-
>  hw/virtio-blk.c   |2 +-
>  4 files changed, 8 insertions(+), 8 deletions(-)

Thanks, applied to the trivial patches tree:
https://github.com/stefanha/qemu/commits/trivial-patches

Stefan



Re: [Qemu-devel] [PATCH 5/6] hw/omap1.c: Separate clkm from omap_mpu_state

2011-12-21 Thread Peter Maydell
On 21 December 2011 01:38, andrzej zaborowski  wrote:
> Since this sub device uses parts of the rest of mpu state, it's
> (apparently) not a separate device, can we thus skip this change?  I
> don't see much value in it and it doesn't simplify code.

If you like; I don't have a very strong feeling about it, I'm mostly
just trying to get patches out of my tree, so "drop patch" is as
good as "push to master" in that sense :-)

I do suspect that nested anonymous structs are going to be a pain
for VMState if we ever get to adding save/load support to omap3.

> (note the parens in *(s->clkm) are redundant.  Also would be great if
> your patches could maintain the indentation of the rest of the file
> where it's not specified by the new coding style, this would reduce
> inconsistency).

I mostly try not to make spurious indentation changes, but there
are a lot in the patches I inherited from the qemu-meego tree so
sometimes things slip through.

-- PMM



Re: [Qemu-devel] [PATCH 00/10] hw/sd.c: Fix various status related bugs

2011-12-21 Thread Peter Maydell
On 21 December 2011 03:54, andrzej zaborowski  wrote:
> On 18 December 2011 21:37, Peter Maydell  wrote:
>> This patchset fixes a number of bugs in our SD card emulation

> Thanks, I pushed the series.  Some good catches here.  Also thanks to
> bug reporter.
>
> I replaced "card" with "command" in the commit message.
> I added resetting of .expecting_acmd in a separate patch.

Thanks for catching these mistakes.

-- PMM



[Qemu-devel] [Bug 597641] Re: SD card state change from ident to stby

2011-12-21 Thread Peter Maydell
These two issues should now be fixed in qemu git master by commits
25881d3..3799ce4.


** Changed in: qemu
   Status: New => Fix Committed

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/597641

Title:
  SD card state change from ident to stby

Status in QEMU:
  Fix Committed

Bug description:
  My environment
  host : Linux PC(2.6.29.4)
  Target: ARM-CortexA8
  I am not running any OS on the target.

  Below is my command line:
  ../src/arm-softmmu/qemu-system-arm  -M FPGA_NOOS -kernel elfImage -nographic 
-sd sd.img

  Problem scenario
  1. CMD13 issued immediately after identication phase reports SD card state as 
"ident" instead of "stby". 
  2. class 0(basic) commands are reported as "ILLEGAL_COMMAND" after the card 
is locked.

  Observation
  1. In function sd_do_command(), the status of *previous* command is reflected 
only after the call to sd_set_status(sd). The variable "last_status" should 
contain the updated value of sd->card_status in order to correctly indicate 
*last command's* card status. 
  2. A "not" condition is missing in the if statement in the function 
sd_do_command(), thereby falsely exiting the function for class0 commands 
issued after card is locked.

  Below is the patch file that seems to be taking care of the above two 
problems.
  *** sd-0.12.4.c 2010-06-23 13:56:05.0 +0530
  --- sd-0.12.4.rudra.c   2010-06-23 14:08:10.0 +0530
  ***
  *** 1265,1278 

sd->card_status &= ~CARD_STATUS_B;
sd_set_status(sd);

if (last_status & CARD_IS_LOCKED)
  ! if (((last_status & APP_CMD) &&
 req->cmd == 41) ||
(!(last_status & APP_CMD) &&
 (sd_cmd_class[req->cmd] == 0 ||
  sd_cmd_class[req->cmd] == 7 ||
  !   req->cmd == 16 || req->cmd == 55))) {
sd->card_status |= ILLEGAL_COMMAND;
fprintf(stderr, "SD: Card is locked\n");
return 0;
  --- 1265,1279 

sd->card_status &= ~CARD_STATUS_B;
sd_set_status(sd);
  + last_status = sd->card_status;

if (last_status & CARD_IS_LOCKED)
  ! if (!(((last_status & APP_CMD) &&
 req->cmd == 41) ||
(!(last_status & APP_CMD) &&
 (sd_cmd_class[req->cmd] == 0 ||
  sd_cmd_class[req->cmd] == 7 ||
  !   req->cmd == 16 || req->cmd == 55 {
sd->card_status |= ILLEGAL_COMMAND;
fprintf(stderr, "SD: Card is locked\n");
return 0;

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/597641/+subscriptions



Re: [Qemu-devel] [PATCH] scsi virtio-blk usb-msd: Clean up device init error messages

2011-12-21 Thread Amit Shah
On (Wed) 21 Dec 2011 [11:37:57], Markus Armbruster wrote:
> Replace
> 
> error_report("DEVICE-NAME: MESSAGE");
> 
> by just
> 
> error_report("MESSAGE");
> 
> in block device init functions.
> 
> DEVICE-NAME is bogus in some cases: it's "scsi-disk" for device
> scsi-hd and scsi-cd, "virtio-blk-pci" for virtio-blk-s390, and
> "usb-msd" for usb-storage.
> 
> There is no real need to put a device name in the message, because
> error_report() points to the offending command line option already:
> 
> $ qemu-system-x86_64 --nodefaults --enable-kvm -vnc :0 -S -monitor stdio -usb 
> -device virtio-blk-pci
> upstream-qemu: -device virtio-blk-pci: virtio-blk-pci: drive property not set
> upstream-qemu: -device virtio-blk-pci: Device 'virtio-blk-pci' could not be 
> initialized
> 
> And for a monitor command, it's obvious anyway:
> 
> $ qemu-system-x86_64 --nodefaults --enable-kvm -vnc :0 -S -monitor stdio -usb
> (qemu) device_add virtio-blk-pci
> virtio-blk-pci: drive property not set
> Device 'virtio-blk-pci' could not be initialized
> 
> Reported-by: Amit Shah 
> Signed-off-by: Markus Armbruster 

Acked-by: Amit Shah 

Amit



[Qemu-devel] memory: Why subpage is introduced?

2011-12-21 Thread Zhi Yong Wu
HI,

For memory management, i have several questions as below:

1.) Why is subpage introduced? what is its goal?

2.) How to render MemoryRegion into one disjoint flatrange list? That
rendering function is a bit difficult to understand. Can anyone simply
explain it?

3.) What are separately the meanings of these flags? such as
IO_MEM_RAM, IO_MEM_ROM, IO_MEM_UNASSIGNED, IO_MEM_*, IO_MEM_ROMD, and
IO_MEM_SUBPAGE.

Can anyone give some helpful comments?


-- 
Regards,

Zhi Yong Wu



Re: [Qemu-devel] [PATCH v2 01/10] qapi: add Visitor interfaces for uint*_t and int*_t

2011-12-21 Thread Paolo Bonzini

On 12/20/2011 09:22 PM, Michael Roth wrote:

The main goal is to abstract away data serialization schemes
(QObject->JSON, C->QEMUFile, etc).


Right.  And the simplest way to abstract the scheme is to provide a 
backend-independent representation of device state.


As a convention, I'll call:

- "device state" the fields in the struct

- "QEMUFile" the serialized output that VMState save produces

- "device state representation" a representation of device state that is 
independent of the actual serialization backend.


- "device configuration" is the parameters that are used to create the 
device (coming from compile-time constants or the command-line)



In the case of a JSON-based serialization, the visitor interface for
fixed-with types would end up serializing everything as
int64_t/double, but QEMUFile requires byte-length affinity to remain
backward-compatible, so that information must be passed on to the
Visitor interface when we call it.


When creating the device state representation from either device state 
or QEMUFile, you need the byte length to fetch fields correctly.


When recreating device state from its representation, or saving to 
QEMUFile state, you need the byte length to store fields correctly.


However, you do not need the byte length in the device state 
representation.  In all four cases (from/to device state, from/to 
QEMUFile) the byte length can be fetched from the VMState.



As always, you can implement that in many ways. However, I think the
point of using Visitors is not to remove QEMUFile. It is to provide a
backend-independent representation that backends can transform and that
secondarily can be exposed by QOM.


Agreed, it's just a matter of wanting to maintain that information from
start to finish.


I don't see the point of maintaining the type hints from start to 
finish, as long as you can reconstruct it any time you want it.



On top of this the representation he passes to visitors is somewhat
redundant. For example, VMState has "equal" fields; they are fields that
are serialized but are really fixed at compile- or realize-time. Such
fields should not be part of the backend-independent representation.
With Michael's approach they are, and that's quite deep in the
implementation.


You mean, for instance, put_int32()/get_int32_equal()? If so, I'm not
sure I follow. In that case we use a Visitor purely to
serialize/deserialize an int32, vmstate adds the *_equal() interface as
helper function on top of that, but it's not part of the Visitor
interfaces.


It should be only in QEMUFile, because it's one of its quirks.  It is a 
separate property that is fixed at compile-time or realize-time, so it's 
part of device configuration; it's not part of device state representation.



Yes, this is accurate, but I see the goals differently. We should:

(1) First and foremost, provide a backend-independent representation of
device state so that we can add other backends later.

(2) Serialize this with QEMUFile, both for backwards-compatibility and
to ensure that the whole thing works.

Whether you do (2) directly with QEMUFile or, like Michael does, with
QEMUFile*Visitors is secondary. I don't have big objections to either
approach. However, the series is missing (1).


I'll fix up the existing non-QEMUFile Visitor backends with base-class
implementations for all the new interfaces. Beyond that, is there
anything else missing to achieve 1)?


I think almost nothing (you need to integrate into the QOM properties, 
and to handle a few special cases such as VMSTATE_EQUAL and 
VMSTATE_UNUSED).  That's quite good.


However, once you do this, you will still be serializing QEMUFile 
directly from device state rather than from device state representation. 
 This is fine, but not what we should do when working on BER 
serialization or similar.


Paolo



Re: [Qemu-devel] [PATCH v2 01/10] qapi: add Visitor interfaces for uint*_t and int*_t

2011-12-21 Thread Paolo Bonzini

On 12/20/2011 09:56 PM, Anthony Liguori wrote:

As always, you can implement that in many ways. However, I think the
point of using Visitors is not to remove QEMUFile.


Yes, it is.


The point of using Visitors is to provide a standard representation of 
device state.  QEMUFile is but one consumer of that representation, 
along with any other migration filter.



We need something like a migration filter capability where we can
encapsulate this kind of logic such that we can ween VMState away
from these things (and ultimately switch to an IDL compiler).

We can't do a migration filter until we have something like Michael's
series.


Agreed.  I'm saying that the QEMUFile serialization should be the first 
example of a migration filter.  Mike's patch add a layer of indirection 
in the code, but not in the data.  We can't do a migration filter until 
we have device state represented as introspectable data.



Yes, this is accurate, but I see the goals differently. We should:

(1) First and foremost, provide a backend-independent representation
of device state so that we can add other backends later.


And Mike's series does this, no?


No.


(2) Serialize this with QEMUFile, both for backwards-compatibility and
to ensure that the whole thing works.


Mike's series also does this, no?


It serializes device state directly, without going through a 
backend-independent representation.  It doesn't provide a place in which 
you can plug a filter.


Paolo



Re: [Qemu-devel] memory: Why subpage is introduced?

2011-12-21 Thread Avi Kivity
On 12/21/2011 02:09 PM, Zhi Yong Wu wrote:
> HI,
>
> For memory management, i have several questions as below:
>
> 1.) Why is subpage introduced? what is its goal?

A TLB entry spans one page; a subpage is a way of dispatching accesses
through that tlb entry to various memory regions.

> 2.) How to render MemoryRegion into one disjoint flatrange list? That
> rendering function is a bit difficult to understand. Can anyone simply
> explain it?

What exactly don't you understand?

>
> 3.) What are separately the meanings of these flags? such as
> IO_MEM_RAM, IO_MEM_ROM, IO_MEM_UNASSIGNED, IO_MEM_*, IO_MEM_ROMD, and
> IO_MEM_SUBPAGE.

RAM = RAM
ROM = ROM
UNASSIGNED = nothing handles this range
ROMD = ROM when read, device (i.e. callbacks) when written
SUBPAGE = dispatch using the lower address bits to obtain final I/O handler.

-- 
error compiling committee.c: too many arguments to function




[Qemu-devel] [PATCH 4/8] Remove support for version 3 ram_load

2011-12-21 Thread Avi Kivity
Version 3 ram_load depends on ram_addrs, which are not stable.  Version 4
was introduced in 0.13 (and RHEL 6), so this means live migration from 0.12
and earlier to 1.1 or later will not work.

Signed-off-by: Avi Kivity 
---
 arch_init.c |   18 --
 1 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 8a3f052..9b8a1f3 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -366,7 +366,7 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
 int flags;
 int error;
 
-if (version_id < 3 || version_id > 4) {
+if (version_id < 4 || version_id > 4) {
 return -EINVAL;
 }
 
@@ -377,11 +377,7 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
 addr &= TARGET_PAGE_MASK;
 
 if (flags & RAM_SAVE_FLAG_MEM_SIZE) {
-if (version_id == 3) {
-if (addr != ram_bytes_total()) {
-return -EINVAL;
-}
-} else {
+if (version_id == 4) {
 /* Synchronize RAM block list */
 char id[256];
 ram_addr_t length;
@@ -419,10 +415,7 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
 void *host;
 uint8_t ch;
 
-if (version_id == 3)
-host = qemu_get_ram_ptr(addr);
-else
-host = host_from_stream_offset(f, addr, flags);
+host = host_from_stream_offset(f, addr, flags);
 if (!host) {
 return -EINVAL;
 }
@@ -438,10 +431,7 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
 } else if (flags & RAM_SAVE_FLAG_PAGE) {
 void *host;
 
-if (version_id == 3)
-host = qemu_get_ram_ptr(addr);
-else
-host = host_from_stream_offset(f, addr, flags);
+host = host_from_stream_offset(f, addr, flags);
 
 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
 }
-- 
1.7.7.1




[Qemu-devel] [PATCH 5/8] Convert ram_load() to the memory API

2011-12-21 Thread Avi Kivity
Signed-off-by: Avi Kivity 
---
 arch_init.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 9b8a1f3..381c055 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -344,7 +344,7 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, 
void *opaque)
 return NULL;
 }
 
-return block->host + offset;
+return memory_region_get_ram_ptr(block->mr) + offset;
 }
 
 len = qemu_get_byte(f);
@@ -353,7 +353,7 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, 
void *opaque)
 
 QLIST_FOREACH(block, &ram_list.blocks, next) {
 if (!strncmp(id, block->idstr, sizeof(id)))
-return block->host + offset;
+return memory_region_get_ram_ptr(block->mr) + offset;
 }
 
 fprintf(stderr, "Can't find block %s!\n", id);
-- 
1.7.7.1




Re: [Qemu-devel] [PATCH v4 04/11] ARM: exynos4210: IRQ subsystem support.

2011-12-21 Thread Peter Maydell
On 19 December 2011 11:53, Evgeny Voevodin  wrote:

> +static uint64_t exynos4210_gic_cpu_read(void *opaque, target_phys_addr_t 
> offset,
> +        unsigned size)
> +{
> +    Exynos4210GicState *s = (Exynos4210GicState *) opaque;
> +    DPRINTF_EXYNOS4210_GIC("CPU%d: read offset 0x%x\n",
> +            gic_get_current_cpu(), offset);
> +    return gic_cpu_read(&s->gic, gic_get_current_cpu(), offset & ~0x8000);
> +}

arm_gic.c exposes the CPU and distributor interfaces as their own
memory regions now -- you shouldn't need any of this intermediate
layer of functions.

(Reviewing the rest of this series is on my todo list but I can't
guarantee I'll get to it until after Christmas now.)

-- PMM



[Qemu-devel] [PATCH 0/8] Convert live migration to memory API

2011-12-21 Thread Avi Kivity
These patches, on top of "vmstate, memory: decouple vmstate from memory API",
convert live migration to use the memory API.

Patch 4 is an ABI change, please review carefully.

Avi Kivity (8):
  Store MemoryRegion in RAMBlock
  Switch ram_save to the memory API
  Sort RAMBlocks by ID for migration, not by ram_addr
  Remove support for version 3 ram_load
  Convert ram_load() to the memory API
  memory: obsolete cpu_physical_memory_[gs]et_dirty_tracking()
  xen: convert framebuffer dirty tracking to memory API
  memory: obsolete more dirty memory related functions

 arch_init.c |   71 +++---
 cpu-all.h   |   54 +-
 exec-obsolete.h |   51 +++
 exec.c  |   11 +---
 memory.c|2 +
 xen-all.c   |5 +--
 6 files changed, 82 insertions(+), 112 deletions(-)

-- 
1.7.7.1




Re: [Qemu-devel] [PATCH 3/8] Sort RAMBlocks by ID for migration, not by ram_addr

2011-12-21 Thread Anthony Liguori

On 12/21/2011 07:34 AM, Avi Kivity wrote:

ram_addr is (a) unstable (b) going away.  Sort by idstr instead.

Signed-off-by: Avi Kivity


I don't see this as a problem, per say, but this is a significant behavioral 
change.  ram_addr does correspond roughly to the location in memory and 
historically we would send memory starting from 0 upward whereas now, the order 
that we send RAMBlocks will be random for all intents and purposes.


Again, I don't think it's a problem, but we should note this in the commit 
message in case it creates a problem down the road.


Regards,

Anthony Liguori


---
  arch_init.c |8 ++--
  1 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 2743bfd..8a3f052 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -217,12 +217,8 @@ static int block_compar(const void *a, const void *b)
  {
  RAMBlock * const *ablock = a;
  RAMBlock * const *bblock = b;
-if ((*ablock)->offset<  (*bblock)->offset) {
-return -1;
-} else if ((*ablock)->offset>  (*bblock)->offset) {
-return 1;
-}
-return 0;
+
+return strcmp((*ablock)->idstr, (*bblock)->idstr);
  }

  static void sort_ram_list(void)





Re: [Qemu-devel] [PATCH 4/8] Remove support for version 3 ram_load

2011-12-21 Thread Anthony Liguori

On 12/21/2011 07:34 AM, Avi Kivity wrote:

Version 3 ram_load depends on ram_addrs, which are not stable.  Version 4
was introduced in 0.13 (and RHEL 6), so this means live migration from 0.12
and earlier to 1.1 or later will not work.


Can you please make a note on http://wiki.qemu.org/ChangeLog/Next

Reviewed-by: Anthony Liguori 

Regards,

Anthony Liguori



Signed-off-by: Avi Kivity
---
  arch_init.c |   18 --
  1 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 8a3f052..9b8a1f3 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -366,7 +366,7 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
  int flags;
  int error;

-if (version_id<  3 || version_id>  4) {
+if (version_id<  4 || version_id>  4) {
  return -EINVAL;
  }

@@ -377,11 +377,7 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
  addr&= TARGET_PAGE_MASK;

  if (flags&  RAM_SAVE_FLAG_MEM_SIZE) {
-if (version_id == 3) {
-if (addr != ram_bytes_total()) {
-return -EINVAL;
-}
-} else {
+if (version_id == 4) {
  /* Synchronize RAM block list */
  char id[256];
  ram_addr_t length;
@@ -419,10 +415,7 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
  void *host;
  uint8_t ch;

-if (version_id == 3)
-host = qemu_get_ram_ptr(addr);
-else
-host = host_from_stream_offset(f, addr, flags);
+host = host_from_stream_offset(f, addr, flags);
  if (!host) {
  return -EINVAL;
  }
@@ -438,10 +431,7 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
  } else if (flags&  RAM_SAVE_FLAG_PAGE) {
  void *host;

-if (version_id == 3)
-host = qemu_get_ram_ptr(addr);
-else
-host = host_from_stream_offset(f, addr, flags);
+host = host_from_stream_offset(f, addr, flags);

  qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
  }





Re: [Qemu-devel] [PATCH 0/8] Convert live migration to memory API

2011-12-21 Thread Anthony Liguori

On 12/21/2011 07:34 AM, Avi Kivity wrote:

These patches, on top of "vmstate, memory: decouple vmstate from memory API",
convert live migration to use the memory API.

Patch 4 is an ABI change, please review carefully.


Using ram_addr_t for migration was badly, badly broken.  I think it's been long 
enough that we can safely remove that logic.


Other than my request for a better commit message, this whole series:

Reviewed-by: Anthony Liguori 

Regards,

Anthony Liguori



Avi Kivity (8):
   Store MemoryRegion in RAMBlock
   Switch ram_save to the memory API
   Sort RAMBlocks by ID for migration, not by ram_addr
   Remove support for version 3 ram_load
   Convert ram_load() to the memory API
   memory: obsolete cpu_physical_memory_[gs]et_dirty_tracking()
   xen: convert framebuffer dirty tracking to memory API
   memory: obsolete more dirty memory related functions

  arch_init.c |   71 +++---
  cpu-all.h   |   54 +-
  exec-obsolete.h |   51 +++
  exec.c  |   11 +---
  memory.c|2 +
  xen-all.c   |5 +--
  6 files changed, 82 insertions(+), 112 deletions(-)






[Qemu-devel] [PATCH 6/8] memory: obsolete cpu_physical_memory_[gs]et_dirty_tracking()

2011-12-21 Thread Avi Kivity
The getter is no longer used, so it is completely removed.

Signed-off-by: Avi Kivity 
---
 arch_init.c |7 +++
 cpu-all.h   |4 
 exec-obsolete.h |2 ++
 exec.c  |   10 --
 memory.c|2 ++
 5 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 381c055..d3adacf 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -251,7 +251,7 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, 
void *opaque)
 int ret;
 
 if (stage < 0) {
-cpu_physical_memory_set_dirty_tracking(0);
+memory_global_dirty_log_stop();
 return 0;
 }
 
@@ -274,8 +274,7 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, 
void *opaque)
 }
 }
 
-/* Enable dirty memory tracking */
-cpu_physical_memory_set_dirty_tracking(1);
+memory_global_dirty_log_start();
 
 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
 
@@ -320,7 +319,7 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, 
void *opaque)
 while ((bytes_sent = ram_save_block(f)) != 0) {
 bytes_transferred += bytes_sent;
 }
-cpu_physical_memory_set_dirty_tracking(0);
+memory_global_dirty_log_stop();
 }
 
 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
diff --git a/cpu-all.h b/cpu-all.h
index 4acaa8b..2fdb856 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -566,10 +566,6 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, 
ram_addr_t end,
  int dirty_flags);
 void cpu_tlb_update_dirty(CPUState *env);
 
-int cpu_physical_memory_set_dirty_tracking(int enable);
-
-int cpu_physical_memory_get_dirty_tracking(void);
-
 void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
 #endif /* !CONFIG_USER_ONLY */
 
diff --git a/exec-obsolete.h b/exec-obsolete.h
index 3a2faae..880105e 100644
--- a/exec-obsolete.h
+++ b/exec-obsolete.h
@@ -61,6 +61,8 @@ static inline void 
cpu_register_physical_memory(target_phys_addr_t start_addr,
 void qemu_register_coalesced_mmio(target_phys_addr_t addr, ram_addr_t size);
 void qemu_unregister_coalesced_mmio(target_phys_addr_t addr, ram_addr_t size);
 
+int cpu_physical_memory_set_dirty_tracking(int enable);
+
 #endif
 
 #endif
diff --git a/exec.c b/exec.c
index a4116d9..28c057c 100644
--- a/exec.c
+++ b/exec.c
@@ -2008,19 +2008,9 @@ int cpu_physical_memory_set_dirty_tracking(int enable)
 {
 int ret = 0;
 in_migration = enable;
-if (enable) {
-memory_global_dirty_log_start();
-} else {
-memory_global_dirty_log_stop();
-}
 return ret;
 }
 
-int cpu_physical_memory_get_dirty_tracking(void)
-{
-return in_migration;
-}
-
 static inline void tlb_update_dirty(CPUTLBEntry *tlb_entry)
 {
 ram_addr_t ram_addr;
diff --git a/memory.c b/memory.c
index f7b3d50..868ffd0 100644
--- a/memory.c
+++ b/memory.c
@@ -1493,6 +1493,7 @@ void memory_global_dirty_log_start(void)
 {
 MemoryListener *listener;
 
+cpu_physical_memory_set_dirty_tracking(1);
 global_dirty_log = true;
 QLIST_FOREACH(listener, &memory_listeners, link) {
 listener->log_global_start(listener);
@@ -1507,6 +1508,7 @@ void memory_global_dirty_log_stop(void)
 QLIST_FOREACH(listener, &memory_listeners, link) {
 listener->log_global_stop(listener);
 }
+cpu_physical_memory_set_dirty_tracking(0);
 }
 
 static void listener_add_address_space(MemoryListener *listener,
-- 
1.7.7.1




[Qemu-devel] [PATCH 3/8] Sort RAMBlocks by ID for migration, not by ram_addr

2011-12-21 Thread Avi Kivity
ram_addr is (a) unstable (b) going away.  Sort by idstr instead.

Signed-off-by: Avi Kivity 
---
 arch_init.c |8 ++--
 1 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 2743bfd..8a3f052 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -217,12 +217,8 @@ static int block_compar(const void *a, const void *b)
 {
 RAMBlock * const *ablock = a;
 RAMBlock * const *bblock = b;
-if ((*ablock)->offset < (*bblock)->offset) {
-return -1;
-} else if ((*ablock)->offset > (*bblock)->offset) {
-return 1;
-}
-return 0;
+
+return strcmp((*ablock)->idstr, (*bblock)->idstr);
 }
 
 static void sort_ram_list(void)
-- 
1.7.7.1




Re: [Qemu-devel] Regression: cold plug with device_add fails assertion

2011-12-21 Thread Anthony Liguori

On 12/21/2011 05:01 AM, Markus Armbruster wrote:

Test case:

$ echo -e 'device_add usb-mouse\nq' | qemu-system-x86_64 -vnc :0 -monitor stdio 
-usb

Works in v1.0, doesn't work in master's commit ab0115e1:

qemu-system-x86_64: /work/armbru/qemu/hw/qdev.c:97: qdev_create_from_info: 
Assertion `bus->allow_hotplug' failed.

Same for virtio-blk-pci, so it's not specific to USB.


Yes, see:

http://mid.gmane.org/1324334380-25278-1-git-send-email-aligu...@us.ibm.com

Which is in the latest qemu.git.

I also added a test to qemu-test to catch this.

Regards,

Anthony Liguori



git-bisect:

8eb02831af6c5534d8712cb8db1104852cac26e9 is the first bad commit
commit 8eb02831af6c5534d8712cb8db1104852cac26e9
Author: Anthony Liguori
Date:   Mon Dec 12 14:29:37 2011 -0600

 dev: add an anonymous peripheral container

 Signed-off-by: Anthony Liguori






Re: [Qemu-devel] [RFC] Migration convergence - a suggestion

2011-12-21 Thread Anthony Liguori

On 12/21/2011 03:47 AM, Ronen Hod wrote:

On 12/20/2011 03:39 PM, Anthony Liguori wrote:

On 12/20/2011 01:06 AM, Ronen Hod wrote:

Well the issue is not new, anyhow, following a conversation with Orit ...

Since we want the migration to finish, I believe that the "migration speed"
parameter alone cannot do the job.
I suggest using two distinct parameters:
1. Migration speed - will be used to limit the network resources utilization
2. aggressionLevel - A number between 0.0 and 1.0, where low values imply
minimal interruption to the guest, and 1.0 mean that the guest will be
completely stalled.

In any case the migration will have to do its work and finish given any actual
migration-speed, so even low aggressionLevel values will sometimes imply that
the guest will be throttled substantially.

The algorithm:
The aggressionLevel should determine the targetGuest%CPU (how much CPU time we
want to allocate to the guest)


QEMU has no way to limit the guest CPU time.


Wouldn't any "yield" (sleep / whatever) limit the guest's CPU time, be it in
qemu or in KVM.


Practically speaking, not really.


My intention is to suggest an algorithm that is based on guest throttling.
Looking at the relevant BZs, I do not see how we can avoid it. I certainly have
no claims regarding the architecture.
Avi and mst, believe that it is better to continuously control the guest's CPU
from the outside (libvirt) using cgroups. Although less responsive to changes,
it should still work.


Yes, that's really the only way to do it I think.  You'll need to use the CFS 
bandwidth controller.



In the meantime, I also discovered that everybody has a different point of view
regarding the requirements. Regardless, I believe that the same basic mechanics
(once decided), can do the work
Some relevant configuration "requirements" are:
1. Max bandwidth
2. Min CPU per guest
3. Max guest stall time
4. Max migration time
These requirements will often conflict, and may imply changes in behavior over
time.

I would also suggest that the management GUI will let the user select the
aggression-level (or whatever), and display the implication on all the other
parameters (total-time, %CPU) based on the current behavior of the guest and
network.


It really depends on your management GUI.  If you have a task oriented GUI, you 
are unlikely to have a "live migration" feature.  Rather, you'll have a 
mechanism to enable load balancing (which may use live migration) or you'll have 
an "evacuate node" feature to allow system maintenance.


Your downtime policy really depends on the feature.  The best thing for QEMU to 
do is make sure we expose all of the necessary hooks to allow a wide variety of 
policies to be implemented.


Regards,

Anthony Liguori



Regards, Ronen



Regards,

Anthony Liguori








Re: [Qemu-devel] [PATCH 01/27] qom: add the base Object class

2011-12-21 Thread Paolo Bonzini

On 12/20/2011 05:51 PM, Anthony Liguori wrote:

This class provides the main building block for QEMU Object Model and is
extensively documented in the header file.  It is largely inspired by GObject.

Signed-off-by: Anthony Liguori
---
  Makefile.objs |2 +
  hw/object.c   |  469 +
  hw/object.h   |  427 +++
  3 files changed, 898 insertions(+), 0 deletions(-)
  create mode 100644 hw/object.c
  create mode 100644 hw/object.h

diff --git a/Makefile.objs b/Makefile.objs
index f753d83..b86e8a1 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -122,6 +122,8 @@ common-obj-$(CONFIG_WIN32) += version.o

  common-obj-$(CONFIG_SPICE) += ui/spice-core.o ui/spice-input.o 
ui/spice-display.o spice-qemu-char.o

+common-obj-y += object.o
+
  audio-obj-y = audio.o noaudio.o wavaudio.o mixeng.o
  audio-obj-$(CONFIG_SDL) += sdlaudio.o
  audio-obj-$(CONFIG_OSS) += ossaudio.o
diff --git a/hw/object.c b/hw/object.c
new file mode 100644
index 000..620e63f
--- /dev/null
+++ b/hw/object.c
@@ -0,0 +1,469 @@
+/*
+ * QEMU Object Model
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Anthony Liguori
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "object.h"
+
+#define MAX_INTERFACES 32
+
+typedef struct InterfaceImpl
+{
+const char *parent;
+void (*interface_initfn)(ObjectClass *class, void *data);
+Type type;
+} InterfaceImpl;
+
+typedef struct TypeImpl
+{
+const char *name;
+Type type;


What's the need for "Type"?  You can use simply the TypeImpl * and drop 
type_get_instance.  Outside object.h it can be an opaque pointer.



+
+size_t class_size;
+
+size_t instance_size;
+
+void (*base_init)(ObjectClass *klass);
+void (*base_finalize)(ObjectClass *klass);
+
+void (*class_init)(ObjectClass *klass, void *data);
+void (*class_finalize)(ObjectClass *klass, void *data);
+
+void *class_data;
+
+void (*instance_init)(Object *obj);
+void (*instance_finalize)(Object *obj);
+
+bool abstract;
+
+const char *parent;
+
+ObjectClass *class;
+
+int num_interfaces;
+InterfaceImpl interfaces[MAX_INTERFACES];


... this way you can also allocate dynamically and use a variable-sized 
array for interfaces.



+} TypeImpl;
+
+static int num_types = 1;
+static TypeImpl type_table[1024];


... and you can also drop this.


+Type type_register_static(const TypeInfo *info)
+{
+Type type = num_types++;
+TypeImpl *ti;
+
+ti =&type_table[type];
+
+assert(info->name != NULL);
+
+printf("Added type %s ->  %s\n", info->name, info->parent);
+
+ti->name = info->name;


Why no strdup here?


+ti->parent = info->parent;


Please store a Type or a pointer to TypeImpl.  Otherwise, I don't see 
the need to distinguish TypeInfo/InterfaceInfo and 
TypeImpl/InterfaceImpl at all.  The only difference is num_interfaces, and


for (i = 0; i < ti->num_interfaces; i++) {

can be replaced just as well with

for (i = 0; i < ti->interfaces[i].type; i++) {


+ti->type = type;
+
+ti->class_size = info->class_size;
+ti->instance_size = info->instance_size;
+
+ti->base_init = info->base_init;
+ti->base_finalize = info->base_finalize;
+
+ti->class_init = info->class_init;
+ti->class_finalize = info->class_finalize;
+ti->class_data = info->class_data;
+
+ti->instance_init = info->instance_init;
+ti->instance_finalize = info->instance_finalize;
+
+ti->abstract = info->abstract;
+
+if (info->interfaces) {
+int i;
+
+for (i = 0; info->interfaces[i].type; i++) {
+ti->interfaces[i].parent = info->interfaces[i].type;
+ti->interfaces[i].interface_initfn = 
info->interfaces[i].interface_initfn;
+ti->num_interfaces++;
+}
+}
+
+return type;


The return value is unused.  Looks like a premature optimization or a 
leftover from GObject.



+}
+
+static Type type_register_anonymous(const TypeInfo *info)
+{
+Type type = num_types++;
+TypeImpl *ti;
+char buffer[32];
+static int count;
+
+ti =&type_table[type];
+
+snprintf(buffer, sizeof(buffer), "", count++);
+ti->name = g_strdup(buffer);


g_strdup_printf, please.  However, this has exactly one use in 
type_class_interface_init.  Can you make the name something like 
, so that the meaning is more clear?



+ti->parent = g_strdup(info->parent);
+ti->type = type;
+
+ti->class_size = info->class_size;
+ti->instance_size = info->instance_size;
+
+ti->base_init = info->base_init;
+ti->base_finalize = info->base_finalize;
+
+ti->class_init = info->class_init;
+ti->class_finalize = info->class_finalize;
+ti->class_data = info->class_data;
+
+ti->instance_init = info->instance_init;
+ti->instance_finalize = info->instance_finalize;
+
+if

[Qemu-devel] [PATCH 2/8] Switch ram_save to the memory API

2011-12-21 Thread Avi Kivity
Avoid using ram_addr_t, instead use (MemoryRegion *, offset) pairs.

Signed-off-by: Avi Kivity 
---
 arch_init.c |   34 ++
 1 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index ceef26e..2743bfd 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -117,24 +117,22 @@ static int ram_save_block(QEMUFile *f)
 {
 RAMBlock *block = last_block;
 ram_addr_t offset = last_offset;
-ram_addr_t current_addr;
 int bytes_sent = 0;
+MemoryRegion *mr;
 
 if (!block)
 block = QLIST_FIRST(&ram_list.blocks);
 
-current_addr = block->offset + offset;
-
 do {
-if (cpu_physical_memory_get_dirty(current_addr, MIGRATION_DIRTY_FLAG)) 
{
+mr = block->mr;
+if (memory_region_get_dirty(mr, offset, DIRTY_MEMORY_MIGRATION)) {
 uint8_t *p;
 int cont = (block == last_block) ? RAM_SAVE_FLAG_CONTINUE : 0;
 
-cpu_physical_memory_reset_dirty(current_addr,
-current_addr + TARGET_PAGE_SIZE,
-MIGRATION_DIRTY_FLAG);
+memory_region_reset_dirty(mr, offset, TARGET_PAGE_SIZE,
+  DIRTY_MEMORY_MIGRATION);
 
-p = block->host + offset;
+p = memory_region_get_ram_ptr(mr) + offset;
 
 if (is_dup_page(p, *p)) {
 qemu_put_be64(f, offset | cont | RAM_SAVE_FLAG_COMPRESS);
@@ -166,10 +164,7 @@ static int ram_save_block(QEMUFile *f)
 if (!block)
 block = QLIST_FIRST(&ram_list.blocks);
 }
-
-current_addr = block->offset + offset;
-
-} while (current_addr != last_block->offset + last_offset);
+} while (block != last_block || offset != last_offset);
 
 last_block = block;
 last_offset = offset;
@@ -186,9 +181,9 @@ static ram_addr_t ram_save_remaining(void)
 
 QLIST_FOREACH(block, &ram_list.blocks, next) {
 ram_addr_t addr;
-for (addr = block->offset; addr < block->offset + block->length;
- addr += TARGET_PAGE_SIZE) {
-if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) {
+for (addr = 0; addr < block->length; addr += TARGET_PAGE_SIZE) {
+if (memory_region_get_dirty(block->mr, addr,
+DIRTY_MEMORY_MIGRATION)) {
 count++;
 }
 }
@@ -275,11 +270,10 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, 
void *opaque)
 
 /* Make sure all dirty bits are set */
 QLIST_FOREACH(block, &ram_list.blocks, next) {
-for (addr = block->offset; addr < block->offset + block->length;
- addr += TARGET_PAGE_SIZE) {
-if (!cpu_physical_memory_get_dirty(addr,
-   MIGRATION_DIRTY_FLAG)) {
-cpu_physical_memory_set_dirty(addr);
+for (addr = 0; addr < block->length; addr += TARGET_PAGE_SIZE) {
+if (!memory_region_get_dirty(block->mr, addr,
+ DIRTY_MEMORY_MIGRATION)) {
+memory_region_set_dirty(block->mr, addr);
 }
 }
 }
-- 
1.7.7.1




[Qemu-devel] [PATCH 1/8] Store MemoryRegion in RAMBlock

2011-12-21 Thread Avi Kivity
As a step in moving live migration from RAMBlocks to MemoryRegions,
store the MemoryRegion in a RAMBlock.

Signed-off-by: Avi Kivity 
---
 cpu-all.h |1 +
 exec.c|1 +
 2 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/cpu-all.h b/cpu-all.h
index 734833a..4acaa8b 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -476,6 +476,7 @@ extern ram_addr_t ram_size;
 #define RAM_PREALLOC_MASK   (1 << 0)
 
 typedef struct RAMBlock {
+struct MemoryRegion *mr;
 uint8_t *host;
 ram_addr_t offset;
 ram_addr_t length;
diff --git a/exec.c b/exec.c
index dffceb9..a4116d9 100644
--- a/exec.c
+++ b/exec.c
@@ -2793,6 +2793,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void 
*host,
 size = TARGET_PAGE_ALIGN(size);
 new_block = g_malloc0(sizeof(*new_block));
 
+new_block->mr = mr;
 new_block->offset = find_ram_offset(size);
 if (host) {
 new_block->host = host;
-- 
1.7.7.1




[Qemu-devel] [PATCH 7/8] xen: convert framebuffer dirty tracking to memory API

2011-12-21 Thread Avi Kivity
Signed-off-by: Avi Kivity 
---
 xen-all.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/xen-all.c b/xen-all.c
index df70592..8f9db95 100644
--- a/xen-all.c
+++ b/xen-all.c
@@ -408,7 +408,6 @@ static int xen_sync_dirty_bitmap(XenIOState *state,
  ram_addr_t size)
 {
 target_phys_addr_t npages = size >> TARGET_PAGE_BITS;
-target_phys_addr_t vram_offset = 0;
 const int width = sizeof(unsigned long) * 8;
 unsigned long bitmap[(npages + width - 1) / width];
 int rc, i, j;
@@ -425,7 +424,6 @@ static int xen_sync_dirty_bitmap(XenIOState *state,
 } else if (state->log_for_dirtybit != physmap) {
 return -1;
 }
-vram_offset = physmap->phys_offset;
 
 rc = xc_hvm_track_dirty_vram(xen_xc, xen_domid,
  start_addr >> TARGET_PAGE_BITS, npages,
@@ -439,7 +437,8 @@ static int xen_sync_dirty_bitmap(XenIOState *state,
 while (map != 0) {
 j = ffsl(map) - 1;
 map &= ~(1ul << j);
-cpu_physical_memory_set_dirty(vram_offset + (i * width + j) * 
TARGET_PAGE_SIZE);
+memory_region_set_dirty(framebuffer,
+(i * width + j) * TARGET_PAGE_SIZE);
 };
 }
 
-- 
1.7.7.1




[Qemu-devel] [PATCH 8/8] memory: obsolete more dirty memory related functions

2011-12-21 Thread Avi Kivity
No longer used outside memory.c and exec.c.

Signed-off-by: Avi Kivity 
---
 cpu-all.h   |   49 -
 exec-obsolete.h |   49 +
 2 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/cpu-all.h b/cpu-all.h
index 2fdb856..31d5b27 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -515,55 +515,6 @@ extern int mem_prealloc;
 /* Set if TLB entry is an IO callback.  */
 #define TLB_MMIO(1 << 5)
 
-#define VGA_DIRTY_FLAG   0x01
-#define CODE_DIRTY_FLAG  0x02
-#define MIGRATION_DIRTY_FLAG 0x08
-
-/* read dirty bit (return 0 or 1) */
-static inline int cpu_physical_memory_is_dirty(ram_addr_t addr)
-{
-return ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] == 0xff;
-}
-
-static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr)
-{
-return ram_list.phys_dirty[addr >> TARGET_PAGE_BITS];
-}
-
-static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
-int dirty_flags)
-{
-return ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] & dirty_flags;
-}
-
-static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
-{
-ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] = 0xff;
-}
-
-static inline int cpu_physical_memory_set_dirty_flags(ram_addr_t addr,
-  int dirty_flags)
-{
-return ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] |= dirty_flags;
-}
-
-static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start,
-int length,
-int dirty_flags)
-{
-int i, mask, len;
-uint8_t *p;
-
-len = length >> TARGET_PAGE_BITS;
-mask = ~dirty_flags;
-p = ram_list.phys_dirty + (start >> TARGET_PAGE_BITS);
-for (i = 0; i < len; i++) {
-p[i] &= mask;
-}
-}
-
-void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
- int dirty_flags);
 void cpu_tlb_update_dirty(CPUState *env);
 
 void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
diff --git a/exec-obsolete.h b/exec-obsolete.h
index 880105e..698d7fc 100644
--- a/exec-obsolete.h
+++ b/exec-obsolete.h
@@ -63,6 +63,55 @@ void qemu_unregister_coalesced_mmio(target_phys_addr_t addr, 
ram_addr_t size);
 
 int cpu_physical_memory_set_dirty_tracking(int enable);
 
+#define VGA_DIRTY_FLAG   0x01
+#define CODE_DIRTY_FLAG  0x02
+#define MIGRATION_DIRTY_FLAG 0x08
+
+/* read dirty bit (return 0 or 1) */
+static inline int cpu_physical_memory_is_dirty(ram_addr_t addr)
+{
+return ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] == 0xff;
+}
+
+static inline int cpu_physical_memory_get_dirty_flags(ram_addr_t addr)
+{
+return ram_list.phys_dirty[addr >> TARGET_PAGE_BITS];
+}
+
+static inline int cpu_physical_memory_get_dirty(ram_addr_t addr,
+int dirty_flags)
+{
+return ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] & dirty_flags;
+}
+
+static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
+{
+ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] = 0xff;
+}
+
+static inline int cpu_physical_memory_set_dirty_flags(ram_addr_t addr,
+  int dirty_flags)
+{
+return ram_list.phys_dirty[addr >> TARGET_PAGE_BITS] |= dirty_flags;
+}
+
+static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start,
+int length,
+int dirty_flags)
+{
+int i, mask, len;
+uint8_t *p;
+
+len = length >> TARGET_PAGE_BITS;
+mask = ~dirty_flags;
+p = ram_list.phys_dirty + (start >> TARGET_PAGE_BITS);
+for (i = 0; i < len; i++) {
+p[i] &= mask;
+}
+}
+
+void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
+ int dirty_flags);
 #endif
 
 #endif
-- 
1.7.7.1




Re: [Qemu-devel] [PATCH 01/27] qom: add the base Object class

2011-12-21 Thread Anthony Liguori

On 12/21/2011 07:35 AM, Paolo Bonzini wrote:

On 12/20/2011 05:51 PM, Anthony Liguori wrote:

This class provides the main building block for QEMU Object Model and is
extensively documented in the header file. It is largely inspired by GObject.

Signed-off-by: Anthony Liguori
---
Makefile.objs | 2 +
hw/object.c | 469 +
hw/object.h | 427 +++
3 files changed, 898 insertions(+), 0 deletions(-)
create mode 100644 hw/object.c
create mode 100644 hw/object.h

diff --git a/Makefile.objs b/Makefile.objs
index f753d83..b86e8a1 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -122,6 +122,8 @@ common-obj-$(CONFIG_WIN32) += version.o

common-obj-$(CONFIG_SPICE) += ui/spice-core.o ui/spice-input.o
ui/spice-display.o spice-qemu-char.o

+common-obj-y += object.o
+
audio-obj-y = audio.o noaudio.o wavaudio.o mixeng.o
audio-obj-$(CONFIG_SDL) += sdlaudio.o
audio-obj-$(CONFIG_OSS) += ossaudio.o
diff --git a/hw/object.c b/hw/object.c
new file mode 100644
index 000..620e63f
--- /dev/null
+++ b/hw/object.c
@@ -0,0 +1,469 @@
+/*
+ * QEMU Object Model
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ * Anthony Liguori
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "object.h"
+
+#define MAX_INTERFACES 32
+
+typedef struct InterfaceImpl
+{
+ const char *parent;
+ void (*interface_initfn)(ObjectClass *class, void *data);
+ Type type;
+} InterfaceImpl;
+
+typedef struct TypeImpl
+{
+ const char *name;
+ Type type;


What's the need for "Type"? You can use simply the TypeImpl * and drop
type_get_instance. Outside object.h it can be an opaque pointer.


It's a bit nicer for type_register to return a handle that can later be 
unregistered (although that's not currently implemented).


You could have it return TypeImpl * of course.  GObject uses a simpler GType but 
they don't have the notion of a symbolic type name.


I used a symbolic type name to avoid the problem of dependencies.  In order to 
create a type in gobject, you have to reference the parent's GType which usually 
means you have to call the _get_type() function which acts as a singleton which 
registers the type.


Since you have to specify the parent via a function call, you can't define the 
type in a unit-level static structure which I viewed as a critical requirement.


So it might be worth refactoring it away but I'd prefer to do that in the future 
once we're sure it's not useful.



+
+ size_t class_size;
+
+ size_t instance_size;
+
+ void (*base_init)(ObjectClass *klass);
+ void (*base_finalize)(ObjectClass *klass);
+
+ void (*class_init)(ObjectClass *klass, void *data);
+ void (*class_finalize)(ObjectClass *klass, void *data);
+
+ void *class_data;
+
+ void (*instance_init)(Object *obj);
+ void (*instance_finalize)(Object *obj);
+
+ bool abstract;
+
+ const char *parent;
+
+ ObjectClass *class;
+
+ int num_interfaces;
+ InterfaceImpl interfaces[MAX_INTERFACES];


... this way you can also allocate dynamically and use a variable-sized array
for interfaces.


This could be made dynamic fairly easily.



+} TypeImpl;
+
+static int num_types = 1;
+static TypeImpl type_table[1024];


... and you can also drop this.


Yes, this could be gotten rid of.




+Type type_register_static(const TypeInfo *info)
+{
+ Type type = num_types++;
+ TypeImpl *ti;
+
+ ti =&type_table[type];
+
+ assert(info->name != NULL);
+
+ printf("Added type %s -> %s\n", info->name, info->parent);
+
+ ti->name = info->name;


Why no strdup here?


'static' means that the strings are const char *.




+ ti->parent = info->parent;


Please store a Type or a pointer to TypeImpl.


It needs to be a symbolic name because the TypeImpl may not exist yet since we 
can't guarantee registration order.


This ends up being a tremendous simplification because we don't have to care 
about type registration order.



Otherwise, I don't see the need to
distinguish TypeInfo/InterfaceInfo and TypeImpl/InterfaceImpl at all. The only
difference is num_interfaces, and

for (i = 0; i < ti->num_interfaces; i++) {

can be replaced just as well with

for (i = 0; i < ti->interfaces[i].type; i++) {


InterfaceInfo is specifically limited in what it contains to prevent someone 
from shooting themselves in the foot.  It may be possible to just use TypeImpl 
in the backend but we should keep InterfaceInfo limited.



+ ti->type = type;
+
+ ti->class_size = info->class_size;
+ ti->instance_size = info->instance_size;
+
+ ti->base_init = info->base_init;
+ ti->base_finalize = info->base_finalize;
+
+ ti->class_init = info->class_init;
+ ti->class_finalize = info->class_finalize;
+ ti->class_data = info->class_data;
+
+ ti->instance_init = info->instance_init;
+ ti->instance_finalize = info->instance_finalize;
+
+ ti->abstract = info->abstract;
+
+ if (info->interfaces) {
+ int i;
+
+ for (i = 0; info->interfaces[i].typ

Re: [Qemu-devel] [PATCH v2 01/10] qapi: add Visitor interfaces for uint*_t and int*_t

2011-12-21 Thread Anthony Liguori

On 12/21/2011 06:35 AM, Paolo Bonzini wrote:

On 12/20/2011 09:56 PM, Anthony Liguori wrote:

As always, you can implement that in many ways. However, I think the
point of using Visitors is not to remove QEMUFile.


Yes, it is.


The point of using Visitors is to provide a standard representation of device
state. QEMUFile is but one consumer of that representation, along with any other
migration filter.


Can you do a quick code mock up of how you'd expect this to work?  I'm not sure 
if I'm just being dense, but I'm having a really hard time understanding what 
you're suggesting.  How I see this evolving with Mike's series is:


struct DeviceStateClass {
   ObjectClass parent;

   void (*load_state)(DeviceState *obj, Visitor *v, const char *name,
  Error **errp);
   void (*save_state)(DeviceState *obj, Visitor *v, const char *name,
  Error **errp);
};

The PCIDevice base class would expose:

/* protected */
void pci_load_state(PCIDevice *obj, Visitor *v, const char *name,
Error **errp)
{
visit_start_struct(v, "PCIDevice", name, errp);
visit_type_int8(v, &obj->reg[0], "reg[0]", errp);
..
visit_end_struct(v, errp);
}

Or:

static VMStateDescription pci_device_desc = {
  .name = "PCIDevice",
  .fields = {
 VMSTATE_UINT8(PCIDevice, reg[0]),
 ...
  }
};

void pci_load_state(PCIDevice *obj, Visitor *v, const char *name,
Error **errp)
{
visit_with_vmstate(v, obj, &pci_device_desc, name, errp);
}

A subclass would do:

static void my_device_load_state(DeviceState *obj, Visitor *v, const char *name,
 Error **errp)
{
MyDevice *s = MY_DEVICE(obj);
visit_start_struct(v, "MyDevice", name, errp);
pci_load_state(PCI_DEVICE(obj), v, "super", errp);
visit_end_struct(v, errp);
}

static void my_device_class_init(ObjectClass *klass, void *data)
{
   DeviceClass *dc = DEVICE_CLASS(klass);

   dc->load_state = my_device_load_state;
   ...
}

There's no reference at all to QEMUFile.  The load_state/save_state methods can 
be exposed as a "state" property too.


Once the series 2/4 lands and Mike's series, implementing this should be very 
straight forward.


Regards,

Anthony Liguori



Re: [Qemu-devel] [PATCH v4 04/11] ARM: exynos4210: IRQ subsystem support.

2011-12-21 Thread Evgeny Voevodin

On 12/21/2011 05:50 PM, Peter Maydell wrote:

On 19 December 2011 11:53, Evgeny Voevodin  wrote:


+static uint64_t exynos4210_gic_cpu_read(void *opaque, target_phys_addr_t 
offset,
+unsigned size)
+{
+Exynos4210GicState *s = (Exynos4210GicState *) opaque;
+DPRINTF_EXYNOS4210_GIC("CPU%d: read offset 0x%x\n",
+gic_get_current_cpu(), offset);
+return gic_cpu_read(&s->gic, gic_get_current_cpu(), offset&  ~0x8000);
+}

arm_gic.c exposes the CPU and distributor interfaces as their own
memory regions now -- you shouldn't need any of this intermediate
layer of functions.

(Reviewing the rest of this series is on my todo list but I can't
guarantee I'll get to it until after Christmas now.)

-- PMM

These functions are not actually for splitting CPU and Distributer 
interfaces.
In our board we have two GICs - internal and external. Internal GIC is 
completely

matching arm_gic.c.

Internal GIC CPU[n] and Distributer[n] interfaces are at 0x100 and 
0x1000 offsets from

0x1050 base.

But external GIC is different.
It's CPU[0] interface is at 0x0 offset from 0x1048 base
and
  CPU[1] interface is at 0x8000 offset from 0x1048 base

It's Distributer[0] interface is at 0x0 offset from 0x1049 base
and
  Distributer[1] interface is at 0x8000 offset from 0x1049 base

[n] - is corresponding to SMP CPU Core.

So, we need these wrapper functions for External GIC.
In public accessed documentation internal GIC is not covered for some 
reason.


--
Kind regards,
Evgeny Voevodin,
Leading Software Engineer,
ASWG, Moscow R&D center, Samsung Electronics
e-mail: e.voevo...@samsung.com




Re: [Qemu-devel] [PATCH 3/8] Sort RAMBlocks by ID for migration, not by ram_addr

2011-12-21 Thread Avi Kivity
On 12/21/2011 03:55 PM, Anthony Liguori wrote:
> On 12/21/2011 07:34 AM, Avi Kivity wrote:
>> ram_addr is (a) unstable (b) going away.  Sort by idstr instead.
>>
>> Signed-off-by: Avi Kivity
>
> I don't see this as a problem, per say, but this is a significant
> behavioral change.  ram_addr does correspond roughly to the location
> in memory and historically we would send memory starting from 0 upward
> whereas now, the order that we send RAMBlocks will be random for all
> intents and purposes.
>
> Again, I don't think it's a problem, but we should note this in the
> commit message in case it creates a problem down the road.
>

I (and the new commit message) note that the sort order used to be
random a while ago (before b2e0a138e), so this isn't entirely new.

-- 
error compiling committee.c: too many arguments to function




Re: [Qemu-devel] [PATCH 4/8] Remove support for version 3 ram_load

2011-12-21 Thread Avi Kivity
On 12/21/2011 03:57 PM, Anthony Liguori wrote:
> On 12/21/2011 07:34 AM, Avi Kivity wrote:
>> Version 3 ram_load depends on ram_addrs, which are not stable. 
>> Version 4
>> was introduced in 0.13 (and RHEL 6), so this means live migration
>> from 0.12
>> and earlier to 1.1 or later will not work.
>
> Can you please make a note on http://wiki.qemu.org/ChangeLog/Next

Done.

-- 
error compiling committee.c: too many arguments to function




Re: [Qemu-devel] [PATCH 01/27] qom: add the base Object class

2011-12-21 Thread Paolo Bonzini

On 12/21/2011 03:35 PM, Anthony Liguori wrote:

+Type type_register_static(const TypeInfo *info)
+{
+ Type type = num_types++;
+ TypeImpl *ti;
+
+ ti =&type_table[type];
+
+ assert(info->name != NULL);
+
+ printf("Added type %s -> %s\n", info->name, info->parent);
+
+ ti->name = info->name;


Why no strdup here?


'static' means that the strings are const char *.


Sure, but if you later want to unregister, the ownership of ->name and 
->parent need to be the same for all paths.  In this case 
type_register_anonymous makes it "owned by object.c" and 
type_register_static makes it "owned by caller".



+ ti->parent = info->parent;


Please store a Type or a pointer to TypeImpl.


It needs to be a symbolic name because the TypeImpl may not exist yet
since we can't guarantee registration order.

This ends up being a tremendous simplification because we don't have to
care about type registration order.


Got it now.  However, let's cache the Type as soon as we resolve it the 
first time.



Otherwise, I don't see the need to
distinguish TypeInfo/InterfaceInfo and TypeImpl/InterfaceImpl at all.
The only
difference is num_interfaces, and

for (i = 0; i < ti->num_interfaces; i++) {

can be replaced just as well with

for (i = 0; i < ti->interfaces[i].type; i++) {


InterfaceInfo is specifically limited in what it contains to prevent
someone from shooting themselves in the foot. It may be possible to just
use TypeImpl in the backend but we should keep InterfaceInfo limited.


Let's keep Impl/Info separate, but use the difference so that we can 
cache name->type mappings as much as possible.



+ return type;


The return value is unused. Looks like a premature optimization or a
leftover
from GObject.


It will be used to allow type unregistration. Think about dynamic
loadable/unloadable modules.


Ack.


+ for (i = 1; i< num_types; i++) {
+ if (strcmp(name, type_table[i].name) == 0) {
+ return i;
+ }
+ }


Please use a hash table here. Ultimately object creation might be
in hot paths. For example I would like to turn SCSIRequests into
QOM objects. It would let me reuse the reference counting as well
as help with migration.


Ack.


But in any case, this function should be called as little as
possible, and should not be static in case other places want to
cache the outcome.


Hrm, I don't think it should ever be used outside of object.c. What are
you thinking re: caching.


I'm thinking of using


+static void object_interface_init(Object *obj, InterfaceImpl *iface)
+{
+ TypeImpl *ti = type_get_instance(iface->type);
+ Interface *iface_obj;
+
+ iface_obj = INTERFACE(object_new(ti->name));
+ iface_obj->obj = obj;
+
+ obj->interfaces = g_slist_prepend(obj->interfaces, iface_obj);


Please use a QSIMPLEQ.


I saw your github comment and looked at it. Turns out, Interface
shouldn't be public at all (because you only ever subclass
InterfaceClass, never Interface).


I agree that Interface and INTERFACE() should be private to object.c, 
but it's not a big deal to leave them as opaque in object.h.  Leaving 
Interface as an opaque type is enough in order to use QSIMPLEQ.



+void object_initialize(void *data, const char *typename)
+{
+ TypeImpl *ti = type_get_instance(type_get_by_name(typename));


I should be able to pass directly a Type (or a TypeImpl, whatever
happens of my
suggestion above) here.


The Type is not public. It's a dynamic value and unless we introduce a
_get_type() function (which would create a dependency problem), there's
no easy way to do it.


True, we need to keep types as strings, but we can do some kind of lazy 
initialization.  See IRC conversation + later.



Symbolic names really help simplify things by allowing dynamic
dependency resolution.


Yes, I understand this now.  But once we're out of the initialization 
path we should be able to avoid them in hot paths.

At the very least provide a function that takes a string and one that
takes a Type/TypeImpl, and always use the latter when you tail call.


You're concerned about performance?


Yes.  At least for a simple subclass of Object, object_new should not be 
_that_ much slower than a g_malloc.



Typo, you want deinit. And again, let's avoid type names. This is C,
not BASIC.


Ack on the deinit. But see above re: type names.


Yeah, I see now the problem with dependencies, but I'm pretty sure we'll 
reach a compromise. :)



+typedef uint64_t Type;


struct TypeImpl;
typedef struct TypeImpl *Type;


Not so sure on this.


It doesn't really matter, it can be refactored away later.  But on IRC 
you said it worked out nicely.


The point is when to use strings and when to use type handles.  It 
doesn't matter whether the handles are direct pointers or indices into a 
registry.





+typedef struct ObjectClass ObjectClass;
+typedef struct Object Object;
+
+typedef struct TypeInfo TypeInfo;
+
+typedef struct InterfaceClass InterfaceClass;
+typedef struct Interface Interface;
+typedef struct InterfaceInfo InterfaceInfo;
+
+#define TYPE_OBJECT N

[Qemu-devel] [PATCH] qemu-io: add option to enable tracing

2011-12-21 Thread Stefan Hajnoczi
It can be useful to enable QEMU tracing when trying out block layer
interfaces via qemu-io.  Tracing can be enabled using the new -t FILE
option where the given file contains a list of trace events to enable
(just like the qemu --trace events=FILE option).

  $ echo qemu_vfree >my-events
  $ ./qemu-io -t my-events ...

Remember to use ./configure --enable-trace-backend=BACKEND when building
qemu-io.

Signed-off-by: Stefan Hajnoczi 
---
 qemu-io.c |8 +++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/qemu-io.c b/qemu-io.c
index ffa62fb..ad91fd6 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -17,6 +17,7 @@
 #include "qemu-common.h"
 #include "block_int.h"
 #include "cmd.h"
+#include "trace/control.h"
 
 #define VERSION"0.0.1"
 
@@ -1722,6 +1723,7 @@ static void usage(const char *name)
 "  -g, --growable   allow file to grow (only applies to protocols)\n"
 "  -m, --misalign   misalign allocations for O_DIRECT\n"
 "  -k, --native-aio use kernel AIO implementation (on Linux only)\n"
+"  -t, --trace FILE enable trace events listed in the given file\n"
 "  -h, --help   display this help and exit\n"
 "  -V, --versionoutput version information and exit\n"
 "\n",
@@ -1733,7 +1735,7 @@ int main(int argc, char **argv)
 {
 int readonly = 0;
 int growable = 0;
-const char *sopt = "hVc:rsnmgk";
+const char *sopt = "hVc:rsnmgkt:";
 const struct option lopt[] = {
 { "help", 0, NULL, 'h' },
 { "version", 0, NULL, 'V' },
@@ -1745,6 +1747,7 @@ int main(int argc, char **argv)
 { "misalign", 0, NULL, 'm' },
 { "growable", 0, NULL, 'g' },
 { "native-aio", 0, NULL, 'k' },
+{ "trace", 1, NULL, 't' },
 { NULL, 0, NULL, 0 }
 };
 int c;
@@ -1776,6 +1779,9 @@ int main(int argc, char **argv)
 case 'k':
 flags |= BDRV_O_NATIVE_AIO;
 break;
+case 't':
+trace_backend_init(optarg, NULL);
+break;
 case 'V':
 printf("%s version %s\n", progname, VERSION);
 exit(0);
-- 
1.7.7.3




Re: [Qemu-devel] [PATCH v2 01/10] qapi: add Visitor interfaces for uint*_t and int*_t

2011-12-21 Thread Paolo Bonzini

On 12/21/2011 03:45 PM, Anthony Liguori wrote:

On 12/21/2011 06:35 AM, Paolo Bonzini wrote:

On 12/20/2011 09:56 PM, Anthony Liguori wrote:

As always, you can implement that in many ways. However, I think the
point of using Visitors is not to remove QEMUFile.


Yes, it is.


The point of using Visitors is to provide a standard representation of
device
state. QEMUFile is but one consumer of that representation, along with
any other
migration filter.


Can you do a quick code mock up of how you'd expect this to work?


void
vmstate_get(Device *obj, Visitor *v, void *opaque, const char *name)
{
/* Use the VMStateDescription in opaque to add fields to Visitor
   from the struct.  */
}

void
vmstate_set(Device *obj, Visitor *v, void *opaque, const char *name)
{
/* Use the VMStateDescription in opaque to retrieve fields from
   Visitor and store them in the struct.  */
}

void
vmstate_load_state(Device *obj, Visitor *v, QEMUFile )
{
VMStateDescription *vmsd = ???; /* something from the DeviceClass */

/* Use the VMStateDescription in opaque to retrieve fields from
   Visitor and store them in the struct.  This is basically the
   VMState interpreter currently in savevm.c, only fetching fields
   from v rather than the file.  */
}

void
vmstate_save_state(Device *obj, Visitor *v, QEMUFile *out)
{
VMStateDescription *vmsd = ???; /* something from the DeviceClass */

/* Use the VMStateDescription in opaque to fetch fields from
   Visitor and store them in the file.  This is basically the
   other VMState interpreter currently in savevm.c, but fetching
   fields from v rather than the struct.  */
}

void
qdev_add_vmstate(Device *obj, VMStateDescription *vmsd)
{
qdev_add_property(obj, "vmstate", vmstate_get, vmstate_set, vmsd);
qdev_add_interface(obj, "QEMUFileSerializable", vmstate_load_state,
   vmstate_save_state);
}


savevm.c:

Visitor *v = qmp_output_visitor_new();
qdev_property_get(obj, "vmstate", v);
QObject *qobj = qmp_visitor_get_obj(v);
qmp_visitor_free(v);

Visitor *v = qmp_input_visitor_new(qobj);
QEMUFileSerializable *s = QEMU_FILE_SERIALIZABLE(obj);
s->save_state(obj, v, outfile);
qmp_visitor_free(v);

...

Visitor *v = qmp_output_visitor_new();
QEMUFileSerializable *s = QEMU_FILE_SERIALIZABLE(obj);
s->load_state(obj, v, outfile);
QObject *qobj = qmp_visitor_get_obj(v);
qmp_visitor_free(v);

Visitor *v = qmp_input_visitor_new(qobj);
QEMUFileSerializable *s = QEMU_FILE_SERIALIZABLE(obj);
qdev_property_set(obj, "vmstate", v);
qmp_visitor_free(v);

-

Yes, this makes QEMUFile serialization special.  But that's because it's 
legacy, and it needs to do strange things and store things beyond the 
contents of the vmstate.


Take for example "unused" fields.  In Mike's implementation, if you try 
to pass a QMPOutputVisitor to save_state you might get a "unused" entry 
that is an array of zeros.  I have no idea what happens if a single 
VMStateDescription has more than one VMSTATE_UNUSED field. 
QEMUFileOutputVisitor completely ignores the field names, so it probably 
works but would break with QMPOutputVisitor.


Other serialization backends should not need any hooks in the devices 
beyond save_state and load_state.


Paolo



[Qemu-devel] [PATCH v3 0/6] block: zero writes

2011-12-21 Thread Stefan Hajnoczi
This series adds an interface for efficient writes when data contains all
zeros.  It also takes advantage of this new interface by extending the
copy-on-read feature to perform zero-detection.

The details of efficient zero representations depend on the image format.  This
series includes a patch for the QED image format to write special "zero
clusters" that keep the image file compact.  In the future qcow2v3 could also
support an efficient zero representation.

The new BlockDriver interface is called .bdrv_co_write_zeroes() and is
optional.  If the interface is not implemented by a BlockDriver then a regular
.bdrv_co_writev() operation will be performed.  The public interface is called
bdrv_co_write_zeroes() and can be tested via the new qemu-io write -z option.

Copy-on-read is extended to detect zeroes and invoke the
.bdrv_co_write_zeroes() interface when possible.  As a result we avoid bloating
the image file if the backing file contains zeroes.

My motivation for this feature is efficient image streaming.  The destination
file must stay compact even when copying zeroes from the source file.

We now only do zero detection for copy-on-read requests, whereas previous
revisions of this patch series scanned all write requests for zeroes.  The old
behavior wasted CPU cycles in most cases but we could add a feature to
explicitly scan guest writes for zeroes in the future, if desired.

v2:
 * Introduce .bdrv_co_write_zeroes() [Kevin]
 * Perform zero detection only on copy-on-read requests [Kevin]

Stefan Hajnoczi (6):
  cutils: extract buffer_is_zero() from qemu-img.c
  block: add .bdrv_co_write_zeroes() interface
  block: perform zero-detection during copy-on-read
  qed: replace is_write with flags field
  qed: add .bdrv_co_write_zeroes() support
  qemu-io: add write -z option for bdrv_co_write_zeroes

 block.c   |   64 +
 block.h   |7 +++
 block/qed.c   |  125 ++---
 block/qed.h   |7 +++-
 block_int.h   |8 
 cutils.c  |   34 +++
 qemu-common.h |2 +
 qemu-img.c|   46 +++--
 qemu-io.c |   77 +++
 trace-events  |3 +-
 10 files changed, 300 insertions(+), 73 deletions(-)

-- 
1.7.7.3




[Qemu-devel] [PATCH v3 1/6] cutils: extract buffer_is_zero() from qemu-img.c

2011-12-21 Thread Stefan Hajnoczi
The qemu-img.c:is_not_zero() function checks if a buffer contains all
zeroes.  This function will come in handy for zero-detection in the
block layer, so clean it up and move it to cutils.c.

Note that the function now returns true if the buffer is all zeroes.
This avoids the double-negatives (i.e. !is_not_zero()) that the old
function can cause in callers.

Signed-off-by: Stefan Hajnoczi 
---
 cutils.c  |   34 ++
 qemu-common.h |2 ++
 qemu-img.c|   46 +++---
 3 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/cutils.c b/cutils.c
index 24b3fe3..c9560c3 100644
--- a/cutils.c
+++ b/cutils.c
@@ -301,6 +301,40 @@ void qemu_iovec_memset_skip(QEMUIOVector *qiov, int c, 
size_t count,
 }
 }
 
+/*
+ * Checks if a buffer is all zeroes
+ *
+ * Attention! The len must be a multiple of 4 * sizeof(long) due to
+ * restriction of optimizations in this function.
+ */
+bool buffer_is_zero(const void *buf, size_t len)
+{
+/*
+ * Use long as the biggest available internal data type that fits into the
+ * CPU register and unroll the loop to smooth out the effect of memory
+ * latency.
+ */
+
+size_t i;
+long d0, d1, d2, d3;
+const long * const data = buf;
+
+len /= sizeof(long);
+
+for (i = 0; i < len; i += 4) {
+d0 = data[i + 0];
+d1 = data[i + 1];
+d2 = data[i + 2];
+d3 = data[i + 3];
+
+if (d0 || d1 || d2 || d3) {
+return false;
+}
+}
+
+return true;
+}
+
 #ifndef _WIN32
 /* Sets a specific flag */
 int fcntl_setfl(int fd, int flag)
diff --git a/qemu-common.h b/qemu-common.h
index b2de015..95fa2b2 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -293,6 +293,8 @@ void qemu_iovec_memset(QEMUIOVector *qiov, int c, size_t 
count);
 void qemu_iovec_memset_skip(QEMUIOVector *qiov, int c, size_t count,
 size_t skip);
 
+bool buffer_is_zero(const void *buf, size_t len);
+
 void qemu_progress_init(int enabled, float min_skip);
 void qemu_progress_end(void);
 void qemu_progress_print(float delta, int max);
diff --git a/qemu-img.c b/qemu-img.c
index 01cc0d3..c4bcf41 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -515,40 +515,6 @@ static int img_commit(int argc, char **argv)
 }
 
 /*
- * Checks whether the sector is not a zero sector.
- *
- * Attention! The len must be a multiple of 4 * sizeof(long) due to
- * restriction of optimizations in this function.
- */
-static int is_not_zero(const uint8_t *sector, int len)
-{
-/*
- * Use long as the biggest available internal data type that fits into the
- * CPU register and unroll the loop to smooth out the effect of memory
- * latency.
- */
-
-int i;
-long d0, d1, d2, d3;
-const long * const data = (const long *) sector;
-
-len /= sizeof(long);
-
-for(i = 0; i < len; i += 4) {
-d0 = data[i + 0];
-d1 = data[i + 1];
-d2 = data[i + 2];
-d3 = data[i + 3];
-
-if (d0 || d1 || d2 || d3) {
-return 1;
-}
-}
-
-return 0;
-}
-
-/*
  * Returns true iff the first sector pointed to by 'buf' contains at least
  * a non-NUL byte.
  *
@@ -557,20 +523,22 @@ static int is_not_zero(const uint8_t *sector, int len)
  */
 static int is_allocated_sectors(const uint8_t *buf, int n, int *pnum)
 {
-int v, i;
+bool is_zero;
+int i;
 
 if (n <= 0) {
 *pnum = 0;
 return 0;
 }
-v = is_not_zero(buf, 512);
+is_zero = buffer_is_zero(buf, 512);
 for(i = 1; i < n; i++) {
 buf += 512;
-if (v != is_not_zero(buf, 512))
+if (is_zero != buffer_is_zero(buf, 512)) {
 break;
+}
 }
 *pnum = i;
-return v;
+return !is_zero;
 }
 
 /*
@@ -955,7 +923,7 @@ static int img_convert(int argc, char **argv)
 if (n < cluster_sectors) {
 memset(buf + n * 512, 0, cluster_size - n * 512);
 }
-if (is_not_zero(buf, cluster_size)) {
+if (!buffer_is_zero(buf, cluster_size)) {
 ret = bdrv_write_compressed(out_bs, sector_num, buf,
 cluster_sectors);
 if (ret != 0) {
-- 
1.7.7.3




[Qemu-devel] [PATCH v3 3/6] block: perform zero-detection during copy-on-read

2011-12-21 Thread Stefan Hajnoczi
Copy-on-Read populates the image file with data read from a backing
image.  In order to avoid bloating the image file when all zeroes are
read we should scan the buffer and perform an optimized zero write
operation.

Signed-off-by: Stefan Hajnoczi 
---
 block.c |   14 +++---
 1 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index 5ebbd4d..967a583 100644
--- a/block.c
+++ b/block.c
@@ -1506,6 +1506,7 @@ static int coroutine_fn 
bdrv_co_copy_on_readv(BlockDriverState *bs,
  */
 void *bounce_buffer;
 
+BlockDriver *drv = bs->drv;
 struct iovec iov;
 QEMUIOVector bounce_qiov;
 int64_t cluster_sector_num;
@@ -1526,14 +1527,21 @@ static int coroutine_fn 
bdrv_co_copy_on_readv(BlockDriverState *bs,
 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 
-ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
- &bounce_qiov);
+ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
+ &bounce_qiov);
 if (ret < 0) {
 goto err;
 }
 
-ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
+if (drv->bdrv_co_write_zeroes &&
+buffer_is_zero(bounce_buffer, iov.iov_len)) {
+ret = drv->bdrv_co_write_zeroes(bs, cluster_sector_num,
+cluster_nb_sectors);
+} else {
+ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
   &bounce_qiov);
+}
+
 if (ret < 0) {
 /* It might be okay to ignore write errors for guest requests.  If this
  * is a deliberate copy-on-read then we don't want to ignore the error.
-- 
1.7.7.3




[Qemu-devel] [PATCH v3 4/6] qed: replace is_write with flags field

2011-12-21 Thread Stefan Hajnoczi
Per-request attributes like read/write are currently implemented as bool
fields in the QEDAIOCB struct.  This becomes unwiedly as the number of
attributes grows.  For example, the qed_aio_setup() function would have
to take multiple bool arguments and at call sites it would be hard to
distinguish the meaning of each bool.

Instead use a flags field with bitmask constants.  This will be used
when zero write support is added.

Signed-off-by: Stefan Hajnoczi 
---
 block/qed.c  |   15 ---
 block/qed.h  |6 +-
 trace-events |2 +-
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 8da3ebe..b66dd17 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1233,8 +1233,8 @@ static void qed_aio_next_io(void *opaque, int ret)
 {
 QEDAIOCB *acb = opaque;
 BDRVQEDState *s = acb_to_s(acb);
-QEDFindClusterFunc *io_fn =
-acb->is_write ? qed_aio_write_data : qed_aio_read_data;
+QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
+qed_aio_write_data : qed_aio_read_data;
 
 trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
 
@@ -1264,14 +1264,14 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState 
*bs,
int64_t sector_num,
QEMUIOVector *qiov, int nb_sectors,
BlockDriverCompletionFunc *cb,
-   void *opaque, bool is_write)
+   void *opaque, int flags)
 {
 QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque);
 
 trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
- opaque, is_write);
+opaque, flags);
 
-acb->is_write = is_write;
+acb->flags = flags;
 acb->finished = NULL;
 acb->qiov = qiov;
 acb->qiov_offset = 0;
@@ -1291,7 +1291,7 @@ static BlockDriverAIOCB 
*bdrv_qed_aio_readv(BlockDriverState *bs,
 BlockDriverCompletionFunc *cb,
 void *opaque)
 {
-return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, false);
+return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
 }
 
 static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
@@ -1300,7 +1300,8 @@ static BlockDriverAIOCB 
*bdrv_qed_aio_writev(BlockDriverState *bs,
  BlockDriverCompletionFunc *cb,
  void *opaque)
 {
-return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, true);
+return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
+ opaque, QED_AIOCB_WRITE);
 }
 
 static BlockDriverAIOCB *bdrv_qed_aio_flush(BlockDriverState *bs,
diff --git a/block/qed.h b/block/qed.h
index 62cbd3b..abed147 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -123,12 +123,16 @@ typedef struct QEDRequest {
 CachedL2Table *l2_table;
 } QEDRequest;
 
+enum {
+QED_AIOCB_WRITE = 0x0001,   /* read or write? */
+};
+
 typedef struct QEDAIOCB {
 BlockDriverAIOCB common;
 QEMUBH *bh;
 int bh_ret; /* final return status for completion bh */
 QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
-bool is_write;  /* false - read, true - write */
+int flags;  /* QED_AIOCB_* bits ORed together */
 bool *finished; /* signal for cancel completion */
 uint64_t end_pos;   /* request end on block device, in bytes */
 
diff --git a/trace-events b/trace-events
index fd2d7d9..6d063c4 100644
--- a/trace-events
+++ b/trace-events
@@ -309,7 +309,7 @@ qed_need_check_timer_cb(void *s) "s %p"
 qed_start_need_check_timer(void *s) "s %p"
 qed_cancel_need_check_timer(void *s) "s %p"
 qed_aio_complete(void *s, void *acb, int ret) "s %p acb %p ret %d"
-qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void 
*opaque, int is_write) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque 
%p is_write %d"
+qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void 
*opaque, int flags) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque %p 
flags %#x"
 qed_aio_next_io(void *s, void *acb, int ret, uint64_t cur_pos) "s %p acb %p 
ret %d cur_pos %"PRIu64
 qed_aio_read_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s 
%p acb %p ret %d offset %"PRIu64" len %zu"
 qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) 
"s %p acb %p ret %d offset %"PRIu64" len %zu"
-- 
1.7.7.3




[Qemu-devel] [PATCH v3 2/6] block: add .bdrv_co_write_zeroes() interface

2011-12-21 Thread Stefan Hajnoczi
The ability to zero regions of an image file is a useful primitive for
higher-level features such as image streaming or zero write detection.

Image formats may support an optimized metadata representation instead
of writing zeroes into the image file.  This allows zero writes to be
potentially faster than regular write operations and also preserve
sparseness of the image file.

The .bdrv_co_write_zeroes() interface should be implemented by block
drivers that wish to provide efficient zeroing.

Note that this operation is different from the discard operation, which
may leave the contents of the region indeterminate.  That means
discarded blocks are not guaranteed to contain zeroes and may contain
junk data instead.

Signed-off-by: Stefan Hajnoczi 
---
 block.c  |   50 --
 block.h  |7 +++
 block_int.h  |8 
 trace-events |1 +
 4 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index 3f072f6..5ebbd4d 100644
--- a/block.c
+++ b/block.c
@@ -64,7 +64,7 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState 
*bs,
 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, bool write_zeroes);
 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
int64_t sector_num,
QEMUIOVector *qiov,
@@ -1291,7 +1291,7 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque)
  rwco->nb_sectors, rwco->qiov);
 } else {
 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
-  rwco->nb_sectors, rwco->qiov);
+  rwco->nb_sectors, rwco->qiov, false);
 }
 }
 
@@ -1608,11 +1608,37 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, 
int64_t sector_num,
 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov);
 }
 
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+int64_t sector_num, int nb_sectors)
+{
+BlockDriver *drv = bs->drv;
+QEMUIOVector qiov;
+struct iovec iov;
+int ret;
+
+/* First try the efficient write zeroes operation */
+if (drv->bdrv_co_write_zeroes) {
+return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
+}
+
+/* Fall back to bounce buffer if write zeroes is unsupported */
+iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
+iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+memset(iov.iov_base, 0, iov.iov_len);
+qemu_iovec_init_external(&qiov, &iov, 1);
+
+ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
+
+qemu_vfree(iov.iov_base);
+return ret;
+}
+
 /*
  * Handle a write request in coroutine context
  */
 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+bool write_zeroes)
 {
 BlockDriver *drv = bs->drv;
 BdrvTrackedRequest req;
@@ -1639,7 +1665,11 @@ static int coroutine_fn 
bdrv_co_do_writev(BlockDriverState *bs,
 
 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
 
-ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+if (write_zeroes) {
+ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
+} else {
+ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+}
 
 if (bs->dirty_bitmap) {
 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
@@ -1659,7 +1689,15 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, 
int64_t sector_num,
 {
 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
 
-return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov);
+return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, false);
+}
+
+int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
+  int64_t sector_num, int nb_sectors)
+{
+trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
+
+return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, true);
 }
 
 /**
@@ -3143,7 +3181,7 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque)
 acb->req.nb_sectors, acb->req.qiov);
 } else {
 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
-acb->req.nb_sectors, acb->req.qiov);
+acb->req.nb_sectors, acb->req.qiov, false);
 }
 
 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
diff --git a/block.h b/block.h
index 3bd4398..51b90c7 100644
--- a/block.h
+++ b/block.h
@@ -144,6 +144,13 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, 
int64_t sector_num,
 int nb_sectors, QE

[Qemu-devel] [PATCH v3 5/6] qed: add .bdrv_co_write_zeroes() support

2011-12-21 Thread Stefan Hajnoczi
Zero writes are a dedicated interface for writing regions of zeroes into
the image file.  If clusters are not yet allocated it is possible to use
an efficient metadata representation which keeps the image file compact
and does not store individual zero bytes.

Implementing this for the QED image format is fairly straightforward.
The only issue is that when a zero write touches an existing cluster we
have to allocate a bounce buffer and perform a regular write.

Signed-off-by: Stefan Hajnoczi 
---
 block/qed.c |  110 ++
 block/qed.h |1 +
 2 files changed, 103 insertions(+), 8 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index b66dd17..a041d31 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -875,6 +875,12 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
 qemu_iovec_destroy(&acb->cur_qiov);
 qed_unref_l2_cache_entry(acb->request.l2_table);
 
+/* Free the buffer we may have allocated for zero writes */
+if (acb->flags & QED_AIOCB_ZERO) {
+qemu_vfree(acb->qiov->iov[0].iov_base);
+acb->qiov->iov[0].iov_base = NULL;
+}
+
 /* Arrange for a bh to invoke the completion function */
 acb->bh_ret = ret;
 acb->bh = qemu_bh_new(qed_aio_complete_bh, acb);
@@ -941,9 +947,8 @@ static void qed_aio_write_l1_update(void *opaque, int ret)
 /**
  * Update L2 table with new cluster offsets and write them out
  */
-static void qed_aio_write_l2_update(void *opaque, int ret)
+static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
 {
-QEDAIOCB *acb = opaque;
 BDRVQEDState *s = acb_to_s(acb);
 bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
 int index;
@@ -959,7 +964,7 @@ static void qed_aio_write_l2_update(void *opaque, int ret)
 
 index = qed_l2_index(s, acb->cur_pos);
 qed_update_l2_table(s, acb->request.l2_table->table, index, 
acb->cur_nclusters,
- acb->cur_cluster);
+ offset);
 
 if (need_alloc) {
 /* Write out the whole new L2 table */
@@ -976,6 +981,12 @@ err:
 qed_aio_complete(acb, ret);
 }
 
+static void qed_aio_write_l2_update_cb(void *opaque, int ret)
+{
+QEDAIOCB *acb = opaque;
+qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
+}
+
 /**
  * Flush new data clusters before updating the L2 table
  *
@@ -990,7 +1001,7 @@ static void qed_aio_write_flush_before_l2_update(void 
*opaque, int ret)
 QEDAIOCB *acb = opaque;
 BDRVQEDState *s = acb_to_s(acb);
 
-if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update, opaque)) {
+if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) {
 qed_aio_complete(acb, -EIO);
 }
 }
@@ -1019,7 +1030,7 @@ static void qed_aio_write_main(void *opaque, int ret)
 if (s->bs->backing_hd) {
 next_fn = qed_aio_write_flush_before_l2_update;
 } else {
-next_fn = qed_aio_write_l2_update;
+next_fn = qed_aio_write_l2_update_cb;
 }
 }
 
@@ -1081,6 +1092,18 @@ static bool qed_should_set_need_check(BDRVQEDState *s)
 return !(s->header.features & QED_F_NEED_CHECK);
 }
 
+static void qed_aio_write_zero_cluster(void *opaque, int ret)
+{
+QEDAIOCB *acb = opaque;
+
+if (ret) {
+qed_aio_complete(acb, ret);
+return;
+}
+
+qed_aio_write_l2_update(acb, 0, 1);
+}
+
 /**
  * Write new data cluster
  *
@@ -1092,6 +1115,7 @@ static bool qed_should_set_need_check(BDRVQEDState *s)
 static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
 BDRVQEDState *s = acb_to_s(acb);
+BlockDriverCompletionFunc *cb;
 
 /* Cancel timer when the first allocating request comes in */
 if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
@@ -1109,14 +1133,26 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t 
len)
 
 acb->cur_nclusters = qed_bytes_to_clusters(s,
 qed_offset_into_cluster(s, acb->cur_pos) + len);
-acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
 qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
 
+if (acb->flags & QED_AIOCB_ZERO) {
+/* Skip ahead if the clusters are already zero */
+if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
+qed_aio_next_io(acb, 0);
+return;
+}
+
+cb = qed_aio_write_zero_cluster;
+} else {
+cb = qed_aio_write_prefill;
+acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
+}
+
 if (qed_should_set_need_check(s)) {
 s->header.features |= QED_F_NEED_CHECK;
-qed_write_header(s, qed_aio_write_prefill, acb);
+qed_write_header(s, cb, acb);
 } else {
-qed_aio_write_prefill(acb, 0);
+cb(acb, 0);
 }
 }
 
@@ -1131,6 +1167,16 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t 
len)
  */
 static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
 {
+/* Allocate buffer for zero writes

[Qemu-devel] [PATCH v3 6/6] qemu-io: add write -z option for bdrv_co_write_zeroes

2011-12-21 Thread Stefan Hajnoczi
Extend the qemu-io write command with the -z option to call
bdrv_co_write_zeroes().  Exposing the zero write interface from qemu-io
allows us to write tests that exercise this new block layer interface.

Signed-off-by: Stefan Hajnoczi 
---
 qemu-io.c |   77 ++--
 1 files changed, 69 insertions(+), 8 deletions(-)

diff --git a/qemu-io.c b/qemu-io.c
index ffa62fb..48e3393 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -218,6 +218,51 @@ static int do_pwrite(char *buf, int64_t offset, int count, 
int *total)
 return 1;
 }
 
+typedef struct {
+int64_t offset;
+int count;
+int *total;
+int ret;
+bool done;
+} CoWriteZeroes;
+
+static void coroutine_fn co_write_zeroes_entry(void *opaque)
+{
+CoWriteZeroes *data = opaque;
+
+data->ret = bdrv_co_write_zeroes(bs, data->offset / BDRV_SECTOR_SIZE,
+ data->count / BDRV_SECTOR_SIZE);
+data->done = true;
+if (data->ret < 0) {
+*data->total = data->ret;
+return;
+}
+
+*data->total = data->count;
+}
+
+static int do_co_write_zeroes(int64_t offset, int count, int *total)
+{
+Coroutine *co;
+CoWriteZeroes data = {
+.offset = offset,
+.count  = count,
+.total  = total,
+.done   = false,
+};
+
+co = qemu_coroutine_create(co_write_zeroes_entry);
+qemu_coroutine_enter(co, &data);
+while (!data.done) {
+qemu_aio_wait();
+}
+if (data.ret < 0) {
+return data.ret;
+} else {
+return 1;
+}
+}
+
 static int do_load_vmstate(char *buf, int64_t offset, int count, int *total)
 {
 *total = bdrv_load_vmstate(bs, (uint8_t *)buf, offset, count);
@@ -643,6 +688,7 @@ static void write_help(void)
 " -P, -- use different pattern to fill file\n"
 " -C, -- report statistics in a machine parsable format\n"
 " -q, -- quiet mode, do not show I/O statistics\n"
+" -z, -- write zeroes using bdrv_co_write_zeroes\n"
 "\n");
 }
 
@@ -654,7 +700,7 @@ static const cmdinfo_t write_cmd = {
 .cfunc  = write_f,
 .argmin = 2,
 .argmax = -1,
-.args   = "[-abCpq] [-P pattern ] off len",
+.args   = "[-bCpqz] [-P pattern ] off len",
 .oneline= "writes a number of bytes at a specified offset",
 .help   = write_help,
 };
@@ -662,16 +708,16 @@ static const cmdinfo_t write_cmd = {
 static int write_f(int argc, char **argv)
 {
 struct timeval t1, t2;
-int Cflag = 0, pflag = 0, qflag = 0, bflag = 0;
+int Cflag = 0, pflag = 0, qflag = 0, bflag = 0, Pflag = 0, zflag = 0;
 int c, cnt;
-char *buf;
+char *buf = NULL;
 int64_t offset;
 int count;
 /* Some compilers get confused and warn if this is not initialized.  */
 int total = 0;
 int pattern = 0xcd;
 
-while ((c = getopt(argc, argv, "bCpP:q")) != EOF) {
+while ((c = getopt(argc, argv, "bCpP:qz")) != EOF) {
 switch (c) {
 case 'b':
 bflag = 1;
@@ -683,6 +729,7 @@ static int write_f(int argc, char **argv)
 pflag = 1;
 break;
 case 'P':
+Pflag = 1;
 pattern = parse_pattern(optarg);
 if (pattern < 0) {
 return 0;
@@ -691,6 +738,9 @@ static int write_f(int argc, char **argv)
 case 'q':
 qflag = 1;
 break;
+case 'z':
+zflag = 1;
+break;
 default:
 return command_usage(&write_cmd);
 }
@@ -700,8 +750,13 @@ static int write_f(int argc, char **argv)
 return command_usage(&write_cmd);
 }
 
-if (bflag && pflag) {
-printf("-b and -p cannot be specified at the same time\n");
+if (bflag + pflag + zflag > 1) {
+printf("-b, -p, or -z cannot be specified at the same time\n");
+return 0;
+}
+
+if (zflag && Pflag) {
+printf("-z and -P cannot be specified at the same time\n");
 return 0;
 }
 
@@ -732,13 +787,17 @@ static int write_f(int argc, char **argv)
 }
 }
 
-buf = qemu_io_alloc(count, pattern);
+if (!zflag) {
+buf = qemu_io_alloc(count, pattern);
+}
 
 gettimeofday(&t1, NULL);
 if (pflag) {
 cnt = do_pwrite(buf, offset, count, &total);
 } else if (bflag) {
 cnt = do_save_vmstate(buf, offset, count, &total);
+} else if (zflag) {
+cnt = do_co_write_zeroes(offset, count, &total);
 } else {
 cnt = do_write(buf, offset, count, &total);
 }
@@ -758,7 +817,9 @@ static int write_f(int argc, char **argv)
 print_report("wrote", &t2, offset, count, total, cnt, Cflag);
 
 out:
-qemu_io_free(buf);
+if (!zflag) {
+qemu_io_free(buf);
+}
 
 return 0;
 }
-- 
1.7.7.3




Re: [Qemu-devel] [PATCH v3 0/6] block: zero writes

2011-12-21 Thread Stefan Hajnoczi
On Wed, Dec 21, 2011 at 4:00 PM, Stefan Hajnoczi
 wrote:
> This series adds an interface for efficient writes when data contains all
> zeros.  It also takes advantage of this new interface by extending the
> copy-on-read feature to perform zero-detection.
>
> The details of efficient zero representations depend on the image format.  
> This
> series includes a patch for the QED image format to write special "zero
> clusters" that keep the image file compact.  In the future qcow2v3 could also
> support an efficient zero representation.
>
> The new BlockDriver interface is called .bdrv_co_write_zeroes() and is
> optional.  If the interface is not implemented by a BlockDriver then a regular
> .bdrv_co_writev() operation will be performed.  The public interface is called
> bdrv_co_write_zeroes() and can be tested via the new qemu-io write -z option.
>
> Copy-on-read is extended to detect zeroes and invoke the
> .bdrv_co_write_zeroes() interface when possible.  As a result we avoid 
> bloating
> the image file if the backing file contains zeroes.
>
> My motivation for this feature is efficient image streaming.  The destination
> file must stay compact even when copying zeroes from the source file.
>
> We now only do zero detection for copy-on-read requests, whereas previous
> revisions of this patch series scanned all write requests for zeroes.  The old
> behavior wasted CPU cycles in most cases but we could add a feature to
> explicitly scan guest writes for zeroes in the future, if desired.
>
> v2:

Typo:
s/v2/v3/

Stefan



Re: [Qemu-devel] [PATCH v2 01/10] qapi: add Visitor interfaces for uint*_t and int*_t

2011-12-21 Thread Anthony Liguori

On 12/21/2011 09:39 AM, Paolo Bonzini wrote:

On 12/21/2011 03:45 PM, Anthony Liguori wrote:

On 12/21/2011 06:35 AM, Paolo Bonzini wrote:

On 12/20/2011 09:56 PM, Anthony Liguori wrote:

As always, you can implement that in many ways. However, I think the
point of using Visitors is not to remove QEMUFile.


Yes, it is.


The point of using Visitors is to provide a standard representation of
device
state. QEMUFile is but one consumer of that representation, along with
any other
migration filter.


Can you do a quick code mock up of how you'd expect this to work?


void
vmstate_get(Device *obj, Visitor *v, void *opaque, const char *name)
{
/* Use the VMStateDescription in opaque to add fields to Visitor
from the struct. */
}

void
vmstate_set(Device *obj, Visitor *v, void *opaque, const char *name)
{
/* Use the VMStateDescription in opaque to retrieve fields from
Visitor and store them in the struct. */
}

void
vmstate_load_state(Device *obj, Visitor *v, QEMUFile )
{
VMStateDescription *vmsd = ???; /* something from the DeviceClass */

/* Use the VMStateDescription in opaque to retrieve fields from
Visitor and store them in the struct. This is basically the
VMState interpreter currently in savevm.c, only fetching fields
from v rather than the file. */
}

void
vmstate_save_state(Device *obj, Visitor *v, QEMUFile *out)
{
VMStateDescription *vmsd = ???; /* something from the DeviceClass */

/* Use the VMStateDescription in opaque to fetch fields from
Visitor and store them in the file. This is basically the
other VMState interpreter currently in savevm.c, but fetching
fields from v rather than the struct. */
}

void
qdev_add_vmstate(Device *obj, VMStateDescription *vmsd)
{
qdev_add_property(obj, "vmstate", vmstate_get, vmstate_set, vmsd);
qdev_add_interface(obj, "QEMUFileSerializable", vmstate_load_state,
vmstate_save_state);


Ok, I get what you're suggesting.  You would like to continue to keep 
VMStateDescription describing both the QEMUFile format and the fields.


I'm suggesting something fundamentally different.  What I see us doing is 
breaking VMStateDescription apart into two different things.  One would be a 
pure description of current fields.  The other one would be the description of 
the old protocol.


The later would be fed to a migration filter and the former would live off of 
DeviceState.


By doing Mike's series, we can do this incrementally by splitting the 
description up, and invoking the filter post_load/pre_save.


One of the reasons for this split up is because I would like to generate the 
first table by IDL and make the second table not dependent on structure members 
(so we don't end up with all the hacks we have with dummy struct fields).


Regards,

Anthony Liguori


}


savevm.c:

Visitor *v = qmp_output_visitor_new();
qdev_property_get(obj, "vmstate", v);
QObject *qobj = qmp_visitor_get_obj(v);
qmp_visitor_free(v);

Visitor *v = qmp_input_visitor_new(qobj);
QEMUFileSerializable *s = QEMU_FILE_SERIALIZABLE(obj);
s->save_state(obj, v, outfile);
qmp_visitor_free(v);

...

Visitor *v = qmp_output_visitor_new();
QEMUFileSerializable *s = QEMU_FILE_SERIALIZABLE(obj);
s->load_state(obj, v, outfile);
QObject *qobj = qmp_visitor_get_obj(v);
qmp_visitor_free(v);

Visitor *v = qmp_input_visitor_new(qobj);
QEMUFileSerializable *s = QEMU_FILE_SERIALIZABLE(obj);
qdev_property_set(obj, "vmstate", v);
qmp_visitor_free(v);

-

Yes, this makes QEMUFile serialization special. But that's because it's legacy,
and it needs to do strange things and store things beyond the contents of the
vmstate.

Take for example "unused" fields. In Mike's implementation, if you try to pass a
QMPOutputVisitor to save_state you might get a "unused" entry that is an array
of zeros. I have no idea what happens if a single VMStateDescription has more
than one VMSTATE_UNUSED field. QEMUFileOutputVisitor completely ignores the
field names, so it probably works but would break with QMPOutputVisitor.

Other serialization backends should not need any hooks in the devices beyond
save_state and load_state.

Paolo






Re: [Qemu-devel] [PATCH 3/9] arm: add missing v7 cp15 registers

2011-12-21 Thread Mark Langsdorf
On 12/20/2011 01:48 PM, Peter Maydell wrote:
> On 20 December 2011 19:10, Mark Langsdorf  wrote:
>> diff --git a/target-arm/helper.c b/target-arm/helper.c
>> index 816c4c4..37110bc 100644
>> --- a/target-arm/helper.c
>> +++ b/target-arm/helper.c
>> @@ -2197,6 +2197,13 @@ uint32_t HELPER(get_cp15)(CPUState *env, uint32_t
>> insn)
>>  * 0x200 << ($rn & 0xfff), when MMU is off.  */
>> goto bad_reg;
>> }
>> +if (ARM_CPUID(env) == ARM_CPUID_CORTEXA9) {
>> +switch (crm) {
>> +case 0:
>> +return env->cp15.c15_scubase;
>> +}
>> +goto bad_reg;
>> +}
> 
> This is underdecoded: the A9 has two registers in c15,c0:
> CRn Op1 CRm Op2 Name
> 15  0   c0  0   Power Control Register
> 15  4   c0  0   Configuration Base Address
> 
> I'm guessing you're after the Configuration Base Address register.
> (please call the struct name something vaguely relating to the
> official register name, incidentally.)

Apparently it's called scubase in the Linux code, but I'll
fix the name for qemu.

>> return 0;
>> }
>>  bad_reg:
> 
> This commit leaves the register with a reset value of 0, which
> isn't right (we only implement A9MP, not A9UP, so the reset value
> should be settable by the board at init time somehow depending
> where the a9mpcore_priv device is mapped. Not sure what the
> cleanest way to do that is.)

I can add a DEFINE_PROP_UINT32 to a9mpcore_priv, which would give
anyone who wants to set the property an easy way to do so. I'm
not sure how to hook the a9mpcore_priv to the CPUARMState, though.
They don't appear to reference each other.

--Mark Langsdorf
Calxeda, Inc.



Re: [Qemu-devel] [PATCH v3 1/6] cutils: extract buffer_is_zero() from qemu-img.c

2011-12-21 Thread Eric Blake
On 12/21/2011 09:00 AM, Stefan Hajnoczi wrote:
> The qemu-img.c:is_not_zero() function checks if a buffer contains all
> zeroes.  This function will come in handy for zero-detection in the
> block layer, so clean it up and move it to cutils.c.
> 
> Note that the function now returns true if the buffer is all zeroes.
> This avoids the double-negatives (i.e. !is_not_zero()) that the old
> function can cause in callers.

Are there plans to improve the efficiency of buffer_is_zero to take
advantage of metadata about sparseness?

That is, there are cases where we can use metadata to prove a region of
a file is sparse, without having to read every byte within that region.
 Now that this series is giving QED special metadata that marks a zero
cluster, it is faster to query if that metadata exists denoting a zero
cluster than it is to read the entire cluster and check for non-zero.
Likewise, with regular files, the kernel provides lseek(SEEK_HOLE) (or
the older, lower-level, ioctl(FS_IOC_FIEMAP)); which at least GNU
coreutils is using for efficient sparse detection in source files.

-- 
Eric Blake   ebl...@redhat.com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH v3 2/6] block: add .bdrv_co_write_zeroes() interface

2011-12-21 Thread Christoph Hellwig
On Wed, Dec 21, 2011 at 04:00:36PM +, Stefan Hajnoczi wrote:
> The ability to zero regions of an image file is a useful primitive for
> higher-level features such as image streaming or zero write detection.
> 
> Image formats may support an optimized metadata representation instead
> of writing zeroes into the image file.  This allows zero writes to be
> potentially faster than regular write operations and also preserve
> sparseness of the image file.
> 
> The .bdrv_co_write_zeroes() interface should be implemented by block
> drivers that wish to provide efficient zeroing.
> 
> Note that this operation is different from the discard operation, which
> may leave the contents of the region indeterminate.  That means
> discarded blocks are not guaranteed to contain zeroes and may contain
> junk data instead.

Most real life discard operations zero the data, and both the ATA and SCSI
spec allow the device to set a bit which gurantees this behaviour.  I think
we also should make these one interface, and if the caller needs it to
actually zero out the discarded blocks it should check if the discard
implementation guarantees that.




Re: [Qemu-devel] [PATCH v2 01/10] qapi: add Visitor interfaces for uint*_t and int*_t

2011-12-21 Thread Paolo Bonzini

On 12/21/2011 05:24 PM, Anthony Liguori wrote:

Ok, I get what you're suggesting.  You would like to continue to keep
VMStateDescription describing both the QEMUFile format and the fields.


I don't "like" to do that, but yes, that's what I'm suggesting. :)

You envision having the front-end (state->introspectable state 
representation) and back-end (state representation->serialization) 
completely decoupled in the future, with migration filters in the middle 
if necessary.


I actually agree this is as the final direction.  For this reason, the 
first step should be to decouple the front-end and backend and actually 
have this introspectable state representation.  Then you can also break 
VMStateDescription apart into a front-end and backend part.


Mike's patches change the way you write to QEMUFile, so the new code 
_does_ go in the right direction.  However, they fail to provide an 
introspectable, backend-independent state representation.  Let's put it 
in a few diagrams:


where we are now:struct ---> QEMUFile
 ^
 |
VMStateDescription


what Mike's patches do:  struct ---> QEMUFileVisitor ---> QEMUFile
 ^
 |
VMStateDescription


what I proposed: struct ---> QMPOutputVisitor ---> QObject
 ^
 |
VMStateDescription

 QObject ---> QEMUFile
  ^
  |
   VMStateDescription

where you want to go:struct ---> QMPOutputVisitor ---> QObject
 ^
 |
DeviceStateDescription

QObject ---> QEMUFileOutputVisitor ---> QEMUFile
 ^
 |
  VMStateDescription

As Mike's patches stand, I'm worried that they would make further steps 
harder rather than easier.  This is because I'm not convinced that the 
QEMUFileOutputVisitor is actually useful.


However, the new code from Mike's patches is very close to the 
backend-independent representation from the VMStateDescription.  So, 
Mike's patches could be morphed into this pretty easily:


struct ---> QMPOutputVisitor ---> QObject
^
|
 VMStateDescription

struct ---> QEMUFile \
^ \ that's savevm.c,
| / unchanged
   VMStateDescription/

This is an intermediate state that lets us do the next steps:

- serialize to QEMUFile from a QObject;

- reintroduce Mike's QEMUFileOutputVisitor [I don't really see the 
benefit of this];


- split the DeviceStateDescription and the VMStateDescription;

None of these three steps are a prerequisite for introducing a new 
migration format.



One of the reasons for this split up is because I would like to generate
the first table by IDL and make the second table not dependent on
structure members (so we don't end up with all the hacks we have with
dummy struct fields).


That would be a few years away.  Let's reason in incremental steps.

Paolo



Re: [Qemu-devel] [PATCH 00/17] Support mismatched host and guest logical block sizes

2011-12-21 Thread Christoph Hellwig
On Wed, Dec 14, 2011 at 01:40:22PM +0100, Paolo Bonzini wrote:
> If the partitions are aligned, the OS will always issue aligned requests, 
> because file system blocks are already 4k.

It won't nessecarily.  For example XFS will do a fair amount of sub-blocksize
I/O for metadata and the log.  Note that if you tell XFS that the
minimal_io_size is 4k it will at least align the log write to it, which
is what really matters.




Re: [Qemu-devel] [PATCH 00/17] Support mismatched host and guest logical block sizes

2011-12-21 Thread Paolo Bonzini

On 12/21/2011 05:55 PM, Christoph Hellwig wrote:

On Wed, Dec 14, 2011 at 01:40:22PM +0100, Paolo Bonzini wrote:

If the partitions are aligned, the OS will always issue aligned requests,
because file system blocks are already 4k.


It won't nessecarily.  For example XFS will do a fair amount of sub-blocksize
I/O for metadata and the log.  Note that if you tell XFS that the
minimal_io_size is 4k it will at least align the log write to it, which
is what really matters.


The physical_block_size doesn't matter?

Paolo




Re: [Qemu-devel] [RFC] Device isolation infrastructure v2

2011-12-21 Thread Joerg Roedel
On Wed, Dec 21, 2011 at 02:32:35PM +1100, David Gibson wrote:
> On Mon, Dec 19, 2011 at 04:41:56PM +0100, Joerg Roedel wrote:
> > For 1) I think the current solution with the iommu_group file is fine.
> > It is somewhat expensive for user-space to figure out the per-group
> > device-sets, but that is a one-time effort so it doesn't really matter.
> > Probably we can rename 'iommu_group' to 'isolation_group' or
> > something.
> 
> Hrm.  Alex's group code also provides no in-kernel way to enumerate a
> group, short of walking every device in the system.  And it provides
> no way to attach information to a group.  It just seems foolish to me
> to have this concept without some kind of in-kernel handle on it, and
> if you're managing the in-kernel representation you might as well
> expose it to userspace there as well.

I agree that we should have an in-kernel handle for groups in the
future. This would be generic code the iommu-drivers can use to manage
their groups and where the pci-quirks can chime in. But we don't need
that immediatly, lets add this incrementally.
>From the implementation side I would prefer to introduce a dev->iommu
pointer (a generalization of the dev->archdata->iommu pointer we have on
x86, ia64 and arm already) and link this to the handle of the group.
This makes it also easy to walk all devices in a group within the
kernel.
I still disagree that we need to expose this to user-space in any other
way then what we currently have. It is just not worth the effort when it
is used that rarely. It is perfectly fine if user-space has to scan the
sysfs device tree to build its own group data structures.

> > Regarding 2), I think providing user-space a way to unbind groups of
> > devices from their drivers is a horrible idea.
> 
> Well, I'm not wed to unbinding all the drivers at once.  But what I
> *do* think is essential is that we can atomically switch off automatic
> driver matching for the whole group.  Without that nothing really
> stops a driver reattaching to the things you've unbound, so you end up
> bailing a leakey boat.

Okay, makes sense for hotadd to have this. What we need from the
driver-core here is a method to disable driver probing for a device. The
iommu-layer can then call this method on the DEVICE_ADD notifier if
required. This also can be done incrementally to the current VFIO
implementation.

> Ok, that's not the usage model I had in mind.  What I'm thinking here
> is that the admin removes groups that will be used in guests from the
> host kernel (probably via boot-time scripts).  The guests will pick
> them up later, so that when a guest quits then restarts, we don't have
> devices appearing transiently in the host.

This 'remove-groups-from-the-host' thing can easily be done (also at
boot time) by just binding the devices in question to the VFIO driver.
And the devices can stay with VFIO for the lifetime of the system (or
until the host want to make use of it). I don't think we need any
additional logic for that.

> > For the remaining two questions I think the concept of a default-domain
> > is helpful.  The default-domain is a per-group domain which is created
> > by the iommu-driver at initialization time. It is the domain each device
> > is assigned to when it is not assigned to any other domain (which means
> > that each device/group is always attached to a domain). The default
> > domain will be used by the DMA-API layer. This implicitly means, that a
> > device which is not in the default-domain can't be used with the
> > dma-api. The dma_supported() function will return false for those
> > devices.
> 
> But.. by definition every device in the group must belong to the same
> domain.  So how is this "default domain" in any way different from
> "current domain".

The default domain is created by the iommu-drivers for all groups it
finds on the system and the devices are automatically assigned to it at
iommu-driver initialization time. This domain will be used for mapping
dma-api calls.
The "current domain" on the other side can also be a domain created by
VFIO (or KVM as it is today) for mapping the device dma space into a
guest.

> The domain_{attach,detach} functions absolutely should be group based
> not device based.  That's what it means to be a group.

I disagree with that. This would force all iommu-drivers that have no
group-concept (basically all current ARM iommu implementations) to work
on groups too. That makes the iommu-api much harder to use.

We could do instead:

1) Force users of the iommu-api to bind each device in the group
   individually.
2) Attach all devices in the group automatically to the same
   domain if a single device is attached to it.

I have no strong opinion which way we go, but have a slight favour for
way 2. To make it harder to mis-use we can force that way 2 only works
if the group is in its default-domain and there are no pending
dma-allocations.

> > Question 4) is also solved with the defaul

[Qemu-devel] [Bug 883136] Re: qemu on ARM hosts aborts on startup because makecontext() always fails

2011-12-21 Thread Dr. David Alan Gilbert
Attached is a 1st cut of a makecontext/setcontext/getcontext/swapcontext set 
for ARM - not ready to go into libc yet;
currently it builds standalone and links (and passes) with a test of mine.  
Next stop libc and a lot of cleanup.


** Changed in: qemu-linaro
 Assignee: (unassigned) => Dr. David Alan Gilbert (davidgil-uk)

** Attachment added: "Start of a set of context routines"
   
https://bugs.launchpad.net/qemu-linaro/+bug/883136/+attachment/2642884/+files/allcontext.S

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/883136

Title:
  qemu on ARM hosts aborts on startup because makecontext() always fails

Status in QEMU:
  New
Status in Linaro QEMU:
  New

Bug description:
  qemu has recently grown a coroutines implementation. There are two
  versions, one using the makecontext/setcontext/swapcontext functions
  from ucontext.h, and one falling back to implementing coroutines as
  separate glib threads. configure chooses the former if the platform
  has a makecontext().

  Unfortunately ARM eglibc provides a makecontext() which always fails
  ENOSYS, which means the configure check passes but when qemu starts it
  abort()s.

  The best fix for this is probably going to involve making the
  coroutine implementation runtime-selectable.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/883136/+subscriptions



Re: [Qemu-devel] [PATCH 00/25] nbd asynchronous operation

2011-12-21 Thread Paolo Bonzini

On 12/15/2011 12:09 PM, Kevin Wolf wrote:

>  This needs rebasing due to commit 3a93113 (fix typo: delete redundant
>  semicolon, 2011-11-29).  To avoid further spamming, I placed the whole
>  thing at git://github.com/bonzini/qemu.git in branch nbd-server.

I won't get to this (and virtio-scsi) before I go on vacation next week,
and probably for some time after it. Feel free to send a pull request to
Anthony when you think the series have received enough review and are
tested well enough.

Maybe Nick is interested in reviewing this series, CCed him.


Anthony, I think this is tested well enough (by me :) but it hasn't 
received review and likely won't.  If that's fine with you, it's pushed 
at git://github.com/bonzini/qemu.git in branch nbd-server.


The sheepdog bits are quite old and have been reviewed by Kazutaka.

Paolo



Re: [Qemu-devel] [PATCH 00/25] nbd asynchronous operation

2011-12-21 Thread Anthony Liguori

On 12/21/2011 12:11 PM, Paolo Bonzini wrote:

On 12/15/2011 12:09 PM, Kevin Wolf wrote:

> This needs rebasing due to commit 3a93113 (fix typo: delete redundant
> semicolon, 2011-11-29). To avoid further spamming, I placed the whole
> thing at git://github.com/bonzini/qemu.git in branch nbd-server.

I won't get to this (and virtio-scsi) before I go on vacation next week,
and probably for some time after it. Feel free to send a pull request to
Anthony when you think the series have received enough review and are
tested well enough.

Maybe Nick is interested in reviewing this series, CCed him.


Anthony, I think this is tested well enough (by me :) but it hasn't received
review and likely won't.


Does that mean we need a:

Network Block Device
M: Paolo Bonzini 
S: Maintained
F: block/block-nbd.c
F: nbd.*
F: qemu-nbd.c

In MAINTAINERS?  Please send a patch if you're willing.

If that's fine with you, it's pushed at

git://github.com/bonzini/qemu.git in branch nbd-server.


Please send a proper pull request.

Regards,

Anthony Liguori



The sheepdog bits are quite old and have been reviewed by Kazutaka.

Paolo





Re: [Qemu-devel] [PATCH v4 04/11] ARM: exynos4210: IRQ subsystem support.

2011-12-21 Thread Peter Maydell
On 21 December 2011 15:08, Evgeny Voevodin  wrote:
> On 12/21/2011 05:50 PM, Peter Maydell wrote:
>> arm_gic.c exposes the CPU and distributor interfaces as their own
>> memory regions now -- you shouldn't need any of this intermediate
>> layer of functions.

> These functions are not actually for splitting CPU and Distributer
> interfaces.
> In our board we have two GICs - internal and external. Internal GIC is
> completely
> matching arm_gic.c.
>
> Internal GIC CPU[n] and Distributer[n] interfaces are at 0x100 and 0x1000
> offsets from
> 0x1050 base.
>
> But external GIC is different.
> It's CPU[0] interface is at 0x0 offset from 0x1048 base
> and
>      CPU[1] interface is at 0x8000 offset from 0x1048 base
>
> It's Distributer[0] interface is at 0x0 offset from 0x1049 base
> and
>      Distributer[1] interface is at 0x8000 offset from 0x1049 base
>
> [n] - is corresponding to SMP CPU Core.
>
> So, we need these wrapper functions for External GIC.

I don't understand this reasoning. If there are two GICs then
you should just instantiate two GIC devices and map and/or alias
their memory regions at the right addresses. The reason why
the distributor and CPU interfaces are exposed as multiple
memory regions is exactly so you can put them at different
offsets for different boards/CPUs. If arm_gic doesn't
provide suitably split up memory regions then it should be
fixed to do so.

-- PMM



Re: [Qemu-devel] [PATCH 3/9] arm: add missing v7 cp15 registers

2011-12-21 Thread Peter Maydell
On 21 December 2011 16:37, Mark Langsdorf  wrote:
> On 12/20/2011 01:48 PM, Peter Maydell wrote:
>> This commit leaves the register with a reset value of 0, which
>> isn't right (we only implement A9MP, not A9UP, so the reset value
>> should be settable by the board at init time somehow depending
>> where the a9mpcore_priv device is mapped. Not sure what the
>> cleanest way to do that is.)
>
> I can add a DEFINE_PROP_UINT32 to a9mpcore_priv, which would give
> anyone who wants to set the property an easy way to do so. I'm
> not sure how to hook the a9mpcore_priv to the CPUARMState, though.
> They don't appear to reference each other.

Yes, we don't really model the private peripherals very sensibly:
they should be an integral part of the CPU but for historical
reasons they've been done as a totally separate device. I don't
think a property is the right thing, though -- ideally the a9mpcore
device should be able to find out for itself where it was mapped.

Avi: is there a way for a device (sysbus device in this case) to
find out for one of its memory regions where it has been mapped
in the address space? The context here is that the Cortex-A9
has a cp15 register whose value is "base address of the private
peripherals", and it would be nice not to have to have boards
saying both "map mmio region at X" and "set property so register
reads as X"... [You could argue that hardware implementations
have to do the equivalent of both of these things separately,
I suppose, but it's still not very pretty.]

thanks
-- PMM



[Qemu-devel] [PATCH 0/5] VFIO core framework

2011-12-21 Thread Alex Williamson
This series includes the core framework for the VFIO driver.
VFIO is a userspace driver interface meant to replace both the
KVM device assignment code as well as interfaces like UIO.  Please
see patch 1/5 for a complete description of VFIO, what it can do,
and how it's designed.

This version and the VFIO PCI bus driver, for exposing PCI devices
through VFIO, can be found here:

git://github.com/awilliam/linux-vfio.git vfio-next-20111221

A development version of qemu which includes a full working
vfio-pci driver, indepdendent of KVM support, can be found here:

git://github.com/awilliam/qemu-vfio.git vfio-ng

Thanks,

Alex

PS - I'll be mostly unavailable over the holidays, but wanted to get
this out for review and comparison to the isolation APIs being proposed.

---

Alex Williamson (5):
  vfio: VFIO core Kconfig and Makefile
  vfio: VFIO core IOMMU mapping support
  vfio: VFIO core group interface
  vfio: VFIO core header
  vfio: Introduce documentation for VFIO driver


 Documentation/ioctl/ioctl-number.txt |1 
 Documentation/vfio.txt   |  352 ++
 MAINTAINERS  |8 
 drivers/Kconfig  |2 
 drivers/Makefile |1 
 drivers/vfio/Kconfig |8 
 drivers/vfio/Makefile|3 
 drivers/vfio/vfio_iommu.c|  593 +
 drivers/vfio/vfio_main.c | 1201 ++
 drivers/vfio/vfio_private.h  |   36 +
 include/linux/vfio.h |  353 ++
 11 files changed, 2558 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/vfio.txt
 create mode 100644 drivers/vfio/Kconfig
 create mode 100644 drivers/vfio/Makefile
 create mode 100644 drivers/vfio/vfio_iommu.c
 create mode 100644 drivers/vfio/vfio_main.c
 create mode 100644 drivers/vfio/vfio_private.h
 create mode 100644 include/linux/vfio.h



[Qemu-devel] [PATCH 4/5] vfio: VFIO core IOMMU mapping support

2011-12-21 Thread Alex Williamson
Backing for operations on the IOMMU object, including DMA
mapping and unmapping.

Signed-off-by: Alex Williamson 
---

 drivers/vfio/vfio_iommu.c |  593 +
 1 files changed, 593 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vfio/vfio_iommu.c

diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
new file mode 100644
index 000..b6644ec
--- /dev/null
+++ b/drivers/vfio/vfio_iommu.c
@@ -0,0 +1,593 @@
+/*
+ * VFIO: IOMMU DMA mapping support
+ *
+ * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
+ * Author: Alex Williamson 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, p...@cisco.com
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vfio_private.h"
+
+struct vfio_dma_map_entry {
+   struct list_headlist;
+   dma_addr_t  iova;   /* Device address */
+   unsigned long   vaddr;  /* Process virtual addr */
+   longnpage;  /* Number of pages */
+   int prot;   /* IOMMU_READ/WRITE */
+};
+
+/* This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU */
+
+#define NPAGE_TO_SIZE(npage)   ((size_t)(npage) << PAGE_SHIFT)
+
+struct vwork {
+   struct mm_struct*mm;
+   longnpage;
+   struct work_struct  work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void vfio_lock_acct_bg(struct work_struct *work)
+{
+   struct vwork *vwork = container_of(work, struct vwork, work);
+   struct mm_struct *mm;
+
+   mm = vwork->mm;
+   down_write(&mm->mmap_sem);
+   mm->locked_vm += vwork->npage;
+   up_write(&mm->mmap_sem);
+   mmput(mm);
+   kfree(vwork);
+}
+
+static void vfio_lock_acct(long npage)
+{
+   struct vwork *vwork;
+   struct mm_struct *mm;
+
+   if (!current->mm)
+   return; /* process exited */
+
+   if (down_write_trylock(¤t->mm->mmap_sem)) {
+   current->mm->locked_vm += npage;
+   up_write(¤t->mm->mmap_sem);
+   return;
+   }
+
+   /* Couldn't get mmap_sem lock, so must setup to update
+* mm->locked_vm later. If locked_vm were atomic, we
+* wouldn't need this silliness */
+   vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+   if (!vwork)
+   return;
+   mm = get_task_mm(current);
+   if (!mm) {
+   kfree(vwork);
+   return;
+   }
+   INIT_WORK(&vwork->work, vfio_lock_acct_bg);
+   vwork->mm = mm;
+   vwork->npage = npage;
+   schedule_work(&vwork->work);
+}
+
+/* Some mappings aren't backed by a struct page, for example an mmap'd
+ * MMIO range for our own or another device.  These use a different
+ * pfn conversion and shouldn't be tracked as locked pages. */
+static bool is_invalid_reserved_pfn(unsigned long pfn)
+{
+   if (pfn_valid(pfn)) {
+   bool reserved;
+   struct page *tail = pfn_to_page(pfn);
+   struct page *head = compound_trans_head(tail);
+   reserved = !!(PageReserved(head));
+   if (head != tail) {
+   /* "head" is not a dangling pointer
+* (compound_trans_head takes care of that)
+* but the hugepage may have been split
+* from under us (and we may not hold a
+* reference count on the head page so it can
+* be reused before we run PageReferenced), so
+* we've to check PageTail before returning
+* what we just read.  */
+   smp_rmb();
+   if (PageTail(tail))
+   return reserved;
+   }
+   return PageReserved(tail);
+   }
+
+   return true;
+}
+
+static int put_pfn(unsigned long pfn, int prot)
+{
+   if (!is_invalid_reserved_pfn(pfn)) {
+   struct page *page = pfn_to_page(pfn);
+   if (prot & IOMMU_WRITE)
+   SetPageDirty(page);
+   put_page(page);
+   return 1;
+   }
+   return 0;
+}
+
+/* Unmap DMA region */
+static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
+long npage, int prot)
+{
+   long i, unlocked = 0;
+
+   for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
+   unsigned long pfn;
+
+   pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT

[Qemu-devel] [PATCH 3/5] vfio: VFIO core group interface

2011-12-21 Thread Alex Williamson
This provides the base group management with conduits to the
IOMMU driver and VFIO bus drivers.

Signed-off-by: Alex Williamson 
---

 drivers/vfio/vfio_main.c| 1201 +++
 drivers/vfio/vfio_private.h |   36 +
 2 files changed, 1237 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vfio/vfio_main.c
 create mode 100644 drivers/vfio/vfio_private.h

diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
new file mode 100644
index 000..e4f0f92
--- /dev/null
+++ b/drivers/vfio/vfio_main.c
@@ -0,0 +1,1201 @@
+/*
+ * VFIO framework
+ *
+ * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
+ * Author: Alex Williamson 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, p...@cisco.com
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vfio_private.h"
+
+#define DRIVER_VERSION "0.2"
+#define DRIVER_AUTHOR  "Alex Williamson "
+#define DRIVER_DESC"VFIO - User Level meta-driver"
+
+static struct vfio {
+   dev_t   devt;
+   struct cdev cdev;
+   struct list_headgroup_list;
+   struct mutexlock;
+   struct kref kref;
+   struct class*class;
+   struct idr  idr;
+   wait_queue_head_t   release_q;
+} vfio;
+
+static const struct file_operations vfio_group_fops;
+
+struct vfio_group {
+   dev_t   devt;
+   unsigned intgroupid;
+   struct bus_type *bus;
+   struct vfio_iommu   *iommu;
+   struct list_headdevice_list;
+   struct list_headiommu_next;
+   struct list_headgroup_next;
+   struct device   *dev;
+   struct kobject  *devices_kobj;
+   int refcnt;
+   booltainted;
+};
+
+struct vfio_device {
+   struct device   *dev;
+   const struct vfio_device_ops*ops;
+   struct vfio_group   *group;
+   struct list_headdevice_next;
+   boolattached;
+   booldeleteme;
+   int refcnt;
+   void*device_data;
+};
+
+/*
+ * Helper functions called under vfio.lock
+ */
+
+/* Return true if any devices within a group are opened */
+static bool __vfio_group_devs_inuse(struct vfio_group *group)
+{
+   struct list_head *pos;
+
+   list_for_each(pos, &group->device_list) {
+   struct vfio_device *device;
+
+   device = list_entry(pos, struct vfio_device, device_next);
+   if (device->refcnt)
+   return true;
+   }
+   return false;
+}
+
+/* Return true if any of the groups attached to an iommu are opened.
+ * We can only tear apart merged groups when nothing is left open. */
+static bool __vfio_iommu_groups_inuse(struct vfio_iommu *iommu)
+{
+   struct list_head *pos;
+
+   list_for_each(pos, &iommu->group_list) {
+   struct vfio_group *group;
+
+   group = list_entry(pos, struct vfio_group, iommu_next);
+   if (group->refcnt)
+   return true;
+   }
+   return false;
+}
+
+/* An iommu is "in use" if it has a file descriptor open or if any of
+ * the groups assigned to the iommu have devices open. */
+static bool __vfio_iommu_inuse(struct vfio_iommu *iommu)
+{
+   struct list_head *pos;
+
+   if (iommu->refcnt)
+   return true;
+
+   list_for_each(pos, &iommu->group_list) {
+   struct vfio_group *group;
+
+   group = list_entry(pos, struct vfio_group, iommu_next);
+
+   if (__vfio_group_devs_inuse(group))
+   return true;
+   }
+   return false;
+}
+
+static void __vfio_group_set_iommu(struct vfio_group *group,
+  struct vfio_iommu *iommu)
+{
+   if (group->iommu)
+   list_del(&group->iommu_next);
+   if (iommu)
+   list_add(&group->iommu_next, &iommu->group_list);
+
+   group->iommu = iommu;
+}
+
+static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,
+   struct vfio_device *device)
+{
+   if (WARN_ON(!iommu->domain && device->attached))
+   return;
+
+   if (!device->attached)
+   return;
+
+   iommu_detach_device(iommu->domain, device->dev);
+   device->attached = false;
+}
+
+static void __vfio_iommu_detach_group(struct vfio_iommu *iommu

[Qemu-devel] [PATCH 5/5] vfio: VFIO core Kconfig and Makefile

2011-12-21 Thread Alex Williamson
Enable the base code.

Signed-off-by: Alex Williamson 
---

 MAINTAINERS   |8 
 drivers/Kconfig   |2 ++
 drivers/Makefile  |1 +
 drivers/vfio/Kconfig  |8 
 drivers/vfio/Makefile |3 +++
 5 files changed, 22 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vfio/Kconfig
 create mode 100644 drivers/vfio/Makefile

diff --git a/MAINTAINERS b/MAINTAINERS
index 9f7b469..b1f7230 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7142,6 +7142,14 @@ S:   Maintained
 F: Documentation/filesystems/vfat.txt
 F: fs/fat/
 
+VFIO DRIVER
+M: Alex Williamson 
+L: k...@vger.kernel.org
+S: Maintained
+F: Documentation/vfio.txt
+F: drivers/vfio/
+F: include/linux/vfio.h
+
 VIDEOBUF2 FRAMEWORK
 M: Pawel Osciak 
 M: Marek Szyprowski 
diff --git a/drivers/Kconfig b/drivers/Kconfig
index d5138e6..f168bf3 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -114,6 +114,8 @@ source "drivers/auxdisplay/Kconfig"
 
 source "drivers/uio/Kconfig"
 
+source "drivers/vfio/Kconfig"
+
 source "drivers/vlynq/Kconfig"
 
 source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 4ef810e..f715919 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_ATM) += atm/
 obj-$(CONFIG_FUSION)   += message/
 obj-y  += firewire/
 obj-$(CONFIG_UIO)  += uio/
+obj-$(CONFIG_VFIO) += vfio/
 obj-y  += cdrom/
 obj-y  += auxdisplay/
 obj-$(CONFIG_PCCARD)   += pcmcia/
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
new file mode 100644
index 000..9acb1e7
--- /dev/null
+++ b/drivers/vfio/Kconfig
@@ -0,0 +1,8 @@
+menuconfig VFIO
+   tristate "VFIO Non-Privileged userspace driver framework"
+   depends on IOMMU_API
+   help
+ VFIO provides a framework for secure userspace device drivers.
+ See Documentation/vfio.txt for more details.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
new file mode 100644
index 000..088faf1
--- /dev/null
+++ b/drivers/vfio/Makefile
@@ -0,0 +1,3 @@
+vfio-y := vfio_main.o vfio_iommu.o
+
+obj-$(CONFIG_VFIO) := vfio.o




[Qemu-devel] [PATCH 1/5] vfio: Introduce documentation for VFIO driver

2011-12-21 Thread Alex Williamson
Including rationale for design, example usage and API description.

Signed-off-by: Alex Williamson 
---

 Documentation/vfio.txt |  352 
 1 files changed, 352 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/vfio.txt

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
new file mode 100644
index 000..09a5a5b
--- /dev/null
+++ b/Documentation/vfio.txt
@@ -0,0 +1,352 @@
+VFIO - "Virtual Function I/O"[1]
+---
+Many modern system now provide DMA and interrupt remapping facilities
+to help ensure I/O devices behave within the boundaries they've been
+allotted.  This includes x86 hardware with AMD-Vi and Intel VT-d,
+POWER systems with Partitionable Endpoints (PEs) and embedded PowerPC
+systems such as Freescale PAMU.  The VFIO driver is an IOMMU/device
+agnostic framework for exposing direct device access to userspace, in
+a secure, IOMMU protected environment.  In other words, this allows
+safe[2], non-privileged, userspace drivers.
+
+Why do we want that?  Virtual machines often make use of direct device
+access ("device assignment") when configured for the highest possible
+I/O performance.  From a device and host perspective, this simply
+turns the VM into a userspace driver, with the benefits of
+significantly reduced latency, higher bandwidth, and direct use of
+bare-metal device drivers[3].
+
+Some applications, particularly in the high performance computing
+field, also benefit from low-overhead, direct device access from
+userspace.  Examples include network adapters (often non-TCP/IP based)
+and compute accelerators.  Prior to VFIO, these drivers had to either
+go through the full development cycle to become proper upstream
+driver, be maintained out of tree, or make use of the UIO framework,
+which has no notion of IOMMU protection, limited interrupt support,
+and requires root privileges to access things like PCI configuration
+space.
+
+The VFIO driver framework intends to unify these, replacing both the
+KVM PCI specific device assignment code as well as provide a more
+secure, more featureful userspace driver environment than UIO.
+
+Groups, Devices, and IOMMUs
+---
+
+Userspace drivers are primarily concerned with manipulating individual
+devices and setting up mappings in the IOMMU for those devices.
+Unfortunately, the IOMMU doesn't always have the granularity to track
+mappings for an individual device.  Sometimes this is a topology
+barrier, such as a PCIe-to-PCI bridge interposing the device and
+IOMMU, other times this is an IOMMU limitation.  In any case, the
+reality is that devices are not always independent with respect to the
+IOMMU.  Translations setup for one device can be used by another
+device in these scenarios.
+
+The IOMMU API exposes these relationships by identifying an "IOMMU
+group" for these dependent devices.  Devices on the same bus with the
+same IOMMU group (or just "group" for this document) are not isolated
+from each other with respect to DMA mappings.  For userspace usage,
+this logically means that instead of being able to grant ownership of
+an individual device, we must grant ownership of a group, which may
+contain one or more devices.
+
+These groups therefore become a fundamental component of VFIO and the
+working unit we use for exposing devices and granting permissions to
+userspace.  In addition, VFIO make efforts to ensure the integrity of
+the group for user access.  This includes ensuring that all devices
+within the group are controlled by VFIO (vs native host drivers)
+before allowing a user to access any member of the group or the IOMMU
+mappings, as well as maintaining the group viability as devices are
+dynamically added or removed from the system.
+
+To access a device through VFIO, a user must open a character device
+for the group that the device belongs to and then issue an ioctl to
+retrieve a file descriptor for the individual device.  This ensures
+that the user has permissions to the group (file based access to the
+/dev entry) and allows a check point at which VFIO can deny access to
+the device if the group is not viable (all devices within the group
+controlled by VFIO).  A file descriptor for the IOMMU is obtain in the
+same fashion.
+
+VFIO defines a standard set of APIs for access to devices and a
+modular interface for adding new, bus-specific VFIO device drivers.
+We call these "VFIO bus drivers".  The vfio-pci module is an example
+of a bus driver for exposing PCI devices.  When the bus driver module
+is loaded it enumerates all of the devices for it's bus, registering
+each device with the vfio core along with a set of callbacks.  For
+buses that support hotplug, the bus driver also adds itself to the
+notification chain for such events.  The callbacks registered with
+each device implement the VFIO device

Re: [Qemu-devel] [PATCH 1/3] virtio-console: Check if chardev backends available before calling into them

2011-12-21 Thread Anthony Liguori

On 12/21/2011 12:58 AM, Amit Shah wrote:

For the callback functions invoked by the virtio-serial-bus code, check
if we have chardev backends registered before we call into the chardev
functions.

Signed-off-by: Amit Shah


Applied all.  Thanks.

Regards,

Anthony Liguori


---
  hw/virtio-console.c |   11 +++
  1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/hw/virtio-console.c b/hw/virtio-console.c
index d3351c8..dbbea76 100644
--- a/hw/virtio-console.c
+++ b/hw/virtio-console.c
@@ -27,6 +27,11 @@ static ssize_t flush_buf(VirtIOSerialPort *port, const 
uint8_t *buf, size_t len)
  VirtConsole *vcon = DO_UPCAST(VirtConsole, port, port);
  ssize_t ret;

+if (!vcon->chr) {
+/* If there's no backend, we can just say we consumed all data. */
+return len;
+}
+
  ret = qemu_chr_fe_write(vcon->chr, buf, len);
  trace_virtio_console_flush_buf(port->id, len, ret);

@@ -52,6 +57,9 @@ static void guest_open(VirtIOSerialPort *port)
  {
  VirtConsole *vcon = DO_UPCAST(VirtConsole, port, port);

+if (!vcon->chr) {
+return;
+}
  qemu_chr_fe_open(vcon->chr);
  }

@@ -60,6 +68,9 @@ static void guest_close(VirtIOSerialPort *port)
  {
  VirtConsole *vcon = DO_UPCAST(VirtConsole, port, port);

+if (!vcon->chr) {
+return;
+}
  qemu_chr_fe_close(vcon->chr);
  }






[Qemu-devel] [PATCH 2/5] vfio: VFIO core header

2011-12-21 Thread Alex Williamson
This defines both the user and bus driver APIs.

Signed-off-by: Alex Williamson 
---

 Documentation/ioctl/ioctl-number.txt |1 
 include/linux/vfio.h |  353 ++
 2 files changed, 354 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/vfio.h

diff --git a/Documentation/ioctl/ioctl-number.txt 
b/Documentation/ioctl/ioctl-number.txt
index af76fde..69825b0 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -88,6 +88,7 @@ Code  Seq#(hex)   Include FileComments
and kernel/power/user.c
 '8'all SNP8023 advanced NIC card

+';'64-83   linux/vfio.h
 '@'00-0F   linux/radeonfb.hconflict!
 '@'00-0F   drivers/video/aty/aty128fb.cconflict!
 'A'00-1F   linux/apm_bios.hconflict!
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
new file mode 100644
index 000..2769dfb
--- /dev/null
+++ b/include/linux/vfio.h
@@ -0,0 +1,353 @@
+/*
+ * VFIO API definition
+ *
+ * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
+ * Author: Alex Williamson 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef VFIO_H
+#define VFIO_H
+
+#include 
+
+#ifdef __KERNEL__  /* Internal VFIO-core/bus driver API */
+
+/**
+ * struct vfio_device_ops - VFIO bus driver device callbacks
+ *
+ * @match: Return true if buf describes the device
+ * @claim: Force driver to attach to device
+ * @open: Called when userspace receives file descriptor for device
+ * @release: Called when userspace releases file descriptor for device
+ * @read: Perform read(2) on device file descriptor
+ * @write: Perform write(2) on device file descriptor
+ * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_*
+ * operations documented below
+ * @mmap: Perform mmap(2) on a region of the device file descriptor
+ */
+struct vfio_device_ops {
+   bool(*match)(struct device *dev, const char *buf);
+   int (*claim)(struct device *dev);
+   int (*open)(void *device_data);
+   void(*release)(void *device_data);
+   ssize_t (*read)(void *device_data, char __user *buf,
+   size_t count, loff_t *ppos);
+   ssize_t (*write)(void *device_data, const char __user *buf,
+size_t count, loff_t *size);
+   long(*ioctl)(void *device_data, unsigned int cmd,
+unsigned long arg);
+   int (*mmap)(void *device_data, struct vm_area_struct *vma);
+};
+
+/**
+ * vfio_group_add_dev() - Add a device to the vfio-core
+ *
+ * @dev: Device to add
+ * @ops: VFIO bus driver callbacks for device
+ *
+ * This registration makes the VFIO core aware of the device, creates
+ * groups objects as required and exposes chardevs under /dev/vfio.
+ *
+ * Return 0 on success, errno on failure.
+ */
+extern int vfio_group_add_dev(struct device *dev,
+ const struct vfio_device_ops *ops);
+
+/**
+ * vfio_group_del_dev() - Remove a device from the vfio-core
+ *
+ * @dev: Device to remove
+ *
+ * Remove a device previously added to the VFIO core, removing groups
+ * and chardevs as necessary.
+ */
+extern void vfio_group_del_dev(struct device *dev);
+
+/**
+ * vfio_bind_dev() - Indicate device is bound to the VFIO bus driver and
+ *   register private data structure for ops callbacks.
+ *
+ * @dev: Device being bound
+ * @device_data: VFIO bus driver private data
+ *
+ * This registration indicate that a device previously registered with
+ * vfio_group_add_dev() is now available for use by the VFIO core.  When
+ * all devices within a group are available, the group is viable and my
+ * be used by userspace drivers.  Typically called from VFIO bus driver
+ * probe function.
+ *
+ * Return 0 on success, errno on failure
+ */
+extern int vfio_bind_dev(struct device *dev, void *device_data);
+
+/**
+ * vfio_unbind_dev() - Indicate device is unbinding from VFIO bus driver
+ *
+ * @dev: Device being unbound
+ *
+ * De-registration of the device previously registered with vfio_bind_dev()
+ * from VFIO.  Upon completion, the device is no longer available for use by
+ * the VFIO core.  Typically called from the VFIO bus driver remove function.
+ * The VFIO core will attempt to release the device from users and may take
+ * measures to free the device and/or block as necessary.
+ *
+ * Returns pointer to private device_data structure registered with
+ * vfio_bind_dev().
+ */
+extern void *vfio_unbind_dev(struct device *dev);
+
+
+/**
+ * offsetofend(TYPE, MEMBER)
+ *
+ * @TYPE: The type of the structure
+ * @MEMBER: The member within the structure to get the end offset of
+ *
+ * Simple helper macro f

Re: [Qemu-devel] [PATCH 8/9] Add xgmac ethernet model

2011-12-21 Thread Mark Langsdorf
On 12/20/2011 02:24 PM, Peter Maydell wrote:
> On 20 December 2011 19:15, Mark Langsdorf  wrote:
>> This adds very basic support for xgmac block. Missing things include:
>>
>> - statistics counters
>> - WoL support
>> - rx checksum offload
>> - chained descriptors (only linear descriptor ring)
>> - broadcast and multicast handling
> 
> So, er, what's an xgmac? Any public documentation? Are you planning
> to submit a board model that uses this?

It's a Synopis XG-Mac.

I have code for a board model that uses it, but it uses the pre-
MemoryRegion API including direct calls to
cpu_register_physical_memory() and qemu_ram_alloc(). I can't
submit it until I get it updated. Any suggestions for a good
example of converting a board model from one API to the other?

Thanks,
Mark Langsdorf
Calxeda, Inc.




[Qemu-devel] VMDK: footer must take precedence over header when present

2011-12-21 Thread B Gordon
VMDK: footer must take precedence over header when present

In e.g. streamOptimized VMDKs from VSphere 4 with this flag set the
header l1_table is bogus and only the footer l1_table can be used to
correctly read extents.

Also reverts recent change to VMDK4Header so order of rgd_ and
gd_offset matches the VMDK spec.

https://bugs.launchpad.net/qemu/+bug/907063

Signed-off-by: B.B. Gordon 

diff --git a/block/vmdk.c b/block/vmdk.c
index 5623ac1..77ff9e1 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -35,6 +35,7 @@
 #define VMDK4_FLAG_RGD (1 << 1)
 #define VMDK4_FLAG_COMPRESS (1 << 16)
 #define VMDK4_FLAG_MARKER (1 << 17)
+#define VMDK4_HEADER_AT_END 0xULL
 
 typedef struct {
 uint32_t version;
@@ -57,8 +58,8 @@ typedef struct {
 int64_t desc_offset;
 int64_t desc_size;
 int32_t num_gtes_per_gte;
-int64_t gd_offset;
 int64_t rgd_offset;
+int64_t gd_offset;
 int64_t grain_offset;
 char filler[1];
 char check_bytes[4];
@@ -443,11 +445,27 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
 VMDK4Header header;
 VmdkExtent *extent;
 int64_t l1_backup_offset = 0;
+struct stat sb;
 
 ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
 if (ret < 0) {
 return ret;
 }
+if (le64_to_cpu(header.gd_offset) == VMDK4_HEADER_AT_END) {
+uint64_t filesize;
+if (stat(bs->filename, &sb) == -1) {
+fprintf(stderr, "Unable to get file size.\n");
+return -1;
+}
+filesize = (((uint64_t)sb.st_size + (512 - 1)) &
~(uint64_t)(512 - 1));
+ret = bdrv_pread(bs->file,
+ (filesize - 1024 + sizeof(magic)),
+ &header,
+ sizeof(header));
+if (ret != sizeof(header)) {
+return ret;
+}
+}
 if (header.capacity == 0 && header.desc_offset) {
 return vmdk_open_desc_file(bs, flags, header.desc_offset << 9);
 }

-- 
http://www.fastmail.fm - Choose from over 50 domains or use your own




Re: [Qemu-devel] [PATCH] w32: QEMU applications with SDL are always GUI applications

2011-12-21 Thread TeLeMan
--
SUN OF A BEACH



On Mon, Dec 19, 2011 at 16:15, Stefan Weil  wrote:
> Am 19.12.2011 03:12, schrieb TeLeMan:
>
>> On Sat, Dec 17, 2011 at 07:12, Stefan Weil  wrote:
>>>
>>> Am 16.12.2011 04:24, schrieb TeLeMan:
>>>
 On Sun, Dec 4, 2011 at 05:32, Stefan Weil  wrote:
>
>
> Since commit 1d14ffa97eacd3cb722271eaf6f093038396eac4 (in 2005),
> QEMU applications on W32 don't use the default SDL compiler flags:
>
> Instead of a GUI application, a console application is created.
>
> This has disadvantages (there is always an empty console window) and
> no obvious reason, so this patch removes the strange flag modification.
>
> The SDL GUI applications still can be run from a console window
> and even send stdout and stderr to that console by setting environment
> variable SDL_STDIO_REDIRECT=no.


 Did you test it? Windows GUI applications can not send stdout to the
 startup console window unless they create their own console window.
>>>
>>>
>>>
>>> I did, but obviously not good enough:
>>>
>>> in an msys rxvt console the QEMU executables work as I wrote
>>> in the commit message. So msys-rxvt and some other applications
>>> (SciTE for example) allow running GUI applications with
>>> stdio channels.
>>>
>>> The Windows command prompt (cmd.exe) is different, and so is
>>> the normal MSYS console. Here console output does not work,
>>> and I also noticed problems when running an emulation with
>>> latest QEMU (application hangs).
>>>
>>> It seems to be difficult to get a solution which works for
>>> several scenarios:
>>>
>>> * It should be possible to create a link which starts
>>>  an emulation with parameters and only one window (SDL,
>>>  no extra console window). This needs a GUI application
>>>  (or is it possible for a console application to suppress
>>>  or close the console window?).
>>>
>>> * It must be possible to see stdout and stderr output.
>>>  Default today: both are written to files in the program
>>>  directory. This is bad because normally users have no
>>>  write access there. It also does not allow running
>>>  more than one emulation with separated output.
>>>
>>> * It should be possible to get stdout and stderr directly
>>>  to the console. This is needed for running with curses,
>>>  and it is useful when asking for -help.
>>>
>>> * It must be possible to run QEMU executables from cmd.exe.
>>>
>>> * It should be possible to run QEMU executables from other
>>>  shells (msys command line, msys rxvt, cygwin command line,
>>>  ...).
>>>
>>> What would you suggest?
>>
>> Add a configure option and let users decide which one is better.
>
>
> Then you get executables with same name but different behavior,
> so it's always necessary to specify which configure options were used.
> It makes documentation and support more difficult.
>
> My favorite solution would be executables which work for all
> (or at least most) typical use cases.
>
> If that is impossible, maybe pairs of executables (a windows gui
> and a console version for each system emulation) would also be
> a reasonable choice. This only needs an additional objcopy (which
> makes a copy and changes the subsystem) to get qemu-system-i386c.exe
> from qemu-system-i386.exe.
>
> There still remains the problem with stdout.txt and stderr.txt
> which are used by default for any console output. This is
> unusual for console applications, and it does not work when
> QEMU was installed because of missing write access.
SDL-1.3 removed the stdio-redirect feature, so we should ignore it.

Blue Swirl, can you revert this commit?
>
>



[Qemu-devel] Seem thread Competition

2011-12-21 Thread ZhouPeng
Hi,

I meet the err:

# virsh dumpxml 63
error: internal error cannot parse json {"timestamp": {"seconds":
1323332828, "microseco{"timest35p":39}, "econds":PICE_D 1CO3233TE828,
Dmicrosec "ds"data"52}, : {"ser: "SPrCE_DIS"ONNEC{ED", "port": {"serve
": {6299": "62, "famiamily": "ipv:", "hist": pv4"1 "hos6":
"182.168.12.231"}, "2lient.: {"p2rt": 3350196, "fa"ily": "}, "cl,
"ho{sport":t"35020", ": "19y": "i2.4", "168.1.": "19}: lexical error:
invalid string in json text.
  ds": 1323332828, "microseco{"timest35p":39}, "econds":PICE_D

This similar bug seems has been reported by:
https://bugzilla.redhat.com/show_bug.cgi?id=744105

The patch(http://cgit.freedesktop.org/spice/qemu/commit/?h=spice.v42)
seems be related with the competition.
But I searched the upstream qemu, it has not been merged in.

My qemu is qemu-kvm-0.15.1 on 3.1.2-1.fc16.x86_64

Is there any plan about this bug, it seems be some serious.

Thanks,
-- 
Zhou Peng



Re: [Qemu-devel] OEM Windows in Qemu

2011-12-21 Thread inbox
Michael,

Thanks for the response.  Since you wrote the patch I'm using I'd say
you are the most qualified to answer questions about it. :)


>Note that for winXP, the only thing needed from the bios is to _mention_ -
>anywhere in its memory - name of your manufacturer. That is, you can
>add any table with just a string - say - "ASUS_Notebook" in it, winXP
>does an equivalent of memmem() function on the bios content to find if
>it is supposed to be right OEM.

I'm not an expert on this by any means, but what I've read on the net is
that Windows looks for some special bytes in RAM that correspond to the
OEM manufacturer.  Apparently this is set in the OEMBIOS.DAT or .CAT
file on the installation cd rom along with the location in memory where
it looks for the data.  This is an example listing which I've read
specifies the RAM addresses, offset, and bytes to look for.

DELL (OEMBIOS.CAT CRC32=B6F0EEFD)

f000,e076,0010,Dell System
f000,e840,0010,Dell Computer
f000,49a9,0010,Dell System
f000,e05e,0010,Dell System
f000,e838,0018,Dell Inc


>WinXP requires "SLIC version 1.0", which is reduced to just having a string
>with the name of your OEM in the bios (one possible place is the SLIC table).
>More recent version of SLIC (2.1 I think) is needed to activate windows7.

This is the part that is confusing me.  I've read that SLIC 2.0 is
backward compatible with SLIC 1.0 so XP should activate just fine with a
working SLIC 2.0.  And your patch does apparently produce a signed SLIC
2.0 
But my OEM copy of XP complains about the BIOS produced with your patch.
 I can only guess there is some critical piece missing that Windows 7
doesn't care about.


>While I'm the author of the howto you mentioned, so my "opinion" here is
>biased, but still I can say that several other people used this way to
>run oem versions of windows7 and windowsVista in their VMs, and sent me
>their thanks. I also found this way mentioned in vmware-related forums.

You can add my name to the list of people offers thanks for that patch. 
:) I'm sure I'll want to install Windows 7 at some point also.


>I've no idea what does these tools do. For testing I just boot linux
>and check if it can see the tables with the content I've used (somewhere
>in /sys/firmware/acpi/tables). For further testing I boot my OEM-preinstalled
>copy of windows to verify it still thinks it is OEM-activated. I also
>tried to actually activate win7 in a VM, using slic+certificate from
>a "random" OEM (these are available on the 'net despite being M$ high-secret),
>and it worked just fine too.

I haven't yet installed a linux VM to check the acpi/tables but here is
what is on the host:
/*
 * Intel ACPI Component Architecture
 * AML Disassembler version 20100528
 *
 * Disassembly of slic.dat, Mon Dec 19 22:31:44 2011
 *
 * ACPI Data Table [SLIC]
 *
 * Format: [HexOffset DecimalOffset ByteLength]  FieldName : FieldValue
 */

[000h   4]Signature : "SLIC"/* Software
Licensing Description Table */
[004h 0004  4] Table Length : 0176
[008h 0008  1] Revision : 01
[009h 0009  1] Checksum : 5F
[00Ah 0010  6]   Oem ID : "DELL  "
[010h 0016  8] Oem Table ID : "M07"
[018h 0024  4] Oem Revision : 27D80202
[01Ch 0028  4]  Asl Compiler ID : "ASL "
[020h 0032  4]Asl Compiler Revision : 0061

These are the results of a dmidecode -t0

SMBIOS 2.4 present.

Handle 0x, DMI type 0, 24 bytes
BIOS Information
Vendor: Dell Inc.
Version: A06
Release Date: 02/02/2008
Address: 0xF
Runtime Size: 64 kB
ROM Size: 576 kB

I'm guessing this may be significant because the DMI data is different
in the VM.  Looking at a memory dump of the VM ACPI tables I see several
tables: RSDP, RSDT, FACP, SSDT, APIC, HPET, SLIC, FACS, and DSDT.
RSDT and SLIC shows what we would expect: "OEM ID= DELL" "OEM Table ID =
M07" "OEM Revision = 27D80202".  In other words all the exported SLIC
data.  But the other tables show "OEM ID = BOCHS" "OEM Table ID = BXPC"
"OEM Revision = 0001"  Maybe Windows XP reads from the "wrong"
tables?

I assume Seabios reads all of that from src/config.h is that right?  I
could just change that data and recompile, but would I need to change
anything else also?  Would that create maybe some other problems down
the road?

>Besides all this, you obviously should have the right OEM version of
>windows, wich "knows" this very OEM you're pretending to be (if you're
>installing new VM and not using a pre-installed copy). For win7 this
>means valid certificate belonging to this OEM is installed in the system.

Right.  It bears mentioning that the OEM cd I have activates properly
when installed directly on the host.

>I wont provide any further details about it, because someone thinks it
>is hackish and "blackish" territory.

You can mail me offlist for that if you like. :)

Brian




Re: [Qemu-devel] OEM Windows in Qemu

2011-12-21 Thread Michael Tokarev
On 22.12.2011 08:44, in...@expertcomputerrepair.com wrote:
[]
>> WinXP requires "SLIC version 1.0", which is reduced to just having a string
>> with the name of your OEM in the bios (one possible place is the SLIC table).
>> More recent version of SLIC (2.1 I think) is needed to activate windows7.
> 
> This is the part that is confusing me.  I've read that SLIC 2.0 is
> backward compatible with SLIC 1.0 so XP should activate just fine with a
> working SLIC 2.0.  And your patch does apparently produce a signed SLIC 2.0 

"My patch" does not produce any SLIC at all.  The instructions mentions
using SLIC from your machine - "my patch" is just a way to _embed_ a given
data into VM, not a way to "produce" anything.  You get in your VM what
you have outside, either in a file or in your own BIOS, depending on where
you took that data from.

> But my OEM copy of XP complains about the BIOS produced with your patch.
>  I can only guess there is some critical piece missing that Windows 7
> doesn't care about.

Well.  It should be both - win7 in my case cared about alot more details
than winXP.  But I must admit that I never actually installed oem version
of winXP, -- I used an installed version to verify what it needs, and
found that I can just mention the right string in the BIOS.  Maybe it
is not sufficient for actual "activation" procedure, I dunno.

[]
> I'm guessing this may be significant because the DMI data is different
> in the VM.  Looking at a memory dump of the VM ACPI tables I see several
> tables: RSDP, RSDT, FACP, SSDT, APIC, HPET, SLIC, FACS, and DSDT.
> RSDT and SLIC shows what we would expect: "OEM ID= DELL" "OEM Table ID =
> M07" "OEM Revision = 27D80202".  In other words all the exported SLIC
> data.  But the other tables show "OEM ID = BOCHS" "OEM Table ID = BXPC"
> "OEM Revision = 0001"  Maybe Windows XP reads from the "wrong"
> tables?

Yes, the patch only changes RSDT to match SLIC - this is what win7 verifies,
all other tables does not matter for win7.  And yes, it might be different
for winXP - you may try setting all tables in VM to be of DELL OEM and see
what happens.

> I assume Seabios reads all of that from src/config.h is that right?  I
> could just change that data and recompile, but would I need to change
> anything else also?  Would that create maybe some other problems down
> the road?

neither seabios nor qemu/kvm actually use these OEM strings, so there
should be no problems.

/mjt



Re: [Qemu-devel] [PATCH] w32: QEMU applications with SDL are always GUI applications

2011-12-21 Thread Stefan Weil

Am 22.12.2011 02:50, schrieb TeLeMan:

SDL-1.3 removed the stdio-redirect feature, so we should ignore it.

Blue Swirl, can you revert this commit?


I'd prefer to keep it. There are good reason why SDL applications
are linked as Windows applications by default, so we should use
this default.

Yes, it's no perfect solution, but even for developers who want
to see stdout and stderr, it works when you use a rxvt console
or something equivalent.

You said that you use a self-compiled SDL library.
Then you can change your sdl-config or sdl.pc so that it
sets the SDL linker options according to your needs.

What about my proposal to create each system emulation in two
variants (qemu-system-i386.exe, qemu-system-i386w.exe)?
Do you think this might be a solution which fits everybody's
needs?

Regards,
Stefan Weil




Re: [Qemu-devel] [PATCH v4 04/11] ARM: exynos4210: IRQ subsystem support.

2011-12-21 Thread Evgeny Voevodin

On 12/22/2011 12:31 AM, Peter Maydell wrote:

On 21 December 2011 15:08, Evgeny Voevodin  wrote:

On 12/21/2011 05:50 PM, Peter Maydell wrote:

arm_gic.c exposes the CPU and distributor interfaces as their own
memory regions now -- you shouldn't need any of this intermediate
layer of functions.

These functions are not actually for splitting CPU and Distributer
interfaces.
In our board we have two GICs - internal and external. Internal GIC is
completely
matching arm_gic.c.

Internal GIC CPU[n] and Distributer[n] interfaces are at 0x100 and 0x1000
offsets from
0x1050 base.

But external GIC is different.
It's CPU[0] interface is at 0x0 offset from 0x1048 base
and
  CPU[1] interface is at 0x8000 offset from 0x1048 base

It's Distributer[0] interface is at 0x0 offset from 0x1049 base
and
  Distributer[1] interface is at 0x8000 offset from 0x1049 base

[n] - is corresponding to SMP CPU Core.

So, we need these wrapper functions for External GIC.

I don't understand this reasoning. If there are two GICs then
you should just instantiate two GIC devices and map and/or alias
their memory regions at the right addresses. The reason why
the distributor and CPU interfaces are exposed as multiple
memory regions is exactly so you can put them at different
offsets for different boards/CPUs. If arm_gic doesn't
provide suitably split up memory regions then it should be
fixed to do so.

-- PMM


One of our GICs (internal) plus private memory region is represented
as "a9mpcore_priv" device. This implementation fits the documentation.

Second GIC (external) is represented as "exynos4210.gic" with splitted
mapping for CPU (0x1048) and Distributer (0x1049) (we used
arm_gic.c availability to split CPU and Distributer memories).

The reason for creation of this device with it's own read/write 
functions is:


CPU and Distributer registers which are banked per SMP Core in internal GIC
are not banked in external GIC and their offsets could not be used as is 
with

arm_gic.c.
External GIC registers in comparison to Internal GIC registers are moved
from base by offset n * 0x8000 for each SMP Core, where n is SMP Core index.
The rest functionality of external GIC is identical to internal GIC and 
arm_gic.c.
So, we can use arm_gic.c in external GIC if we will pass correct offsets 
to it's

read/write functions. To obtain arm_gic.c compliant addresses, we introduced
exynos4210_gic/dist_read/write functions. They obtain arm_gic.c compliant
offsets for primary and secondary SMP Cores and simply call arm_gic.c
functions to do all the work.

--
Kind regards,
Evgeny Voevodin,
Leading Software Engineer,
ASWG, Moscow R&D center, Samsung Electronics
e-mail: e.voevo...@samsung.com




Re: [Qemu-devel] [PATCH] w32: QEMU applications with SDL are always GUI applications

2011-12-21 Thread TeLeMan
On Thu, Dec 22, 2011 at 14:45, Stefan Weil  wrote:
> Am 22.12.2011 02:50, schrieb TeLeMan:
>
>> SDL-1.3 removed the stdio-redirect feature, so we should ignore it.
>>
>> Blue Swirl, can you revert this commit?
>
>
> I'd prefer to keep it. There are good reason why SDL applications
> are linked as Windows applications by default, so we should use
> this default.
>
> Yes, it's no perfect solution, but even for developers who want
> to see stdout and stderr, it works when you use a rxvt console
> or something equivalent.
Now the MinGW default shell is a console application and rxvt is unmaintained.

> You said that you use a self-compiled SDL library.
> Then you can change your sdl-config or sdl.pc so that it
> sets the SDL linker options according to your needs.
>
> What about my proposal to create each system emulation in two
> variants (qemu-system-i386.exe, qemu-system-i386w.exe)?
> Do you think this might be a solution which fits everybody's
> needs?
Yes, I agree this solution. But because qemu-system-i386.exe will be
also a console application, we can revert your commit at first.

> Regards,
> Stefan Weil
>
>