Re: [Qemu-devel] slow virtio network with vhost=on and multiple cores

2013-03-17 Thread Michael S. Tsirkin
On Fri, Mar 15, 2013 at 08:23:44AM +0100, Peter Lieven wrote:
> On 15.03.2013 00:04, Davide Guerri wrote:
> >Yes this is definitely an option :)
> >
> >Just for curiosity, what is the effect of "in-kernel irqchip"?
> 
> it emulates the irqchip in-kernel (in the KVM kernel module) which
> avoids userspace exits to qemu. in your particular case I remember
> that it made all IRQs deliverd to vcpu0 on. So I think this is a workaround
> and not the real fix. I think Michael is right that it is a
> client kernel bug. It would be good to find out what it is and ask
> the 2.6.32 maintainers to include it. i further have seen that
> with more recent kernels and inkernel-irqchip the irqs are delivered
> to vcpu0 only again (without multiqueue).
>
> >Is it possible to disable it on a "live" domain?
> 
> try it. i don't know. you definetely have to do a live migration for it,
> but I have no clue if the VM will survice this.
> 
> Peter

I doubt you can migrate VMs between irqchip/non irqchip configurations.

> >
> >Cheers,
> >  Davide
> >
> >
> >On 14/mar/2013, at 19:21, Peter Lieven  wrote:
> >
> >>
> >>Am 14.03.2013 um 19:15 schrieb Davide Guerri :
> >>
> >>>Of course I can do some test but a kernel upgrade is not an option here :(
> >>
> >>disabling the in-kernel irqchip (default since 1.2.0) should also help, 
> >>maybe this is an option.
> >>
> >>Peter
> >>
> >>
> >



Re: [Qemu-devel] [PATCH 1/2] virtio-scsi: Set _DRIVER_OK flag before scsi target scanning

2013-03-17 Thread Michael S. Tsirkin
On Fri, Mar 15, 2013 at 09:45:15AM +0800, Asias He wrote:
> Before we start scsi target scanning, we need to set the
> VIRTIO_CONFIG_S_DRIVER_OK flag so the device can do setup properly.
> 
> This fix a bug when booting tcm_vhost with seabios.
> 
> Signed-off-by: Asias He 
> Acked-by: Paolo Bonzini 

Acked-by: Michael S. Tsirkin 

> ---
>  src/virtio-scsi.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/src/virtio-scsi.c b/src/virtio-scsi.c
> index 879ddfb..4de1255 100644
> --- a/src/virtio-scsi.c
> +++ b/src/virtio-scsi.c
> @@ -147,6 +147,9 @@ init_virtio_scsi(struct pci_device *pci)
>  goto fail;
>  }
>  
> +vp_set_status(ioaddr, VIRTIO_CONFIG_S_ACKNOWLEDGE |
> +  VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK);
> +
>  int i, tot;
>  for (tot = 0, i = 0; i < 256; i++)
>  tot += virtio_scsi_scan_target(pci, ioaddr, vq, i);
> @@ -154,8 +157,6 @@ init_virtio_scsi(struct pci_device *pci)
>  if (!tot)
>  goto fail;
>  
> -vp_set_status(ioaddr, VIRTIO_CONFIG_S_ACKNOWLEDGE |
> -  VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK);
>  return;
>  
>  fail:
> -- 
> 1.8.1.4



Re: [Qemu-devel] [PATCH 2/2] virtio-scsi: Pack struct virtio_scsi_{req_cmd, resp_cmd}

2013-03-17 Thread Michael S. Tsirkin
On Fri, Mar 15, 2013 at 09:45:16AM +0800, Asias He wrote:
> Device needs the exact size of these data structure. Prevent padding.
> 
> This fixes guest hang when booting seabios + tcm_vhost.
> 
> Signed-off-by: Asias He 

Acked-by: Michael S. Tsirkin 

> ---
>  src/virtio-scsi.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/src/virtio-scsi.h b/src/virtio-scsi.h
> index bbfbf30..96c3701 100644
> --- a/src/virtio-scsi.h
> +++ b/src/virtio-scsi.h
> @@ -26,7 +26,7 @@ struct virtio_scsi_req_cmd {
>  u8 prio;
>  u8 crn;
>  char cdb[VIRTIO_SCSI_CDB_SIZE];
> -};
> +} __attribute__((packed));
>  
>  /* This is the first element of the "in" scatter-gather list. */
>  struct virtio_scsi_resp_cmd {
> @@ -36,7 +36,7 @@ struct virtio_scsi_resp_cmd {
>  u8 status;
>  u8 response;
>  u8 sense[VIRTIO_SCSI_SENSE_SIZE];
> -};
> +} __attribute__((packed));
>  
>  #define VIRTIO_SCSI_S_OK0
>  
> -- 
> 1.8.1.4



Re: [Qemu-devel] [PATCH] target-mips: fix rndrashift_short_acc and code for EXTR_ instructions

2013-03-17 Thread Aurelien Jarno
On Fri, Mar 15, 2013 at 06:56:19PM +0100, Petar Jovanovic wrote:
> From: Petar Jovanovic 
> 
> Fix for rndrashift_short_acc to set correct value to higher 64 bits.
> This change also corrects conditions when bit 23 of the DSPControl register
> is set.
> 
> The existing test files have been extended with several examples that
> trigger the issues. One bug/example in the test file for EXTR_RS_W has been
> found and reported by Klaus Peichl.
> 
> Signed-off-by: Petar Jovanovic 
> ---
>  target-mips/dsp_helper.c  |   23 +++--
>  tests/tcg/mips/mips32-dsp/extr_r_w.c  |   23 +
>  tests/tcg/mips/mips32-dsp/extr_rs_w.c |   46 
> +
>  tests/tcg/mips/mips32-dsp/extr_w.c|   23 +
>  4 files changed, 101 insertions(+), 14 deletions(-)
> 
> diff --git a/target-mips/dsp_helper.c b/target-mips/dsp_helper.c
> index 472be35..c7df595 100644
> --- a/target-mips/dsp_helper.c
> +++ b/target-mips/dsp_helper.c
> @@ -517,13 +517,8 @@ static inline void mipsdsp_rndrashift_short_acc(int64_t 
> *p,
>  
>  acc = ((int64_t)env->active_tc.HI[ac] << 32) |
>((int64_t)env->active_tc.LO[ac] & 0x);
> -if (shift == 0) {
> -p[0] = acc << 1;
> -p[1] = (acc >> 63) & 0x01;
> -} else {
> -p[0] = acc >> (shift - 1);
> -p[1] = 0;
> -}
> +p[0] = (shift == 0) ? (acc << 1) : (acc >> (shift - 1));
> +p[1] = (acc >> 63) & 0x01;
>  }
>  
>  /* 128 bits long. p[0] is LO, p[1] is HI */
> @@ -3161,8 +3156,8 @@ target_ulong helper_extr_w(target_ulong ac, 
> target_ulong shift,
>  tempDL[1] += 1;
>  }
>  
> -if ((!(tempDL[1] == 0 && (tempDL[0] & MIPSDSP_LHI) == 0x00)) &&
> -(!(tempDL[1] == 1 && (tempDL[0] & MIPSDSP_LHI) == MIPSDSP_LHI))) {
> +if (((tempDL[1] & 0x01) != 0 || (tempDL[0] & MIPSDSP_LHI) != 0) &&
> +((tempDL[1] & 0x01) != 1 || (tempDL[0] & MIPSDSP_LHI) != 
> MIPSDSP_LHI)) {
>  set_DSPControl_overflow_flag(1, 23, env);
>  }
>  
> @@ -3187,8 +3182,8 @@ target_ulong helper_extr_r_w(target_ulong ac, 
> target_ulong shift,
>  tempDL[1] += 1;
>  }
>  
> -if ((tempDL[1] != 0 || (tempDL[0] & MIPSDSP_LHI) != 0) &&
> -(tempDL[1] != 1 && (tempDL[0] & MIPSDSP_LHI) != MIPSDSP_LHI)) {
> +if (((tempDL[1] & 0x01) != 0 || (tempDL[0] & MIPSDSP_LHI) != 0) &&
> +((tempDL[1] & 0x01) != 1 || (tempDL[0] & MIPSDSP_LHI) != 
> MIPSDSP_LHI)) {
>  set_DSPControl_overflow_flag(1, 23, env);
>  }
>  
> @@ -3214,9 +3209,9 @@ target_ulong helper_extr_rs_w(target_ulong ac, 
> target_ulong shift,
>  }
>  tempI = tempDL[0] >> 1;
>  
> -if ((tempDL[1] != 0 || (tempDL[0] & MIPSDSP_LHI) != 0) &&
> -(tempDL[1] != 1 || (tempDL[0] & MIPSDSP_LHI) != MIPSDSP_LHI)) {
> -temp64 = tempDL[1];
> +if (((tempDL[1] & 0x01) != 0 || (tempDL[0] & MIPSDSP_LHI) != 0) &&
> +((tempDL[1] & 0x01) != 1 || (tempDL[0] & MIPSDSP_LHI) != 
> MIPSDSP_LHI)) {
> +temp64 = tempDL[1] & 0x01;
>  if (temp64 == 0) {
>  tempI = 0x7FFF;
>  } else {
> diff --git a/tests/tcg/mips/mips32-dsp/extr_r_w.c 
> b/tests/tcg/mips/mips32-dsp/extr_r_w.c
> index 02e0224..489c193 100644
> --- a/tests/tcg/mips/mips32-dsp/extr_r_w.c
> +++ b/tests/tcg/mips/mips32-dsp/extr_r_w.c
> @@ -67,5 +67,28 @@ int main()
>  assert(dsp == 0);
>  assert(result == rt);
>  
> +/* Clear dspcontrol */
> +dsp = 0;
> +__asm
> +("wrdsp %0\n\t"
> + :
> + : "r"(dsp)
> +);
> +
> +ach = 0x;
> +acl = 0x;
> +result = 0;
> +__asm
> +("mthi %2, $ac1\n\t"
> + "mtlo %3, $ac1\n\t"
> + "extr_r.w %0, $ac1, 0x1F\n\t"
> + "rddsp %1\n\t"
> + : "=r"(rt), "=r"(dsp)
> + : "r"(ach), "r"(acl)
> + );
> +dsp = (dsp >> 23) & 0x01;
> +assert(dsp == 0);
> +assert(result == rt);
> +
>  return 0;
>  }
> diff --git a/tests/tcg/mips/mips32-dsp/extr_rs_w.c 
> b/tests/tcg/mips/mips32-dsp/extr_rs_w.c
> index c3a22ee..f9d2ed6 100644
> --- a/tests/tcg/mips/mips32-dsp/extr_rs_w.c
> +++ b/tests/tcg/mips/mips32-dsp/extr_rs_w.c
> @@ -67,5 +67,51 @@ int main()
>  assert(dsp == 0);
>  assert(result == rt);
>  
> +/* Clear dspcontrol */
> +dsp = 0;
> +__asm
> +("wrdsp %0\n\t"
> + :
> + : "r"(dsp)
> +);
> +
> +ach = 0x8000;
> +acl = 0x;
> +result = 0x8000;
> +__asm
> +("mthi %2, $ac1\n\t"
> + "mtlo %3, $ac1\n\t"
> + "extr_rs.w %0, $ac1, 0x1F\n\t"
> + "rddsp %1\n\t"
> + : "=r"(rt), "=r"(dsp)
> + : "r"(ach), "r"(acl)
> +);
> +dsp = (dsp >> 23) & 0x01;
> +assert(dsp == 1);
> +assert(result == rt);
> +
> +/* Clear dspcontrol */
> +dsp = 0;
> +__asm
> +("wrdsp %0\n\t"
> + :
> + : "r"(dsp)
> +);
> +
> +ach = 0x;
> +

Re: [Qemu-devel] [PATCH] hw/vexpress: set default block type to SD

2013-03-17 Thread Aurelien Jarno
On Sun, Mar 17, 2013 at 12:23:31AM +, Peter Maydell wrote:
> On 17 March 2013 00:04, Aurelien Jarno  wrote:
> > On Fri, Mar 15, 2013 at 03:45:11PM +, Peter Maydell wrote:
> >> On 15 March 2013 15:42, Aurelien Jarno  wrote:
> >> > On Fri, Mar 15, 2013 at 03:35:48PM +, Peter Maydell wrote:
> >> >> I'm not convinced this is a good thing -- I think you should have
> >> >> to know that you're attaching an SD card and not a hard disk,
> >> >> because the performance is much worse. In particular if you
> >> >> don't specify 'cache=writeback' your performance will be
> >> >> dreadful, so you need to do something different from hard
> >> >> disks anyhow.
> >>
> >> > Being a good thing or not, the current code is wrong: the default
> >> > interface type is set to SCSI (I guess it has been copied from
> >> > versatilepb.c), while the vexpress machine has no SCSI
> >> > interface.
> >>
> >> I agree we shouldn't be claiming to have a scsi interface, yes.
> >>
> >
> > Does it means you are going to accept the patch?
> 
> No, I meant I'd accept a patch which stops us reporting that
> we have a SCSI interface when we don't.

That's what the patch does, by correctly saying it's an SD interface and
not a SCSI one.

> > In anycase let me give you some more arguments in favor of it. Even if
> > you believe that users should always provide a cache= argument, I don't
> > think it should be done by setting a wrong default interfaces. Users are
> > likely to simply google for a command line and paste it without
> > understanding the consequences of cache=writeback. The way to go there
> > is to make the writeback argument mandatory for some machines if you
> > really believe it's need by all users.
> 
> writeback shouldn't be mandatory randomly for some machines and
> interfaces and not others.

I don't get your point there. You don't want to make it mandatory at the
QEMU level, but you want users to understand they have to specify it for
some machines, by making if= mandatory randomly for some machines and
not others?

> > That said I don't share this opinion. I have made some tests comparing
> > a versatile and a vexpress machine, running Debian Wheezy, armel for the
> > first one and armhf for the second one. This has been done on a Core i5
> > 2500 machine with a ST2000DM001 hard drive. Here are the results:
> >
> >   | boot | install build-dep | build lm-sensors |
> >   +--+---+--+
> >   versatile   | 1:09 | 2:27  | 4:09 |
> >   versatile cache=wb  | 1:08 | 2:25  | 3:44 |
> >   vexpress| 1:11 | 3:07  | 3:49 |
> >   vexpress cache=wb   | 1:07 | 3:07  | 3:47 |
> >
> >   Note: the install build-dep time doesn't include the download time.
> >
> > So even if the two systems are not directly comparable, it shows that
> > the SD card emulation is indeed slower than the hard disk one. That said
> > while cache=writeback makes a difference for the versatile platform, it
> > doesn't really change anything for the vexpress platform. Therefore
> > forcing the users to add this option doesn't seems to be a good idea.
> 
> Maybe the default has changed. Certainly it used to be the case that
> sd card emulation was incredibly slow without some kind of caching
> option, because it writes 512 bytes at a time, synchronously, and
> the SD card interface has no way for the guest to say 'write; write;
> write; ok now make sure that's all committed'.
> 

I don't know what has changed, but with the current code the argument
saying that the vexpress machine without cache=writeback has dreadful
performance is wrong.

-- 
Aurelien Jarno  GPG: 1024D/F1BCDB73
aurel...@aurel32.net http://www.aurel32.net



[Qemu-devel] [PATCH] vmxcap: Update according to SDM of January 2013

2013-03-17 Thread Jan Kiszka
From: Jan Kiszka 

This adds reporting of VMCS shadowing, #VE, IA32_SMBASE, unrestricted
VMWRITE and fixes the range of the MSEG revision ID.

Signed-off-by: Jan Kiszka 
---
 scripts/kvm/vmxcap |6 +-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap
index a79f816..c90eda4 100755
--- a/scripts/kvm/vmxcap
+++ b/scripts/kvm/vmxcap
@@ -168,6 +168,8 @@ controls = [
 11: 'RDRAND exiting',
 12: 'Enable INVPCID',
 13: 'Enable VM functions',
+14: 'VMCS shadowing',
+18: 'EPT-violation #VE'
 },
 cap_msr = MSR_IA32_VMX_PROCBASED_CTLS2,
 ),
@@ -212,10 +214,12 @@ controls = [
 6: 'HLT activity state',
 7: 'Shutdown activity state',
 8: 'Wait-for-SIPI activity state',
+15: 'IA32_SMBASE support',
 (16,24): 'Number of CR3-target values',
 (25,27): 'MSR-load/store count recommenation',
 28: 'IA32_SMM_MONITOR_CTL[2] can be set to 1',
-(32,62): 'MSEG revision identifier',
+29: 'VMWRITE to VM-exit information fields',
+(32,63): 'MSEG revision identifier',
 },
 msr = MSR_IA32_VMX_MISC_CTLS,
 ),
-- 
1.7.3.4



Re: [Qemu-devel] [PATCH] hw/vexpress: set default block type to SD

2013-03-17 Thread Peter Maydell
On 17 March 2013 10:17, Aurelien Jarno  wrote:
> On Sun, Mar 17, 2013 at 12:23:31AM +, Peter Maydell wrote:
>> On 17 March 2013 00:04, Aurelien Jarno  wrote:
>> > On Fri, Mar 15, 2013 at 03:45:11PM +, Peter Maydell wrote:
>> >> I agree we shouldn't be claiming to have a scsi interface, yes.
>> >>
>> >
>> > Does it means you are going to accept the patch?
>>
>> No, I meant I'd accept a patch which stops us reporting that
>> we have a SCSI interface when we don't.
>
> That's what the patch does, by correctly saying it's an SD interface and
> not a SCSI one.

The patch does two things:
 * stops claiming a default SCSI interface [obviously right]
 * starts claiming a default SD interface [what we're arguing about]

>> > In anycase let me give you some more arguments in favor of it. Even if
>> > you believe that users should always provide a cache= argument, I don't
>> > think it should be done by setting a wrong default interfaces. Users are
>> > likely to simply google for a command line and paste it without
>> > understanding the consequences of cache=writeback. The way to go there
>> > is to make the writeback argument mandatory for some machines if you
>> > really believe it's need by all users.
>>
>> writeback shouldn't be mandatory randomly for some machines and
>> interfaces and not others.
>
> I don't get your point there. You don't want to make it mandatory at the
> QEMU level, but you want users to understand they have to specify it for
> some machines, by making if= mandatory randomly for some machines and
> not others?

My point is that I don't think there's a good solution to "modelled SD
cards aren't a good substitute for modelled hard disks".

That said, I want to postpone the rest of this conversation until I've
had a chance to look into what has changed regarding performance of
SD with the default cache settings.

-- PMM



Re: [Qemu-devel] [PATCH 0/2] Fix booting tcm_vhost + seabios

2013-03-17 Thread Kevin O'Connor
On Fri, Mar 15, 2013 at 09:45:14AM +0800, Asias He wrote:
> Asias He (2):
>   virtio-scsi: Set _DRIVER_OK flag before scsi target scanning
>   virtio-scsi: Pack struct virtio_scsi_{req_cmd,resp_cmd}

Thanks.  I pushed these patches.

-Kevin



Re: [Qemu-devel] [PATCH 0/7] pci: Create PCI Express bus type

2013-03-17 Thread Michael S. Tsirkin
On Thu, Mar 14, 2013 at 04:00:53PM -0600, Alex Williamson wrote:
> When setting PCIe capabilities we need to know the type of bus we
> have.  On secondary buses we could poke around on the parent bridge
> device to infer this data, but on root buses there's no parent device.
> By creating a new PCIE TypeInfo we can inherit everything about PCI
> buses while still allowing us to differentiate and potentially extend
> in the future.
> 
> The first benefactor of this change is included here, mangling
> Endpoints to Root Complex Integrated Endpoints, allowing nec-usb-xhci
> to work on the root bus of q35 with Windows.  I also plan to use this
> to drop link capabilities, control, and status on all Integrated
> Endpoints and for PCIe capability modifications on assigned devices.
> Thanks,
> 
> Alex

Applied, thanks!

> ---
> 
> Alex Williamson (7):
>   pci: Create and register a new PCI Express TypeInfo
>   pci: Move PCI and PCIE type defines
>   pci: Allow PCI bus creation interfaces to specify the type of bus
>   pci: Q35, Root Ports, and Switches create PCI Express buses
>   pci: Create pci_bus_is_express helper
>   pci: Create and use API to determine root buses
>   pcie: Mangle types to match topology
> 
> 
>  hw/alpha_typhoon.c  |2 +-
>  hw/apb_pci.c|4 ++--
>  hw/bonito.c |2 +-
>  hw/dec_pci.c|7 ++-
>  hw/grackle_pci.c|2 +-
>  hw/gt64xxx.c|2 +-
>  hw/i82801b11.c  |2 +-
>  hw/ioh3420.c|2 +-
>  hw/pci/pci.c|   36 ++--
>  hw/pci/pci.h|   13 ++---
>  hw/pci/pci_bridge.c |5 ++---
>  hw/pci/pci_bridge.h |2 +-
>  hw/pci/pci_bus.h|3 ---
>  hw/pci/pcie.c   |   13 +
>  hw/pci_bridge_dev.c |2 +-
>  hw/piix_pci.c   |2 +-
>  hw/ppc4xx_pci.c |2 +-
>  hw/ppce500_pci.c|2 +-
>  hw/prep_pci.c   |2 +-
>  hw/q35.c|3 ++-
>  hw/sh_pci.c |2 +-
>  hw/spapr_pci.c  |2 +-
>  hw/unin_pci.c   |4 ++--
>  hw/versatile_pci.c  |2 +-
>  hw/xio3130_downstream.c |2 +-
>  hw/xio3130_upstream.c   |2 +-
>  26 files changed, 80 insertions(+), 42 deletions(-)



Re: [Qemu-devel] [PATCH 7/7] pcie: Mangle types to match topology

2013-03-17 Thread Michael S. Tsirkin
On Thu, Mar 14, 2013 at 04:01:35PM -0600, Alex Williamson wrote:
> Windows will fail to start drivers for devices with an Endpoint type
> PCIe capability attached to a Root Complex (code 10 - Device cannot
> start).  The proper type for such a device is Root Complex Integrated
> Endpoint.  Devices don't care which they are, so do this conversion
> automatically.
> 
> This allows the Windows driver to load for nec-usb-xhci when attached
> to pcie.0 of a q35 machine.
> 
> Signed-off-by: Alex Williamson 

I think it's a bit ugly from the API perspective,
in that an integrated endpoint is not converted to a regular one.

I think it would be cleaner to have pcie_cap_init do exactly
what it's told to do, maybe failing if you give it an
incorrect configuration. On top of this add
pcie_endpoint_cap_init which sets the type explicitly.
Hmm?

Not critical, so applied as is for now.

> ---
>  hw/pci/pcie.c |   13 +
>  1 file changed, 13 insertions(+)
> 
> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> index 485c94c..bcfbae4 100644
> --- a/hw/pci/pcie.c
> +++ b/hw/pci/pcie.c
> @@ -48,6 +48,19 @@ int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t 
> type, uint8_t port)
>  
>  assert(pci_is_express(dev));
>  
> +/*
> + * Mangle type to convert Endpoints to Root Complex Integrated Endpoints.
> + * Windows will report Code 10 (device cannot start) for regular 
> Endpoints
> + * on the Root Complex.
> + */
> +if (pci_bus_is_express(dev->bus) && pci_bus_is_root(dev->bus)) {
> +switch (type) {
> +case PCI_EXP_TYPE_ENDPOINT:
> +type = PCI_EXP_TYPE_RC_END;
> +break;
> +}
> +}
> +
>  pos = pci_add_capability(dev, PCI_CAP_ID_EXP, offset,
>   PCI_EXP_VER2_SIZEOF);
>  if (pos < 0) {



Re: [Qemu-devel] [PATCH] hw/vexpress: set default block type to SD

2013-03-17 Thread Paolo Bonzini
Il 15/03/2013 16:35, Peter Maydell ha scritto:
> On 5 March 2013 00:44, Aurelien Jarno  wrote:
>> On Tue, Mar 05, 2013 at 08:22:57AM +0800, Peter Maydell wrote:
>>> What effect does this actually have on the user experience?
>>
>> The effect is that the user don't has to specify the interface type.
>> Basically:
>>
>>   -drive file=/path/to/file,if=sd
>> can be replaced by
>>   -drive file=/path/to/file
>>
>> It means the user doesn't have to know the details of the machine to
>> know how to attach a disk. Note that the user here can also be a script,
>> which then becomes a bit simpler.
> 
> I'm not convinced this is a good thing -- I think you should have
> to know that you're attaching an SD card and not a hard disk,
> because the performance is much worse. In particular if you
> don't specify 'cache=writeback' your performance will be
> dreadful, so you need to do something different from hard
> disks anyhow.

cache=writeback has been the default for a few releases.

Paolo




Re: [Qemu-devel] [PATCH] hw/vexpress: set default block type to SD

2013-03-17 Thread Paolo Bonzini
Il 17/03/2013 01:04, Aurelien Jarno ha scritto:
>   | boot | install build-dep | build lm-sensors |
>   +--+---+--+
>   versatile   | 1:09 | 2:27  | 4:09 |
>   versatile cache=wb  | 1:08 | 2:25  | 3:44 |

Strange, cache=wb should be the default.

The real problem with SD is that it doesn't have a flush command, hence
it cannot be made safe against power losses.  But the solution is to add
support for virtio-mmio in those boards.

Paolo

>   vexpress| 1:11 | 3:07  | 3:49 |
>   vexpress cache=wb   | 1:07 | 3:07  | 3:47 |
> 
>   Note: the install build-dep time doesn't include the download time.
> 
> So even if the two systems are not directly comparable, it shows that
> the SD card emulation is indeed slower than the hard disk one. That said
> while cache=writeback makes a difference for the versatile platform, it
> doesn't really change anything for the vexpress platform. Therefore
> forcing the users to add this option doesn't seems to be a good idea.




Re: [Qemu-devel] [PATCH] hw/vexpress: set default block type to SD

2013-03-17 Thread Peter Maydell
On 17 March 2013 18:51, Paolo Bonzini  wrote:
> Il 17/03/2013 01:04, Aurelien Jarno ha scritto:
>>   | boot | install build-dep | build lm-sensors |
>>   +--+---+--+
>>   versatile   | 1:09 | 2:27  | 4:09 |
>>   versatile cache=wb  | 1:08 | 2:25  | 3:44 |
>
> Strange, cache=wb should be the default.
>
> The real problem with SD is that it doesn't have a flush command, hence
> it cannot be made safe against power losses.

So, two things here:
(a) the way to make it safe against power loss is that when the
guest writes a block of data it has to really hit the emulated
disk, because this is how the hardware works
(b) I thought this was the reason cache=wb wasn't the default (ie
that we weren't defaulting to 'may lose data on powerloss'). At the
time I last looked into command line options for these boards I'm
pretty sure it wasn't the default, because the performance improvement
from turning on caching was huge.

-- PMM



Re: [Qemu-devel] [PATCH] pvevent: pvevent device driver

2013-03-17 Thread Blue Swirl
On Thu, Mar 14, 2013 at 8:51 AM, Hu Tao  wrote:
> pvevent device is a qemu simulated device through which guest panic
> event is sent to host.
>
> ref: http://lists.nongnu.org/archive/html/qemu-devel/2013-03/msg02293.html
>
> Signed-off-by: Hu Tao 
> ---
>  drivers/platform/x86/Kconfig   |   7 +++
>  drivers/platform/x86/Makefile  |   2 +
>  drivers/platform/x86/pvevent.c | 115 
> +
>  3 files changed, 124 insertions(+)
>  create mode 100644 drivers/platform/x86/pvevent.c
>
> diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
> index 7ab0b2f..369135e 100644
> --- a/drivers/platform/x86/Kconfig
> +++ b/drivers/platform/x86/Kconfig
> @@ -768,4 +768,11 @@ config APPLE_GMUX
>   graphics as well as the backlight. Currently only backlight
>   control is supported by the driver.
>
> +config PVEVENT
> +   tristate "pvevent device support"
> +   depends on ACPI
> +   ---help---
> + This driver provides support for pvevent device, which is a qemu
> + simulated device through which guest panic event is sent to host.
> +
>  endif # X86_PLATFORM_DEVICES
> diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
> index bf7e4f9..8779396 100644
> --- a/drivers/platform/x86/Makefile
> +++ b/drivers/platform/x86/Makefile
> @@ -50,3 +50,5 @@ obj-$(CONFIG_INTEL_MID_POWER_BUTTON)  += 
> intel_mid_powerbtn.o
>  obj-$(CONFIG_INTEL_OAKTRAIL)   += intel_oaktrail.o
>  obj-$(CONFIG_SAMSUNG_Q10)  += samsung-q10.o
>  obj-$(CONFIG_APPLE_GMUX)   += apple-gmux.o
> +
> +obj-$(CONFIG_PVEVENT)   += pvevent.o
> diff --git a/drivers/platform/x86/pvevent.c b/drivers/platform/x86/pvevent.c
> new file mode 100644
> index 000..00ef7f4
> --- /dev/null
> +++ b/drivers/platform/x86/pvevent.c
> @@ -0,0 +1,115 @@
> +/*
> + *  pvevent.c - pvevent Device Support
> + *
> + *  Copyright (C) 2013 Fujitsu.
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License, or
> + *  (at your option) any later version.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + *  GNU General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License
> + *  along with this program; if not, write to the Free Software
> + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

Current address of FSF is:

51 Franklin Street, Fifth Floor
Boston, MA 02110-1301
USA

I'd use the web version recommended by FSF.

> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +MODULE_AUTHOR("Hu Tao ");
> +MODULE_DESCRIPTION("pvevent device driver");
> +MODULE_LICENSE("GPL");
> +
> +static int pvevent_add(struct acpi_device *device);
> +static int pvevent_remove(struct acpi_device *device);
> +
> +static const struct acpi_device_id pvevent_device_ids[] = {
> +   { "MSFT0001", 0},
> +   { "", 0},
> +};
> +MODULE_DEVICE_TABLE(acpi, pvevent_device_ids);
> +
> +#define PVEVENT_PANICKED   (1 << 0)
> +
> +static acpi_handle handle;
> +
> +static struct acpi_driver pvevent_driver = {
> +   .name = "pvevent",
> +   .class ="QEMU",
> +   .ids =  pvevent_device_ids,
> +   .ops =  {
> +   .add =  pvevent_add,
> +   .remove =   pvevent_remove,
> +   },
> +   .owner =THIS_MODULE,
> +};
> +
> +static void
> +pvevent_send_event(unsigned int event)
> +{
> +   union acpi_object arg;
> +   struct acpi_object_list arg_list;
> +
> +   if (!handle)
> +   return;
> +
> +   arg.type = ACPI_TYPE_INTEGER;
> +   arg.integer.value = event;
> +
> +   arg_list.count = 1;
> +   arg_list.pointer = &arg;
> +
> +   acpi_evaluate_object(handle, "WRPT", &arg_list, NULL);
> +}
> +
> +static int
> +pvevent_panic_notify(struct notifier_block *nb, unsigned long code,
> +void *unused)
> +{
> +   pvevent_send_event(PVEVENT_PANICKED);
> +   return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block pvevent_panic_nb = {
> +   .notifier_call = pvevent_panic_notify,
> +};
> +
> +static int pvevent_add(struct acpi_device *device)
> +{
> +   acpi_status status;
> +   u64 ret;
> +
> +   status = acpi_evaluate_integer(device->handle, "_STA", NULL,
> +  &ret);
> +
> +   if (ACPI_FAILURE(status) || !ret)
> +   return -ENODEV;
> +
> +   handle = device->handle;
> +   atomic_notifier_chain_register(&panic_

Re: [Qemu-devel] [PATCH] hw/vexpress: set default block type to SD

2013-03-17 Thread Aurelien Jarno
On Sun, Mar 17, 2013 at 07:51:00PM +0100, Paolo Bonzini wrote:
> Il 17/03/2013 01:04, Aurelien Jarno ha scritto:
> >   | boot | install build-dep | build lm-sensors |
> >   +--+---+--+
> >   versatile   | 1:09 | 2:27  | 4:09 |
> >   versatile cache=wb  | 1:08 | 2:25  | 3:44 |
> 
> Strange, cache=wb should be the default.

I agree it's strange, maybe there were other effects, but I am sure when
I did the tests cache=wb the build time was clearly shorter.

> The real problem with SD is that it doesn't have a flush command, hence
> it cannot be made safe against power losses.  But the solution is to add
> support for virtio-mmio in those boards.
> 
> Paolo
> 
> >   vexpress| 1:11 | 3:07  | 3:49 |
> >   vexpress cache=wb   | 1:07 | 3:07  | 3:47 |
> > 

Here the results are more consistent with the default being cache=wb.

I'll try to do the same tests on another machine to see if the results
are still the same.

-- 
Aurelien Jarno  GPG: 1024D/F1BCDB73
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH] hw/vexpress: set default block type to SD

2013-03-17 Thread Aurelien Jarno
On Sun, Mar 17, 2013 at 06:54:47PM +, Peter Maydell wrote:
> On 17 March 2013 18:51, Paolo Bonzini  wrote:
> > Il 17/03/2013 01:04, Aurelien Jarno ha scritto:
> >>   | boot | install build-dep | build lm-sensors |
> >>   +--+---+--+
> >>   versatile   | 1:09 | 2:27  | 4:09 |
> >>   versatile cache=wb  | 1:08 | 2:25  | 3:44 |
> >
> > Strange, cache=wb should be the default.
> >
> > The real problem with SD is that it doesn't have a flush command, hence
> > it cannot be made safe against power losses.
> 
> So, two things here:
> (a) the way to make it safe against power loss is that when the
> guest writes a block of data it has to really hit the emulated
> disk, because this is how the hardware works
> (b) I thought this was the reason cache=wb wasn't the default (ie
> that we weren't defaulting to 'may lose data on powerloss'). At the
> time I last looked into command line options for these boards I'm
> pretty sure it wasn't the default, because the performance improvement
> from turning on caching was huge.
> 

For the record, this has been changed in this commit:

commit 1f212b9d3edd8679bafd3bcf0301795206438724
Author: Paolo Bonzini 
Date:   Thu Aug 9 16:07:21 2012 +0200

blockdev: flip default cache mode from writethrough to writeback

Now all major device models (IDE, SCSI, virtio) can choose between
writethrough and writeback at run-time, and virtio will even revert
to writethrough if the guest is not capable of sending flushes.  So
we can change the default to writeback at last.

Tested, for lack of a better idea, with a breakpoint on bdrv_open
and all cache choices one by one.

Signed-off-by: Paolo Bonzini 
Signed-off-by: Kevin Wolf 

-- 
Aurelien Jarno  GPG: 1024D/F1BCDB73
aurel...@aurel32.net http://www.aurel32.net



[Qemu-devel] [Bug 1156313] [NEW] X86-64 flags handling broken

2013-03-17 Thread Torbjorn Granlund
Public bug reported:

The current qemu sources cause improper handling of flags on x86-64.
This bug seems to have shown up a few weeks ago.

A plain install of Debian GNU/Linux makes user processes catch
spurious signals.  The kernel seems to run stably, though.

The ADX feature works very poorly.  It might be related; at least it
allows for reproducibly provoking invalid behaviour.

Here is a test case:


qemumain.c
#include 
long adx();
int
main ()
{
  printf ("%lx\n", adx (0xffbeef, 17));
  return 0;
}

qemuadx.s:
.globl  adx
adx:xor %rax, %rax
1:  dec %rdi
jnz 1b
.byte 0xf3, 0x48, 0x0f, 0x38, 0xf6, 0xc0# adox  %rax, %rax
.byte 0x66, 0x48, 0x0f, 0x38, 0xf6, 0xc0# adcx  %rax, %rax
ret


Compile and execute:
$ gcc -m64 qemumain.c qemuadx.s
$ a.out
ff8000378cd8

Expected output is simply "0".  The garbage value varies between qemu
compiles and guest systems.

Note that one needs a recent GNU assembler in order to handle adox and
adcx.  For convenience I have supplied them as byte sequences.

Exaplanation and feeble analysis:

The 0xffbeef argument is a loop count.  It is necessary to loop for a
while in order to trigger this bug.  If the loop count is decreased,
the bug will seen intermittently; the lower the count, the less
frequent the invalid behaviour.

It seems like a reasonable assumption that this bug is related to
flags handling at context switch.  Presumably, qemu keeps flags state
in some internal format, then recomputes then when needing to form the
eflags register, as needed for example for context switching.

I haven't tried to reproduce this bug using qemu-x86_64 and SYSROOT,
but I strongly suspect that to be impossible.  I use
qemu-system-x86_64 and the guest Debian GNU/Linux x86_64 (version
6.0.6) .

The bug happens also with the guest FreeBSD x86_64 version 9.1.  (The
iteration count for triggering the problem 50% of the runs is not the
same when using the kernel Linux and FreeBSD's kernel, presumably due
to different ticks.)

The bug happens much more frequently for a loaded system; in fact, the
loop count can be radically decreased if two instances of the trigger
program are run in parallel.

** Affects: qemu
 Importance: Undecided
 Status: New

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1156313

Title:
  X86-64 flags handling broken

Status in QEMU:
  New

Bug description:
  The current qemu sources cause improper handling of flags on x86-64.
  This bug seems to have shown up a few weeks ago.

  A plain install of Debian GNU/Linux makes user processes catch
  spurious signals.  The kernel seems to run stably, though.

  The ADX feature works very poorly.  It might be related; at least it
  allows for reproducibly provoking invalid behaviour.

  Here is a test case:

  
  qemumain.c
  #include 
  long adx();
  int
  main ()
  {
printf ("%lx\n", adx (0xffbeef, 17));
return 0;
  }
  
  qemuadx.s:
  .globl  adx
  adx:xor %rax, %rax
  1:  dec %rdi
  jnz 1b
  .byte 0xf3, 0x48, 0x0f, 0x38, 0xf6, 0xc0# adox  %rax, %rax
  .byte 0x66, 0x48, 0x0f, 0x38, 0xf6, 0xc0# adcx  %rax, %rax
  ret
  

  Compile and execute:
  $ gcc -m64 qemumain.c qemuadx.s
  $ a.out
  ff8000378cd8

  Expected output is simply "0".  The garbage value varies between qemu
  compiles and guest systems.

  Note that one needs a recent GNU assembler in order to handle adox and
  adcx.  For convenience I have supplied them as byte sequences.

  Exaplanation and feeble analysis:

  The 0xffbeef argument is a loop count.  It is necessary to loop for a
  while in order to trigger this bug.  If the loop count is decreased,
  the bug will seen intermittently; the lower the count, the less
  frequent the invalid behaviour.

  It seems like a reasonable assumption that this bug is related to
  flags handling at context switch.  Presumably, qemu keeps flags state
  in some internal format, then recomputes then when needing to form the
  eflags register, as needed for example for context switching.

  I haven't tried to reproduce this bug using qemu-x86_64 and SYSROOT,
  but I strongly suspect that to be impossible.  I use
  qemu-system-x86_64 and the guest Debian GNU/Linux x86_64 (version
  6.0.6) .

  The bug happens also with the guest FreeBSD x86_64 version 9.1.  (The
  iteration count for triggering the problem 50% of the runs is not the
  same when using the kernel Linux and FreeB

Re: [Qemu-devel] [PATCH 0/4] Fix JSON string formatter

2013-03-17 Thread Blue Swirl
On Thu, Mar 14, 2013 at 5:49 PM, Markus Armbruster  wrote:
> This should unbreak "make check" on machines where char is unsigned.
> Blue, please give it a whirl.

With the patches applied there are no errors, thanks.
Tested-by: Blue Swirl 

Though test-coroutine seems to hang, maybe fallout from recent
coroutine changes.

>
> The JSON parser is still as broken as ever.  Left for another day.
>
> Markus Armbruster (4):
>   unicode: New mod_utf8_codepoint()
>   check-qjson: Fix up a few bogus comments
>   check-qjson: Test noncharacters other than U+FFFE, U+ in strings
>   qjson: to_json() case QTYPE_QSTRING is buggy, rewrite
>
>  include/qemu-common.h |   3 +
>  qobject/qjson.c   | 102 --
>  tests/check-qjson.c   | 280 
> +-
>  util/Makefile.objs|   1 +
>  util/unicode.c|  96 +
>  5 files changed, 306 insertions(+), 176 deletions(-)
>  create mode 100644 util/unicode.c
>
> --
> 1.7.11.7
>



[Qemu-devel] [Bug 1155403] Re: virtio cdrom detected as hard disk

2013-03-17 Thread Phillip Susi
It seems the installer does not consider virtio devices when doing its
search.  It also seems the installer does not have the virtio-scsi
module, and it seems a bit wasteful to go through a layer of scsi
emulation.

Shouldn't qemu at least warn you that the media=argument does nothing on
virtio devices instead of silently ignoring it?  Or better yet,
shouldn't it automatically use virtio-scsi instead?

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1155403

Title:
  virtio cdrom detected as hard disk

Status in QEMU:
  Invalid
Status in “qemu” package in Ubuntu:
  Invalid

Bug description:
  Trying to install Ubuntu or Debian using virtio to emulate the cdrom
  fails.  This appears to be due to the drive appearing to be a hard
  disk, rather than a cdrom, despite the media=cdrom argument to qemu.

  I'm not sure if this is a bug in qemu, or the kernel virtio driver?

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1155403/+subscriptions



[Qemu-devel] [PATCH] Advertise --libdir in configure --help output

2013-03-17 Thread Doug Goldstein
The configure script allows you to supply a libdir via --libdir but was
not advertising this in --help.

Signed-off-by: Doug Goldstein 
CC: qemu-triv...@nongnu.org
---
 configure | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configure b/configure
index 46a7594..497ce29 100755
--- a/configure
+++ b/configure
@@ -1050,6 +1050,7 @@ echo "  --mandir=PATHinstall man pages in 
PATH"
 echo "  --datadir=PATH   install firmware in PATH$confsuffix"
 echo "  --docdir=PATHinstall documentation in PATH$confsuffix"
 echo "  --bindir=PATHinstall binaries in PATH"
+echo "  --libdir=PATHinstall libraries in PATH"
 echo "  --sysconfdir=PATHinstall config in PATH$confsuffix"
 echo "  --localstatedir=PATH install local state in PATH"
 echo "  --with-confsuffix=SUFFIX suffix for QEMU data inside datadir and 
sysconfdir [$confsuffix]"
-- 
1.8.1.5




Re: [Qemu-devel] [PATCH v10 0/4] Moxie CPU port

2013-03-17 Thread Blue Swirl
On Sun, Mar 10, 2013 at 2:07 PM, Anthony Green  wrote:
> This version of the patch includes a bug fix and some formatting fixes
> identified by Blue Swirl here:
>
> http://lists.gnu.org/archive/html/qemu-devel/2013-03/msg01530.html
>
> Please consider applying this version of the patch series.

The series does not build due to recent CPU refactoring, please update.

>
> Thanks!
>
> AG
>
>
> Anthony Green (4):
>   Add moxie target code
>   Add moxie disassembler
>   Add sample moxie system
>   Add top level changes for moxie
>
>  MAINTAINERS   |   5 +
>  arch_init.c   |   2 +
>  configure |   9 +-
>  cpu-exec.c|   2 +
>  default-configs/moxie-softmmu.mak |   2 +
>  disas.c   |   6 +
>  disas/Makefile.objs   |   1 +
>  disas/moxie.c | 360 +++
>  hw/moxie/Makefile.objs|   6 +
>  hw/moxie/moxiesim.c   | 174 +++
>  include/disas/bfd.h   |   2 +
>  include/sysemu/arch_init.h|   1 +
>  qapi-schema.json  |   6 +-
>  target-moxie/Makefile.objs|   2 +
>  target-moxie/cpu.c| 172 +++
>  target-moxie/cpu.h| 169 +++
>  target-moxie/helper.c | 171 +++
>  target-moxie/helper.h |   9 +
>  target-moxie/machine.c|  28 ++
>  target-moxie/machine.h|   1 +
>  target-moxie/mmu.c|  36 ++
>  target-moxie/mmu.h|  19 +
>  target-moxie/translate.c  | 926 
> ++
>  23 files changed, 2105 insertions(+), 4 deletions(-)
>  create mode 100644 default-configs/moxie-softmmu.mak
>  create mode 100644 disas/moxie.c
>  create mode 100644 hw/moxie/Makefile.objs
>  create mode 100644 hw/moxie/moxiesim.c
>  create mode 100644 target-moxie/Makefile.objs
>  create mode 100644 target-moxie/cpu.c
>  create mode 100644 target-moxie/cpu.h
>  create mode 100644 target-moxie/helper.c
>  create mode 100644 target-moxie/helper.h
>  create mode 100644 target-moxie/machine.c
>  create mode 100644 target-moxie/machine.h
>  create mode 100644 target-moxie/mmu.c
>  create mode 100644 target-moxie/mmu.h
>  create mode 100644 target-moxie/translate.c
>
> --
> 1.8.1.4
>
>



Re: [Qemu-devel] [PULL 00/17] arm-devs queue

2013-03-17 Thread Blue Swirl
Thanks, pulled.

On Fri, Mar 15, 2013 at 4:56 PM, Peter Maydell  wrote:
> Another arm-devs pullreq: xilinx fixes from Peter C, the pl330
> model, and my vexpress fixes for the system control regs.
> Please pull.
>
> thanks
> -- PMM
>
>
> The following changes since commit dc0b0616f726956001be09e9a65a6e0b0bd939db:
>
>   Merge remote-tracking branch 'stefanha/block' into staging (2013-03-15 
> 10:47:21 -0500)
>
> are available in the git repository at:
>
>
>   git://git.linaro.org/people/pmaydell/qemu-arm.git arm-devs.next
>
> for you to fetch changes up to f8b9fe249a706bfed61e0bb66c73394553696382:
>
>   xilinx_spips: QOM styling fixes (2013-03-15 16:41:59 +)
>
> 
> Nathan Rossi (2):
>   xilinx_spips: Fix bus setup conditional check
>   xilinx_spips: Add missing dual-bus snoop commands
>
> Peter Crosthwaite (5):
>   iov: Factor out hexdumper
>   pl330: Initial version
>   xilinx_zynq: added pl330 to machine model
>   xilinx_spips: Set unused IRQs to NULL
>   xilinx_spips: QOM styling fixes
>
> Peter Maydell (10):
>   hw/vexpress: Pass proc_id via VEDBoardInfo
>   hw/arm_sysctl: Handle SYS_CFGCTRL in a more structured way
>   hw/arm_sysctl: Implement SYS_CFG_MUXFPGA writes as a no-op
>   hw/arm_sysctl: Implement SYS_CFG_DVIMODE as a no-op
>   hw/arm_sysctl: Convert from qdev init to instance_init
>   qdev: Implement (variable length) array properties
>   hw/arm_sysctl: Implement SYS_CFG_VOLT
>   hw/vexpress: Pass voltage sensor properties to sysctl device
>   hw/arm_sysctl: Implement SYS_CFG_OSC function
>   hw/vexpress: Set reset values for daughterboard oscillators
>
>  default-configs/arm-softmmu.mak |1 +
>  hw/Makefile.objs|1 +
>  hw/arm/vexpress.c   |   81 +-
>  hw/arm/xilinx_zynq.c|   24 +
>  hw/arm_sysctl.c |  261 +-
>  hw/pl330.c  | 1654 
> +++
>  hw/qdev-core.h  |3 +
>  hw/qdev-properties.c|  104 +++
>  hw/qdev-properties.h|   39 +
>  hw/xilinx_spips.c   |   64 +-
>  include/qemu-common.h   |6 +
>  util/Makefile.objs  |1 +
>  util/hexdump.c  |   37 +
>  util/iov.c  |   36 +-
>  14 files changed, 2234 insertions(+), 78 deletions(-)
>  create mode 100644 hw/pl330.c
>  create mode 100644 util/hexdump.c



Re: [Qemu-devel] [PATCH] s390: Fix cpu refactoring fallout.

2013-03-17 Thread Blue Swirl
Thanks, applied.

On Fri, Mar 15, 2013 at 9:57 AM, Cornelia Huck  wrote:
> Commit 259186a7 "cpu: Move halted and interrupt_request fields to CPUState"
> seems to have missed one instance in target-s390x/kvm.c:
>
> /home/cohuck/git/qemu/target-s390x/kvm.c: In function 
> ‘kvm_arch_process_async_events’:
> /home/cohuck/git/qemu/target-s390x/kvm.c:319: error: ‘CPUS390XState’ has no 
> member named ‘halted’
> /home/cohuck/git/qemu/target-s390x/kvm.c:320: warning: control reaches end of 
> non-void function
> make[1]: *** [target-s390x/kvm.o] Error 1
>
> Let's just switch to cs->halted.
>
> Signed-off-by: Cornelia Huck 
> ---
>  target-s390x/kvm.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
> index 8f111ae..644f484 100644
> --- a/target-s390x/kvm.c
> +++ b/target-s390x/kvm.c
> @@ -315,8 +315,7 @@ void kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
>
>  int kvm_arch_process_async_events(CPUState *cs)
>  {
> -S390CPU *cpu = S390_CPU(cs);
> -return cpu->env.halted;
> +return cs->halted;
>  }
>
>  void kvm_s390_interrupt_internal(S390CPU *cpu, int type, uint32_t parm,
> --
> 1.7.12.4
>
>



[Qemu-devel] SSD Trim Support- Current State?

2013-03-17 Thread d hee
What is the current state of SSD Trim/Discard support  for a Qemu/KVM guests? 

I run a Qemi/ KVM host wtih these kvm guests...they are all Linux os's and use 
raw images: 1) 20 gig Linux web-server 2) Subsonic music streaming server for 
my Android phone 3) Postfix server for automated emails 4) Linux guest for 
sandbox for testing. All of this is on my on personal system as a hobby and not 
a a business "production" system. If I run them off a ssd drive without the 
trim support, will I see performance impacted as the guest blocks slowly fill 
up without being marked for reuse?


Re: [Qemu-devel] TCG broken in system mode (was TCG assertion with qemu-system-mipsel)

2013-03-17 Thread Aurélien Jarno
On Wed, Mar 06, 2013 at 07:10:17AM +0100, Aurélien Jarno wrote:
> On Wed, Mar 06, 2013 at 11:05:15AM +0900, Yeongkyoon Lee wrote:
> > On 03/05/2013 11:18 PM, Aurélien Jarno wrote:
> > >On Mon, Mar 04, 2013 at 05:37:31PM +0100, Aurélien Jarno wrote:
> > >>Hi,
> > >>
> > >>On Sat, Feb 23, 2013 at 11:10:18PM +0100, Stefan Weil wrote:
> > >>>This assertion occured with latest git master:
> > >>>
> > >>>qemu-system-mipsel: /src/qemu/tcg/tcg-op.h:2589:
> > >>>  tcg_gen_goto_tb: Assertion `(tcg_ctx.goto_tb_issue_mask & (1 << idx))
> > >>>== 0' failed.
> > >>>Aborted
> > >>>
> > >>>QEMU was built with --enable-debug and running a Debian MIPS Lenny (NFS
> > >>>root).
> > >>>The assertion happened when running "apt-get update" in the guest.
> > >>>
> > >>Is it something reproductible or more or less random? Have you Cc:ed
> > >>Richard because it's related to the latest patches?
> > >>
> > >>On my side I am experiencing random segfaults in various guests (at
> > >>least PowerPC, MIPS, SH4 and ARM). I have found a way to bisect it, even
> > >>if it is quite long (building Perl + the testsuite). Currently I know
> > >>that 1.3 is affected, while 1.2 is not.
> > >>
> > >I have found that the issue comes from the following commits, which
> > >unfortunately are not bisectable one by one (though it won't change the
> > >results a lot):
> > >
> > > commit b76f0d8c2e3eac94bc7fd90a510cb7426b2a2699
> > > Author: Yeongkyoon Lee 
> > > Date:   Wed Oct 31 16:04:25 2012 +0900
> > > tcg: Optimize qemu_ld/st by generating slow paths at the end of a 
> > > block
> > > Add optimized TCG qemu_ld/st generation which locates the code of 
> > > TLB miss
> > > cases at the end of a block after generating the other IRs.
> > > Currently, this optimization supports only i386 and x86_64 hosts.
> > > Signed-off-by: Yeongkyoon Lee 
> > > Signed-off-by: Blue Swirl 
> > > commit fdbb84d1332ae0827d60f1a2ca03c7d5678c6edd
> > > Author: Yeongkyoon Lee 
> > > Date:   Wed Oct 31 16:04:24 2012 +0900
> > > tcg: Add extended GETPC mechanism for MMU helpers with ldst 
> > > optimization
> > > Add GETPC_EXT which is used by MMU helpers to selectively 
> > > calculate the code
> > > address of accessing guest memory when called from a qemu_ld/st 
> > > optimized code
> > > or a C function. Currently, it supports only i386 and x86-64 
> > > hosts.
> > > Signed-off-by: Yeongkyoon Lee 
> > > Signed-off-by: Blue Swirl 
> > > commit 32761257c0b9fa7ee04d2871a6e48a41f119c469
> > > Author: Yeongkyoon Lee 
> > > Date:   Wed Oct 31 16:04:23 2012 +0900
> > > configure: Add CONFIG_QEMU_LDST_OPTIMIZATION for TCG qemu_ld/st 
> > > optimization
> > > Enable CONFIG_QEMU_LDST_OPTIMIZATION for TCG qemu_ld/st 
> > > optimization only when
> > > a host is i386 or x86_64.
> > > Signed-off-by: Yeongkyoon Lee 
> > > Signed-off-by: Blue Swirl 
> > >
> > >I will try to understand why.
> > >
> > >
> > 
> > Hi Aurélien,
> > Do you mean that those random segfaults occurred only when
> > configured with "--enable-debug"?
> > Although I cannot see how my commits affect debug built image at a
> > glance, I'll do double-check.
> > Thanks.
> 
> The problem is there even without configuring QEMU with --enable-debug.
> It justs doesn't happens very often, and very randomly. The only way to
> reproduce it each time is to launch a big task in the guest (for me
> building Perl) and see if it completes or now. It can take up to one
> hour until it happens.
> 
> I should precise that the segfault is on the guest side.
> 
> I have tried to look at your patches, and so far I haven't found the
> issue. It seems the two first patches are fine, ie I have verified the
> return address is always correctly computed.
> 

I still haven't found the issue, but on the other hand I can't find any
problem in your code, after reading it dozen of times. I also tried to
modify it as less as possible while issuing the slow path back inside
the TB and it fixes the problem. So it really looks like to be due to
the slow path being at the end of the TB, and not to a bug in the code
generating it. After adding various checks, I am also convinced the
address computed in GETPC_EXT() is always correct. I have to say I am
running out of ideas.

One way to reproduce the issue more easily is to reduce the size of the
generated code buffer, for example by setting it to 512kB for both
MIN_CODE_GEN_BUFFER_SIZE and MAX_CODE_GEN_BUFFER_SIZE in
translate-all.c. That way booting an ARM guest triggers plenty of
segmentation faults or other strange issues with your patch but not
without.

OTOH increasing this size make the issue to almost disappear even when
building perl including the testsuite (for that it has to be at least
512MB).

-- 
Aurelien Jarno  GPG: 1024D/F1BCDB73
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v8 00/24] hw/arm: add Faraday A369 SoC platform support

2013-03-17 Thread Kuo-Jung Su
2013/3/15 Peter Maydell :
> On 15 March 2013 13:15, Kuo-Jung Su  wrote:
>>  default-configs/arm-softmmu.mak |1 +
>>  hw/Makefile.objs|2 +
>>  hw/arm/Makefile.objs|   21 ++
>>  hw/arm/faraday.h|   61 
>>  hw/arm/faraday_a369.c   |  174 ++
>>  hw/arm/faraday_a369_kpd.c   |  231 +
>>  hw/arm/faraday_a369_scu.c   |  182 ++
>>  hw/arm/faraday_a369_soc.c   |  342 +++
>>  hw/arm/ftahbc020.c  |  202 +++
>>  hw/arm/ftapbbrg020.c|  468 ++
>>  hw/arm/ftapbbrg020.h|   44 +++
>>  hw/arm/ftddrii030.c |  183 ++
>>  hw/arm/ftdmac020.c  |  595 
>>  hw/arm/ftdmac020.h  |  107 ++
>>  hw/arm/ftgmac100.c  |  708 
>> +++
>>  hw/arm/ftgmac100.h  |  239 +
>>  hw/arm/fti2c010.c   |  212 
>>  hw/arm/fti2c010.h   |   71 
>>  hw/arm/ftintc020.c  |  302 +
>>  hw/arm/ftintc020.h  |   57 
>>  hw/arm/ftkbc010.h   |   44 +++
>>  hw/arm/ftlcdc200.c  |  510 
>>  hw/arm/ftlcdc200.h  |  112 +++
>>  hw/arm/ftlcdc200_template.h |  439 
>>  hw/arm/ftmac110.c   |  661 
>>  hw/arm/ftmac110.h   |  168 ++
>>  hw/arm/ftnandc021.c |  516 
>>  hw/arm/ftnandc021.h |   84 +
>>  hw/arm/ftpwmtmr010.c|  258 ++
>>  hw/arm/ftpwmtmr010.h|   31 ++
>>  hw/arm/ftrtc011.c   |  383 +
>>  hw/arm/ftrtc011.h   |   53 +++
>>  hw/arm/ftsdc010.c   |  354 
>>  hw/arm/ftsdc010.h   |   90 +
>>  hw/arm/ftspi020.c   |  337 +++
>>  hw/arm/ftspi020.h   |   81 +
>>  hw/arm/ftssp010.c   |  494 +++
>>  hw/arm/ftssp010.h   |   98 ++
>>  hw/arm/fttmr010.c   |  445 
>>  hw/arm/fttmr010.h   |   41 +++
>>  hw/arm/fttsc010.c   |  260 ++
>>  hw/arm/fttsc010.h   |   39 +++
>>  hw/arm/ftwdt010.c   |  209 
>>  hw/arm/ftwdt010.h   |   35 ++
>
> I don't have time to do a full review right now, but a lot
> of these files are in the wrong place. If it looks like a
> device it doesn't belong in hw/arm (top level board models
> and things that directly reference the CPU only).
>
> -- PMM

Got it, thanks.

-- 
Best wishes,
Kuo-Jung Su



Re: [Qemu-devel] [PATCH v8 05/24] hw/arm: add FTDDRII030 DDRII controller support

2013-03-17 Thread Kuo-Jung Su
2013/3/16 Peter Crosthwaite :
> Hi Kuo-Jung,
>
> On Fri, Mar 15, 2013 at 11:13 PM, Kuo-Jung Su  wrote:
>> From: Kuo-Jung Su 
>>
>> The FTDDRII030 is a DDRII SDRAM controller which is responsible for
>> SDRAM initialization.
>> In QEMU we emulate only the SDRAM enable function.
>>
>> Signed-off-by: Kuo-Jung Su 
>> ---
>>  hw/arm/Makefile.objs  |1 +
>>  hw/arm/faraday_a369_soc.c |9 +++
>>  hw/arm/ftddrii030.c   |  183 
>> +
>>  3 files changed, 193 insertions(+)
>>  create mode 100644 hw/arm/ftddrii030.c
>>
>> diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs
>> index af36b01..0bbf838 100644
>> --- a/hw/arm/Makefile.objs
>> +++ b/hw/arm/Makefile.objs
>> @@ -39,3 +39,4 @@ obj-y += faraday_a369.o faraday_a369_soc.o 
>> faraday_a369_scu.o \
>>  faraday_a369_kpd.o
>>  obj-y += ftintc020.o
>>  obj-y += ftahbc020.o
>> +obj-y += ftddrii030.o
>> diff --git a/hw/arm/faraday_a369_soc.c b/hw/arm/faraday_a369_soc.c
>> index 01b4395..e8a63bb 100644
>> --- a/hw/arm/faraday_a369_soc.c
>> +++ b/hw/arm/faraday_a369_soc.c
>> @@ -158,6 +158,15 @@ a369soc_device_init(FaradaySoCState *s)
>>  fprintf(stderr, "a369soc: Unable to set soc link for FTAHBC020\n");
>>  abort();
>>  }
>> +
>> +/* ftddrii030 */
>> +ds = sysbus_create_simple("ftddrii030", 0x9310, NULL);
>> +s->ddrc = ds;
>> +object_property_set_link(OBJECT(ds), OBJECT(s), "soc", &local_errp);
>> +if (local_errp) {
>> +fprintf(stderr, "a369soc: Unable to set soc link for FTDDRII030\n");
>> +abort();
>> +}
>>  }
>>
>>  static void a369soc_realize(DeviceState *dev, Error **errp)
>> diff --git a/hw/arm/ftddrii030.c b/hw/arm/ftddrii030.c
>> new file mode 100644
>> index 000..90a5842
>> --- /dev/null
>> +++ b/hw/arm/ftddrii030.c
>> @@ -0,0 +1,183 @@
>> +/*
>> + * Faraday DDRII controller
>> + *
>> + * Copyright (c) 2012 Faraday Technology
>> + * Written by Dante Su 
>> + *
>> + * This code is licensed under GNU GPL v2+
>> + */
>> +
>> +#include "hw/hw.h"
>> +#include "hw/sysbus.h"
>> +#include "hw/devices.h"
>> +#include "sysemu/sysemu.h"
>> +
>> +#include "faraday.h"
>> +
>> +#define REG_MCR 0x00/* memory configuration register */
>> +#define REG_MSR 0x04/* memory status register */
>> +#define REG_REVR0x50/* revision register */
>> +
>> +#define MSR_INIT_OK BIT(8)  /* DDR2 initial is completed */
>> +#define MSR_CMD_MRS BIT(0)  /* start MRS command */
>> +
>> +#define CFG_REGSIZE (0x50 / 4)
>> +
>> +#define TYPE_FTDDRII030 "ftddrii030"
>> +
>> +typedef struct Ftddrii030State {
>> +SysBusDevice busdev;
>> +MemoryRegion iomem;
>> +
>> +FaradaySoCState *soc;
>> +/* HW register cache */
>> +uint32_t regs[CFG_REGSIZE];
>> +} Ftddrii030State;
>> +
>> +#define FTDDRII030(obj) \
>> +OBJECT_CHECK(Ftddrii030State, obj, TYPE_FTDDRII030)
>> +
>> +#define DDR_REG32(s, off) \
>> +((s)->regs[(off) / 4])
>> +
>> +static uint64_t
>> +ftddrii030_mem_read(void *opaque, hwaddr addr, unsigned size)
>> +{
>> +Ftddrii030State *s = FTDDRII030(opaque);
>> +uint64_t ret = 0;
>> +
>> +if (s->soc->ddr_inited) {
>> +DDR_REG32(s, REG_MSR) |= MSR_INIT_OK;
>> +}
>> +
>> +switch (addr) {
>> +case REG_MCR ... (CFG_REGSIZE - 1) * 4:
>> +ret = s->regs[addr / 4];
>> +break;
>> +case REG_REVR:
>> +ret = 0x100;/* rev. = 0.1.0 */
>> +break;
>> +default:
>> +qemu_log_mask(LOG_GUEST_ERROR,
>> +"ftddrii030: undefined memory access@%#" HWADDR_PRIx "\n", 
>> addr);
>> +break;
>> +}
>> +
>> +return ret;
>> +}
>> +
>> +static void
>> +ftddrii030_mem_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
>> +{
>> +Ftddrii030State *s = FTDDRII030(opaque);
>> +
>> +switch (addr) {
>> +case REG_MCR:
>> +DDR_REG32(s, REG_MCR) = (uint32_t)val & 0x;
>> +break;
>> +case REG_MSR:
>> +val = (val & 0x3f) | (DDR_REG32(s, REG_MSR) & MSR_INIT_OK);
>> +if (!s->soc->ddr_inited && (val & MSR_CMD_MRS)) {
>> +val &= ~MSR_CMD_MRS;
>> +val |= MSR_INIT_OK;
>> +memory_region_add_subregion(s->soc->as,
>> +s->soc->ram_base,
>> +s->soc->ram);
>
> I feel like this is overstepping the bounds of the device. Its
> modifying the internals of the parent device (the SoC itself). AFAICT,
> this device does not need awareness of where the RAM is to live in the
> address map, thats the responsibility of the machine model. It might
> be cleaner to model the actual RAM as a second sysbus memory region
> then leave it up the machine model to decide where in the address map
> it should live. This device just adds/removes the ram from the second
> region without knowing where it lives and the machine model maps the
> RAM to its actual

Re: [Qemu-devel] [PATCH v8 13/24] hw/arm: add FTI2C010 I2C controller support

2013-03-17 Thread Kuo-Jung Su
2013/3/16 Peter Crosthwaite :
> On Fri, Mar 15, 2013 at 11:15 PM, Kuo-Jung Su  wrote:
>> From: Kuo-Jung Su 
>>
>> The FTI2C010 is a simple I2C master controller.
>>
>> Signed-off-by: Kuo-Jung Su 
>> ---
>>  hw/arm/Makefile.objs  |1 +
>>  hw/arm/faraday_a369_soc.c |6 ++
>>  hw/arm/fti2c010.c |  212 
>> +
>>  hw/arm/fti2c010.h |   71 +++
>>  4 files changed, 290 insertions(+)
>>  create mode 100644 hw/arm/fti2c010.c
>>  create mode 100644 hw/arm/fti2c010.h
>>
>> diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs
>> index 2622f3f..63ba519 100644
>> --- a/hw/arm/Makefile.objs
>> +++ b/hw/arm/Makefile.objs
>> @@ -46,3 +46,4 @@ obj-y += ftrtc011.o
>>  obj-y += ftdmac020.o
>>  obj-y += ftapbbrg020.o
>>  obj-y += ftnandc021.o
>> +obj-y += fti2c010.o
>> diff --git a/hw/arm/faraday_a369_soc.c b/hw/arm/faraday_a369_soc.c
>> index 0088915..89610d9 100644
>> --- a/hw/arm/faraday_a369_soc.c
>> +++ b/hw/arm/faraday_a369_soc.c
>> @@ -241,6 +241,12 @@ a369soc_device_init(FaradaySoCState *s)
>>  req = qdev_get_gpio_in(s->hdma[0], 15);
>>  qdev_connect_gpio_out(s->hdma[0], 15, ack);
>>  qdev_connect_gpio_out(ds, 0, req);
>> +
>> +/* fti2c010 */
>> +ds = sysbus_create_simple("fti2c010", 0x9290, s->pic[51]);
>> +s->i2c[0] = ds;
>> +ds = sysbus_create_simple("fti2c010", 0x92A0, s->pic[52]);
>> +s->i2c[1] = ds;
>>  }
>>
>>  static void a369soc_realize(DeviceState *dev, Error **errp)
>> diff --git a/hw/arm/fti2c010.c b/hw/arm/fti2c010.c
>> new file mode 100644
>> index 000..95f6a9d
>> --- /dev/null
>> +++ b/hw/arm/fti2c010.c
>> @@ -0,0 +1,212 @@
>> +/*
>> + * QEMU model of the FTI2C010 Controller
>> + *
>> + * Copyright (C) 2012 Faraday Technology
>> + * Written by Dante Su 
>> + *
>> + * This file is licensed under GNU GPL v2+.
>> + */
>> +
>> +#include "hw/sysbus.h"
>> +#include "hw/i2c.h"
>> +#include "sysemu/sysemu.h"
>> +
>> +#include "fti2c010.h"
>> +
>> +#define I2C_RD  1
>> +#define I2C_WR  0
>> +
>> +#define TYPE_FTI2C010   "fti2c010"
>> +
>> +typedef struct Fti2c010State {
>> +SysBusDevice busdev;
>> +MemoryRegion mmio;
>> +
>> +qemu_irq irq;
>> +i2c_bus *bus;
>> +
>> +uint8_t  recv;/* I2C RD = 1; I2C WR = 0 */
>> +uint8_t  addr;/* 7-bits device address */
>> +
>> +/* HW register cache */
>> +uint32_t cr;
>> +uint32_t sr;
>> +uint32_t cdr;
>> +uint32_t dr;
>> +uint32_t tgsr;
>> +} Fti2c010State;
>> +
>> +#define FTI2C010(obj) \
>> +OBJECT_CHECK(Fti2c010State, obj, TYPE_FTI2C010)
>> +
>> +static void
>> +fti2c010_update_irq(Fti2c010State *s)
>> +{
>> +uint32_t sr = extract32(s->sr, 4, 8);
>> +uint32_t cr = extract32(s->cr, 8, 8);
>> +qemu_set_irq(s->irq, (sr & cr) ? 1 : 0);
>> +}
>> +
>> +static uint64_t
>> +fti2c010_mem_read(void *opaque, hwaddr addr, unsigned size)
>> +{
>> +Fti2c010State *s = FTI2C010(opaque);
>> +uint32_t ret = 0;
>> +
>> +switch (addr) {
>> +case REG_CR:
>> +return s->cr;
>> +case REG_SR:
>> +ret = s->sr | (i2c_bus_busy(s->bus) ? SR_BB : 0);
>> +s->sr &= 0xf00f;/* clear RC status bits */
>> +fti2c010_update_irq(s);
>> +break;
>> +case REG_CDR:
>> +return s->cdr;
>> +case REG_DR:
>> +return s->dr;
>> +case REG_TGSR:
>> +return s->tgsr;
>> +case REG_BMR:
>> +return 0x0003;  /* Slave mode: SCL=1, SDA=1 */
>> +case REG_REVR:
>> +return 0x00011000;  /* REV. 1.10.0 */
>> +default:
>> +qemu_log_mask(LOG_GUEST_ERROR,
>> +"fti2c010: undefined memory access@%#" HWADDR_PRIx "\n", addr);
>> +break;
>> +}
>> +
>> +return ret;
>> +}
>> +
>> +static void
>> +fti2c010_mem_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
>> +{
>> +Fti2c010State *s = FTI2C010(opaque);
>> +
>> +switch (addr) {
>> +case REG_CR:
>> +s->cr = (uint32_t)val;
>> +if (s->cr & CR_I2CRST) {
>> +s->dr = 0;
>> +s->sr = 0;
>> +} else if ((s->cr & CR_MASTER_EN) && (s->cr & CR_TBEN)) {
>> +s->sr &= ~SR_ACK;
>> +if (s->cr & CR_START) {
>> +s->recv = (s->dr & I2C_RD) ? 1 : 0;
>> +s->addr = extract32(s->dr, 1, 7);
>> +if (!i2c_start_transfer(s->bus, s->addr, s->recv)) {
>
> This is the one and only use or s->addr AFAICT, and its used
> immediately after unconditional assignment. You should drop addr
> completely and just inline to save on redundant device state (the
> desired infomation is in s->dr(8:1).
>

Got it, thanks.

>> +s->sr |= SR_DT | SR_ACK;
>> +} else {
>> +s->sr &= ~SR_DT;
>> +}
>> +} else {
>> +if (s->recv) {
>> +s->dr = i2c_recv(s->bus);
>> +s->sr |= SR_DR;
>> +} else {

Re: [Qemu-devel] [PATCH v8 05/24] hw/arm: add FTDDRII030 DDRII controller support

2013-03-17 Thread Peter Crosthwaite
On Mon, Mar 18, 2013 at 11:12 AM, Kuo-Jung Su  wrote:
> 2013/3/16 Peter Crosthwaite :
>> Hi Kuo-Jung,
>>
>> On Fri, Mar 15, 2013 at 11:13 PM, Kuo-Jung Su  wrote:
>>> From: Kuo-Jung Su 
>>>
>>> The FTDDRII030 is a DDRII SDRAM controller which is responsible for
>>> SDRAM initialization.
>>> In QEMU we emulate only the SDRAM enable function.
>>>
>>> Signed-off-by: Kuo-Jung Su 
>>> ---
>>>  hw/arm/Makefile.objs  |1 +
>>>  hw/arm/faraday_a369_soc.c |9 +++
>>>  hw/arm/ftddrii030.c   |  183 
>>> +
>>>  3 files changed, 193 insertions(+)
>>>  create mode 100644 hw/arm/ftddrii030.c
>>>
>>> diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs
>>> index af36b01..0bbf838 100644
>>> --- a/hw/arm/Makefile.objs
>>> +++ b/hw/arm/Makefile.objs
>>> @@ -39,3 +39,4 @@ obj-y += faraday_a369.o faraday_a369_soc.o 
>>> faraday_a369_scu.o \
>>>  faraday_a369_kpd.o
>>>  obj-y += ftintc020.o
>>>  obj-y += ftahbc020.o
>>> +obj-y += ftddrii030.o
>>> diff --git a/hw/arm/faraday_a369_soc.c b/hw/arm/faraday_a369_soc.c
>>> index 01b4395..e8a63bb 100644
>>> --- a/hw/arm/faraday_a369_soc.c
>>> +++ b/hw/arm/faraday_a369_soc.c
>>> @@ -158,6 +158,15 @@ a369soc_device_init(FaradaySoCState *s)
>>>  fprintf(stderr, "a369soc: Unable to set soc link for FTAHBC020\n");
>>>  abort();
>>>  }
>>> +
>>> +/* ftddrii030 */
>>> +ds = sysbus_create_simple("ftddrii030", 0x9310, NULL);
>>> +s->ddrc = ds;
>>> +object_property_set_link(OBJECT(ds), OBJECT(s), "soc", &local_errp);
>>> +if (local_errp) {
>>> +fprintf(stderr, "a369soc: Unable to set soc link for 
>>> FTDDRII030\n");
>>> +abort();
>>> +}
>>>  }
>>>
>>>  static void a369soc_realize(DeviceState *dev, Error **errp)
>>> diff --git a/hw/arm/ftddrii030.c b/hw/arm/ftddrii030.c
>>> new file mode 100644
>>> index 000..90a5842
>>> --- /dev/null
>>> +++ b/hw/arm/ftddrii030.c
>>> @@ -0,0 +1,183 @@
>>> +/*
>>> + * Faraday DDRII controller
>>> + *
>>> + * Copyright (c) 2012 Faraday Technology
>>> + * Written by Dante Su 
>>> + *
>>> + * This code is licensed under GNU GPL v2+
>>> + */
>>> +
>>> +#include "hw/hw.h"
>>> +#include "hw/sysbus.h"
>>> +#include "hw/devices.h"
>>> +#include "sysemu/sysemu.h"
>>> +
>>> +#include "faraday.h"
>>> +
>>> +#define REG_MCR 0x00/* memory configuration register */
>>> +#define REG_MSR 0x04/* memory status register */
>>> +#define REG_REVR0x50/* revision register */
>>> +
>>> +#define MSR_INIT_OK BIT(8)  /* DDR2 initial is completed */
>>> +#define MSR_CMD_MRS BIT(0)  /* start MRS command */
>>> +
>>> +#define CFG_REGSIZE (0x50 / 4)
>>> +
>>> +#define TYPE_FTDDRII030 "ftddrii030"
>>> +
>>> +typedef struct Ftddrii030State {
>>> +SysBusDevice busdev;
>>> +MemoryRegion iomem;
>>> +
>>> +FaradaySoCState *soc;
>>> +/* HW register cache */
>>> +uint32_t regs[CFG_REGSIZE];
>>> +} Ftddrii030State;
>>> +
>>> +#define FTDDRII030(obj) \
>>> +OBJECT_CHECK(Ftddrii030State, obj, TYPE_FTDDRII030)
>>> +
>>> +#define DDR_REG32(s, off) \
>>> +((s)->regs[(off) / 4])
>>> +
>>> +static uint64_t
>>> +ftddrii030_mem_read(void *opaque, hwaddr addr, unsigned size)
>>> +{
>>> +Ftddrii030State *s = FTDDRII030(opaque);
>>> +uint64_t ret = 0;
>>> +
>>> +if (s->soc->ddr_inited) {
>>> +DDR_REG32(s, REG_MSR) |= MSR_INIT_OK;
>>> +}
>>> +
>>> +switch (addr) {
>>> +case REG_MCR ... (CFG_REGSIZE - 1) * 4:
>>> +ret = s->regs[addr / 4];
>>> +break;
>>> +case REG_REVR:
>>> +ret = 0x100;/* rev. = 0.1.0 */
>>> +break;
>>> +default:
>>> +qemu_log_mask(LOG_GUEST_ERROR,
>>> +"ftddrii030: undefined memory access@%#" HWADDR_PRIx "\n", 
>>> addr);
>>> +break;
>>> +}
>>> +
>>> +return ret;
>>> +}
>>> +
>>> +static void
>>> +ftddrii030_mem_write(void *opaque, hwaddr addr, uint64_t val, unsigned 
>>> size)
>>> +{
>>> +Ftddrii030State *s = FTDDRII030(opaque);
>>> +
>>> +switch (addr) {
>>> +case REG_MCR:
>>> +DDR_REG32(s, REG_MCR) = (uint32_t)val & 0x;
>>> +break;
>>> +case REG_MSR:
>>> +val = (val & 0x3f) | (DDR_REG32(s, REG_MSR) & MSR_INIT_OK);
>>> +if (!s->soc->ddr_inited && (val & MSR_CMD_MRS)) {
>>> +val &= ~MSR_CMD_MRS;
>>> +val |= MSR_INIT_OK;
>>> +memory_region_add_subregion(s->soc->as,
>>> +s->soc->ram_base,
>>> +s->soc->ram);
>>
>> I feel like this is overstepping the bounds of the device. Its
>> modifying the internals of the parent device (the SoC itself). AFAICT,
>> this device does not need awareness of where the RAM is to live in the
>> address map, thats the responsibility of the machine model. It might
>> be cleaner to model the actual RAM as a second sysbus memory region
>> then leave it up th

Re: [Qemu-devel] [Qemu-ppc] [PATCH 5/5] pseries: Move XICS initialization before cpu initialization

2013-03-17 Thread David Gibson
On Fri, Mar 15, 2013 at 01:33:18PM +0100, Alexander Graf wrote:
> 
> On 14.03.2013, at 02:53, David Gibson wrote:
> 
> > Currently, the pseries machine initializes the cpus, then the XICS
> > interrupt controller.  However, to support the upcoming in-kernel XICS
> > implementation we will need to initialize the irq controller before the
> > vcpus.  This patch makes the necesssary rearrangement.  This means the
> 
> We're changing that notion in the in-kernel XICS discussions.  The flow will 
> look like this:
> 
>   * create vcpus
>   * create XICS
>   * foreach (vcpu)
>   * enable_cap(vcpu, CAP_XICS_SERVER, xics_handle)
> 
> However, that means we still need to know the maximum number of
> supported vcpus during the create phase. That number can be bigger
> than smp_cpus though, since you probably want to support hotplug add
> of CPUs later on.
> 
> Can't we just make the number of supported "interrupt servers" a
> constant?

I suppose, but we need an allocation for each one, so its a bit ugly.
In any case although the comment is a bit out of date, this patch also
creates a logical place to put per-cpu XICS initialization which we
will still need for the new interface.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: Digital signature


Re: [Qemu-devel] [Qemu-ppc] [PATCH 5/5] pseries: Move XICS initialization before cpu initialization

2013-03-17 Thread Alexander Graf

On 18.03.2013, at 03:55, David Gibson wrote:

> On Fri, Mar 15, 2013 at 01:33:18PM +0100, Alexander Graf wrote:
>> 
>> On 14.03.2013, at 02:53, David Gibson wrote:
>> 
>>> Currently, the pseries machine initializes the cpus, then the XICS
>>> interrupt controller.  However, to support the upcoming in-kernel XICS
>>> implementation we will need to initialize the irq controller before the
>>> vcpus.  This patch makes the necesssary rearrangement.  This means the
>> 
>> We're changing that notion in the in-kernel XICS discussions.  The flow will 
>> look like this:
>> 
>>  * create vcpus
>>  * create XICS
>>  * foreach (vcpu)
>>  * enable_cap(vcpu, CAP_XICS_SERVER, xics_handle)
>> 
>> However, that means we still need to know the maximum number of
>> supported vcpus during the create phase. That number can be bigger
>> than smp_cpus though, since you probably want to support hotplug add
>> of CPUs later on.
>> 
>> Can't we just make the number of supported "interrupt servers" a
>> constant?
> 
> I suppose, but we need an allocation for each one, so its a bit ugly.
> In any case although the comment is a bit out of date, this patch also
> creates a logical place to put per-cpu XICS initialization which we
> will still need for the new interface.

So how would you model CPU hotplug add?


Alex




[Qemu-devel] [RFC PATCH RDMA support v4: 05/10] reuse function for parsing the QMP 'migrate' string

2013-03-17 Thread mrhines
From: "Michael R. Hines" 


Signed-off-by: Michael R. Hines 
---
 include/qemu/sockets.h |1 +
 util/qemu-sockets.c|2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h
index ae5c21c..5066fca 100644
--- a/include/qemu/sockets.h
+++ b/include/qemu/sockets.h
@@ -48,6 +48,7 @@ typedef void NonBlockingConnectHandler(int fd, void *opaque);
 int inet_listen_opts(QemuOpts *opts, int port_offset, Error **errp);
 int inet_listen(const char *str, char *ostr, int olen,
 int socktype, int port_offset, Error **errp);
+InetSocketAddress *inet_parse(const char *str, Error **errp);
 int inet_connect_opts(QemuOpts *opts, Error **errp,
   NonBlockingConnectHandler *callback, void *opaque);
 int inet_connect(const char *str, Error **errp);
diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
index 83e4e08..6b60b63 100644
--- a/util/qemu-sockets.c
+++ b/util/qemu-sockets.c
@@ -485,7 +485,7 @@ err:
 }
 
 /* compatibility wrapper */
-static InetSocketAddress *inet_parse(const char *str, Error **errp)
+InetSocketAddress *inet_parse(const char *str, Error **errp)
 {
 InetSocketAddress *addr;
 const char *optstr, *h;
-- 
1.7.10.4




[Qemu-devel] [RFC PATCH RDMA support v4: 00/10] cleaner ramblocks and documentation

2013-03-17 Thread mrhines
From: "Michael R. Hines" 

Changes since v3:

- Compile-tested with and without --enable-rdma is working.
- Updated docs/rdma.txt (included below)
- Merged with latest pull queue from Paolo
- Implemented qemu_ram_foreach_block()

mrhines@mrhinesdev:~/qemu$ git diff --stat master
Makefile.objs |1 +
arch_init.c   |   28 +-
configure |   25 ++
docs/rdma.txt |  190 +++
exec.c|   21 ++
include/exec/cpu-common.h |6 +
include/migration/migration.h |3 +
include/migration/qemu-file.h |   10 +
include/migration/rdma.h  |  269 
include/qemu/sockets.h|1 +
migration-rdma.c  |  205 
migration.c   |   19 +-
rdma.c| 1511 
++
savevm.c  |  172 +-
util/qemu-sockets.c   |2 +-
15 files changed, 2445 insertions(+), 18 deletions(-)

QEMUFileRDMA:
==

QEMUFileRDMA introduces a couple of new functions:

1. qemu_rdma_get_buffer()  (QEMUFileOps rdma_read_ops)
2. qemu_rdma_put_buffer()  (QEMUFileOps rdma_write_ops)

These two functions provide an RDMA transport
(not a protocol) without changing the upper-level
users of QEMUFile that depend on a bytstream abstraction.

In order to provide the same bytestream interface 
for RDMA, we use SEND messages instead of sockets.
The operations themselves and the protocol built on 
top of QEMUFile used throughout the migration 
process do not change whatsoever.

An infiniband SEND message is the standard ibverbs
message used by applications of infiniband hardware.
The only difference between a SEND message and an RDMA
message is that SEND message cause completion notifications
to be posted to the completion queue (CQ) on the 
infiniband receiver side, whereas RDMA messages (used
for pc.ram) do not (to behave like an actual DMA).

Messages in infiniband require two things:

1. registration of the memory that will be transmitted
2. (SEND only) work requests to be posted on both
   sides of the network before the actual transmission
   can occur.

RDMA messages much easier to deal with. Once the memory
on the receiver side is registed and pinned, we're
basically done. All that is required is for the sender
side to start dumping bytes onto the link.

SEND messages require more coordination because the
receiver must have reserved space (using a receive
work request) on the receive queue (RQ) before QEMUFileRDMA
can start using them to carry all the bytes as
a transport for migration of device state.

After the initial connection setup (migration-rdma.c),
this coordination starts by having both sides post
a single work request to the RQ before any users
of QEMUFile are activated.

Once an initial receive work request is posted,
we have a put_buffer()/get_buffer() implementation
that looks like this:

Logically:

qemu_rdma_get_buffer():

1. A user on top of QEMUFile calls ops->get_buffer(),
   which calls us.
2. We transmit an empty SEND to let the sender know that 
   we are *ready* to receive some bytes from QEMUFileRDMA.
   These bytes will come in the form of a another SEND.
3. Before attempting to receive that SEND, we post another
   RQ work request to replace the one we just used up.
4. Block on a CQ event channel and wait for the SEND
   to arrive.
5. When the send arrives, librdmacm will unblock us
   and we can consume the bytes (described later).
   
qemu_rdma_put_buffer(): 

1. A user on top of QEMUFile calls ops->put_buffer(),
   which calls us.
2. Block on the CQ event channel waiting for a SEND
   from the receiver to tell us that the receiver
   is *ready* for us to transmit some new bytes.
3. When the "ready" SEND arrives, librdmacm will 
   unblock us and we immediately post a RQ work request
   to replace the one we just used up.
4. Now, we can actually deliver the bytes that
   put_buffer() wants and return. 

NOTE: This entire sequents of events is designed this
way to mimic the operations of a bytestream and is not
typical of an infiniband application. (Something like MPI
would not 'ping-pong' messages like this and would not
block after every request, which would normally defeat
the purpose of using zero-copy infiniband in the first place).

Finally, how do we handoff the actual bytes to get_buffer()?

Again, because we're trying to "fake" a bytestream abstraction
using an analogy not unlike individual UDP frames, we have
to hold on to the bytes received from SEND in memory.

Each time we get to "Step 5" above for get_buffer(),
the bytes from SEND are copied into a local holding buffer.

Then, we return the number of bytes requested by get_buffer()
and leave the remaining bytes in the buffer until get_buffer()
comes around for another pass.

If the buffer is empty, then we follow the same steps
listed above for qem

[Qemu-devel] [RFC PATCH RDMA support v4: 07/10] connection-establishment for RDMA

2013-03-17 Thread mrhines
From: "Michael R. Hines" 


Signed-off-by: Michael R. Hines 
---
 migration-rdma.c |  205 ++
 1 file changed, 205 insertions(+)
 create mode 100644 migration-rdma.c

diff --git a/migration-rdma.c b/migration-rdma.c
new file mode 100644
index 000..e1ea055
--- /dev/null
+++ b/migration-rdma.c
@@ -0,0 +1,205 @@
+/*
+ *  Copyright (C) 2013 Michael R. Hines 
+ *  Copyright (C) 2013 Jiuxing Liu 
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; under version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see .
+ */
+#include "migration/rdma.h"
+#include "qemu-common.h"
+#include "migration/migration.h"
+#include "migration/qemu-file.h"
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+//#define DEBUG_MIGRATION_RDMA
+
+#ifdef DEBUG_MIGRATION_RDMA
+#define DPRINTF(fmt, ...) \
+do { printf("migration-rdma: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+do { } while (0)
+#endif
+
+static int rdma_accept_incoming_migration(RDMAData *rdma, Error **errp)
+{
+int ret;
+
+ret = qemu_rdma_migrate_listen(rdma, rdma->host, rdma->port);
+if (ret) {
+qemu_rdma_print("rdma migration: error listening!");
+goto err_rdma_server_wait;
+}
+
+ret = qemu_rdma_alloc_qp(&rdma->rdma_ctx);
+if (ret) {
+qemu_rdma_print("rdma migration: error allocating qp!");
+goto err_rdma_server_wait;
+}
+
+ret = qemu_rdma_migrate_accept(&rdma->rdma_ctx, NULL, NULL, NULL, 0);
+if (ret) {
+qemu_rdma_print("rdma migration: error accepting connection!");
+goto err_rdma_server_wait;
+}
+
+ret = qemu_rdma_post_recv_qemu_file(rdma);
+if (ret) {
+qemu_rdma_print("rdma migration: error posting second qemu file 
recv!");
+goto err_rdma_server_wait;
+}
+
+ret = qemu_rdma_post_send_remote_info(rdma);
+if (ret) {
+qemu_rdma_print("rdma migration: error sending remote info!");
+goto err_rdma_server_wait;
+}
+
+ret = qemu_rdma_wait_for_wrid(rdma, RDMA_WRID_SEND_REMOTE_INFO);
+if (ret < 0) {
+qemu_rdma_print("rdma migration: polling remote info error!");
+goto err_rdma_server_wait;
+}
+
+rdma->total_bytes = 0;
+rdma->enabled = 1;
+qemu_rdma_dump_gid("server_connect", rdma->rdma_ctx.cm_id);
+return 0;
+
+err_rdma_server_wait:
+qemu_rdma_cleanup(rdma);
+return -1;
+
+}
+
+int rdma_start_incoming_migration(const char * host_port, Error **errp)
+{
+RDMAData *rdma = g_malloc0(sizeof(RDMAData));
+QEMUFile *f;
+int ret;
+
+if ((ret = qemu_rdma_data_init(rdma, host_port, errp)) < 0)
+return ret; 
+
+ret = qemu_rdma_server_init(rdma, NULL);
+
+DPRINTF("Starting RDMA-based incoming migration\n");
+
+if (!ret) {
+DPRINTF("qemu_rdma_server_init success\n");
+ret = qemu_rdma_server_prepare(rdma, NULL);
+
+if (!ret) {
+DPRINTF("qemu_rdma_server_prepare success\n");
+
+ret = rdma_accept_incoming_migration(rdma, NULL);
+if(!ret)
+DPRINTF("qemu_rdma_accept_incoming_migration success\n");
+f = qemu_fopen_rdma(rdma, "rb");
+if (f == NULL) {
+fprintf(stderr, "could not qemu_fopen RDMA\n");
+ret = -EIO;
+}
+
+process_incoming_migration(f);
+}
+}
+
+return ret;
+}
+
+void rdma_start_outgoing_migration(void *opaque, const char *host_port, Error 
**errp)
+{
+RDMAData *rdma = g_malloc0(sizeof(RDMAData));
+MigrationState *s = opaque;
+int ret;
+
+if (qemu_rdma_data_init(rdma, host_port, errp) < 0)
+return; 
+
+ret = qemu_rdma_client_init(rdma, NULL);
+if(!ret) {
+DPRINTF("qemu_rdma_client_init success\n");
+ret = qemu_rdma_client_connect(rdma, NULL);
+
+if(!ret) {
+s->file = qemu_fopen_rdma(rdma, "wb");
+DPRINTF("qemu_rdma_client_connect success\n");
+migrate_fd_connect(s);
+return;
+}
+}
+
+migrate_fd_error(s);
+}
+
+size_t save_rdma_page(QEMUFile *f, ram_addr_t block_offset, ram_addr_t offset, 
int cont, size_t size)
+{
+int ret;
+size_t bytes_sent = 0;
+ram_addr_t current_addr;
+RDMAData * rdma = migrate_use_rdma(f);
+
+current_addr = block_offset + offset;
+
+/*
+ * Add this page to the current 'chunk'. If the chunk
+ * is full, an actual

[Qemu-devel] [RFC PATCH RDMA support v4: 02/10] check for CONFIG_RDMA

2013-03-17 Thread mrhines
From: "Michael R. Hines" 

Make both rdma.c and migration-rdma.c conditionally built.

Signed-off-by: Michael R. Hines 
---
 Makefile.objs |1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile.objs b/Makefile.objs
index f99841c..d12208b 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -58,6 +58,7 @@ common-obj-$(CONFIG_POSIX) += os-posix.o
 common-obj-$(CONFIG_LINUX) += fsdev/
 
 common-obj-y += migration.o migration-tcp.o
+common-obj-$(CONFIG_RDMA) += migration-rdma.o rdma.o
 common-obj-y += qemu-char.o #aio.o
 common-obj-y += block-migration.o
 common-obj-y += page_cache.o xbzrle.o
-- 
1.7.10.4




[Qemu-devel] [RFC PATCH RDMA support v4: 01/10] ./configure --enable-rdma

2013-03-17 Thread mrhines
From: "Michael R. Hines" 


Signed-off-by: Michael R. Hines 
---
 configure |   25 +
 1 file changed, 25 insertions(+)

diff --git a/configure b/configure
index 46a7594..bdc6b13 100755
--- a/configure
+++ b/configure
@@ -170,6 +170,7 @@ xfs=""
 
 vhost_net="no"
 kvm="no"
+rdma="no"
 gprof="no"
 debug_tcg="no"
 debug="no"
@@ -904,6 +905,10 @@ for opt do
   ;;
   --enable-gtk) gtk="yes"
   ;;
+  --enable-rdma) rdma="yes"
+  ;;
+  --disable-rdma) rdma="no"
+  ;;
   --with-gtkabi=*) gtkabi="$optarg"
   ;;
   --enable-tpm) tpm="yes"
@@ -1104,6 +1109,8 @@ echo "  --enable-bluez   enable bluez stack 
connectivity"
 echo "  --disable-slirp  disable SLIRP userspace network connectivity"
 echo "  --disable-kvmdisable KVM acceleration support"
 echo "  --enable-kvm enable KVM acceleration support"
+echo "  --disable-rdma   disable RDMA-based migration support"
+echo "  --enable-rdmaenable RDMA-based migration support"
 echo "  --enable-tcg-interpreter enable TCG with bytecode interpreter (TCI)"
 echo "  --disable-nptl   disable usermode NPTL support"
 echo "  --enable-nptlenable usermode NPTL support"
@@ -1766,6 +1773,18 @@ EOF
   libs_softmmu="$sdl_libs $libs_softmmu"
 fi
 
+if test "$rdma" = "yes" ; then
+  cat > $TMPC <
+int main(void) { return 0; }
+EOF
+  rdma_libs="-lrdmacm -libverbs"
+  if ! compile_prog "" "$rdma_libs" ; then
+  feature_not_found "rdma"
+  fi
+
+fi
+
 ##
 # VNC TLS/WS detection
 if test "$vnc" = "yes" -a \( "$vnc_tls" != "no" -o "$vnc_ws" != "no" \) ; then
@@ -3412,6 +3431,7 @@ echo "Linux AIO support $linux_aio"
 echo "ATTR/XATTR support $attr"
 echo "Install blobs $blobs"
 echo "KVM support   $kvm"
+echo "RDMA support  $rdma"
 echo "TCG interpreter   $tcg_interpreter"
 echo "fdt support   $fdt"
 echo "preadv support$preadv"
@@ -4384,6 +4404,11 @@ if [ "$pixman" = "internal" ]; then
   echo "config-host.h: subdir-pixman" >> $config_host_mak
 fi
 
+if test "$rdma" = "yes" ; then
+echo "CONFIG_RDMA=y" >> $config_host_mak
+echo "LIBS+=$rdma_libs" >> $config_host_mak
+fi
+
 # build tree in object directory in case the source is not in the current 
directory
 DIRS="tests tests/tcg tests/tcg/cris tests/tcg/lm32"
 DIRS="$DIRS pc-bios/optionrom pc-bios/spapr-rtas"
-- 
1.7.10.4




[Qemu-devel] [RFC PATCH RDMA support v4: 09/10] check for QMP string and bypass nonblock() calls

2013-03-17 Thread mrhines
From: "Michael R. Hines" 

Since we're not using TCP anymore, we skip these calls.

Also print a little extra text while debugging, like "gbps"
which is helpful to know how the link is being utilized.

Signed-off-by: Michael R. Hines 
---
 include/migration/migration.h |3 +++
 migration.c   |   19 +--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index bb617fd..88ab5f6 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -20,6 +20,7 @@
 #include "qemu/notify.h"
 #include "qapi/error.h"
 #include "migration/vmstate.h"
+#include "migration/rdma.h"
 #include "qapi-types.h"
 
 struct MigrationParams {
@@ -102,6 +103,7 @@ uint64_t xbzrle_mig_bytes_transferred(void);
 uint64_t xbzrle_mig_pages_transferred(void);
 uint64_t xbzrle_mig_pages_overflow(void);
 uint64_t xbzrle_mig_pages_cache_miss(void);
+uint64_t delta_norm_mig_bytes_transferred(void);
 
 /**
  * @migrate_add_blocker - prevent migration from proceeding
@@ -122,6 +124,7 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t 
*new_buf, int slen,
 int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
 
 int migrate_use_xbzrle(void);
+void *migrate_use_rdma(QEMUFile *f);
 int64_t migrate_xbzrle_cache_size(void);
 
 int64_t xbzrle_cache_resize(int64_t new_size);
diff --git a/migration.c b/migration.c
index 185d112..634437a 100644
--- a/migration.c
+++ b/migration.c
@@ -15,6 +15,7 @@
 
 #include "qemu-common.h"
 #include "migration/migration.h"
+#include "migration/rdma.h"
 #include "monitor/monitor.h"
 #include "migration/qemu-file.h"
 #include "sysemu/sysemu.h"
@@ -77,6 +78,8 @@ void qemu_start_incoming_migration(const char *uri, Error 
**errp)
 
 if (strstart(uri, "tcp:", &p))
 tcp_start_incoming_migration(p, errp);
+else if (strstart(uri, "rdma:", &p))
+rdma_start_incoming_migration(p, errp);
 #if !defined(WIN32)
 else if (strstart(uri, "exec:", &p))
 exec_start_incoming_migration(p, errp);
@@ -118,10 +121,11 @@ static void process_incoming_migration_co(void *opaque)
 void process_incoming_migration(QEMUFile *f)
 {
 Coroutine *co = qemu_coroutine_create(process_incoming_migration_co);
-int fd = qemu_get_fd(f);
-
-assert(fd != -1);
-socket_set_nonblock(fd);
+if(!migrate_use_rdma(f)) {
+int fd = qemu_get_fd(f);
+assert(fd != -1);
+socket_set_nonblock(fd);
+}
 qemu_coroutine_enter(co, f);
 }
 
@@ -404,6 +408,8 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
 
 if (strstart(uri, "tcp:", &p)) {
 tcp_start_outgoing_migration(s, p, &local_err);
+} else if (strstart(uri, "rdma:", &p)) {
+rdma_start_outgoing_migration(s, p, &local_err);
 #if !defined(WIN32)
 } else if (strstart(uri, "exec:", &p)) {
 exec_start_outgoing_migration(s, p, &local_err);
@@ -545,8 +551,9 @@ static void *migration_thread(void *opaque)
 max_size = bandwidth * migrate_max_downtime() / 100;
 
 DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
-" bandwidth %g max_size %" PRId64 "\n",
-transferred_bytes, time_spent, bandwidth, max_size);
+" bandwidth %g (%0.2f mbps) max_size %" PRId64 "\n",
+transferred_bytes, time_spent, 
+bandwidth, Gbps(transferred_bytes, time_spent), max_size);
 /* if we haven't sent anything, we don't want to recalculate
1 is a small enough number for our purposes */
 if (s->dirty_bytes_rate && transferred_bytes > 1) {
-- 
1.7.10.4




[Qemu-devel] [RFC PATCH RDMA support v4: 10/10] send pc.ram over RDMA

2013-03-17 Thread mrhines
From: "Michael R. Hines" 


Signed-off-by: Michael R. Hines 
---
 arch_init.c |   28 +++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch_init.c b/arch_init.c
index 98e2bc6..b013cc8 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -45,6 +45,7 @@
 #include "exec/address-spaces.h"
 #include "hw/pcspk.h"
 #include "migration/page_cache.h"
+#include "migration/rdma.h"
 #include "qemu/config-file.h"
 #include "qmp-commands.h"
 #include "trace.h"
@@ -225,6 +226,18 @@ static void acct_clear(void)
 memset(&acct_info, 0, sizeof(acct_info));
 }
 
+/*
+ * RDMA pc.ram doesn't go through QEMUFile directly,
+ * but still needs to be accounted for...
+ */
+uint64_t delta_norm_mig_bytes_transferred(void)
+{
+static uint64_t last_norm_pages = 0;
+uint64_t delta_bytes = (acct_info.norm_pages - last_norm_pages) * 
TARGET_PAGE_SIZE;
+last_norm_pages = acct_info.norm_pages; 
+return delta_bytes;
+}
+
 uint64_t dup_mig_bytes_transferred(void)
 {
 return acct_info.dup_pages * TARGET_PAGE_SIZE;
@@ -463,7 +476,11 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 
 /* In doubt sent page as normal */
 bytes_sent = -1;
-if (is_dup_page(p)) {
+if (migrate_use_rdma(f)) {
+/* for now, mapping the page is slower than RDMA */
+acct_info.norm_pages++;
+bytes_sent = save_rdma_page(f, block->offset, offset, cont, 
TARGET_PAGE_SIZE);
+} else if (is_dup_page(p)) {
 acct_info.dup_pages++;
 bytes_sent = save_block_hdr(f, block, offset, cont,
 RAM_SAVE_FLAG_COMPRESS);
@@ -648,6 +665,15 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 
 qemu_mutex_unlock_ramlist();
 
+/*
+ * Don't go to the next iteration without
+ * ensuring RDMA transfers have completed.
+ */
+if ((ret = qemu_drain(f)) < 0) {
+fprintf(stderr, "failed to drain RDMA first!\n");
+return ret;
+}
+
 if (ret < 0) {
 bytes_transferred += total_sent;
 return ret;
-- 
1.7.10.4




[Qemu-devel] [RFC PATCH RDMA support v4: 08/10] introduce QEMUFileRDMA

2013-03-17 Thread mrhines
From: "Michael R. Hines" 

This compiles with and without --enable-rdma.

Signed-off-by: Michael R. Hines 
---
 include/migration/qemu-file.h |   10 +++
 savevm.c  |  172 ++---
 2 files changed, 172 insertions(+), 10 deletions(-)

diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index df81261..9046751 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -51,23 +51,33 @@ typedef int (QEMUFileCloseFunc)(void *opaque);
  */
 typedef int (QEMUFileGetFD)(void *opaque);
 
+/* 
+ * 'drain' from a QEMUFile perspective means
+ * to flush the outbound send buffer
+ * (if one exists). (Only used by RDMA right now)
+ */
+typedef int (QEMUFileDrainFunc)(void *opaque);
+
 typedef struct QEMUFileOps {
 QEMUFilePutBufferFunc *put_buffer;
 QEMUFileGetBufferFunc *get_buffer;
 QEMUFileCloseFunc *close;
 QEMUFileGetFD *get_fd;
+QEMUFileDrainFunc *drain;
 } QEMUFileOps;
 
 QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops);
 QEMUFile *qemu_fopen(const char *filename, const char *mode);
 QEMUFile *qemu_fdopen(int fd, const char *mode);
 QEMUFile *qemu_fopen_socket(int fd, const char *mode);
+QEMUFile *qemu_fopen_rdma(void *opaque, const char *mode);
 QEMUFile *qemu_popen_cmd(const char *command, const char *mode);
 int qemu_get_fd(QEMUFile *f);
 int qemu_fclose(QEMUFile *f);
 int64_t qemu_ftell(QEMUFile *f);
 void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size);
 void qemu_put_byte(QEMUFile *f, int v);
+int qemu_drain(QEMUFile *f);
 
 static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v)
 {
diff --git a/savevm.c b/savevm.c
index 35c8d1e..9b90b7f 100644
--- a/savevm.c
+++ b/savevm.c
@@ -32,6 +32,7 @@
 #include "qemu/timer.h"
 #include "audio/audio.h"
 #include "migration/migration.h"
+#include "migration/rdma.h"
 #include "qemu/sockets.h"
 #include "qemu/queue.h"
 #include "sysemu/cpus.h"
@@ -143,6 +144,13 @@ typedef struct QEMUFileSocket
 QEMUFile *file;
 } QEMUFileSocket;
 
+typedef struct QEMUFileRDMA
+{
+void *rdma;
+size_t len;
+QEMUFile *file;
+} QEMUFileRDMA;
+
 typedef struct {
 Coroutine *co;
 int fd;
@@ -178,6 +186,66 @@ static int socket_get_fd(void *opaque)
 return s->fd;
 }
 
+/*
+ * SEND messages for none-live state only.
+ * pc.ram is handled elsewhere...
+ */
+static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, 
int size)
+{
+QEMUFileRDMA *r = opaque;
+size_t remaining = size;
+uint8_t * data = (void *) buf;
+
+/*
+ * Although we're sending non-live
+ * state here, push out any writes that
+ * we're queued up for pc.ram anyway.
+ */
+if (qemu_rdma_write_flush(r->rdma) < 0)
+return -EIO;
+
+while(remaining) {
+r->len = MIN(remaining, RDMA_SEND_INCREMENT);
+remaining -= r->len;
+
+if(qemu_rdma_exchange_send(r->rdma, data, r->len) < 0)
+return -EINVAL;
+
+data += r->len;
+}
+
+return size;
+} 
+
+/*
+ * RDMA links don't use bytestreams, so we have to
+ * return bytes to QEMUFile opportunistically.
+ */
+static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int 
size)
+{
+QEMUFileRDMA *r = opaque;
+
+/*
+ * First, we hold on to the last SEND message we 
+ * were given and dish out the bytes until we run 
+ * out of bytes.
+ */
+if((r->len = qemu_rdma_fill(r->rdma, buf, size)))
+   return r->len; 
+
+ /*
+  * Once we run out, we block and wait for another
+  * SEND message to arrive.
+  */
+if(qemu_rdma_exchange_recv(r->rdma) < 0)
+   return -EINVAL;
+
+/*
+ * SEND was received with new bytes, now try again.
+ */
+return qemu_rdma_fill(r->rdma, buf, size);
+} 
+
 static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
 {
 QEMUFileSocket *s = opaque;
@@ -390,16 +458,24 @@ static const QEMUFileOps socket_write_ops = {
 .close =  socket_close
 };
 
-QEMUFile *qemu_fopen_socket(int fd, const char *mode)
+static bool qemu_mode_is_not_valid(const char * mode)
 {
-QEMUFileSocket *s = g_malloc0(sizeof(QEMUFileSocket));
-
 if (mode == NULL ||
 (mode[0] != 'r' && mode[0] != 'w') ||
 mode[1] != 'b' || mode[2] != 0) {
 fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
-return NULL;
+return true;
 }
+
+return false;
+}
+
+QEMUFile *qemu_fopen_socket(int fd, const char *mode)
+{
+QEMUFileSocket *s = g_malloc0(sizeof(QEMUFileSocket));
+
+if(qemu_mode_is_not_valid(mode))
+   return NULL;
 
 s->fd = fd;
 if (mode[0] == 'w') {
@@ -411,16 +487,66 @@ QEMUFile *qemu_fopen_socket(int fd, const char *mode)
 return s->file;
 }
 
+static int qemu_rdma_close(void *opaque)
+{
+QEMUFileRDMA *r = opaque;
+if(r->rdma) {
+qemu_rdma_cleanup(r->rdma);
+g_free(r->rdma);
+}
+g_free(r

[Qemu-devel] [RFC PATCH RDMA support v4: 03/10] more verbose documentation of the RDMA transport

2013-03-17 Thread mrhines
From: "Michael R. Hines" 

This tries to cover all the questions I got the last time.

Please do tell me what is not clear, and I'll revise again.

Signed-off-by: Michael R. Hines 
---
 docs/rdma.txt |  208 +
 1 file changed, 208 insertions(+)
 create mode 100644 docs/rdma.txt

diff --git a/docs/rdma.txt b/docs/rdma.txt
new file mode 100644
index 000..2a48ab0
--- /dev/null
+++ b/docs/rdma.txt
@@ -0,0 +1,208 @@
+Changes since v3:
+
+- Compile-tested with and without --enable-rdma is working.
+- Updated docs/rdma.txt (included below)
+- Merged with latest pull queue from Paolo
+- Implemented qemu_ram_foreach_block()
+
+mrhines@mrhinesdev:~/qemu$ git diff --stat master
+Makefile.objs |1 +
+arch_init.c   |   28 +-
+configure |   25 ++
+docs/rdma.txt |  190 +++
+exec.c|   21 ++
+include/exec/cpu-common.h |6 +
+include/migration/migration.h |3 +
+include/migration/qemu-file.h |   10 +
+include/migration/rdma.h  |  269 
+include/qemu/sockets.h|1 +
+migration-rdma.c  |  205 
+migration.c   |   19 +-
+rdma.c| 1511 
++
+savevm.c  |  172 +-
+util/qemu-sockets.c   |2 +-
+15 files changed, 2445 insertions(+), 18 deletions(-)
+
+QEMUFileRDMA:
+==
+
+QEMUFileRDMA introduces a couple of new functions:
+
+1. qemu_rdma_get_buffer()  (QEMUFileOps rdma_read_ops)
+2. qemu_rdma_put_buffer()  (QEMUFileOps rdma_write_ops)
+
+These two functions provide an RDMA transport
+(not a protocol) without changing the upper-level
+users of QEMUFile that depend on a bytstream abstraction.
+
+In order to provide the same bytestream interface 
+for RDMA, we use SEND messages instead of sockets.
+The operations themselves and the protocol built on 
+top of QEMUFile used throughout the migration 
+process do not change whatsoever.
+
+An infiniband SEND message is the standard ibverbs
+message used by applications of infiniband hardware.
+The only difference between a SEND message and an RDMA
+message is that SEND message cause completion notifications
+to be posted to the completion queue (CQ) on the 
+infiniband receiver side, whereas RDMA messages (used
+for pc.ram) do not (to behave like an actual DMA).
+
+Messages in infiniband require two things:
+
+1. registration of the memory that will be transmitted
+2. (SEND only) work requests to be posted on both
+   sides of the network before the actual transmission
+   can occur.
+
+RDMA messages much easier to deal with. Once the memory
+on the receiver side is registed and pinned, we're
+basically done. All that is required is for the sender
+side to start dumping bytes onto the link.
+
+SEND messages require more coordination because the
+receiver must have reserved space (using a receive
+work request) on the receive queue (RQ) before QEMUFileRDMA
+can start using them to carry all the bytes as
+a transport for migration of device state.
+
+After the initial connection setup (migration-rdma.c),
+this coordination starts by having both sides post
+a single work request to the RQ before any users
+of QEMUFile are activated.
+
+Once an initial receive work request is posted,
+we have a put_buffer()/get_buffer() implementation
+that looks like this:
+
+Logically:
+
+qemu_rdma_get_buffer():
+
+1. A user on top of QEMUFile calls ops->get_buffer(),
+   which calls us.
+2. We transmit an empty SEND to let the sender know that 
+   we are *ready* to receive some bytes from QEMUFileRDMA.
+   These bytes will come in the form of a another SEND.
+3. Before attempting to receive that SEND, we post another
+   RQ work request to replace the one we just used up.
+4. Block on a CQ event channel and wait for the SEND
+   to arrive.
+5. When the send arrives, librdmacm will unblock us
+   and we can consume the bytes (described later).
+   
+qemu_rdma_put_buffer(): 
+
+1. A user on top of QEMUFile calls ops->put_buffer(),
+   which calls us.
+2. Block on the CQ event channel waiting for a SEND
+   from the receiver to tell us that the receiver
+   is *ready* for us to transmit some new bytes.
+3. When the "ready" SEND arrives, librdmacm will 
+   unblock us and we immediately post a RQ work request
+   to replace the one we just used up.
+4. Now, we can actually deliver the bytes that
+   put_buffer() wants and return. 
+
+NOTE: This entire sequents of events is designed this
+way to mimic the operations of a bytestream and is not
+typical of an infiniband application. (Something like MPI
+would not 'ping-pong' messages like this and would not
+block after every request, which would normally defeat
+the purpose of using zero-copy infiniband in the first place).
+
+Finally, how do we han

[Qemu-devel] [RFC PATCH RDMA support v4: 04/10] iterators for getting the RAMBlocks

2013-03-17 Thread mrhines
From: "Michael R. Hines" 

This introduces:
1. qemu_ram_foreach_block
2. qemu_ram_count_blocks

Both used in communicating the RAMBlocks
to each side for later memory registration.

Signed-off-by: Michael R. Hines 
---
 exec.c|   21 +
 include/exec/cpu-common.h |6 ++
 2 files changed, 27 insertions(+)

diff --git a/exec.c b/exec.c
index 8a6aac3..a985da8 100644
--- a/exec.c
+++ b/exec.c
@@ -2629,3 +2629,24 @@ bool cpu_physical_memory_is_io(hwaddr phys_addr)
  memory_region_is_romd(section->mr));
 }
 #endif
+
+void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
+{
+RAMBlock *block;
+
+QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+func(block->host, block->offset, block->length, opaque);
+}
+}
+
+int qemu_ram_count_blocks(void)
+{
+RAMBlock *block;
+int total = 0;
+
+QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+total++;
+}
+
+return total;
+}
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 2e5f11f..aea3fe0 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -119,6 +119,12 @@ extern struct MemoryRegion io_mem_rom;
 extern struct MemoryRegion io_mem_unassigned;
 extern struct MemoryRegion io_mem_notdirty;
 
+typedef void  (RAMBlockIterFunc)(void *host_addr, 
+ram_addr_t offset, ram_addr_t length, void *opaque); 
+
+void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
+int qemu_ram_count_blocks(void);
+
 #endif
 
 #endif /* !CPU_COMMON_H */
-- 
1.7.10.4




[Qemu-devel] [RFC PATCH RDMA support v4: 06/10] core RDMA migration code (rdma.c)

2013-03-17 Thread mrhines
From: "Michael R. Hines" 


Signed-off-by: Michael R. Hines 
---
 include/migration/rdma.h |  244 
 rdma.c   | 1532 ++
 2 files changed, 1776 insertions(+)
 create mode 100644 include/migration/rdma.h
 create mode 100644 rdma.c

diff --git a/include/migration/rdma.h b/include/migration/rdma.h
new file mode 100644
index 000..a6c521a
--- /dev/null
+++ b/include/migration/rdma.h
@@ -0,0 +1,244 @@
+/*
+ *  Copyright (C) 2013 Michael R. Hines 
+ *  Copyright (C) 2013 Jiuxing Liu 
+ *
+ *  RDMA data structures and helper functions (for migration)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; under version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see .
+ */
+
+#ifndef _RDMA_H
+#define _RDMA_H
+
+#include "config-host.h"
+#ifdef CONFIG_RDMA 
+#include 
+#endif
+#include "monitor/monitor.h"
+#include "exec/cpu-common.h"
+#include "migration/migration.h"
+
+#define Gbps(bytes, ms) ((double) bytes * 8.0 / ((double) ms / 1000.0)) \
+/ 1000.0 / 1000.0
+#define qemu_rdma_print(msg) fprintf(stderr, msg "\n")
+//#define qemu_rdma_print(msg) error_setg(errp, msg)
+
+#define RDMA_CHUNK_REGISTRATION
+
+#define RDMA_LAZY_REGISTRATION
+
+#define RDMA_REG_CHUNK_SHIFT 20
+#define RDMA_REG_CHUNK_SIZE (1UL << (RDMA_REG_CHUNK_SHIFT))
+#define RDMA_REG_CHUNK_INDEX(start_addr, host_addr) \
+(((unsigned long)(host_addr) >> RDMA_REG_CHUNK_SHIFT) - \
+((unsigned long)(start_addr) >> RDMA_REG_CHUNK_SHIFT))
+#define RDMA_REG_NUM_CHUNKS(rdma_ram_block) \
+(RDMA_REG_CHUNK_INDEX((rdma_ram_block)->local_host_addr,\
+(rdma_ram_block)->local_host_addr +\
+(rdma_ram_block)->length) + 1)
+#define RDMA_REG_CHUNK_START(rdma_ram_block, i) ((uint8_t *)\
+unsigned long)((rdma_ram_block)->local_host_addr) >> \
+RDMA_REG_CHUNK_SHIFT) + (i)) << \
+RDMA_REG_CHUNK_SHIFT))
+#define RDMA_REG_CHUNK_END(rdma_ram_block, i) \
+(RDMA_REG_CHUNK_START(rdma_ram_block, i) + \
+ RDMA_REG_CHUNK_SIZE)
+
+/*
+ * This is only for non-live state being migrated.
+ * Instead of RDMA_WRITE messages, we use RDMA_SEND
+ * messages for that state, which requires a different
+ * delivery design than main memory.
+ */
+#define RDMA_SEND_INCREMENT 32768
+#define QEMU_FILE_RDMA_MAX (512 * 1024)
+
+#define RDMA_BLOCKING
+
+#ifdef CONFIG_RDMA
+enum {
+RDMA_WRID_NONE = 0,
+RDMA_WRID_RDMA,
+RDMA_WRID_SEND_REMOTE_INFO,
+RDMA_WRID_RECV_REMOTE_INFO,
+RDMA_WRID_SEND_QEMU_FILE = 1000,
+RDMA_WRID_RECV_QEMU_FILE = 2000,
+};
+
+typedef struct RDMAContext {
+/* cm_id also has ibv_conext, rdma_event_channel, and ibv_qp in
+   cm_id->verbs, cm_id->channel, and cm_id->qp. */
+struct rdma_cm_id *cm_id;
+struct rdma_cm_id *listen_id;
+
+struct ibv_context *verbs;
+struct rdma_event_channel *channel;
+struct ibv_qp *qp;
+
+struct ibv_comp_channel *comp_channel;
+struct ibv_pd *pd;
+struct ibv_cq *cq;
+} RDMAContext;
+
+typedef struct RDMALocalBlock {
+uint8_t *local_host_addr;
+uint64_t remote_host_addr;
+uint64_t offset;
+uint64_t length;
+struct ibv_mr **pmr;
+struct ibv_mr *mr;
+uint32_t remote_rkey;
+} RDMALocalBlock;
+
+typedef struct RDMARemoteBlock {
+uint64_t remote_host_addr;
+uint64_t offset;
+uint64_t length;
+uint32_t remote_rkey;
+} RDMARemoteBlock;
+
+typedef struct RDMALocalBlocks {
+int num_blocks;
+RDMALocalBlock *block;
+} RDMALocalBlocks;
+
+typedef struct RDMARemoteBlocks {
+int * num_blocks;
+RDMARemoteBlock *block;
+void * remote_info_area;
+int info_size;
+} RDMARemoteBlocks;
+
+typedef struct RDMAData {
+char *host;
+int port;
+int enabled;
+int gidx;
+union ibv_gid gid;
+uint8_t b;
+
+RDMAContext rdma_ctx;
+RDMALocalBlocks rdma_local_ram_blocks;
+
+/* This is used for synchronization: We use
+   IBV_WR_SEND to send it after all IBV_WR_RDMA_WRITEs
+   are done. When the receiver gets it, it can be certain
+   that all the RDMAs are completed. */
+int sync;
+struct ibv_mr *sync_mr;
+
+/* This is used for the server to write the remote
+   ram blocks info. */
+RDMARemoteBlocks remote_info;
+struct ibv_mr *remote_info_mr;
+
+/* This is used by the migration protocol to transmit
+ * device and CPU state that's not part of the

Re: [Qemu-devel] [Qemu-ppc] [PATCH 5/5] pseries: Move XICS initialization before cpu initialization

2013-03-17 Thread David Gibson
On Mon, Mar 18, 2013 at 04:12:11AM +0100, Alexander Graf wrote:
> 
> On 18.03.2013, at 03:55, David Gibson wrote:
> 
> > On Fri, Mar 15, 2013 at 01:33:18PM +0100, Alexander Graf wrote:
> >> 
> >> On 14.03.2013, at 02:53, David Gibson wrote:
> >> 
> >>> Currently, the pseries machine initializes the cpus, then the XICS
> >>> interrupt controller.  However, to support the upcoming in-kernel XICS
> >>> implementation we will need to initialize the irq controller before the
> >>> vcpus.  This patch makes the necesssary rearrangement.  This means the
> >> 
> >> We're changing that notion in the in-kernel XICS discussions.  The flow 
> >> will look like this:
> >> 
> >>  * create vcpus
> >>  * create XICS
> >>  * foreach (vcpu)
> >>  * enable_cap(vcpu, CAP_XICS_SERVER, xics_handle)
> >> 
> >> However, that means we still need to know the maximum number of
> >> supported vcpus during the create phase. That number can be bigger
> >> than smp_cpus though, since you probably want to support hotplug add
> >> of CPUs later on.
> >> 
> >> Can't we just make the number of supported "interrupt servers" a
> >> constant?
> > 
> > I suppose, but we need an allocation for each one, so its a bit ugly.
> > In any case although the comment is a bit out of date, this patch also
> > creates a logical place to put per-cpu XICS initialization which we
> > will still need for the new interface.
> 
> So how would you model CPU hotplug add?

Add headroom to the XICS setup based on whatever information we have
about maximum pluggable CPUs.  To use the PAPR hotplug interfaces we
already need a notion of max # CPUs, we can't have that just be open
ended.  We'd also add a call to xics_cpu_setup() to the hotplug add
path, obviously.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: Digital signature


[Qemu-devel] [PATCH v3 0/2] bridge helper: includedir conf arg

2013-03-17 Thread Doug Goldstein
The goal is to support an 'includedir' to include all files within a
directory specified in the bridge.conf file. The rationale is to allow
libvirt to be able to configure interfaces to for use by unprivileged
users by just simply generating a new configuration file to the directory.

Change from v3:
- Integreated review changes from Blue Swirl

Change from v2:
- Integrated review changes from Corey Bryant
- Integrated review changes from Stefan Hajnoczi

Change from v1:
- Reversed patch order to make the series clearer
- Integrated review changes from Corey Bryant
- Integrated review changes from Stefan Hajnoczi

Doug Goldstein (2):
  bridge helper: unified error cleanup for parse_acl_file
  bridge helper: support conf dirs

 qemu-bridge-helper.c | 75 +++-
 1 file changed, 63 insertions(+), 12 deletions(-)

-- 
1.8.1.5




[Qemu-devel] [PATCH v3 1/2] bridge helper: unified error cleanup for parse_acl_file

2013-03-17 Thread Doug Goldstein
Handle errors and cleanup from the error in a unified place for
parse_acl_file().

Signed-off-by: Doug Goldstein 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Corey Bryant 
CC: Anthony Liguori 
CC: Richa Marwaha 
CC: Corey Bryant 
TO: qemu-devel@nongnu.org
---
 qemu-bridge-helper.c | 28 
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/qemu-bridge-helper.c b/qemu-bridge-helper.c
index 287bfd5..95486e7 100644
--- a/qemu-bridge-helper.c
+++ b/qemu-bridge-helper.c
@@ -74,11 +74,12 @@ static int parse_acl_file(const char *filename, ACLList 
*acl_list)
 {
 FILE *f;
 char line[4096];
+int ret = -EINVAL;
 ACLRule *acl_rule;
 
 f = fopen(filename, "r");
 if (f == NULL) {
-return -1;
+return -errno;
 }
 
 while (fgets(line, sizeof(line), f) != NULL) {
@@ -102,9 +103,8 @@ static int parse_acl_file(const char *filename, ACLList 
*acl_list)
 
 if (arg == NULL) {
 fprintf(stderr, "Invalid config line:\n  %s\n", line);
-fclose(f);
-errno = EINVAL;
-return -1;
+ret = -EINVAL;
+goto failure;
 }
 
 *arg = 0;
@@ -142,15 +142,17 @@ static int parse_acl_file(const char *filename, ACLList 
*acl_list)
 parse_acl_file(arg, acl_list);
 } else {
 fprintf(stderr, "Unknown command `%s'\n", cmd);
-fclose(f);
-errno = EINVAL;
-return -1;
+ret = -EINVAL;
+goto failure;
 }
 }
 
+ret = 0;
+
+failure:
 fclose(f);
 
-return 0;
+return ret;
 }
 
 static bool has_vnet_hdr(int fd)
@@ -238,7 +240,7 @@ int main(int argc, char **argv)
 ACLRule *acl_rule;
 ACLList acl_list;
 int access_allowed, access_denied;
-int ret = EXIT_SUCCESS;
+int ret;
 
 #ifdef CONFIG_LIBCAP
 /* if we're run from an suid binary, immediately drop privileges preserving
@@ -272,9 +274,10 @@ int main(int argc, char **argv)
 
 /* parse default acl file */
 QSIMPLEQ_INIT(&acl_list);
-if (parse_acl_file(DEFAULT_ACL_FILE, &acl_list) == -1) {
-fprintf(stderr, "failed to parse default acl file `%s'\n",
-DEFAULT_ACL_FILE);
+ret = parse_acl_file(DEFAULT_ACL_FILE, &acl_list);
+if (ret < 0) {
+fprintf(stderr, "failed to parse default acl file `%s': %s\n",
+DEFAULT_ACL_FILE, strerror(ret));
 ret = EXIT_FAILURE;
 goto cleanup;
 }
@@ -416,6 +419,7 @@ int main(int argc, char **argv)
 /* ... */
 
 /* profit! */
+ret = EXIT_SUCCESS;
 
 cleanup:
 
-- 
1.8.1.5




[Qemu-devel] [PATCH v3 2/2] bridge helper: support conf dirs

2013-03-17 Thread Doug Goldstein
Allow the bridge helper to take a config directory rather than having to
specify every file in the directory manually via an include statement.

Signed-off-by: Doug Goldstein 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Corey Bryant 
CC: Anthony Liguori 
CC: Richa Marwaha 
CC: Corey Bryant 
TO: qemu-devel@nongnu.org
---
 qemu-bridge-helper.c | 47 +++
 1 file changed, 47 insertions(+)

diff --git a/qemu-bridge-helper.c b/qemu-bridge-helper.c
index 95486e7..b647848 100644
--- a/qemu-bridge-helper.c
+++ b/qemu-bridge-helper.c
@@ -16,6 +16,7 @@
 #include "config-host.h"
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -70,12 +71,28 @@ static void usage(void)
 "Usage: qemu-bridge-helper [--use-vnet] --br=bridge 
--fd=unixfd\n");
 }
 
+static int filter_bridge_conf_dir(const struct dirent *entry)
+{
+ssize_t len = strlen(entry->d_name);
+
+/* We only want files ending in .conf */
+if (len > 5 &&
+strcmp(".conf", &entry->d_name[len-5]) == 0) {
+return 1;
+}
+
+return 0;
+}
+
 static int parse_acl_file(const char *filename, ACLList *acl_list)
 {
 FILE *f;
 char line[4096];
 int ret = -EINVAL;
 ACLRule *acl_rule;
+struct dirent **include_list = NULL;
+int i, include_count = 0;
+char *conf_file;
 
 f = fopen(filename, "r");
 if (f == NULL) {
@@ -137,6 +154,31 @@ static int parse_acl_file(const char *filename, ACLList 
*acl_list)
 snprintf(acl_rule->iface, IFNAMSIZ, "%s", arg);
 }
 QSIMPLEQ_INSERT_TAIL(acl_list, acl_rule, entry);
+} else if (strcmp(cmd, "includedir") == 0) {
+include_count = scandir(arg, &include_list,
+filter_bridge_conf_dir, alphasort);
+if (include_count < 0) {
+ret = -errno;
+fprintf(stderr, "Unable to retrieve conf files from '%s': 
%s\n",
+arg, strerror(errno));
+goto failure;
+}
+
+for (i = 0; i < include_count; i++) {
+conf_file = g_strdup_printf("%s/%s", arg,
+include_list[i]->d_name);
+
+/* ignore errors like 'include' cmd */
+parse_acl_file(conf_file, acl_list);
+
+g_free(conf_file);
+free(include_list[i]);
+include_list[i] = NULL;
+}
+free(include_list);
+include_list = NULL;
+include_count = 0;
+
 } else if (strcmp(cmd, "include") == 0) {
 /* ignore errors */
 parse_acl_file(arg, acl_list);
@@ -152,6 +194,11 @@ static int parse_acl_file(const char *filename, ACLList 
*acl_list)
 failure:
 fclose(f);
 
+for (i = 0; i < include_count; i++) {
+free(include_list[i]);
+}
+free(include_list);
+
 return ret;
 }
 
-- 
1.8.1.5




Re: [Qemu-devel] [PATCH][RFC 0/14] implement power chip

2013-03-17 Thread li guang
ping ...


在 2013-03-15五的 08:59 +0800,li guang写道:
> Hi, Anthony
> 
> Sorry to bother you,
> can you please help to see it this work is
> worth to go on, or ...
> 
> or someone else can help to see this patch-set.
> 
> Thanks All.
> 
> 
> 在 2013-03-13三的 16:01 +0800,liguang写道:
> > By now, all devices of QEMU do not have much more
> > power management consideration, for example, if
> > system do suspend, it will call all registered notifiers,
> > this was loosely required, and the code to do power management
> > state transition seems just do 'ugly emulation', rather than be
> > conscious with whole system devices, same condition with reset(it
> > has been embedded in DeviceClass, good!),
> > shutdown, in real world, commonly all devices' power are controlled
> > by a power chip, then all power sequence can be done just
> > issue commands to this chip.
> > so, I come across an idea to implement qdev'ed power device, and
> > make all qdev struct of devices aware of self power management(add
> > on/off/wakeup/suspend ... filed for DeviceClass), this will
> > bring tidy power management, and the emulation will more like what
> > happened in real world.
> > 
> > Of course, it's only a patch-set for RFC, I'd like to ask all 
> > developers to help correct this idea, if it's worth to implement, 
> > I'll go head to refactor more.
> > 
> > Li Guang (14)
> >  gitignore: ignore more files
> >  qdev: add power management method
> >  qdev: remove redundant abort()
> >  qdev: add power on/off/suspend/wakeup handler
> >  power: add power chip emulation
> >  sysemu: remove PowerReason in sysemu.h
> >  acpi: refactor acpi wakeup function
> >  ich9: make lpc's reset also do pm_reset
> >  ich9: do lpc's power on by reset function
> >  piix4: refactor piix4's power callbacks
> >  pckbd: refactor pckbd's power callbacks
> >  ps2: call ps2_{kbd,mouse}_reset in kbd_reset
> >  parallel: refactor parallel_reset function
> >  uhci: refactor uhci's power callbacks
> > 
> > .gitignore  |   3 +++
> > Makefile.objs   |   1 +
> > hw/acpi.c   |  20 +---
> > hw/acpi.h   |   3 ++-
> > hw/acpi_ich9.c  |   4 ++--
> > hw/ich9.h   |   1 +
> > hw/lpc_ich9.c   |  12 ++-
> > hw/parallel.c   |  10 ++
> > hw/pckbd.c  |  25 --
> > hw/piix4.c  |  14 --
> > hw/ps2.c|   8 
> > hw/ps2.h|   2 ++
> > hw/qdev-core.h  |  15 ++
> > hw/qdev.c   |  99 +--
> > hw/usb/hcd-uhci.c   |  10 ++
> > include/sysemu/sysemu.h |   7 +--
> > power.c | 133 
> > +
> > power.h |  41 +
> > 18 files changed, 365 insertions(+), 43 deletions(-)
> >  create mode 100644 power.c
> >  create mode 100644 power.h
> > 
> > 
> > 
> 





[Qemu-devel] [PATCH] sheepdog: show error message for halt status

2013-03-17 Thread Liu Yuan
From: Liu Yuan 

Sheepdog (neither quorum nor unsafe mode) will refuse to serve IO requests when
number of alive nodes is less than that of copies specified by users. This will
return 0x19 to QEMU client which currently doesn't recognize it.

This patch adds an error description when QEMU client receives it, other than
plainly printing 'Invalid error code'

Cc: MORITA Kazutaka 
Cc: Kevin Wolf 
Cc: Stefan Hajnoczi 
Signed-off-by: Liu Yuan 
---
 block/sheepdog.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 4245328..54d3e53 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -65,6 +65,7 @@
 #define SD_RES_WAIT_FOR_FORMAT  0x16 /* Waiting for a format operation */
 #define SD_RES_WAIT_FOR_JOIN0x17 /* Waiting for other nodes joining */
 #define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
+#define SD_RES_HALT  0x19 /* Sheepdog is stopped serving IO request */
 
 /*
  * Object ID rules
@@ -344,6 +345,7 @@ static const char * sd_strerror(int err)
 {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
 {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
 {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
+{SD_RES_HALT, "Sheepdog is stopped serving IO request"},
 };
 
 for (i = 0; i < ARRAY_SIZE(errors); ++i) {
-- 
1.7.9.5




[Qemu-devel] [PATCH] HLFS driver for QEMU

2013-03-17 Thread harryxiyou
From: Harry Wei 

HLFS is HDFS-based(Hadoop Distributed File System) Log-Structured File
System. Actually, HLFS, currently, is not a FS but a block-storage system,
which we simplify LFS to fit block-level storage. So you could also call HLFS
as HLBS (HDFS-based Log-Structured Block-storage System).  HLFS has
two mode, which are local mode and HDFS mode. HDFS is once write and
many read so HLFS realize LBS(Log-Structured Block-storage System) to
achieve reading and writing randomly. LBS is based on LFS's basic theories
but is different from LFS, which LBS fits block storage better. See
http://code.google.com/p/cloudxy/wiki/WHAT_IS_CLOUDXY
about HLFS in details.

Currently, HLFS support following features:

1, Portions of POSIX --- Just realize some interfaces VM image need.
2, Randomly Read/Write.
3, Large file storage (TB).
4, Support snapshots(Linear snapshots and tree snapshots), Clone,
Block compression, Cache, etc.
5, A copy of the data more.
6, Cluster system can dynamic expand.
...

Signed-off-by: Harry Wei 

---
 block/Makefile.objs |2 +-
 block/hlfs.c|  515 +++
 configure   |   51 +
 3 files changed, 567 insertions(+), 1 deletion(-)
 create mode 100644 block/hlfs.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index c067f38..723c7a5 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -8,7 +8,7 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 ifeq ($(CONFIG_POSIX),y)
-block-obj-y += nbd.o sheepdog.o
+block-obj-y += nbd.o sheepdog.o hlfs.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
diff --git a/block/hlfs.c b/block/hlfs.c
new file mode 100644
index 000..331feae
--- /dev/null
+++ b/block/hlfs.c
@@ -0,0 +1,514 @@
+/*
+ * Block driver for HLFS(HDFS-based Log-structured File System)
+ *
+ * Copyright (c) 2013, Kang Hua 
+ * Copyright (c) 2013, Wang Sen 
+ * Copyright (c) 2013, Harry Wei 
+ *
+ * This program is free software. You can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * Reference:
+ * http://code.google.com/p/cloudxy
+ */
+
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "qemu/sockets.h"
+#include "block/block_int.h"
+#include "qemu/bitops.h"
+#include "api/hlfs.h"
+#include "storage_helper.h"
+#include "comm_define.h"
+#include "snapshot_helper.h"
+#include "address.h"
+
+#define DEBUG_HLBS
+#undef dprintf
+#ifdef DEBUG_HLBS
+#define dprintf(fmt, args...) \
+do {\
+fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
+} while (0)
+#else
+#define dprintf(fmt, args...)
+#endif
+
+#define HLBS_MAX_VDI_SIZE (8192ULL*8192ULL*8192ULL*8192ULL)
+#define SECTOR_SIZE 512
+
+typedef struct BDRVHLBSState {
+struct hlfs_ctrl *hctrl;
+char *snapshot;
+char *uri;
+} BDRVHLBSState;
+
+/*
+ * Parse a filename.
+ *
+ * file name format must be one of the following:
+ *1. [vdiname]
+ *2. [vdiname]%[snapshot]
+ ** vdiname format --
+ ** local:///tmp/testenv/testfs
+ ** hdfs:///tmp/testenv/testfs
+ ** hdfs://localhost:8020/tmp/testenv/testfs
+ ** hdfs://localhost/tmp/testenv/testfs
+ ** hdfs://192.168.0.1:8020/tmp/testenv/testfs
+ */
+
+static int parse_vdiname(BDRVHLBSState *s, const char *filename, char *vdi,
+char *snapshot)
+{
+if (!filename) {
+return -1;
+}
+
+gchar **v = g_strsplit(filename, "%", 2);
+if (g_strv_length(v) == 1) {
+strcpy(vdi, v[0]);
+s->uri = g_strdup(vdi);
+} else if (g_strv_length(v) == 2) {
+strcpy(vdi, v[0]);
+strcpy(snapshot, v[1]);
+s->uri = g_strdup(vdi);
+s->snapshot = g_strdup(snapshot);
+} else {
+goto out;
+}
+
+return 0;
+out:
+g_strfreev(v);
+return -1;
+}
+
+static int hlbs_open(BlockDriverState *bs, const char *filename, int flags)
+{
+int ret = 0;
+BDRVHLBSState *s = bs->opaque;
+char vdi[256];
+char snapshot[HLFS_FILE_NAME_MAX];
+
+strstart(filename, "hlfs:", (const char **)&filename);
+memset(snapshot, 0, sizeof(snapshot));
+memset(vdi, 0, sizeof(vdi));
+
+if (parse_vdiname(s, filename, vdi, snapshot) < 0) {
+goto out;
+}
+
+HLFS_CTRL *ctrl = init_hlfs(vdi);
+if (strlen(snapshot)) {
+dprintf("snapshot:%s was open.\n", snapshot);
+ret = hlfs_open_by_snapshot(ctrl, snapshot, 1);
+} else {
+ret = hlfs_open(ctrl, 1);
+}
+g_assert(ret == 0);
+s->hctrl = ctrl;
+bs->total_sectors = ctrl->sb.max_fs_size * 1024 * 1024 / SECTOR_SIZE;
+return 0;
+out:
+if (s->hctrl) {
+hlfs_close(s->hctrl);
+deinit_hlfs(s->hctrl);
+}
+return -1;
+}
+
+static int hlbs_create(const char *filename, QEMUOptionParameter *options)
+{
+int64_t vdi_size = 0;
+char

Re: [Qemu-devel] [RFC] qmp interface for save vmstate to image

2013-03-17 Thread Wenchao Xia
于 2013-3-15 22:51, Stefan Hajnoczi 写道:
> On Fri, Mar 15, 2013 at 03:24:38PM +0800, Wenchao Xia wrote:
>>I'd like to add a new way to save vmstate, which will based on the
>> migration thread, but will write contents to block images, instead
>> of fd as stream. Following is the method to add API:
> 
> Hi Wenchao,
> What use cases are there besides saving vmstate to a raw image?
> 
> I'm curious if you're proposing this since there is no "file:" URI or
> because you really want to do things like saving vmstate into a qcow2
> file or over NBD.
> 
> Stefan
> 
Hi, Stefan
  Most used cases would be "raw" and "qcow2", which is flex and can be
chosen by user. In this way, existing block layer feature in qemu can
be used, such as tagging zeros. I haven't check the buffer/cache status
in qemu block layer, but if there is, it can also benefit.
  User can specify "raw" or "qcow2" according to host configuration, If
there is dedicated storage components underlining, he can use "raw" to
skip qemu's block layer.

-- 
Best Regards

Wenchao Xia




[Qemu-devel] [PATCH] HLFS driver for QEMU

2013-03-17 Thread harryxiyou
From: Harry Wei 

HLFS is HDFS-based(Hadoop Distributed File System) Log-Structured File
System. Actually, HLFS, currently, is not a FS but a block-storage system,
which we simplify LFS to fit block-level storage. So you could also call HLFS
as HLBS (HDFS-based Log-Structured Block-storage System).  HLFS has
two mode, which are local mode and HDFS mode. HDFS is once write and
many read so HLFS realize LBS(Log-Structured Block-storage System) to
achieve reading and writing randomly. LBS is based on LFS's basic theories
but is different from LFS, which LBS fits block storage better. See
http://code.google.com/p/cloudxy/wiki/WHAT_IS_CLOUDXY
about HLFS in details.

Currently, HLFS support following features:

1, Portions of POSIX --- Just realize some interfaces VM image need.
2, Randomly Read/Write.
3, Large file storage (TB).
4, Support snapshots(Linear snapshots and tree snapshots), Clone,
Block compression, Cache, etc.
5, A copy of the data more.
6, Cluster system can dynamic expand.
...

Signed-off-by: Harry Wei 

---
 block/Makefile.objs |2 +-
 block/hlfs.c|  515 +++
 configure   |   51 +
 3 files changed, 567 insertions(+), 1 deletion(-)
 create mode 100644 block/hlfs.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index c067f38..723c7a5 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -8,7 +8,7 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 ifeq ($(CONFIG_POSIX),y)
-block-obj-y += nbd.o sheepdog.o
+block-obj-y += nbd.o sheepdog.o hlfs.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
diff --git a/block/hlfs.c b/block/hlfs.c
new file mode 100644
index 000..331feae
--- /dev/null
+++ b/block/hlfs.c
@@ -0,0 +1,514 @@
+/*
+ * Block driver for HLFS(HDFS-based Log-structured File System)
+ *
+ * Copyright (c) 2013, Kang Hua 
+ * Copyright (c) 2013, Wang Sen 
+ * Copyright (c) 2013, Harry Wei 
+ *
+ * This program is free software. You can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * Reference:
+ * http://code.google.com/p/cloudxy
+ */
+
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "qemu/sockets.h"
+#include "block/block_int.h"
+#include "qemu/bitops.h"
+#include "api/hlfs.h"
+#include "storage_helper.h"
+#include "comm_define.h"
+#include "snapshot_helper.h"
+#include "address.h"
+
+#define DEBUG_HLBS
+#undef dprintf
+#ifdef DEBUG_HLBS
+#define dprintf(fmt, args...) \
+do {\
+fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
+} while (0)
+#else
+#define dprintf(fmt, args...)
+#endif
+
+#define HLBS_MAX_VDI_SIZE (8192ULL*8192ULL*8192ULL*8192ULL)
+#define SECTOR_SIZE 512
+
+typedef struct BDRVHLBSState {
+struct hlfs_ctrl *hctrl;
+char *snapshot;
+char *uri;
+} BDRVHLBSState;
+
+/*
+ * Parse a filename.
+ *
+ * file name format must be one of the following:
+ *1. [vdiname]
+ *2. [vdiname]%[snapshot]
+ ** vdiname format --
+ ** local:///tmp/testenv/testfs
+ ** hdfs:///tmp/testenv/testfs
+ ** hdfs://localhost:8020/tmp/testenv/testfs
+ ** hdfs://localhost/tmp/testenv/testfs
+ ** hdfs://192.168.0.1:8020/tmp/testenv/testfs
+ */
+
+static int parse_vdiname(BDRVHLBSState *s, const char *filename, char *vdi,
+char *snapshot)
+{
+if (!filename) {
+return -1;
+}
+
+gchar **v = g_strsplit(filename, "%", 2);
+if (g_strv_length(v) == 1) {
+strcpy(vdi, v[0]);
+s->uri = g_strdup(vdi);
+} else if (g_strv_length(v) == 2) {
+strcpy(vdi, v[0]);
+strcpy(snapshot, v[1]);
+s->uri = g_strdup(vdi);
+s->snapshot = g_strdup(snapshot);
+} else {
+goto out;
+}
+
+return 0;
+out:
+g_strfreev(v);
+return -1;
+}
+
+static int hlbs_open(BlockDriverState *bs, const char *filename, int flags)
+{
+int ret = 0;
+BDRVHLBSState *s = bs->opaque;
+char vdi[256];
+char snapshot[HLFS_FILE_NAME_MAX];
+
+strstart(filename, "hlfs:", (const char **)&filename);
+memset(snapshot, 0, sizeof(snapshot));
+memset(vdi, 0, sizeof(vdi));
+
+if (parse_vdiname(s, filename, vdi, snapshot) < 0) {
+goto out;
+}
+
+HLFS_CTRL *ctrl = init_hlfs(vdi);
+if (strlen(snapshot)) {
+dprintf("snapshot:%s was open.\n", snapshot);
+ret = hlfs_open_by_snapshot(ctrl, snapshot, 1);
+} else {
+ret = hlfs_open(ctrl, 1);
+}
+g_assert(ret == 0);
+s->hctrl = ctrl;
+bs->total_sectors = ctrl->sb.max_fs_size * 1024 * 1024 / SECTOR_SIZE;
+return 0;
+out:
+if (s->hctrl) {
+hlfs_close(s->hctrl);
+deinit_hlfs(s->hctrl);
+}
+return -1;
+}
+
+static int hlbs_create(const char *filename, QEMUOptionParameter *options)
+{
+int64_t vdi_size = 0;
+char