from:"Cathy Avery"

[PATCH] scsi: storvsc: Allow only one remove lun work item to be issued per lun

2017-10-17 Thread Cathy Avery

When running multipath on a VM if all available paths go down
the driver can schedule large amounts of storvsc_remove_lun
work items to the same lun. In response to the failing paths
typically storvsc responds by taking host->scan_mutex and issuing
a TUR per lun. If there has been heavy IO to the failed device
all the failed IOs are returned from the host. A remove lun work
item is issued per failed IO. If the outstanding TURs have not been
completed in a timely manner the scan_mutex is never released or
released too late. Consequently the many remove lun work items are
not completed as scsi_remove_device also tries to take host->scan_mutex.
This results in dragging the VM down and sometimes completely.

This patch only allows one remove lun to be issued to a particular
lun while it is an instantiated member of the scsi stack.

Changes since v1:

Use single threaded workqueue to serialize work in
storvsc_handle_error [Christoph Hellwig]

Signed-off-by: Cathy Avery 
---
 drivers/scsi/storvsc_drv.c | 27 ++-
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 5e7200f..6febcdb 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -486,6 +486,8 @@ struct hv_host_device {
unsigned int port;
unsigned char path;
unsigned char target;
+   struct workqueue_struct *handle_error_wq;
+   char work_q_name[20];
 };
 
 struct storvsc_scan_work {
@@ -922,6 +924,7 @@ static void storvsc_handle_error(struct vmscsi_request 
*vm_srb,
 {
struct storvsc_scan_work *wrk;
void (*process_err_fn)(struct work_struct *work);
+   struct hv_host_device *host_dev = shost_priv(host);
bool do_work = false;
 
switch (SRB_STATUS(vm_srb->srb_status)) {
@@ -988,7 +991,7 @@ static void storvsc_handle_error(struct vmscsi_request 
*vm_srb,
wrk->lun = vm_srb->lun;
wrk->tgt_id = vm_srb->target_id;
INIT_WORK(&wrk->work, process_err_fn);
-   schedule_work(&wrk->work);
+   queue_work(host_dev->handle_error_wq, &wrk->work);
 }
 
 
@@ -1803,10 +1806,19 @@ static int storvsc_probe(struct hv_device *device,
if (stor_device->num_sc != 0)
host->nr_hw_queues = stor_device->num_sc + 1;
 
+   /*
+* Set the error handler work queue.
+*/
+   snprintf(host_dev->work_q_name, sizeof(host_dev->work_q_name),
+"storvsc_error_wq_%d", host->host_no);
+   host_dev->handle_error_wq =
+   create_singlethread_workqueue(host_dev->work_q_name);
+   if (!host_dev->handle_error_wq)
+   goto err_out2;
/* Register the HBA and start the scsi bus scan */
ret = scsi_add_host(host, &device->device);
if (ret != 0)
-   goto err_out2;
+   goto err_out3;
 
if (!dev_is_ide) {
scsi_scan_host(host);
@@ -1815,7 +1827,7 @@ static int storvsc_probe(struct hv_device *device,
 device->dev_instance.b[4]);
ret = scsi_add_device(host, 0, target, 0);
if (ret)
-   goto err_out3;
+   goto err_out4;
}
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
@@ -1827,14 +1839,17 @@ static int storvsc_probe(struct hv_device *device,
fc_host_port_name(host) = stor_device->port_name;
stor_device->rport = fc_remote_port_add(host, 0, &ids);
if (!stor_device->rport)
-   goto err_out3;
+   goto err_out4;
}
 #endif
return 0;
 
-err_out3:
+err_out4:
scsi_remove_host(host);
 
+err_out3:
+   destroy_workqueue(host_dev->handle_error_wq);
+
 err_out2:
/*
 * Once we have connected with the host, we would need to
@@ -1858,6 +1873,7 @@ static int storvsc_remove(struct hv_device *dev)
 {
struct storvsc_device *stor_device = hv_get_drvdata(dev);
struct Scsi_Host *host = stor_device->host;
+   struct hv_host_device *host_dev = shost_priv(host);
 
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
@@ -1865,6 +1881,7 @@ static int storvsc_remove(struct hv_device *dev)
fc_remove_host(host);
}
 #endif
+   destroy_workqueue(host_dev->handle_error_wq);
scsi_remove_host(host);
storvsc_dev_remove(dev);
scsi_host_put(host);
-- 
2.5.0

[PATCH V2] scsi: storvsc: Allow only one remove lun work item to be issued per lun

2017-10-17 Thread Cathy Avery

When running multipath on a VM if all available paths go down
the driver can schedule large amounts of storvsc_remove_lun
work items to the same lun. In response to the failing paths
typically storvsc responds by taking host->scan_mutex and issuing
a TUR per lun. If there has been heavy IO to the failed device
all the failed IOs are returned from the host. A remove lun work
item is issued per failed IO. If the outstanding TURs have not been
completed in a timely manner the scan_mutex is never released or
released too late. Consequently the many remove lun work items are
not completed as scsi_remove_device also tries to take host->scan_mutex.
This results in dragging the VM down and sometimes completely.

This patch only allows one remove lun to be issued to a particular
lun while it is an instantiated member of the scsi stack.

Changes since v1:

Use single threaded workqueue to serialize work in
storvsc_handle_error [Christoph Hellwig]

Signed-off-by: Cathy Avery 
---
 drivers/scsi/storvsc_drv.c | 27 ++-
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 5e7200f..6febcdb 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -486,6 +486,8 @@ struct hv_host_device {
unsigned int port;
unsigned char path;
unsigned char target;
+   struct workqueue_struct *handle_error_wq;
+   char work_q_name[20];
 };
 
 struct storvsc_scan_work {
@@ -922,6 +924,7 @@ static void storvsc_handle_error(struct vmscsi_request 
*vm_srb,
 {
struct storvsc_scan_work *wrk;
void (*process_err_fn)(struct work_struct *work);
+   struct hv_host_device *host_dev = shost_priv(host);
bool do_work = false;
 
switch (SRB_STATUS(vm_srb->srb_status)) {
@@ -988,7 +991,7 @@ static void storvsc_handle_error(struct vmscsi_request 
*vm_srb,
wrk->lun = vm_srb->lun;
wrk->tgt_id = vm_srb->target_id;
INIT_WORK(&wrk->work, process_err_fn);
-   schedule_work(&wrk->work);
+   queue_work(host_dev->handle_error_wq, &wrk->work);
 }
 
 
@@ -1803,10 +1806,19 @@ static int storvsc_probe(struct hv_device *device,
if (stor_device->num_sc != 0)
host->nr_hw_queues = stor_device->num_sc + 1;
 
+   /*
+* Set the error handler work queue.
+*/
+   snprintf(host_dev->work_q_name, sizeof(host_dev->work_q_name),
+"storvsc_error_wq_%d", host->host_no);
+   host_dev->handle_error_wq =
+   create_singlethread_workqueue(host_dev->work_q_name);
+   if (!host_dev->handle_error_wq)
+   goto err_out2;
/* Register the HBA and start the scsi bus scan */
ret = scsi_add_host(host, &device->device);
if (ret != 0)
-   goto err_out2;
+   goto err_out3;
 
if (!dev_is_ide) {
scsi_scan_host(host);
@@ -1815,7 +1827,7 @@ static int storvsc_probe(struct hv_device *device,
 device->dev_instance.b[4]);
ret = scsi_add_device(host, 0, target, 0);
if (ret)
-   goto err_out3;
+   goto err_out4;
}
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
@@ -1827,14 +1839,17 @@ static int storvsc_probe(struct hv_device *device,
fc_host_port_name(host) = stor_device->port_name;
stor_device->rport = fc_remote_port_add(host, 0, &ids);
if (!stor_device->rport)
-   goto err_out3;
+   goto err_out4;
}
 #endif
return 0;
 
-err_out3:
+err_out4:
scsi_remove_host(host);
 
+err_out3:
+   destroy_workqueue(host_dev->handle_error_wq);
+
 err_out2:
/*
 * Once we have connected with the host, we would need to
@@ -1858,6 +1873,7 @@ static int storvsc_remove(struct hv_device *dev)
 {
struct storvsc_device *stor_device = hv_get_drvdata(dev);
struct Scsi_Host *host = stor_device->host;
+   struct hv_host_device *host_dev = shost_priv(host);
 
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
@@ -1865,6 +1881,7 @@ static int storvsc_remove(struct hv_device *dev)
fc_remove_host(host);
}
 #endif
+   destroy_workqueue(host_dev->handle_error_wq);
scsi_remove_host(host);
storvsc_dev_remove(dev);
scsi_host_put(host);
-- 
2.5.0

Re: [PATCH V2] scsi: storvsc: Allow only one remove lun work item to be issued per lun

2017-10-31 Thread Cathy Avery


On 10/31/2017 08:24 AM, Martin K. Petersen wrote:

If you use alloc_ordered_workqueue directly instead of
create_singlethread_workqueue you can pass a format string and don't
need the separate allocation.

But I'm not sure if Tejun is fine with using __WQ_LEGACY directly..

The only thing that flag does is exempting the workqueue from possible
flush deadlock check as we don't know whether WQ_MEM_RECLAIM on a
legacy workqueue is intentional.  There's no reason to add it when
converting to alloc_ordered_workqueue().  Just decide whether it needs
forward progress guarantee and use WQ_MEM_RECLAIM if so.

Cathy?



Sorry for the delay. Long was working on a similar problem and we needed 
to add a couple of extra patches. I was thinking of sending all three in 
series but I can send the V3 of this now and follow up with the 
additional patches. Does that make sense?

[PATCH V3] scsi: storvsc: Allow only one remove lun work item to be issued per lun

2017-10-31 Thread Cathy Avery

When running multipath on a VM if all available paths go down
the driver can schedule large amounts of storvsc_remove_lun
work items to the same lun. In response to the failing paths
typically storvsc responds by taking host->scan_mutex and issuing
a TUR per lun. If there has been heavy IO to the failed device
all the failed IOs are returned from the host. A remove lun work
item is issued per failed IO. If the outstanding TURs have not been
completed in a timely manner the scan_mutex is never released or
released too late. Consequently the many remove lun work items are
not completed as scsi_remove_device also tries to take host->scan_mutex.
This results in dragging the VM down and sometimes completely.

This patch only allows one remove lun to be issued to a particular
lun while it is an instantiated member of the scsi stack.

Changes since v1:
Use single threaded workqueue to serialize work in
storvsc_handle_error [Christoph Hellwig]

Changes since v2:
Replaced create_singlethread_workqueue with
alloc_ordered_workqueue [Christoph Hellwig]

Added reviewed by's.

Signed-off-by: Cathy Avery 
Reviewed-by: Christoph Hellwig 
Reviewed-by: Long Li 
---
 drivers/scsi/storvsc_drv.c | 26 +-
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 5e7200f..7e3cd12 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -486,6 +486,7 @@ struct hv_host_device {
unsigned int port;
unsigned char path;
unsigned char target;
+   struct workqueue_struct *handle_error_wq;
 };
 
 struct storvsc_scan_work {
@@ -922,6 +923,7 @@ static void storvsc_handle_error(struct vmscsi_request 
*vm_srb,
 {
struct storvsc_scan_work *wrk;
void (*process_err_fn)(struct work_struct *work);
+   struct hv_host_device *host_dev = shost_priv(host);
bool do_work = false;
 
switch (SRB_STATUS(vm_srb->srb_status)) {
@@ -988,7 +990,7 @@ static void storvsc_handle_error(struct vmscsi_request 
*vm_srb,
wrk->lun = vm_srb->lun;
wrk->tgt_id = vm_srb->target_id;
INIT_WORK(&wrk->work, process_err_fn);
-   schedule_work(&wrk->work);
+   queue_work(host_dev->handle_error_wq, &wrk->work);
 }
 
 
@@ -1803,10 +1805,19 @@ static int storvsc_probe(struct hv_device *device,
if (stor_device->num_sc != 0)
host->nr_hw_queues = stor_device->num_sc + 1;
 
+   /*
+* Set the error handler work queue.
+*/
+   host_dev->handle_error_wq =
+   alloc_ordered_workqueue("storvsc_error_wq_%d",
+   WQ_MEM_RECLAIM,
+   host->host_no);
+   if (!host_dev->handle_error_wq)
+   goto err_out2;
/* Register the HBA and start the scsi bus scan */
ret = scsi_add_host(host, &device->device);
if (ret != 0)
-   goto err_out2;
+   goto err_out3;
 
if (!dev_is_ide) {
scsi_scan_host(host);
@@ -1815,7 +1826,7 @@ static int storvsc_probe(struct hv_device *device,
 device->dev_instance.b[4]);
ret = scsi_add_device(host, 0, target, 0);
if (ret)
-   goto err_out3;
+   goto err_out4;
}
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
@@ -1827,14 +1838,17 @@ static int storvsc_probe(struct hv_device *device,
fc_host_port_name(host) = stor_device->port_name;
stor_device->rport = fc_remote_port_add(host, 0, &ids);
if (!stor_device->rport)
-   goto err_out3;
+   goto err_out4;
}
 #endif
return 0;
 
-err_out3:
+err_out4:
scsi_remove_host(host);
 
+err_out3:
+   destroy_workqueue(host_dev->handle_error_wq);
+
 err_out2:
/*
 * Once we have connected with the host, we would need to
@@ -1858,6 +1872,7 @@ static int storvsc_remove(struct hv_device *dev)
 {
struct storvsc_device *stor_device = hv_get_drvdata(dev);
struct Scsi_Host *host = stor_device->host;
+   struct hv_host_device *host_dev = shost_priv(host);
 
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
@@ -1865,6 +1880,7 @@ static int storvsc_remove(struct hv_device *dev)
fc_remove_host(host);
}
 #endif
+   destroy_workqueue(host_dev->handle_error_wq);
scsi_remove_host(host);
storvsc_dev_remove(dev);
scsi_host_put(host);
-- 
2.5.0

Re: [PATCH] storvsc: Avoid excessive host scan on controller change

2017-11-06 Thread Cathy Avery


On 10/31/2017 05:58 PM, Long Li wrote:

From: Long Li 

When there are multiple disks attached to the same SCSI controller,
the host may send several VSTOR_OPERATION_REMOVE_DEVICE or
VSTOR_OPERATION_ENUMERATE_BUS messages in a row, to indicate there is a
change on the SCSI controller. In response, storvsc rescans the SCSI host.

There is no need to do multiple scans on the same host. Fix the code to do
only one scan.

Signed-off-by: Long Li 
---
  drivers/scsi/storvsc_drv.c | 26 +++---
  1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 6febcdb..b602f52 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -488,6 +488,8 @@ struct hv_host_device {
unsigned char target;
struct workqueue_struct *handle_error_wq;
char work_q_name[20];
+   struct work_struct host_scan_work;
+   struct Scsi_Host *host;
  };
  
  struct storvsc_scan_work {

@@ -516,13 +518,12 @@ static void storvsc_device_scan(struct work_struct *work)
  
  static void storvsc_host_scan(struct work_struct *work)

  {
-   struct storvsc_scan_work *wrk;
struct Scsi_Host *host;
struct scsi_device *sdev;
+   struct hv_host_device *host_device =
+   container_of(work, struct hv_host_device, host_scan_work);
  
-	wrk = container_of(work, struct storvsc_scan_work, work);

-   host = wrk->host;
-
+   host = host_device->host;
/*
 * Before scanning the host, first check to see if any of the
 * currrently known devices have been hot removed. We issue a
@@ -542,8 +543,6 @@ static void storvsc_host_scan(struct work_struct *work)
 * Now scan the host to discover LUNs that may have been added.
 */
scsi_scan_host(host);
-
-   kfree(wrk);
  }
  
  static void storvsc_remove_lun(struct work_struct *work)

@@ -1119,8 +1118,7 @@ static void storvsc_on_receive(struct storvsc_device 
*stor_device,
 struct vstor_packet *vstor_packet,
 struct storvsc_cmd_request *request)
  {
-   struct storvsc_scan_work *work;
-
+   struct hv_host_device *host_dev;
switch (vstor_packet->operation) {
case VSTOR_OPERATION_COMPLETE_IO:
storvsc_on_io_completion(stor_device, vstor_packet, request);
@@ -1128,13 +1126,9 @@ static void storvsc_on_receive(struct storvsc_device 
*stor_device,
  
  	case VSTOR_OPERATION_REMOVE_DEVICE:

case VSTOR_OPERATION_ENUMERATE_BUS:
-   work = kmalloc(sizeof(struct storvsc_scan_work), GFP_ATOMIC);
-   if (!work)
-   return;
-
-   INIT_WORK(&work->work, storvsc_host_scan);
-   work->host = stor_device->host;
-   schedule_work(&work->work);
+   host_dev = shost_priv(stor_device->host);
+   queue_work(
+   host_dev->handle_error_wq, &host_dev->host_scan_work);
break;
  
  	case VSTOR_OPERATION_FCHBA_DATA:

@@ -1747,6 +1741,7 @@ static int storvsc_probe(struct hv_device *device,
  
  	host_dev->port = host->host_no;

host_dev->dev = device;
+   host_dev->host = host;
  
  
  	stor_device = kzalloc(sizeof(struct storvsc_device), GFP_KERNEL);

@@ -1815,6 +1810,7 @@ static int storvsc_probe(struct hv_device *device,
create_singlethread_workqueue(host_dev->work_q_name);
if (!host_dev->handle_error_wq)
goto err_out2;
+   INIT_WORK(&host_dev->host_scan_work, storvsc_host_scan);
/* Register the HBA and start the scsi bus scan */
ret = scsi_add_host(host, &device->device);
if (ret != 0)


I've tested this patch with both a multipath failover while running fio 
and taking hyperV snap shots while luns are being hot added and removed.


Tested-by: Cathy Avery

[PATCH 0/2] KVM: SVM: Track physical cpu and asid_generation via the vmcb

2021-01-12 Thread Cathy Avery

In the cases where vmcbs change processors from one vmrun to another updated
information in the vmcb from a prior run can potentially be lost. By tracking
the physical cpu and asid_generation per vmcb instead of svm->vcpu the following
scenario illustrated by Paolo can be avoided.

 -  -
 pCPU 1 pCPU 2
 -  -
 run VMCB02
run VMCB02 (*)
run VMCB01
 run VMCB01 (**)
 run VMCB02 (***)
 -  -

 After the point marked (*), while L2 runs, some fields change in VMCB02.
 When the processor vmexits back to L0, VMCB02 is marked clean.

 At the point marked (**), svm->vcpu.cpu becomes 1 again.

 Therefore, at the point marked (***) you will get svm->vcpu.cpu == cpu
 and the VMCB02 will not be marked dirty.  The processor can then 
incorrectly
 use some data that is cached from before point (*).

Theses patches are intended for the kvm nested-svm branch.

The patches have been tested on nested fedora VMs, kvm self tests, and 
kvm-unit-tests.
They have not been tested on SEV.

Cathy Avery (2):
  KVM: nSVM: Track the physical cpu of the vmcb vmrun through the vmcb
  KVM: nSVM: Track the ASID generation of the vmcb vmrun through the
vmcb

 arch/x86/kvm/svm/svm.c | 45 +++---
 arch/x86/kvm/svm/svm.h |  3 ++-
 2 files changed, 27 insertions(+), 21 deletions(-)

-- 
2.20.1

[PATCH 1/2] KVM: nSVM: Track the physical cpu of the vmcb vmrun through the vmcb

2021-01-12 Thread Cathy Avery

This patch moves the physical cpu tracking from the vcpu
to the vmcb in svm_switch_vmcb. If either vmcb01 or vmcb02
change physical cpus from one vmrun to the next the vmcb's
previous cpu is preserved for comparison with the current
cpu and the vmcb is marked dirty if different. This prevents
the processor from using old cached data for a vmcb that may
have been updated on a prior run on a different processor.

It also moves the physical cpu check from svm_vcpu_load
to pre_svm_run as the check only needs to be done at run.

Suggested-by: Paolo Bonzini 
Signed-off-by: Cathy Avery 
---
 arch/x86/kvm/svm/svm.c | 28 
 arch/x86/kvm/svm/svm.h |  1 +
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 62390fbc9233..5b6998ff7aa1 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1287,6 +1287,11 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool 
init_event)
 
 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
 {
+   /*
+   * Track the old VMCB so the new VMCB will be marked
+   * dirty at its next vmrun.
+   */
+
svm->current_vmcb = target_vmcb;
svm->vmcb = target_vmcb->ptr;
svm->vmcb_pa = target_vmcb->pa;
@@ -1299,11 +1304,12 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct 
kvm_vmcb_info *target_vmcb)
svm->asid_generation = 0;
 
/*
-   * Workaround: we don't yet track the physical CPU that
-   * target_vmcb has run on.
+   * Track the physical CPU the target_vmcb is running on
+   * in order to mark the VMCB dirty if the cpu changes at
+   * its next vmrun.
*/
 
-   vmcb_mark_all_dirty(svm->vmcb);
+   svm->current_vmcb->cpu = svm->vcpu.cpu;
 }
 
 static int svm_create_vcpu(struct kvm_vcpu *vcpu)
@@ -1386,11 +1392,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
int i;
 
-   if (unlikely(cpu != vcpu->cpu)) {
-   svm->asid_generation = 0;
-   vmcb_mark_all_dirty(svm->vmcb);
-   }
-
 #ifdef CONFIG_X86_64
rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
 #endif
@@ -3176,6 +3177,17 @@ static void pre_svm_run(struct vcpu_svm *svm)
 {
struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
 
+   /*
+   * If the previous vmrun of the vmcb occurred on
+   * a different physical cpu then we must mark the vmcb dirty.
+   */
+
+if (unlikely(svm->current_vmcb->cpu != svm->vcpu.cpu)) {
+   svm->asid_generation = 0;
+   vmcb_mark_all_dirty(svm->vmcb);
+   svm->current_vmcb->cpu = svm->vcpu.cpu;
+}
+
if (sev_guest(svm->vcpu.kvm))
return pre_sev_run(svm, svm->vcpu.cpu);
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 86bf443d4b2a..05baee934d03 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -85,6 +85,7 @@ struct kvm_vcpu;
 struct kvm_vmcb_info {
struct vmcb *ptr;
unsigned long pa;
+   int cpu;
 };
 
 struct svm_nested_state {
-- 
2.20.1

[PATCH 2/2] KVM: nSVM: Track the ASID generation of the vmcb vmrun through the vmcb

2021-01-12 Thread Cathy Avery

This patch moves the asid_generation from the vcpu to the vmcb
in order to track the ASID generation that was active the last
time the vmcb was run. If sd->asid_generation changes between
two runs, the old ASID is invalid and must be changed.

Suggested-by: Paolo Bonzini 
Signed-off-by: Cathy Avery 
---
 arch/x86/kvm/svm/svm.c | 21 +++--
 arch/x86/kvm/svm/svm.h |  2 +-
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5b6998ff7aa1..73e63f7a0acf 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1214,7 +1214,7 @@ static void init_vmcb(struct vcpu_svm *svm)
save->cr3 = 0;
save->cr4 = 0;
}
-   svm->asid_generation = 0;
+   svm->current_vmcb->asid_generation = 0;
svm->asid = 0;
 
svm->nested.vmcb12_gpa = 0;
@@ -1296,13 +1296,6 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct 
kvm_vmcb_info *target_vmcb)
svm->vmcb = target_vmcb->ptr;
svm->vmcb_pa = target_vmcb->pa;
 
-   /*
-   * Workaround: we don't yet track the ASID generation
-   * that was active the last time target_vmcb was run.
-   */
-
-   svm->asid_generation = 0;
-
/*
* Track the physical CPU the target_vmcb is running on
* in order to mark the VMCB dirty if the cpu changes at
@@ -1347,7 +1340,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 
svm_switch_vmcb(svm, &svm->vmcb01);
 
-   svm->asid_generation = 0;
init_vmcb(svm);
 
svm_init_osvw(vcpu);
@@ -1784,7 +1776,7 @@ static void new_asid(struct vcpu_svm *svm, struct 
svm_cpu_data *sd)
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
 
-   svm->asid_generation = sd->asid_generation;
+   svm->current_vmcb->asid_generation = sd->asid_generation;
svm->asid = sd->next_asid++;
 }
 
@@ -3179,11 +3171,12 @@ static void pre_svm_run(struct vcpu_svm *svm)
 
/*
* If the previous vmrun of the vmcb occurred on
-   * a different physical cpu then we must mark the vmcb dirty.
+   * a different physical cpu then we must mark the vmcb dirty,
+   * update the asid_generation, and assign a new asid.
*/
 
 if (unlikely(svm->current_vmcb->cpu != svm->vcpu.cpu)) {
-   svm->asid_generation = 0;
+   svm->current_vmcb->asid_generation = 0;
vmcb_mark_all_dirty(svm->vmcb);
svm->current_vmcb->cpu = svm->vcpu.cpu;
 }
@@ -3192,7 +3185,7 @@ static void pre_svm_run(struct vcpu_svm *svm)
return pre_sev_run(svm, svm->vcpu.cpu);
 
/* FIXME: handle wraparound of asid_generation */
-   if (svm->asid_generation != sd->asid_generation)
+   if (svm->current_vmcb->asid_generation != sd->asid_generation)
new_asid(svm, sd);
 }
 
@@ -3399,7 +3392,7 @@ void svm_flush_tlb(struct kvm_vcpu *vcpu)
if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
else
-   svm->asid_generation--;
+   svm->current_vmcb->asid_generation--;
 }
 
 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 05baee934d03..edcfc2a4e77e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -86,6 +86,7 @@ struct kvm_vmcb_info {
struct vmcb *ptr;
unsigned long pa;
int cpu;
+   uint64_t asid_generation;
 };
 
 struct svm_nested_state {
@@ -115,7 +116,6 @@ struct vcpu_svm {
struct kvm_vmcb_info *current_vmcb;
struct svm_cpu_data *svm_data;
u32 asid;
-   uint64_t asid_generation;
uint64_t sysenter_esp;
uint64_t sysenter_eip;
uint64_t tsc_aux;
-- 
2.20.1

[PATCH v2 1/2] KVM: nSVM: Track the physical cpu of the vmcb vmrun through the vmcb

2021-01-13 Thread Cathy Avery

This patch moves the physical cpu tracking from the vcpu
to the vmcb in svm_switch_vmcb. If either vmcb01 or vmcb02
change physical cpus from one vmrun to the next the vmcb's
previous cpu is preserved for comparison with the current
cpu and the vmcb is marked dirty if different. This prevents
the processor from using old cached data for a vmcb that may
have been updated on a prior run on a different processor.

It also moves the physical cpu check from svm_vcpu_load
to pre_svm_run as the check only needs to be done at run.

Suggested-by: Paolo Bonzini 
Signed-off-by: Cathy Avery 
---
 arch/x86/kvm/svm/svm.c | 23 +++
 arch/x86/kvm/svm/svm.h |  1 +
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 62390fbc9233..be41b7ebfcea 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1299,11 +1299,12 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct 
kvm_vmcb_info *target_vmcb)
svm->asid_generation = 0;
 
/*
-   * Workaround: we don't yet track the physical CPU that
-   * target_vmcb has run on.
+   * Track the physical CPU the target_vmcb is running on
+   * in order to mark the VMCB dirty if the cpu changes at
+   * its next vmrun.
*/
 
-   vmcb_mark_all_dirty(svm->vmcb);
+   svm->current_vmcb->cpu = svm->vcpu.cpu;
 }
 
 static int svm_create_vcpu(struct kvm_vcpu *vcpu)
@@ -1386,11 +1387,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
int i;
 
-   if (unlikely(cpu != vcpu->cpu)) {
-   svm->asid_generation = 0;
-   vmcb_mark_all_dirty(svm->vmcb);
-   }
-
 #ifdef CONFIG_X86_64
rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
 #endif
@@ -3176,6 +3172,17 @@ static void pre_svm_run(struct vcpu_svm *svm)
 {
struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
 
+   /*
+   * If the previous vmrun of the vmcb occurred on
+   * a different physical cpu then we must mark the vmcb dirty.
+   */
+
+if (unlikely(svm->current_vmcb->cpu != svm->vcpu.cpu)) {
+   svm->asid_generation = 0;
+   vmcb_mark_all_dirty(svm->vmcb);
+   svm->current_vmcb->cpu = svm->vcpu.cpu;
+}
+
if (sev_guest(svm->vcpu.kvm))
return pre_sev_run(svm, svm->vcpu.cpu);
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 86bf443d4b2a..05baee934d03 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -85,6 +85,7 @@ struct kvm_vcpu;
 struct kvm_vmcb_info {
struct vmcb *ptr;
unsigned long pa;
+   int cpu;
 };
 
 struct svm_nested_state {
-- 
2.20.1

[PATCH v2 2/2] KVM: nSVM: Track the ASID generation of the vmcb vmrun through the vmcb

2021-01-13 Thread Cathy Avery

This patch moves the asid_generation from the vcpu to the vmcb
in order to track the ASID generation that was active the last
time the vmcb was run. If sd->asid_generation changes between
two runs, the old ASID is invalid and must be changed.

Suggested-by: Paolo Bonzini 
Signed-off-by: Cathy Avery 
---
 arch/x86/kvm/svm/svm.c | 21 +++--
 arch/x86/kvm/svm/svm.h |  2 +-
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index be41b7ebfcea..e9326d3f6583 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1214,7 +1214,7 @@ static void init_vmcb(struct vcpu_svm *svm)
save->cr3 = 0;
save->cr4 = 0;
}
-   svm->asid_generation = 0;
+   svm->current_vmcb->asid_generation = 0;
svm->asid = 0;
 
svm->nested.vmcb12_gpa = 0;
@@ -1291,13 +1291,6 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct 
kvm_vmcb_info *target_vmcb)
svm->vmcb = target_vmcb->ptr;
svm->vmcb_pa = target_vmcb->pa;
 
-   /*
-   * Workaround: we don't yet track the ASID generation
-   * that was active the last time target_vmcb was run.
-   */
-
-   svm->asid_generation = 0;
-
/*
* Track the physical CPU the target_vmcb is running on
* in order to mark the VMCB dirty if the cpu changes at
@@ -1342,7 +1335,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 
svm_switch_vmcb(svm, &svm->vmcb01);
 
-   svm->asid_generation = 0;
init_vmcb(svm);
 
svm_init_osvw(vcpu);
@@ -1779,7 +1771,7 @@ static void new_asid(struct vcpu_svm *svm, struct 
svm_cpu_data *sd)
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
 
-   svm->asid_generation = sd->asid_generation;
+   svm->current_vmcb->asid_generation = sd->asid_generation;
svm->asid = sd->next_asid++;
 }
 
@@ -3174,11 +3166,12 @@ static void pre_svm_run(struct vcpu_svm *svm)
 
/*
* If the previous vmrun of the vmcb occurred on
-   * a different physical cpu then we must mark the vmcb dirty.
+   * a different physical cpu then we must mark the vmcb dirty,
+   * update the asid_generation, and assign a new asid.
*/
 
 if (unlikely(svm->current_vmcb->cpu != svm->vcpu.cpu)) {
-   svm->asid_generation = 0;
+   svm->current_vmcb->asid_generation = 0;
vmcb_mark_all_dirty(svm->vmcb);
svm->current_vmcb->cpu = svm->vcpu.cpu;
 }
@@ -3187,7 +3180,7 @@ static void pre_svm_run(struct vcpu_svm *svm)
return pre_sev_run(svm, svm->vcpu.cpu);
 
/* FIXME: handle wraparound of asid_generation */
-   if (svm->asid_generation != sd->asid_generation)
+   if (svm->current_vmcb->asid_generation != sd->asid_generation)
new_asid(svm, sd);
 }
 
@@ -3394,7 +3387,7 @@ void svm_flush_tlb(struct kvm_vcpu *vcpu)
if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
else
-   svm->asid_generation--;
+   svm->current_vmcb->asid_generation--;
 }
 
 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 05baee934d03..edcfc2a4e77e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -86,6 +86,7 @@ struct kvm_vmcb_info {
struct vmcb *ptr;
unsigned long pa;
int cpu;
+   uint64_t asid_generation;
 };
 
 struct svm_nested_state {
@@ -115,7 +116,6 @@ struct vcpu_svm {
struct kvm_vmcb_info *current_vmcb;
struct svm_cpu_data *svm_data;
u32 asid;
-   uint64_t asid_generation;
uint64_t sysenter_esp;
uint64_t sysenter_eip;
uint64_t tsc_aux;
-- 
2.20.1

[PATCH v2 0/2] ] KVM: SVM: Track physical cpu and asid_generation via the vmcb

2021-01-13 Thread Cathy Avery

In the cases where vmcbs change processors from one vmrun to another updated
information in the vmcb from a prior run can potentially be lost. By tracking
the physical cpu and asid_generation per vmcb instead of svm->vcpu the following
scenario illustrated by Paolo can be avoided.

 -  -
 pCPU 1 pCPU 2
 -  -
 run VMCB02
run VMCB02 (*)
run VMCB01
 run VMCB01 (**)
 run VMCB02 (***)
 -  -

 After the point marked (*), while L2 runs, some fields change in VMCB02.
 When the processor vmexits back to L0, VMCB02 is marked clean.

 At the point marked (**), svm->vcpu.cpu becomes 1 again.

 Therefore, at the point marked (***) you will get svm->vcpu.cpu == cpu
 and the VMCB02 will not be marked dirty.  The processor can then 
incorrectly
 use some data that is cached from before point (*).

Theses patches are intended for the kvm nested-svm branch.

The patches have been tested on nested fedora VMs, kvm self tests, and 
kvm-unit-tests.
They have not been tested on SEV.

Changes v1 -> v2:
- Remove outdated comment from svm_switch_vmcb().

Cathy Avery (2):
  KVM: nSVM: Track the physical cpu of the vmcb vmrun through the vmcb
  KVM: nSVM: Track the ASID generation of the vmcb vmrun through the
vmcb

 arch/x86/kvm/svm/svm.c | 40 
 arch/x86/kvm/svm/svm.h |  3 ++-
 2 files changed, 22 insertions(+), 21 deletions(-)

-- 
2.20.1

[PATCH] KVM: nSVM: Additions to optimizing L12 to L2 vmcb.save copies

2021-03-17 Thread Cathy Avery

Extend using the vmcb12 control clean field to determine which
vmcb12.save registers were marked dirty in order to minimize
register copies by including the CR bit.

This patch also fixes the init of last_vmcb12_gpa by using an invalid
physical address instead of 0.

Tested:
kvm-unit-tests
kvm selftests
Fedora L1 L2

Signed-off-by: Cathy Avery 
---
 arch/x86/kvm/svm/nested.c | 9 ++---
 arch/x86/kvm/svm/svm.c| 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 8523f60adb92..6f9a40e002bc 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -449,9 +449,12 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm 
*svm, struct vmcb *vmcb12
}
 
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
-   svm_set_efer(&svm->vcpu, vmcb12->save.efer);
-   svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
-   svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
+
+   if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_CR))) {
+   svm_set_efer(&svm->vcpu, vmcb12->save.efer);
+   svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
+   svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
+   }
 
svm->vcpu.arch.cr2 = vmcb12->save.cr2;
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 271196400495..41f5cd1009ca 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1234,7 +1234,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm->asid = 0;
 
svm->nested.vmcb12_gpa = 0;
-   svm->nested.last_vmcb12_gpa = 0;
+   svm->nested.last_vmcb12_gpa = -1;
vcpu->arch.hflags = 0;
 
if (!kvm_pause_in_guest(vcpu->kvm)) {
-- 
2.26.2

[PATCH] KVM: nSVM: Optimize L12 to L2 vmcb.save copies

2021-03-01 Thread Cathy Avery

Use the vmcb12 control clean field to determine which vmcb12.save
registers were marked dirty in order to minimize register copies
when switching from L1 to L2. Those L12 registers marked as dirty need
to be copied to L2's vmcb as they will be used to update the vmcb
state cache for the L2 VMRUN.  In the case where we have a different
vmcb12 from the last L2 VMRUN all vmcb12.save registers must be
copied over to L2.save.

Tested:
kvm-unit-tests
kvm selftests
Fedora L1 L2

Suggested-by: Paolo Bonzini 
Signed-off-by: Cathy Avery 
---
 arch/x86/kvm/svm/nested.c | 43 ++-
 arch/x86/kvm/svm/svm.c|  1 +
 arch/x86/kvm/svm/svm.h|  6 ++
 3 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 90a1704b5752..c1d5944ee473 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -422,39 +422,54 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
 
 static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb 
*vmcb12)
 {
+   bool new_vmcb12 = false;
+
nested_vmcb02_compute_g_pat(svm);
 
/* Load the nested guest state */
-   svm->vmcb->save.es = vmcb12->save.es;
-   svm->vmcb->save.cs = vmcb12->save.cs;
-   svm->vmcb->save.ss = vmcb12->save.ss;
-   svm->vmcb->save.ds = vmcb12->save.ds;
-   svm->vmcb->save.cpl = vmcb12->save.cpl;
-   vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
 
-   svm->vmcb->save.gdtr = vmcb12->save.gdtr;
-   svm->vmcb->save.idtr = vmcb12->save.idtr;
-   vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+   if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+   new_vmcb12 = true;
+   svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+   }
+
+   if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+   svm->vmcb->save.es = vmcb12->save.es;
+   svm->vmcb->save.cs = vmcb12->save.cs;
+   svm->vmcb->save.ss = vmcb12->save.ss;
+   svm->vmcb->save.ds = vmcb12->save.ds;
+   svm->vmcb->save.cpl = vmcb12->save.cpl;
+   vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+   }
+
+   if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+   svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+   svm->vmcb->save.idtr = vmcb12->save.idtr;
+   vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+   }
 
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
svm_set_efer(&svm->vcpu, vmcb12->save.efer);
svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
 
-   svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+   svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
 
/* In case we don't even reach vcpu_run, the fields are not updated */
-   svm->vmcb->save.cr2 = svm->vcpu.arch.cr2;
svm->vmcb->save.rax = vmcb12->save.rax;
svm->vmcb->save.rsp = vmcb12->save.rsp;
svm->vmcb->save.rip = vmcb12->save.rip;
 
-   svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
-   svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
-   vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+   /* These bits will be set properly on the first execution when 
new_vmc12 is true */
+   if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+   svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
+   svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
+   vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+   }
 }
 
 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 54610270f66a..9761a7ca8100 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1232,6 +1232,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm->asid = 0;
 
svm->nested.vmcb12_gpa = 0;
+   svm->nested.last_vmcb12_gpa = 0;
vcpu->arch.hflags = 0;
 
if (!kvm_pause_in_guest(vcpu->kvm)) {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index fbbb26dd0f73..911868d4584c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -93,6 +93,7 @@ struct svm_nested_state {
u64 hsave_msr;
u64 vm_cr_msr;
u64 vmcb12_gpa;
+   u64 last_vmcb12_gpa;
 
/* These are the merged vectors */
u32 *msrpm;
@@ -247,6 +248,11 @

Re: [PATCH] KVM: nSVM: Optimize L12 to L2 vmcb.save copies

2021-03-02 Thread Cathy Avery


On 3/1/21 7:59 PM, Sean Christopherson wrote:

On Mon, Mar 01, 2021, Cathy Avery wrote:

kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
svm_set_efer(&svm->vcpu, vmcb12->save.efer);
svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);

Why not utilize VMCB_CR?
I was going to tackle CR in a follow up patch. I should have mentioned 
that but it makes sense to go ahead and do it now.



-   svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+   svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;

Same question for VMCB_CR2.

Also, isn't writing svm->vmcb->save.cr2 unnecessary since svm_vcpu_run()
unconditionally writes it?

Alternatively, it shouldn't be too much work to add proper dirty tracking for
CR2.  VMX has to write the real CR2 every time because there's no VMCS field,
but I assume can avoid the write and dirty update on the majority of VMRUNs.


I 'll take a look at CR2 as well.

Thanks for the feedback,

Cathy




+
kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
  
  	/* In case we don't even reach vcpu_run, the fields are not updated */

-   svm->vmcb->save.cr2 = svm->vcpu.arch.cr2;
svm->vmcb->save.rax = vmcb12->save.rax;
svm->vmcb->save.rsp = vmcb12->save.rsp;
svm->vmcb->save.rip = vmcb12->save.rip;
  
-	svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;

-   svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
-   vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+   /* These bits will be set properly on the first execution when 
new_vmc12 is true */
+   if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+   svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
+   svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
+   vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+   }
  }
  
  static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 54610270f66a..9761a7ca8100 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1232,6 +1232,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm->asid = 0;
  
  	svm->nested.vmcb12_gpa = 0;

+   svm->nested.last_vmcb12_gpa = 0;

We should use INVALID_PAGE, '0' is a legal physical address and could
theoretically get a false negative on the "new_vmcb12" check.


vcpu->arch.hflags = 0;
  
  	if (!kvm_pause_in_guest(vcpu->kvm)) {

diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index fbbb26dd0f73..911868d4584c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -93,6 +93,7 @@ struct svm_nested_state {
u64 hsave_msr;
u64 vm_cr_msr;
u64 vmcb12_gpa;
+   u64 last_vmcb12_gpa;
  
  	/* These are the merged vectors */

u32 *msrpm;
@@ -247,6 +248,11 @@ static inline void vmcb_mark_dirty(struct vmcb *vmcb, int 
bit)
vmcb->control.clean &= ~(1 << bit);
  }
  
+static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)

+{
+return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
+}
+
  static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
  {
return container_of(vcpu, struct vcpu_svm, vcpu);
--
2.26.2

[PATCH 1/1] scsi: storvsc: Filter out storvsc messages CD-ROM medium not present

2016-05-23 Thread Cathy Avery

When a virtual scsi DVD device is present with no image file
attached the storvsc driver logs all resulting unnecessary sense errors
whenever IO is issued to the device.

[storvsc] Sense Key : Not Ready [current]
[storvsc] Add. Sense: Medium not present - tray closed

Signed-off-by: Cathy Avery 
---
 drivers/scsi/storvsc_drv.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 3ddcabb..f0efa07 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -966,7 +966,9 @@ static void storvsc_command_completion(struct 
storvsc_cmd_request *cmd_request,
if (scmnd->result) {
if (scsi_normalize_sense(scmnd->sense_buffer,
SCSI_SENSE_BUFFERSIZE, &sense_hdr) &&
-   do_logging(STORVSC_LOGGING_ERROR))
+   !(sense_hdr.sense_key == NOT_READY &&
+sense_hdr.asc == 0x03A) &&
+   do_logging(STORVSC_LOGGING_ERROR)) 
scsi_print_sense_hdr(scmnd->device, "storvsc",
 &sense_hdr);
}
-- 
2.5.0

Re: [PATCH v8 net-next 1/1] hv_sock: introduce Hyper-V Sockets

2016-04-26 Thread Cathy Avery


Hi,

I will be working with Dexuan to possibly port this functionality into RHEL.

Here are my initial comments. Mostly stylistic. They are prefaced by CAA.

Thanks,

Cathy Avery

On 04/07/2016 09:36 PM, Dexuan Cui wrote:

Hyper-V Sockets (hv_sock) supplies a byte-stream based communication
mechanism between the host and the guest. It's somewhat like TCP over
VMBus, but the transportation layer (VMBus) is much simpler than IP.

With Hyper-V Sockets, applications between the host and the guest can talk
to each other directly by the traditional BSD-style socket APIs.

Hyper-V Sockets is only available on new Windows hosts, like Windows Server
2016. More info is in this article "Make your own integration services":
https://msdn.microsoft.com/en-us/virtualization/hyperv_on_windows/develop/make_mgmt_service

The patch implements the necessary support in the guest side by introducing
a new socket address family AF_HYPERV.

Signed-off-by: Dexuan Cui
Cc: "K. Y. Srinivasan"
Cc: Haiyang Zhang
Cc: Vitaly Kuznetsov
---
  MAINTAINERS |2 +
  include/linux/hyperv.h  |   16 +
  include/linux/socket.h  |5 +-
  include/net/af_hvsock.h |   51 ++
  include/uapi/linux/hyperv.h |   25 +
  net/Kconfig |1 +
  net/Makefile|1 +
  net/hv_sock/Kconfig |   10 +
  net/hv_sock/Makefile|3 +
  net/hv_sock/af_hvsock.c | 1483 +++
  10 files changed, 1595 insertions(+), 2 deletions(-)
  create mode 100644 include/net/af_hvsock.h
  create mode 100644 net/hv_sock/Kconfig
  create mode 100644 net/hv_sock/Makefile
  create mode 100644 net/hv_sock/af_hvsock.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 67d99dd..7b6f203 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5267,7 +5267,9 @@ F:drivers/pci/host/pci-hyperv.c
  F:drivers/net/hyperv/
  F:drivers/scsi/storvsc_drv.c
  F:drivers/video/fbdev/hyperv_fb.c
+F: net/hv_sock/
  F:include/linux/hyperv.h
+F: include/net/af_hvsock.h
  F:tools/hv/
  F:Documentation/ABI/stable/sysfs-bus-vmbus
  
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h

index aa0fadc..b92439d 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1338,4 +1338,20 @@ extern __u32 vmbus_proto_version;
  
  int vmbus_send_tl_connect_request(const uuid_le *shv_guest_servie_id,

  const uuid_le *shv_host_servie_id);
+struct vmpipe_proto_header {
+   u32 pkt_type;
+   u32 data_size;
+} __packed;
+
+#define HVSOCK_HEADER_LEN  (sizeof(struct vmpacket_descriptor) + \
+sizeof(struct vmpipe_proto_header))
+
+/* See 'prev_indices' in hv_ringbuffer_read(), hv_ringbuffer_write() */
+#define PREV_INDICES_LEN   (sizeof(u64))
+
+#define HVSOCK_PKT_LEN(payload_len)(HVSOCK_HEADER_LEN + \
+   ALIGN((payload_len), 8) + \
+   PREV_INDICES_LEN)
+#define HVSOCK_MIN_PKT_LEN HVSOCK_PKT_LEN(1)
+
  #endif /* _HYPERV_H */
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 73bf6c6..88b1ccd 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -201,8 +201,8 @@ struct ucred {
  #define AF_NFC39  /* NFC sockets  */
  #define AF_VSOCK  40  /* vSockets */
  #define AF_KCM41  /* Kernel Connection Multiplexor*/
-
-#define AF_MAX 42  /* For now.. */
+#define AF_HYPERV  42  /* Hyper-V Sockets  */
+#define AF_MAX 43  /* For now.. */
  
  /* Protocol families, same as address families. */

  #define PF_UNSPEC AF_UNSPEC
@@ -249,6 +249,7 @@ struct ucred {
  #define PF_NFCAF_NFC
  #define PF_VSOCK  AF_VSOCK
  #define PF_KCMAF_KCM
+#define PF_HYPERV  AF_HYPERV
  #define PF_MAXAF_MAX
  
  /* Maximum queue length specifiable by listen.  */

diff --git a/include/net/af_hvsock.h b/include/net/af_hvsock.h
new file mode 100644
index 000..a5aa28d
--- /dev/null
+++ b/include/net/af_hvsock.h
@@ -0,0 +1,51 @@
+#ifndef __AF_HVSOCK_H__
+#define __AF_HVSOCK_H__
+
+#include 
+#include 
+#include 
+
+#define VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV (5 * PAGE_SIZE)
+#define VMBUS_RINGBUFFER_SIZE_HVSOCK_SEND (5 * PAGE_SIZE)
+
+#define HVSOCK_RCV_BUF_SZ  VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV
+#define HVSOCK_SND_BUF_SZ  PAGE_SIZE
+
+#define sk_to_hvsock(__sk)((struct hvsock_sock *)(__sk))
+#define hvsock_to_sk(__hvsk)   ((struct sock *)(__hvsk))
+
+struct hvsock_sock {
+   /* sk must be the first member. */
+   struct sock sk;
+
+   struct sockaddr_hv local_addr;
+   struct sockaddr_hv remote_addr;
+
+   /* protected by the global hvsock_mutex */
+   struct list_head bound_list;
+   struct list_head connected_list;
+
+   struct list_head accept_qu

Re: [PATCH v2] Drivers: hv: kvp: fix IP Failover

2016-03-30 Thread Cathy Avery

t if VersionAndFeatures.HypervisorPresent
   * is set by CPUID(HVCPUID_VERSION_FEATURES).
   */

Acked-by: Cathy Avery

Re: [PATCH v2] Drivers: hv: kvp: fix IP Failover

2016-03-30 Thread Cathy Avery


Sorry acking wrong email.

Thanks,

Cathy

On 03/30/2016 08:21 AM, Cathy Avery wrote:



On 03/29/2016 08:30 AM, Vitaly Kuznetsov wrote:

Hyper-V VMs can be replicated to another hosts and there is a feature to
set different IP for replicas, it is called 'Failover TCP/IP'. When
such guest starts Hyper-V host sends it KVP_OP_SET_IP_INFO message as 
soon
as we finish negotiation procedure. The problem is that it can happen 
(and

it actually happens) before userspace daemon connects and we reply with
HV_E_FAIL to the message. As there are no repetitions we fail to set the
requested IP.

Solve the issue by postponing our reply to the negotiation message till
userspace daemon is connected. We can't wait too long as there is a
host-side timeout (cca. 75 seconds) and if we fail to reply in this time
frame the whole KVP service will become inactive. The solution is not
ideal - if it takes userspace daemon more than 60 seconds to connect
IP Failover will still fail but I don't see a solution with our current
separation between kernel and userspace parts.

Other two modules (VSS and FCOPY) don't require such delay, leave them
untouched.

Signed-off-by: Vitaly Kuznetsov 
---
Changes since v1:
- Cancel handshake timeout work on module unload.
---
  drivers/hv/hv_kvp.c   | 31 +++
  drivers/hv/hyperv_vmbus.h |  5 +
  2 files changed, 36 insertions(+)

diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index 9b9b370..cb1a916 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -78,9 +78,11 @@ static void kvp_send_key(struct work_struct *dummy);
static void kvp_respond_to_host(struct hv_kvp_msg *msg, int error);
  static void kvp_timeout_func(struct work_struct *dummy);
+static void kvp_host_handshake_func(struct work_struct *dummy);
  static void kvp_register(int);
static DECLARE_DELAYED_WORK(kvp_timeout_work, kvp_timeout_func);
+static DECLARE_DELAYED_WORK(kvp_host_handshake_work, 
kvp_host_handshake_func);

  static DECLARE_WORK(kvp_sendkey_work, kvp_send_key);
static const char kvp_devname[] = "vmbus/hv_kvp";
@@ -130,6 +132,11 @@ static void kvp_timeout_func(struct work_struct 
*dummy)

  hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper);
  }
  +static void kvp_host_handshake_func(struct work_struct *dummy)
+{
+hv_poll_channel(kvp_transaction.recv_channel, 
hv_kvp_onchannelcallback);

+}
+
  static int kvp_handle_handshake(struct hv_kvp_msg *msg)
  {
  switch (msg->kvp_hdr.operation) {
@@ -154,6 +161,12 @@ static int kvp_handle_handshake(struct 
hv_kvp_msg *msg)

  pr_debug("KVP: userspace daemon ver. %d registered\n",
   KVP_OP_REGISTER);
  kvp_register(dm_reg_value);
+
+/*
+ * If we're still negotiating with the host cancel the timeout
+ * work to not poll the channel twice.
+ */
+cancel_delayed_work_sync(&kvp_host_handshake_work);
  hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper);
return 0;
@@ -594,7 +607,22 @@ void hv_kvp_onchannelcallback(void *context)
  struct icmsg_negotiate *negop = NULL;
  int util_fw_version;
  int kvp_srv_version;
+static enum {NEGO_NOT_STARTED,
+ NEGO_IN_PROGRESS,
+ NEGO_FINISHED} host_negotiatied = NEGO_NOT_STARTED;
  +if (host_negotiatied == NEGO_NOT_STARTED &&
+kvp_transaction.state < HVUTIL_READY) {
+/*
+ * If userspace daemon is not connected and host is asking
+ * us to negotiate we need to delay to not lose messages.
+ * This is important for Failover IP setting.
+ */
+host_negotiatied = NEGO_IN_PROGRESS;
+schedule_delayed_work(&kvp_host_handshake_work,
+  HV_UTIL_NEGO_TIMEOUT * HZ);
+return;
+}
  if (kvp_transaction.state > HVUTIL_READY)
  return;
  @@ -672,6 +700,8 @@ void hv_kvp_onchannelcallback(void *context)
  vmbus_sendpacket(channel, recv_buffer,
 recvlen, requestid,
 VM_PKT_DATA_INBAND, 0);
+
+host_negotiatied = NEGO_FINISHED;
  }
}
@@ -708,6 +738,7 @@ hv_kvp_init(struct hv_util_service *srv)
  void hv_kvp_deinit(void)
  {
  kvp_transaction.state = HVUTIL_DEVICE_DYING;
+cancel_delayed_work_sync(&kvp_host_handshake_work);
  cancel_delayed_work_sync(&kvp_timeout_work);
  cancel_work_sync(&kvp_sendkey_work);
  hvutil_transport_destroy(hvt);
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 12321b9..8b07f9c 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -36,6 +36,11 @@
  #define HV_UTIL_TIMEOUT 30
/*
+ * Timeout for guest-host handshake for services.
+ */
+#define HV_UTIL_NEGO_TIMEOUT 60
+
+/*
   * The below CPUID leaves are present if 
VersionAndFeatures.HypervisorPresent

   * is set by CPUID(HVCPUID_VERSION_FEATURES).
   */

Acked-by: Cathy Avery

[PATCH kvm-unit-tests] svm: Fix nmi hlt test to fail test correctly

2020-04-28 Thread Cathy Avery

The last test does not return vmmcall on fail resulting
in passing the entire test.

Signed-off-by: Cathy Avery 
---
 x86/svm_tests.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/x86/svm_tests.c b/x86/svm_tests.c
index 2b84e4d..65008ba 100644
--- a/x86/svm_tests.c
+++ b/x86/svm_tests.c
@@ -1480,6 +1480,7 @@ static void nmi_hlt_test(struct svm_test *test)
 if (!nmi_fired) {
 report(nmi_fired, "intercepted pending NMI not dispatched");
 set_test_stage(test, -1);
+vmmcall();
 }
 
 set_test_stage(test, 3);
-- 
2.20.1

[PATCH] [hv] storvsc: Payload buffer incorrectly sized for 32 bit kernels.

2016-11-22 Thread Cathy Avery

On a 32 bit kernel sizeof(void *) is not 64 bits as hv_mpb_array
requires. Also the buffer needs to be cleared or the upper bytes
could contain junk.

Suggested-by: Vitaly Kuznets 
Signed-off-by: Cathy Avery 
---
 drivers/scsi/storvsc_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 8ccfc9e..b4a8c9d 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1495,11 +1495,12 @@ static int storvsc_queuecommand(struct Scsi_Host *host, 
struct scsi_cmnd *scmnd)
if (sg_count) {
if (sg_count > MAX_PAGE_BUFFER_COUNT) {
 
-   payload_sz = (sg_count * sizeof(void *) +
+   payload_sz = (sg_count * sizeof(u64) +
  sizeof(struct vmbus_packet_mpb_array));
payload = kmalloc(payload_sz, GFP_ATOMIC);
if (!payload)
return SCSI_MLQUEUE_DEVICE_BUSY;
+   memset(payload, 0, payload_sz);
}
 
payload->range.len = length;
-- 
2.5.0

[PATCH v2] [hv] storvsc: Payload buffer incorrectly sized for 32 bit kernels.

2016-11-23 Thread Cathy Avery

On a 32 bit kernel sizeof(void *) is not 64 bits as hv_mpb_array
requires. Also the buffer needs to be cleared or the upper bytes
will contain junk.

Suggested-by: Vitaly Kuznetsov 
Signed-off-by: Cathy Avery 

ChangeLog:

v1) Initial submission
v2) Remove memset and replace kmalloc with kzalloc.
---
 drivers/scsi/storvsc_drv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 8ccfc9e..05526b7 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1495,9 +1495,9 @@ static int storvsc_queuecommand(struct Scsi_Host *host, 
struct scsi_cmnd *scmnd)
if (sg_count) {
if (sg_count > MAX_PAGE_BUFFER_COUNT) {
 
-   payload_sz = (sg_count * sizeof(void *) +
+   payload_sz = (sg_count * sizeof(u64) +
  sizeof(struct vmbus_packet_mpb_array));
-   payload = kmalloc(payload_sz, GFP_ATOMIC);
+   payload = kzalloc(payload_sz, GFP_ATOMIC);
if (!payload)
return SCSI_MLQUEUE_DEVICE_BUSY;
}
-- 
2.5.0

[PATCH 0/2] scsi: Create a lightweight FC Transport option for Virtual FC Hosts.

2017-01-18 Thread Cathy Avery

Currently virtual FC hosts or lightweight hosts are not able to be
manually scanned  via sysfs due to the fact that they do not
contain the complete characteristic set associated with a normal FC host
( missing rports, vports, etc ) and are not consistent with the current FC
transport model.

Patch 1: The patch provides a lightweight option to the current FC 
transport class. The new option is selected by a driver when it 
indicates it wants the lightweight transport in fc_function_template.

Patch 2: storvsc elects using the new lightweight FC host option.

Cathy Avery (2):
  scsi: scsi_transport_fc: Provide a lightweight option for Virtual FC
Hosts.
  scsi: storvsc: Add support for FC lightweight host.

 drivers/scsi/scsi_transport_fc.c | 125 +--
 drivers/scsi/storvsc_drv.c   |   6 +-
 include/scsi/scsi_transport_fc.h |   1 +
 3 files changed, 123 insertions(+), 9 deletions(-)

-- 
2.5.0

[PATCH 1/2] scsi: scsi_transport_fc: Provide a lightweight option for Virtual FC Hosts.

2017-01-18 Thread Cathy Avery

The patch provides a means to offer a lightweight option to the
current FC transport class. The new option is selected by a
driver when it indicates it wants the lightweight
transport via fc_function_template.

Signed-off-by: Cathy Avery 
---
 drivers/scsi/scsi_transport_fc.c | 125 +--
 include/scsi/scsi_transport_fc.h |   1 +
 2 files changed, 122 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 03577bd..4adc669 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -50,6 +50,15 @@ static int fc_bsg_hostadd(struct Scsi_Host *, struct 
fc_host_attrs *);
 static int fc_bsg_rportadd(struct Scsi_Host *, struct fc_rport *);
 static void fc_bsg_remove(struct request_queue *);
 static void fc_bsg_goose_queue(struct fc_rport *);
+static int fc_host_lw_setup(struct Scsi_Host *, struct fc_host_attrs *);
+static int fc_host_hw_setup(struct Scsi_Host *, struct fc_host_attrs *);
+static int fc_host_hw_remove(struct fc_host_attrs *);
+static struct scsi_transport_template *
+   fc_attach_lw_transport(struct fc_function_template *);
+static struct scsi_transport_template *
+   fc_attach_hw_transport(struct fc_function_template *);
+static void fc_remove_lw_host(struct Scsi_Host *);
+static void fc_remove_hw_host(struct Scsi_Host *, struct fc_host_attrs *);
 
 /*
  * Module Parameters
@@ -352,6 +361,10 @@ struct fc_internal {
 
 #define to_fc_internal(tmpl)   container_of(tmpl, struct fc_internal, t)
 
+
+static void fc_release_lw_transport(struct fc_internal *);
+static void fc_release_hw_transport(struct fc_internal *);
+
 static int fc_target_setup(struct transport_container *tc, struct device *dev,
   struct device *cdev)
 {
@@ -387,7 +400,26 @@ static int fc_host_setup(struct transport_container *tc, 
struct device *dev,
 {
struct Scsi_Host *shost = dev_to_shost(dev);
struct fc_host_attrs *fc_host = shost_to_fc_host(shost);
+   struct fc_internal *i = to_fc_internal(shost->transportt);
+
+   if (i->f->lightweight_transport)
+   return fc_host_lw_setup(shost, fc_host);
+
+   return fc_host_hw_setup(shost, fc_host);
+}
+
+static int fc_host_lw_setup(struct Scsi_Host *shost,
+   struct fc_host_attrs *fc_host)
+{
+   fc_host->node_name = -1;
+   fc_host->port_name = -1;
+
+   return 0;
+}
 
+static int fc_host_hw_setup(struct Scsi_Host *shost,
+   struct fc_host_attrs *fc_host)
+{
/*
 * Set default values easily detected by the midlayer as
 * failure cases.  The scsi lldd is responsible for initializing
@@ -468,7 +500,16 @@ static int fc_host_remove(struct transport_container *tc, 
struct device *dev,
 {
struct Scsi_Host *shost = dev_to_shost(dev);
struct fc_host_attrs *fc_host = shost_to_fc_host(shost);
+   struct fc_internal *i = to_fc_internal(shost->transportt);
+
+   if (i->f->lightweight_transport)
+   return 0;
 
+   return fc_host_hw_remove(fc_host);
+}
+
+static int fc_host_hw_remove(struct fc_host_attrs *fc_host)
+{
fc_bsg_remove(fc_host->rqst_q);
return 0;
 }
@@ -2175,6 +2216,49 @@ static int fc_it_nexus_response(struct Scsi_Host *shost, 
u64 nexus, int result)
 struct scsi_transport_template *
 fc_attach_transport(struct fc_function_template *ft)
 {
+   if (ft->lightweight_transport)
+   return fc_attach_lw_transport(ft);
+
+   return fc_attach_hw_transport(ft);
+}
+EXPORT_SYMBOL(fc_attach_transport);
+
+
+struct scsi_transport_template *
+fc_attach_lw_transport(struct fc_function_template *ft)
+{
+   int count;
+   struct fc_internal *i;
+
+   i = kzalloc(sizeof(struct fc_internal),
+   GFP_KERNEL);
+
+   if (unlikely(!i))
+   return NULL;
+
+   i->t.host_attrs.ac.attrs = &i->host_attrs[0];
+   i->t.host_attrs.ac.class = &fc_host_class.class;
+   i->t.host_attrs.ac.match = fc_host_match;
+   i->t.host_size = sizeof(struct fc_host_attrs);
+   transport_container_register(&i->t.host_attrs);
+
+   i->f = ft;
+
+   count = 0;
+   SETUP_HOST_ATTRIBUTE_RD(node_name);
+   SETUP_HOST_ATTRIBUTE_RD(port_name);
+
+   BUG_ON(count > FC_HOST_NUM_ATTRS);
+
+   i->host_attrs[count] = NULL;
+
+   return &i->t;
+}
+
+
+struct scsi_transport_template *
+fc_attach_hw_transport(struct fc_function_template *ft)
+{
int count;
struct fc_internal *i = kzalloc(sizeof(struct fc_internal),
GFP_KERNEL);
@@ -2318,12 +2402,27 @@ fc_attach_transport(struct fc_function_template *ft)
 
return &i->t;
 }
-EXPORT_SYMBOL(fc_attach_transport);
 
 void fc_release_transport(struct scsi_transport_template *t)
 {
s

[PATCH 2/2] scsi: storvsc: Add support for FC lightweight host.

2017-01-18 Thread Cathy Avery

Enable FC lightweight host option so that the luns exposed by
the driver may be manually scanned.

Signed-off-by: Cathy Avery 
---
 drivers/scsi/storvsc_drv.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 888e16e..fc1d6ba 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1882,6 +1882,7 @@ static struct hv_driver storvsc_drv = {
 static struct fc_function_template fc_transport_functions = {
.show_host_node_name = 1,
.show_host_port_name = 1,
+   .lightweight_transport = 1,
 };
 #endif
 
@@ -1906,11 +1907,6 @@ static int __init storvsc_drv_init(void)
fc_transport_template = fc_attach_transport(&fc_transport_functions);
if (!fc_transport_template)
return -ENODEV;
-
-   /*
-* Install Hyper-V specific timeout handler.
-*/
-   fc_transport_template->eh_timed_out = storvsc_eh_timed_out;
 #endif
 
ret = vmbus_driver_register(&storvsc_drv);
-- 
2.5.0

Re: [PATCH 1/2] scsi: scsi_transport_fc: Provide a lightweight option for Virtual FC Hosts.

2017-01-19 Thread Cathy Avery




On 01/19/2017 10:11 AM, Christoph Hellwig wrote:

On Wed, Jan 18, 2017 at 03:28:57PM -0500, Cathy Avery wrote:

The patch provides a means to offer a lightweight option to the
current FC transport class. The new option is selected by a
driver when it indicates it wants the lightweight
transport via fc_function_template.

This need some really good documentation in the code and changelog
what "lightweight" means.  It's a pretty horrible term, btw.


Thanks, I will work on better documentation and a better name.

Cathy

Re: [PATCH 2/2] scsi: storvsc: Add support for FC lightweight host.

2017-01-23 Thread Cathy Avery


Hi,

There is no way to issue a lip directly as  the current client for this 
feature ( storvsc ) does not handle that request as a physical fc hba 
can. Storvsc only has two fc attributes exposed -  port_name and node_name.


You can rescan the bus with the standard echo "- - -" > 
/sys/class/scsi_host/hostX/scan.


Cathy


On 01/22/2017 10:13 PM, Fam Zheng wrote:

On Wed, 01/18 15:28, Cathy Avery wrote:

Enable FC lightweight host option so that the luns exposed by
the driver may be manually scanned.

Hi Cathy, out of curiosity: how does this relate to issue_lip operation? And how
to trigger manual scan with this patch?

Fam

[PATCH v2 2/2] scsi: storvsc: Add support for FC lightweight host.

2017-01-26 Thread Cathy Avery

Included in the current storvsc driver for Hyper-V is the ability
to access luns on an FC fabric via a virtualized fiber channel
adapter exposed by the Hyper-V host. This was done to provide an
interface for existing customer tools that was more consistent
with a conventional FC device. The driver attaches to the FC
transport to allow host and port names to be published
under /sys/class/fc_host/hostX.

A problem arose when attaching to the FC transport. The scsi_scan
code attempts to call fc_user_scan which has basically become a no-op
due to the virtualized nature of the FC host ( missing rports,
vports, etc ).  At this point you cannot refresh the scsi bus after
mapping or unmapping luns on the SAN without a reboot.

This patch consists of:

1) storvsc elects to use the new lightweight FC host option by enabling it
in fc_function_template facilitating a fuctional scsi_scan.

2) Removes an original workaround dealing with replacing the eh_timed_out
function.  Patch 1 will not set the scsi_transport_template.eh_timed_out
function directly during lightweight fc_attach_transport(). It instead
relies on whatever was indicated as the scsi_host_template timeout handler
during scsi_times_out() scsi_error.c. So the workaround is no longer
necessary.

Signed-off-by: Cathy Avery 
---
 drivers/scsi/storvsc_drv.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 888e16e..d487e00 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1878,10 +1878,17 @@ static struct hv_driver storvsc_drv = {
.remove = storvsc_remove,
 };
 
+/*
+ * The driver cannot functionally back all transport
+ * attributes so select only those pertinent including
+ * the lightweight model.
+ */
+
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
 static struct fc_function_template fc_transport_functions = {
.show_host_node_name = 1,
.show_host_port_name = 1,
+   .lightweight_transport = 1,
 };
 #endif
 
@@ -1906,11 +1913,6 @@ static int __init storvsc_drv_init(void)
fc_transport_template = fc_attach_transport(&fc_transport_functions);
if (!fc_transport_template)
return -ENODEV;
-
-   /*
-* Install Hyper-V specific timeout handler.
-*/
-   fc_transport_template->eh_timed_out = storvsc_eh_timed_out;
 #endif
 
ret = vmbus_driver_register(&storvsc_drv);
-- 
2.5.0

[PATCH v2 0/2] scsi: storvsc: Add support for FC lightweight host.

2017-01-26 Thread Cathy Avery

This patch set is based on the following patch submission
and email exchange:

[PATCH 1/1] scsi: storvsc: Support manual scan of FC hosts on Hyper-V
K. Y. Srinivasan kys at microsoft.com
Sat Mar 12 21:52:48 UTC 2016

http://driverdev.linuxdriverproject.org/pipermail/driverdev-devel/2016-March/087116.html

Included in the current storvsc driver for Hyper-V is the ability
to access luns on an FC fabric via a virtualized fiber channel
adapter exposed by the Hyper-V host. This was done to provide an
interface for existing customer tools that was more consistent with
a conventional FC device. The driver attaches to the FC transport 
to allow host and port names to be published under 
/sys/class/fc_host/hostX.

A problem arose when attaching to the FC transport. The scsi_scan code
attempts to call fc_user_scan which has basically become a no-op 
due to the virtualized nature of the FC host 
( missing rports, vports, etc ). At this point you cannot refresh
the scsi bus after mapping or unmapping luns on the SAN without
a reboot.

The patch above attempted to address the problem of not being
able to scan FC hosts on a Hyper-V guest by setting
fc_transport_template->user_scan = NULL but it was rejected
in favor of a new "lightweight" version of the FC transport that
only provides the bare minimum functionality of the standard FC model.
This new transport option would be more suitable for FC transports
running on a VM and provide some flexibility in the future.

The patches below offer a method to incorporate the new 
lightweight FC option into the existing transport
and storvsc drivers.

Patch 1: scsi_transport_fc.h, scsi_transport_fc.c

1) Adds the lightweight_transport option to fc_function_template.
Based on this selection the transport will either be lightweight
or default to heavyweight.

2) Divides the applicable export functions into two sets.
The lightweight functions involve FC attributes port_name and 
node name. The functions that deal with targets, rports, etc
are not used. The heavyweight default contains the original
standard physical FC hba attribute set. 

3) All top level FC class directories such fc_remote_ports,
fc_transport, and fc_vports are still created when the transport
driver is loaded. They are just not populated when running in
lightweight mode. Conceptually both lightweight and heavyweight
clients could coexist.

4) fc_transport_template->user_scan is now null and the bus 
can be scanned.

Patch 2: storvsc.c

1) storvsc elects to use the new lightweight FC host option 
by enabling it in fc_function_template. 

2) Removes an original workaround dealing with replacing
the eh_timed_out function. Patch 1 will not set the 
scsi_transport_template.eh_timed_out function directly during
lightweight fc_attach_transport(). It instead relies on
whatever was indicated as the scsi_host_template timeout handler
during scsi_times_out() scsi_error.c. So the workaround is 
no longer necessary.


It has been suggested that the word lightweight may not be
the best choice of terms when describing the new FC transport
option.  I can offer a few new ones but I am not particularly
imaginative. 

Virtual FC
Mini FC
Host only FC 


Changes from V1:

Added more comments and documentation in the code regarding 
the lightweight feature.


Cathy Avery (2):
  scsi: scsi_transport_fc: Provide a lightweight option for Virtual FC
Hosts.
  scsi: storvsc: Add support for FC lightweight host.

 drivers/scsi/scsi_transport_fc.c | 144 +--
 drivers/scsi/storvsc_drv.c   |  12 ++--
 include/scsi/scsi_transport_fc.h |   2 +
 3 files changed, 149 insertions(+), 9 deletions(-)

-- 
2.5.0

[PATCH v2 1/2] scsi: scsi_transport_fc: Provide a lightweight option for Virtual FC Hosts.

2017-01-26 Thread Cathy Avery

This patch provides a means to offer a lightweight option to the
current FC transport class. This new transport option is more
suitable for FC transports running on a VM that virtualizes their
FCs and that do not possess rports or vports whereas the heavyweight
option maintains the standard physical FC hba attuibute set.

The patch performs the following:

1) Adds the lightweight_transport option to fc_function_template. Based
on this selection the transport will either be lightweight or default
to heavyweight.

2) Divides the applicable export functions into two sets. The lightweight
functions involve FC attributes port_name and node name. The functions
that deal with targets, rports, etc are not used. The heavyweight default
contains the original standard physical FC hba attribute set.

3) All top level FC class directories such fc_remote_ports, fc_transport,
and fc_vports are still created when the transport driver loads.
They are just not populated when running in lightweight mode. Conceptually
both lightweight and heavyweight clients could coexist.

4) fc_transport_template->user_scan is now null and the bus can be scanned.

Signed-off-by: Cathy Avery 
---
 drivers/scsi/scsi_transport_fc.c | 144 +--
 include/scsi/scsi_transport_fc.h |   2 +
 2 files changed, 142 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 03577bd..615057c 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -52,6 +52,25 @@ static void fc_bsg_remove(struct request_queue *);
 static void fc_bsg_goose_queue(struct fc_rport *);
 
 /*
+ * These functions define the actions taken on behalf of either
+ * a virtualized client which is considered to be lightweigth
+ * having only port name and node name as attributes in sysfs and
+ * which does not possess rports or vports vs the heavyweigth
+ * mode which is intended for fully functional clients such as
+ * physical HBAs.
+ */
+
+static int fc_host_lw_setup(struct Scsi_Host *, struct fc_host_attrs *);
+static int fc_host_hw_setup(struct Scsi_Host *, struct fc_host_attrs *);
+static int fc_host_hw_remove(struct fc_host_attrs *);
+static struct scsi_transport_template *
+   fc_attach_lw_transport(struct fc_function_template *);
+static struct scsi_transport_template *
+   fc_attach_hw_transport(struct fc_function_template *);
+static void fc_remove_lw_host(struct Scsi_Host *);
+static void fc_remove_hw_host(struct Scsi_Host *, struct fc_host_attrs *);
+
+/*
  * Module Parameters
  */
 
@@ -352,6 +371,10 @@ struct fc_internal {
 
 #define to_fc_internal(tmpl)   container_of(tmpl, struct fc_internal, t)
 
+
+static void fc_release_lw_transport(struct fc_internal *);
+static void fc_release_hw_transport(struct fc_internal *);
+
 static int fc_target_setup(struct transport_container *tc, struct device *dev,
   struct device *cdev)
 {
@@ -387,7 +410,33 @@ static int fc_host_setup(struct transport_container *tc, 
struct device *dev,
 {
struct Scsi_Host *shost = dev_to_shost(dev);
struct fc_host_attrs *fc_host = shost_to_fc_host(shost);
+   struct fc_internal *i = to_fc_internal(shost->transportt);
+
+   /*
+* Check if the client driver has selected a minimum set
+* of fc transport attributes to initialize. Otherwise
+* initialize all currently available attributes.
+*/
+
+   if (i->f->lightweight_transport)
+   return fc_host_lw_setup(shost, fc_host);
+
+   return fc_host_hw_setup(shost, fc_host);
+}
+
+static int fc_host_lw_setup(struct Scsi_Host *shost,
+   struct fc_host_attrs *fc_host)
+{
+   /* Only node_name and port_name are used */
+   fc_host->node_name = -1;
+   fc_host->port_name = -1;
+
+   return 0;
+}
 
+static int fc_host_hw_setup(struct Scsi_Host *shost,
+   struct fc_host_attrs *fc_host)
+{
/*
 * Set default values easily detected by the midlayer as
 * failure cases.  The scsi lldd is responsible for initializing
@@ -468,7 +517,16 @@ static int fc_host_remove(struct transport_container *tc, 
struct device *dev,
 {
struct Scsi_Host *shost = dev_to_shost(dev);
struct fc_host_attrs *fc_host = shost_to_fc_host(shost);
+   struct fc_internal *i = to_fc_internal(shost->transportt);
+
+   if (i->f->lightweight_transport)
+   return 0;
 
+   return fc_host_hw_remove(fc_host);
+}
+
+static int fc_host_hw_remove(struct fc_host_attrs *fc_host)
+{
fc_bsg_remove(fc_host->rqst_q);
return 0;
 }
@@ -2175,6 +2233,51 @@ static int fc_it_nexus_response(struct Scsi_Host *shost, 
u64 nexus, int result)
 struct scsi_transport_template *
 fc_attach_transport(struct fc_function_template *ft)
 {
+   if (ft->lightweight_transport)
+   return fc_attach_l

Re: [PATCH 2/2] scsi: storvsc: Add support for FC lightweight host.

2017-01-19 Thread Cathy Avery




On 01/18/2017 06:15 PM, Dan Carpenter wrote:

On Wed, Jan 18, 2017 at 03:28:58PM -0500, Cathy Avery wrote:

Enable FC lightweight host option so that the luns exposed by
the driver may be manually scanned.

Signed-off-by: Cathy Avery 
---
  drivers/scsi/storvsc_drv.c | 6 +-
  1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 888e16e..fc1d6ba 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1882,6 +1882,7 @@ static struct hv_driver storvsc_drv = {
  static struct fc_function_template fc_transport_functions = {
.show_host_node_name = 1,
.show_host_port_name = 1,
+   .lightweight_transport = 1,
  };
  #endif
  
@@ -1906,11 +1907,6 @@ static int __init storvsc_drv_init(void)

fc_transport_template = fc_attach_transport(&fc_transport_functions);
if (!fc_transport_template)
return -ENODEV;
-
-   /*
-* Install Hyper-V specific timeout handler.
-*/
-   fc_transport_template->eh_timed_out = storvsc_eh_timed_out;

I don't undestand how removing this is related.


Its not related but it is also not necessary so I took it out. The 
default scsi timeout handler will be used.


I can certainly put it back.

Cathy


regards,
dan carpenter

Re: [PATCH 2/2] scsi: storvsc: Add support for FC lightweight host.

2017-01-22 Thread Cathy Avery

I'm sorry.  In my zeal to push out this patch I have done a poor job of 
communication on a number of levels.


The first patch which deals with the fc transport changes will not set 
the scsi_transport_template.eh_timed_out function directly during 
lightweight fc_attach_transport(). It instead relies on whatever was 
indicated as the scsi_host_template timeout handler during 
inscsi_times_out() scsi_error.c.


So yes in a sense it is related but now I believe I understand your 
point. Perhaps this would fall more under the heading of post 
fc_transport implementation storvsc cleanup necessitating its own patch.


I will break it out in the next go round.

Thanks,

Cathy


On 01/20/2017 04:31 AM, Dan Carpenter wrote:

On Thu, Jan 19, 2017 at 12:55:27PM -0500, Cathy Avery wrote:


On 01/18/2017 06:15 PM, Dan Carpenter wrote:

On Wed, Jan 18, 2017 at 03:28:58PM -0500, Cathy Avery wrote:

Enable FC lightweight host option so that the luns exposed by
the driver may be manually scanned.

Signed-off-by: Cathy Avery 
---
  drivers/scsi/storvsc_drv.c | 6 +-
  1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 888e16e..fc1d6ba 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1882,6 +1882,7 @@ static struct hv_driver storvsc_drv = {
  static struct fc_function_template fc_transport_functions = {
.show_host_node_name = 1,
.show_host_port_name = 1,
+   .lightweight_transport = 1,
  };
  #endif
@@ -1906,11 +1907,6 @@ static int __init storvsc_drv_init(void)
fc_transport_template = fc_attach_transport(&fc_transport_functions);
if (!fc_transport_template)
return -ENODEV;
-
-   /*
-* Install Hyper-V specific timeout handler.
-*/
-   fc_transport_template->eh_timed_out = storvsc_eh_timed_out;

I don't undestand how removing this is related.

Its not related but it is also not necessary so I took it out. The
default scsi timeout handler will be used.

I can certainly put it back.

I'm not sure that we understand each other properly.

Has this patch already been committed?  If so, then there is no need to
put it back.

But it if hasn't been committed, can you resend the patches with that
bit broken out into a separate patch with its own changelog?  Patches
should only do one thing but you're  saying that it's doing two
unrelated things.

regards,
dan carpenter

Re: [PATCH v15 net-next 1/1] hv_sock: introduce Hyper-V Sockets

2016-07-08 Thread Cathy Avery

It's too bad about having to allocate the send/receive rings on a per 
socket basis. Hopefully this will change in the future.


I have built kernel 4.7.0-rc6 with this patch and did a quick test on 
the driver using rhel7 over hyperv Windows Server 2016 TP5. These were 
basic send and receive tests ( host to vm and vm to host ) using apps 
provided by Dexuan.


Reviewed-by: Cathy Avery 
Tested-by: Cathy Avery 

On 07/08/2016 03:47 AM, Dexuan Cui wrote:

Hyper-V Sockets (hv_sock) supplies a byte-stream based communication
mechanism between the host and the guest. It's somewhat like TCP over
VMBus, but the transportation layer (VMBus) is much simpler than IP.

With Hyper-V Sockets, applications between the host and the guest can talk
to each other directly by the traditional BSD-style socket APIs.

Hyper-V Sockets is only available on new Windows hosts, like Windows Server
2016. More info is in this article "Make your own integration services":
https://msdn.microsoft.com/en-us/virtualization/hyperv_on_windows/develop/make_mgmt_service

The patch implements the necessary support in the guest side by introducing
a new socket address family AF_HYPERV.

Signed-off-by: Dexuan Cui 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Vitaly Kuznetsov 
Cc: Cathy Avery 
---

You can also get the patch here (2764221d):
https://github.com/dcui/linux/commits/decui/hv_sock/net-next/20160708_v15

For the change log before v12, please see https://lkml.org/lkml/2016/5/15/31

In v12, the changes are mainly the following:

1) remove the module params as David suggested.

2) use 5 exact pages for VMBus send/recv rings, respectively.
The host side's design of the feature requires 5 exact pages for recv/send
rings respectively -- this is suboptimal considering memory consumption,
however unluckily we have to live with it, before the host comes up with
a new design in the future. :-(

3) remove the per-connection static send/recv buffers
Instead, we allocate and free the buffers dynamically only when we recv/send
data. This means: when a connection is idle, no memory is consumed as
recv/send buffers at all.

In v13:
I return ENOMEM on buffer alllocation failure

Actually "man read/write" says "Other errors may occur, depending on the
object connected to fd". "man send/recv" indeed lists ENOMEM.
Considering AF_HYPERV is a new socket type, ENOMEM seems OK here.
In the long run, I think we should add a new API in the VMBus driver,
allowing data copy from VMBus ringbuffer into user mode buffer directly.
This way, we can even eliminate this temporary buffer.

In v14:
fix some coding style issues pointed out by David.

In v15:
Just some stylistic changes addressing comments from Joe Perches and
Olaf Hering -- thank you!
- add a GPL blurb.
- define a new macro PAGE_SIZE_4K and use it to replace PAGE_SIZE
- change sk_to_hvsock/hvsock_to_sk() from macros to inline functions
- remove a not-very-useful pr_err()
- fix some typos in comment and coding style issues.

Looking forward to your comments!

  MAINTAINERS |2 +
  include/linux/hyperv.h  |   13 +
  include/linux/socket.h  |4 +-
  include/net/af_hvsock.h |   78 +++
  include/uapi/linux/hyperv.h |   24 +
  net/Kconfig |1 +
  net/Makefile|1 +
  net/hv_sock/Kconfig |   10 +
  net/hv_sock/Makefile|3 +
  net/hv_sock/af_hvsock.c | 1523 +++
  10 files changed, 1658 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 50f69ba..6eaa26f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5514,7 +5514,9 @@ F:drivers/pci/host/pci-hyperv.c
  F:drivers/net/hyperv/
  F:drivers/scsi/storvsc_drv.c
  F:drivers/video/fbdev/hyperv_fb.c
+F: net/hv_sock/
  F:include/linux/hyperv.h
+F: include/net/af_hvsock.h
  F:tools/hv/
  F:Documentation/ABI/stable/sysfs-bus-vmbus
  
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h

index 50f493e..1cda6ea5 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1508,5 +1508,18 @@ static inline void commit_rd_index(struct vmbus_channel 
*channel)
vmbus_set_event(channel);
  }
  
+struct vmpipe_proto_header {

+   u32 pkt_type;
+   u32 data_size;
+};
+
+#define HVSOCK_HEADER_LEN  (sizeof(struct vmpacket_descriptor) + \
+sizeof(struct vmpipe_proto_header))
+
+/* See 'prev_indices' in hv_ringbuffer_read(), hv_ringbuffer_write() */
+#define PREV_INDICES_LEN   (sizeof(u64))
  
+#define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \

+   ALIGN((payload_len), 8) + \
+   PREV_INDICES_LEN)
  #endif /* _HYPERV_H */
diff --git a/include/linux/socket.h b/include/linux/socket.h
index b5cc5a6..0b68b58 100644
--- a/include/linux/socket.h
+++ b/i

[RFC PATCH] scsi: scsi_transport_fc: Create a lightweight option for Virtual FC Hosts.

2017-01-04 Thread Cathy Avery

This patch represents an attempt to resurrect the conversation
based on the submission of patch:

[PATCH 1/1] scsi: storvsc: Support manual scan of FC hosts on Hyper-V
K. Y. Srinivasan kys at microsoft.com
Sat Mar 12 21:52:48 UTC 2016

http://driverdev.linuxdriverproject.org/pipermail/driverdev-devel/2016-March/087116.html

That patch attempted to address the problem of not being able to scan FC
hosts on a Hyper-V guest via sysfs due to the fact that they did not
contain the complete characteristic set associated with a normal FC host
( missing rports, vports, etc ). These new lightweight hosts as they
were subsequently referred to are not consistent with the current FC
transport model.

The patch below provides a method to reconcile the issue by offering
a lightweight option to the current FC transport class. The new option
is selected by a driver when it indicates it wants the lightweight
transport in fc_function_template. I have included the changes for
storvsc_drv.c in this patch as an example of a driver making use of the
lightweight transport option.

Signed-off-by: Cathy Avery 
---
 drivers/scsi/scsi_transport_fc.c | 125 +--
 drivers/scsi/storvsc_drv.c   |   6 +-
 include/scsi/scsi_transport_fc.h |   1 +
 3 files changed, 123 insertions(+), 9 deletions(-)

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 03577bd..4adc669 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -50,6 +50,15 @@ static int fc_bsg_hostadd(struct Scsi_Host *, struct 
fc_host_attrs *);
 static int fc_bsg_rportadd(struct Scsi_Host *, struct fc_rport *);
 static void fc_bsg_remove(struct request_queue *);
 static void fc_bsg_goose_queue(struct fc_rport *);
+static int fc_host_lw_setup(struct Scsi_Host *, struct fc_host_attrs *);
+static int fc_host_hw_setup(struct Scsi_Host *, struct fc_host_attrs *);
+static int fc_host_hw_remove(struct fc_host_attrs *);
+static struct scsi_transport_template *
+   fc_attach_lw_transport(struct fc_function_template *);
+static struct scsi_transport_template *
+   fc_attach_hw_transport(struct fc_function_template *);
+static void fc_remove_lw_host(struct Scsi_Host *);
+static void fc_remove_hw_host(struct Scsi_Host *, struct fc_host_attrs *);
 
 /*
  * Module Parameters
@@ -352,6 +361,10 @@ struct fc_internal {
 
 #define to_fc_internal(tmpl)   container_of(tmpl, struct fc_internal, t)
 
+
+static void fc_release_lw_transport(struct fc_internal *);
+static void fc_release_hw_transport(struct fc_internal *);
+
 static int fc_target_setup(struct transport_container *tc, struct device *dev,
   struct device *cdev)
 {
@@ -387,7 +400,26 @@ static int fc_host_setup(struct transport_container *tc, 
struct device *dev,
 {
struct Scsi_Host *shost = dev_to_shost(dev);
struct fc_host_attrs *fc_host = shost_to_fc_host(shost);
+   struct fc_internal *i = to_fc_internal(shost->transportt);
+
+   if (i->f->lightweight_transport)
+   return fc_host_lw_setup(shost, fc_host);
+
+   return fc_host_hw_setup(shost, fc_host);
+}
+
+static int fc_host_lw_setup(struct Scsi_Host *shost,
+   struct fc_host_attrs *fc_host)
+{
+   fc_host->node_name = -1;
+   fc_host->port_name = -1;
+
+   return 0;
+}
 
+static int fc_host_hw_setup(struct Scsi_Host *shost,
+   struct fc_host_attrs *fc_host)
+{
/*
 * Set default values easily detected by the midlayer as
 * failure cases.  The scsi lldd is responsible for initializing
@@ -468,7 +500,16 @@ static int fc_host_remove(struct transport_container *tc, 
struct device *dev,
 {
struct Scsi_Host *shost = dev_to_shost(dev);
struct fc_host_attrs *fc_host = shost_to_fc_host(shost);
+   struct fc_internal *i = to_fc_internal(shost->transportt);
+
+   if (i->f->lightweight_transport)
+   return 0;
 
+   return fc_host_hw_remove(fc_host);
+}
+
+static int fc_host_hw_remove(struct fc_host_attrs *fc_host)
+{
fc_bsg_remove(fc_host->rqst_q);
return 0;
 }
@@ -2175,6 +2216,49 @@ static int fc_it_nexus_response(struct Scsi_Host *shost, 
u64 nexus, int result)
 struct scsi_transport_template *
 fc_attach_transport(struct fc_function_template *ft)
 {
+   if (ft->lightweight_transport)
+   return fc_attach_lw_transport(ft);
+
+   return fc_attach_hw_transport(ft);
+}
+EXPORT_SYMBOL(fc_attach_transport);
+
+
+struct scsi_transport_template *
+fc_attach_lw_transport(struct fc_function_template *ft)
+{
+   int count;
+   struct fc_internal *i;
+
+   i = kzalloc(sizeof(struct fc_internal),
+   GFP_KERNEL);
+
+   if (unlikely(!i))
+   return NULL;
+
+   i->t.host_attrs.ac.attrs = &i->host_attrs[0];
+   i->t.host_attrs.ac.class = &fc_host_class.class;
+

[PATCH v2] scsi: storvsc: Add support for FC rport.

2017-03-17 Thread Cathy Avery

Included in the current storvsc driver for Hyper-V is the ability
to access luns on an FC fabric via a virtualized fiber channel
adapter exposed by the Hyper-V host. The driver also attaches to
the FC transport to allow host and port names to be published under
/sys/class/fc_host/hostX. Current customer tools running on the VM
require that these names be available in the well known standard
location under fc_host/hostX.

A problem arose when attaching to the FC transport. The scsi_scan
code attempts to call fc_user_scan which has basically become a no-op
due to the fact that the virtualized FC device does not expose rports.
At this point you cannot refresh the scsi bus after mapping or unmapping
luns on the SAN without a reboot of the VM.

This patch stubs in an rport per fc_host in storvsc so that the
requirement of a defined rport is now met within the fc_transport and
echo "- - -" > /sys/class/scsi_host/hostX/scan now works.

Signed-off-by: Cathy Avery 
---
Changes since v1:
- Fix fc_rport_identifiers init [Stephen Hemminger]
- Better error checking
---
 drivers/scsi/storvsc_drv.c | 23 ++-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 638e5f4..37646d1 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -478,6 +478,9 @@ struct storvsc_device {
 */
u64 node_name;
u64 port_name;
+#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
+   struct fc_rport *rport;
+#endif
 };
 
 struct hv_host_device {
@@ -1816,19 +1819,27 @@ static int storvsc_probe(struct hv_device *device,
target = (device->dev_instance.b[5] << 8 |
 device->dev_instance.b[4]);
ret = scsi_add_device(host, 0, target, 0);
-   if (ret) {
-   scsi_remove_host(host);
-   goto err_out2;
-   }
+   if (ret)
+   goto err_out3;
}
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
+   struct fc_rport_identifiers ids = {
+   .roles = FC_PORT_ROLE_FCP_TARGET,
+   };
+
fc_host_node_name(host) = stor_device->node_name;
fc_host_port_name(host) = stor_device->port_name;
+   stor_device->rport = fc_remote_port_add(host, 0, &ids);
+   if (!stor_device->rport)
+   goto err_out3;
}
 #endif
return 0;
 
+err_out3:
+   scsi_remove_host(host);
+
 err_out2:
/*
 * Once we have connected with the host, we would need to
@@ -1854,8 +1865,10 @@ static int storvsc_remove(struct hv_device *dev)
struct Scsi_Host *host = stor_device->host;
 
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
-   if (host->transportt == fc_transport_template)
+   if (host->transportt == fc_transport_template) {
+   fc_remote_port_delete(stor_device->rport);
fc_remove_host(host);
+   }
 #endif
scsi_remove_host(host);
storvsc_dev_remove(dev);
-- 
2.5.0

Re: [PATCH] scsi: storvsc: Add support for FC rport.

2017-03-13 Thread Cathy Avery


Hi,

I haven't received any feedback yet.

Should I resend?

Thanks,

Cathy

On 02/28/2017 01:45 PM, Cathy Avery wrote:

Included in the current storvsc driver for Hyper-V is the ability
to access luns on an FC fabric via a virtualized fiber channel
adapter exposed by the Hyper-V host. The driver also attaches to the FC
transport to allow host and port names to be published under
/sys/class/fc_host/hostX. Current customer tools running on the VM
require that these names be available in the well known standard
location under fc_host/hostX.

A problem arose when attaching to the FC transport. The scsi_scan
code attempts to call fc_user_scan which has basically become a no-op
due to the fact that the virtualized FC device does not expose rports.
At this point you cannot refresh the scsi bus after mapping or unmapping
luns on the SAN without a reboot of the VM.

This patch stubs in an rport per fc_host in storvsc so that the
requirement of a defined rport is now met within the fc_transport and
echo "- - -" > /sys/class/scsi_host/hostX/scan now works.

Signed-off-by: Cathy Avery 
---
  drivers/scsi/storvsc_drv.c | 15 ++-
  1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 585e54f..6d7b932 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -478,6 +478,9 @@ struct storvsc_device {
 */
u64 node_name;
u64 port_name;
+#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
+   struct fc_rport *rport;
+#endif
  };
  
  struct hv_host_device {

@@ -1823,8 +1826,16 @@ static int storvsc_probe(struct hv_device *device,
}
  #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
+   struct fc_rport_identifiers ids;
+
+   ids.node_name = 0;
+   ids.port_name = 0;
+   ids.port_id = 0;
+   ids.roles |= FC_PORT_ROLE_FCP_TARGET;
+
fc_host_node_name(host) = stor_device->node_name;
fc_host_port_name(host) = stor_device->port_name;
+   stor_device->rport = fc_remote_port_add(host, 0, &ids);
}
  #endif
return 0;
@@ -1854,8 +1865,10 @@ static int storvsc_remove(struct hv_device *dev)
struct Scsi_Host *host = stor_device->host;
  
  #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)

-   if (host->transportt == fc_transport_template)
+   if (host->transportt == fc_transport_template) {
+   fc_remote_port_delete(stor_device->rport);
fc_remove_host(host);
+   }
  #endif
scsi_remove_host(host);
storvsc_dev_remove(dev);

[PATCH] scsi: storvsc: Add support for FC rport.

2017-03-14 Thread Cathy Avery

Included in the current storvsc driver for Hyper-V is the ability
to access luns on an FC fabric via a virtualized fiber channel
adapter exposed by the Hyper-V host. The driver also attaches to the FC
transport to allow host and port names to be published under
/sys/class/fc_host/hostX. Current customer tools running on the VM
require that these names be available in the well known standard
location under fc_host/hostX.

A problem arose when attaching to the FC transport. The scsi_scan
code attempts to call fc_user_scan which has basically become a no-op
due to the fact that the virtualized FC device does not expose rports.
At this point you cannot refresh the scsi bus after mapping or unmapping
luns on the SAN without a reboot of the VM.

This patch stubs in an rport per fc_host in storvsc so that the
requirement of a defined rport is now met within the fc_transport and
echo "- - -" > /sys/class/scsi_host/hostX/scan now works.

Signed-off-by: Cathy Avery 
---
 drivers/scsi/storvsc_drv.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 638e5f4..c6c0316 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -478,6 +478,9 @@ struct storvsc_device {
 */
u64 node_name;
u64 port_name;
+#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
+   struct fc_rport *rport;
+#endif
 };
 
 struct hv_host_device {
@@ -1823,8 +1826,16 @@ static int storvsc_probe(struct hv_device *device,
}
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
+   struct fc_rport_identifiers ids;
+
+   ids.node_name = 0;
+   ids.port_name = 0;
+   ids.port_id = 0;
+   ids.roles |= FC_PORT_ROLE_FCP_TARGET;
+
fc_host_node_name(host) = stor_device->node_name;
fc_host_port_name(host) = stor_device->port_name;
+   stor_device->rport = fc_remote_port_add(host, 0, &ids);
}
 #endif
return 0;
@@ -1854,8 +1865,10 @@ static int storvsc_remove(struct hv_device *dev)
struct Scsi_Host *host = stor_device->host;
 
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
-   if (host->transportt == fc_transport_template)
+   if (host->transportt == fc_transport_template) {
+   fc_remote_port_delete(stor_device->rport);
fc_remove_host(host);
+   }
 #endif
scsi_remove_host(host);
storvsc_dev_remove(dev);
-- 
2.5.0

Re: [PATCH] scsi: storvsc: Add support for FC rport.

2017-03-14 Thread Cathy Avery


Good catch. Thanks!

On 03/14/2017 12:42 PM, Stephen Hemminger wrote:

On Tue, 14 Mar 2017 12:01:03 -0400
Cathy Avery  wrote:


  #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
+   struct fc_rport_identifiers ids;
+
+   ids.node_name = 0;
+   ids.port_name = 0;
+   ids.port_id = 0;
+   ids.roles |= FC_PORT_ROLE_FCP_TARGET;

Since the variable ids is on the stack, it is uninitialized data.
Doing a OR with uninitialized data is not correct.

Better off to use C99 style iniatializer and skip the zero fields.

struct fc_rport_identifiers ids = {
.roles = FC_PORT_ROLE_FCP_TARGET,
};

[PATCH] scsi: storvsc: Add support for FC rport.

2017-02-28 Thread Cathy Avery

Included in the current storvsc driver for Hyper-V is the ability
to access luns on an FC fabric via a virtualized fiber channel
adapter exposed by the Hyper-V host. The driver also attaches to the FC
transport to allow host and port names to be published under
/sys/class/fc_host/hostX. Current customer tools running on the VM
require that these names be available in the well known standard
location under fc_host/hostX.

A problem arose when attaching to the FC transport. The scsi_scan
code attempts to call fc_user_scan which has basically become a no-op
due to the fact that the virtualized FC device does not expose rports.
At this point you cannot refresh the scsi bus after mapping or unmapping
luns on the SAN without a reboot of the VM.

This patch stubs in an rport per fc_host in storvsc so that the
requirement of a defined rport is now met within the fc_transport and
echo "- - -" > /sys/class/scsi_host/hostX/scan now works.

Signed-off-by: Cathy Avery 
---
 drivers/scsi/storvsc_drv.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 585e54f..6d7b932 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -478,6 +478,9 @@ struct storvsc_device {
 */
u64 node_name;
u64 port_name;
+#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
+   struct fc_rport *rport;
+#endif
 };
 
 struct hv_host_device {
@@ -1823,8 +1826,16 @@ static int storvsc_probe(struct hv_device *device,
}
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
+   struct fc_rport_identifiers ids;
+
+   ids.node_name = 0;
+   ids.port_name = 0;
+   ids.port_id = 0;
+   ids.roles |= FC_PORT_ROLE_FCP_TARGET;
+
fc_host_node_name(host) = stor_device->node_name;
fc_host_port_name(host) = stor_device->port_name;
+   stor_device->rport = fc_remote_port_add(host, 0, &ids);
}
 #endif
return 0;
@@ -1854,8 +1865,10 @@ static int storvsc_remove(struct hv_device *dev)
struct Scsi_Host *host = stor_device->host;
 
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
-   if (host->transportt == fc_transport_template)
+   if (host->transportt == fc_transport_template) {
+   fc_remote_port_delete(stor_device->rport);
fc_remove_host(host);
+   }
 #endif
scsi_remove_host(host);
storvsc_dev_remove(dev);
-- 
2.5.0

[PATCH] PCI: hv: Fix interrupt cleanup path

2016-07-12 Thread Cathy Avery

SR-IOV disabled from the host causes a memory leak.
pci-hyperv usually first receives a PCI_EJECT notification
and then proceeds to delete the hpdev list entry in
hv_eject_device_work(). Later in hv_msi_free() since the
device is no longer on the device list hpdev is NULL
and hv_msi_free returns without freeing int_desc as part of
hv_int_desc_free().

Signed-off-by: Cathy Avery 
---
 drivers/pci/host/pci-hyperv.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/host/pci-hyperv.c b/drivers/pci/host/pci-hyperv.c
index 7e9b2de..449d053 100644
--- a/drivers/pci/host/pci-hyperv.c
+++ b/drivers/pci/host/pci-hyperv.c
@@ -732,16 +732,18 @@ static void hv_msi_free(struct irq_domain *domain, struct 
msi_domain_info *info,
 
pdev = msi_desc_to_pci_dev(msi);
hbus = info->data;
-   hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
-   if (!hpdev)
+   int_desc = irq_data_get_irq_chip_data(irq_data);
+   if (!int_desc)
return;
 
-   int_desc = irq_data_get_irq_chip_data(irq_data);
-   if (int_desc) {
-   irq_data->chip_data = NULL;
-   hv_int_desc_free(hpdev, int_desc);
+   irq_data->chip_data = NULL;
+   hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
+   if (!hpdev) {
+   kfree(int_desc);
+   return;
}
 
+   hv_int_desc_free(hpdev, int_desc);
put_pcichild(hpdev, hv_pcidev_ref_by_slot);
 }
 
-- 
2.5.0

Re: [PATCH] KVM: SVM: Use a separate vmcb for the nested L2 guest

2020-10-08 Thread Cathy Avery


On 10/8/20 6:54 AM, Maxim Levitsky wrote:

On Thu, 2020-10-08 at 13:39 +0300, Maxim Levitsky wrote:

On Thu, 2020-10-08 at 13:23 +0300, Maxim Levitsky wrote:

On Thu, 2020-10-08 at 07:52 +0200, Paolo Bonzini wrote:

On 08/10/20 00:14, Maxim Levitsky wrote:

+   if (svm->vmcb01->control.asid == 0)
+   svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;

I think that the above should be done always. The asid field is currently host
controlled only (that is L2 value is ignored, selective ASID tlb flush is not
advertized to the guest and lnvlpga is emulated as invlpg).

Yes, in fact I suggested that ASID should be in svm->asid and moved to
svm->vmcb->asid in svm_vcpu_run.  Then there's no need to special case
it in nested code.

This makes lot of sense!

This should be a patch coming before this one.


1. Something wrong with memory types - like guest is using UC memory for 
everything.
 I can't completely rule that out yet

You can print g_pat and see if it is all zeroes.

I don't even need to print it. I know that it is never set anywhere, unless 
guest writes it,
but now that I look at it, we set it to a default value and there is no code to 
set it to
default value for vmcb02. This is it. now my fedora guest boots just fine!

I played a lot with g_pat, and yet this didn't occur to me . I was that close 
:-(
I knew that it has to be something with memory types, but it never occured to me
that guest just doesn't write IA32_PAT and uses our value which we set in 
init_vmcb



In general I think it's better to be explicit with vmcb01 vs. vmcb02,
like Cathy did, but I can see it's a matter of personal preference to
some extent.

I also think so in general, but in the code that is outside 'is_guest_mode'
IMHO it is better to refer to vmcb01 as vmcb, although now that I think of
it, its not wrong to do so either.


My windows hyper-v guest doesn't boot though and I know why.

As we know the vmcb save area has extra state which vmrun/vmexit don't touch.
Now suppose a nested hypervisor wants to enter a nested guest.

It will do vmload, which will load the extra state from the nested vmcb (vmcb12
or as I woudl say the vmcb that nested hypervisor thinks that it is using),
to the CPU. This can cause some vmexits I think, but this doesn't matter much.

Now the nested hypervisor does vmrun. The extra state of L2 guest is in CPU 
registers,
and it is untouched. We do vmsave on vmcb01 to preserve that state, but later
when we do vmload on vmcb02 prior to vmenter on it, which loads stale state 
from it.
The same issue happens the other way around on nested vmexit.

I fixed this by doing nested_svm_vmloadsave, but that should be probably be
optimized with dirty bits. Now though I guess the goal it to make
it work first.

With this fixed HyperV boots fine, and even passes the 'works' test of booting
the windows 10 with hyperv enabled nested itself and starting the vm inside,
which makes that VM L3 (in addition to windows itself that runs as L3 in 
relation to hyper-v)

https://i.imgur.com/sRYqtVV.png

In summary this is the diff of fixes (just pasted to email, probably mangled):


diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 0a06e62010d8c..7293ba23b3cbc 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -436,6 +436,9 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 WARN_ON(svm->vmcb == svm->nested.vmcb02);
  
 svm->nested.vmcb02->control = svm->vmcb01->control;

+
+   nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
+
 svm->vmcb = svm->nested.vmcb02;
 svm->vmcb_pa = svm->nested.vmcb02_pa;
 load_nested_vmcb_control(svm, &nested_vmcb->control);
@@ -622,6 +625,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 if (svm->vmcb01->control.asid == 0)
 svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
  
+   nested_svm_vmloadsave(svm->nested.vmcb02, svm->vmcb01);

 svm->vmcb = svm->vmcb01;
 svm->vmcb_pa = svm->nested.vmcb01_pa;
  
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index b66239b26885d..ee9f87fe611f2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1097,6 +1097,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
 save->g_pat = svm->vcpu.arch.pat;
+   svm->nested.vmcb02->save.g_pat = svm->vcpu.arch.pat;


I had noticed that g_pat was not set in vmcb02 and set it to vmcb01's 
value which was not helpful but I did not try the current vcpu value.


I am getting a #UD which I suspected had something to do with cr3 but 
I'll know more after I add your suggestions.


emu-system-x86-1647  [033]   3167.589402: kvm_nested_vmexit_inject: 
reason: UD excp ext_inf1: 0x ext_inf2: 
0x ext_int: 0x ext_int_err: 0x0

Re: [PATCH] KVM: SVM: Use a separate vmcb for the nested L2 guest

2020-10-08 Thread Cathy Avery


On 10/8/20 9:11 AM, Maxim Levitsky wrote:

On Thu, 2020-10-08 at 08:46 -0400, Cathy Avery wrote:

On 10/8/20 6:54 AM, Maxim Levitsky wrote:

On Thu, 2020-10-08 at 13:39 +0300, Maxim Levitsky wrote:

On Thu, 2020-10-08 at 13:23 +0300, Maxim Levitsky wrote:

On Thu, 2020-10-08 at 07:52 +0200, Paolo Bonzini wrote:

On 08/10/20 00:14, Maxim Levitsky wrote:

+   if (svm->vmcb01->control.asid == 0)
+   svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;

I think that the above should be done always. The asid field is currently host
controlled only (that is L2 value is ignored, selective ASID tlb flush is not
advertized to the guest and lnvlpga is emulated as invlpg).

Yes, in fact I suggested that ASID should be in svm->asid and moved to
svm->vmcb->asid in svm_vcpu_run.  Then there's no need to special case
it in nested code.

This makes lot of sense!

This should be a patch coming before this one.


1. Something wrong with memory types - like guest is using UC memory for 
everything.
  I can't completely rule that out yet

You can print g_pat and see if it is all zeroes.

I don't even need to print it. I know that it is never set anywhere, unless 
guest writes it,
but now that I look at it, we set it to a default value and there is no code to 
set it to
default value for vmcb02. This is it. now my fedora guest boots just fine!

I played a lot with g_pat, and yet this didn't occur to me . I was that close 
:-(
I knew that it has to be something with memory types, but it never occured to me
that guest just doesn't write IA32_PAT and uses our value which we set in 
init_vmcb



In general I think it's better to be explicit with vmcb01 vs. vmcb02,
like Cathy did, but I can see it's a matter of personal preference to
some extent.

I also think so in general, but in the code that is outside 'is_guest_mode'
IMHO it is better to refer to vmcb01 as vmcb, although now that I think of
it, its not wrong to do so either.


My windows hyper-v guest doesn't boot though and I know why.

As we know the vmcb save area has extra state which vmrun/vmexit don't touch.
Now suppose a nested hypervisor wants to enter a nested guest.

It will do vmload, which will load the extra state from the nested vmcb (vmcb12
or as I woudl say the vmcb that nested hypervisor thinks that it is using),
to the CPU. This can cause some vmexits I think, but this doesn't matter much.

Now the nested hypervisor does vmrun. The extra state of L2 guest is in CPU 
registers,
and it is untouched. We do vmsave on vmcb01 to preserve that state, but later
when we do vmload on vmcb02 prior to vmenter on it, which loads stale state 
from it.
The same issue happens the other way around on nested vmexit.

I fixed this by doing nested_svm_vmloadsave, but that should be probably be
optimized with dirty bits. Now though I guess the goal it to make
it work first.

With this fixed HyperV boots fine, and even passes the 'works' test of booting
the windows 10 with hyperv enabled nested itself and starting the vm inside,
which makes that VM L3 (in addition to windows itself that runs as L3 in 
relation to hyper-v)

https://i.imgur.com/sRYqtVV.png

In summary this is the diff of fixes (just pasted to email, probably mangled):


diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 0a06e62010d8c..7293ba23b3cbc 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -436,6 +436,9 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
  WARN_ON(svm->vmcb == svm->nested.vmcb02);
   
  svm->nested.vmcb02->control = svm->vmcb01->control;

+
+   nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
+
  svm->vmcb = svm->nested.vmcb02;
  svm->vmcb_pa = svm->nested.vmcb02_pa;
  load_nested_vmcb_control(svm, &nested_vmcb->control);
@@ -622,6 +625,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
  if (svm->vmcb01->control.asid == 0)
  svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
   
+   nested_svm_vmloadsave(svm->nested.vmcb02, svm->vmcb01);

  svm->vmcb = svm->vmcb01;
  svm->vmcb_pa = svm->nested.vmcb01_pa;
   
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index b66239b26885d..ee9f87fe611f2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1097,6 +1097,7 @@ static void init_vmcb(struct vcpu_svm *svm)
  clr_cr_intercept(svm, INTERCEPT_CR3_READ);
  clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
  save->g_pat = svm->vcpu.arch.pat;
+   svm->nested.vmcb02->save.g_pat = svm->vcpu.arch.pat;

I had noticed that g_pat was not set in vmcb02 and set it to vmcb01's
value which was not helpful but I did not try the current vcpu value.

Re: [PATCH] KVM: SVM: Use a separate vmcb for the nested L2 guest

2020-10-09 Thread Cathy Avery


On 10/8/20 6:23 AM, Maxim Levitsky wrote:

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 0a06e62010d8c..7293ba23b3cbc 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -436,6 +436,9 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 WARN_ON(svm->vmcb == svm->nested.vmcb02);
  
 svm->nested.vmcb02->control = svm->vmcb01->control;

+
+   nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
+
 svm->vmcb = svm->nested.vmcb02;
 svm->vmcb_pa = svm->nested.vmcb02_pa;
 load_nested_vmcb_control(svm, &nested_vmcb->control);
@@ -622,6 +625,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 if (svm->vmcb01->control.asid == 0)
 svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;
  
+   nested_svm_vmloadsave(svm->nested.vmcb02, svm->vmcb01);

 svm->vmcb = svm->vmcb01;
 svm->vmcb_pa = svm->nested.vmcb01_pa;
  
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index b66239b26885d..ee9f87fe611f2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1097,6 +1097,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
 save->g_pat = svm->vcpu.arch.pat;
+   svm->nested.vmcb02->save.g_pat = svm->vcpu.arch.pat;
 save->cr3 = 0;
 save->cr4 = 0;
 }


OK this worked for me. Thanks!

[PATCH v2 1/2] KVM: SVM: Move asid to vcpu_svm

2020-10-11 Thread Cathy Avery

Move asid to svm->asid to allow for vmcb assignment
during svm_vcpu_run without regard to which level
guest is running.

Signed-off-by: Cathy Avery 
---
 arch/x86/kvm/svm/svm.c | 4 +++-
 arch/x86/kvm/svm/svm.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d4e18bda19c7..619980a5d540 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1101,6 +1101,7 @@ static void init_vmcb(struct vcpu_svm *svm)
save->cr4 = 0;
}
svm->asid_generation = 0;
+   svm->asid = 0;
 
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
@@ -1663,7 +1664,7 @@ static void new_asid(struct vcpu_svm *svm, struct 
svm_cpu_data *sd)
}
 
svm->asid_generation = sd->asid_generation;
-   svm->vmcb->control.asid = sd->next_asid++;
+   svm->asid = sd->next_asid++;
 
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
 }
@@ -3446,6 +3447,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu 
*vcpu)
 
sync_lapic_to_cr8(vcpu);
 
+   svm->vmcb->control.asid = svm->asid;
svm->vmcb->save.cr2 = vcpu->arch.cr2;
 
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a798e1731709..862f0d2405e8 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -104,6 +104,7 @@ struct vcpu_svm {
struct vmcb *vmcb;
unsigned long vmcb_pa;
struct svm_cpu_data *svm_data;
+   u32 asid;
uint64_t asid_generation;
uint64_t sysenter_esp;
uint64_t sysenter_eip;
-- 
2.20.1

[PATCH v2 0/2] KVM: SVM: Create separate vmcbs for L1 and L2

2020-10-11 Thread Cathy Avery

svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb 
( nested ).

Changes:
v1 -> v2
 - Removed unnecessary update check of L1 save.cr3 during nested_svm_vmexit.
 - Moved vmcb01_pa to svm.
 - Removed get_host_vmcb() function.
 - Updated vmsave/vmload corresponding vmcb state during L2
   enter and exit which fixed the L2 load issue.
 - Moved asid workaround to a new patch which adds asid to svm.
 - Init previously uninitialized L2 vmcb save.gpat and save.cr4

Tested:
kvm-unit-tests
kvm self tests
Loaded fedora nested guest on fedora

Cathy Avery (2):
  KVM: SVM: Move asid to vcpu_svm
  KVM: SVM: Use a separate vmcb for the nested L2 guest

 arch/x86/kvm/svm/nested.c | 117 +-
 arch/x86/kvm/svm/svm.c|  46 ---
 arch/x86/kvm/svm/svm.h|  50 +---
 3 files changed, 93 insertions(+), 120 deletions(-)

-- 
2.20.1

[PATCH v2 2/2] KVM: SVM: Use a separate vmcb for the nested L2 guest

2020-10-11 Thread Cathy Avery

svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb 
( nested ).

Issues:

1) There is some wholesale copying of vmcb.save and vmcb.contol
   areas which will need to be refined.

Tested:
kvm-unit-tests
kvm self tests
Loaded fedora nested guest on fedora

Signed-off-by: Cathy Avery 
---
 arch/x86/kvm/svm/nested.c | 117 +-
 arch/x86/kvm/svm/svm.c|  42 +++---
 arch/x86/kvm/svm/svm.h|  49 +---
 3 files changed, 89 insertions(+), 119 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index e90bc436f584..bcce92f0a90c 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -75,12 +75,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu 
*vcpu)
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
-   struct vmcb *hsave = svm->nested.hsave;
 
WARN_ON(mmu_is_nested(vcpu));
 
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
-   kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, 
hsave->save.efer,
+   kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01->save.cr4,
+   svm->vmcb01->save.efer,
svm->nested.ctl.nested_cr3);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
@@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
return;
 
c = &svm->vmcb->control;
-   h = &svm->nested.hsave->control;
+   h = &svm->vmcb01->control;
g = &svm->nested.ctl;
 
svm->nested.host_intercept_exceptions = h->intercept_exceptions;
@@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm 
*svm)
 
svm->vmcb->control.int_ctl =
(svm->nested.ctl.int_ctl & ~mask) |
-   (svm->nested.hsave->control.int_ctl & mask);
+   (svm->vmcb01->control.int_ctl & mask);
 
svm->vmcb->control.virt_ext= svm->nested.ctl.virt_ext;
svm->vmcb->control.int_vector  = svm->nested.ctl.int_vector;
@@ -432,6 +432,16 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 
vmcb_gpa,
int ret;
 
svm->nested.vmcb = vmcb_gpa;
+
+   WARN_ON(svm->vmcb == svm->nested.vmcb02);
+
+   svm->nested.vmcb02->control = svm->vmcb01->control;
+   svm->nested.vmcb02->save.cr4 = svm->vmcb01->save.cr4;
+
+   nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
+
+   svm->vmcb = svm->nested.vmcb02;
+   svm->vmcb_pa = svm->nested.vmcb02_pa;
load_nested_vmcb_control(svm, &nested_vmcb->control);
nested_prepare_vmcb_save(svm, nested_vmcb);
nested_prepare_vmcb_control(svm);
@@ -450,8 +460,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 {
int ret;
struct vmcb *nested_vmcb;
-   struct vmcb *hsave = svm->nested.hsave;
-   struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
u64 vmcb_gpa;
 
@@ -496,29 +504,14 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
 
-   /*
-* Save the old vmcb, so we don't need to pick what we save, but can
-* restore everything when a VMEXIT occurs
-*/
-   hsave->save.es = vmcb->save.es;
-   hsave->save.cs = vmcb->save.cs;
-   hsave->save.ss = vmcb->save.ss;
-   hsave->save.ds = vmcb->save.ds;
-   hsave->save.gdtr   = vmcb->save.gdtr;
-   hsave->save.idtr   = vmcb->save.idtr;
-   hsave->save.efer   = svm->vcpu.arch.efer;
-   hsave->save.cr0= kvm_read_cr0(&svm->vcpu);
-   hsave->save.cr4= svm->vcpu.arch.cr4;
-   hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
-   hsave->save.rip= kvm_rip_read(&svm->vcpu);
-   hsave->save.rsp= vmcb->save.rsp;
-   hsave->save.rax= vmcb->save.rax;
-   if (npt_enabled)
-   hsave->save.cr3= vmcb->save.cr3;
-   else
-   hsave->save.cr3= kvm_read_cr3(&svm->vcpu);
-
-   copy_vmcb_control_area(&hsave->control, &vmcb->control);
+   svm->vmcb01->save.efer   = svm->vcpu.arch.efer;
+   svm->vmcb01->save.cr0= kvm_read_cr0(&svm->vcpu);
+   svm->vmcb01->save.cr4= svm->vcpu.arch.cr4;
+   svm->vmcb01->save.rflags = kvm_get_rflags(&svm->vcpu);
+   svm->vmcb01->save.rip= kvm_rip_read(&svm->vcpu);
+
+   if (!npt_enabled)

[PATCH kvm-unit-tests] svm: Test V_IRQ injection

2020-05-09 Thread Cathy Avery

Test V_IRQ injection from L1 to L2 with V_TPR less
than or greater than V_INTR_PRIO. Also test VINTR
intercept with differing V_TPR and V_INTR_PRIO.

Signed-off-by: Cathy Avery 
---
 x86/svm_tests.c | 150 
 1 file changed, 150 insertions(+)

diff --git a/x86/svm_tests.c b/x86/svm_tests.c
index 65008ba..aa6f3c2 100644
--- a/x86/svm_tests.c
+++ b/x86/svm_tests.c
@@ -1595,6 +1595,153 @@ static bool exc_inject_check(struct svm_test *test)
 return count_exc == 1 && get_test_stage(test) == 3;
 }
 
+static volatile bool virq_fired;
+
+static void virq_isr(isr_regs_t *regs)
+{
+virq_fired = true;
+}
+
+static void virq_inject_prepare(struct svm_test *test)
+{
+handle_irq(0xf1, virq_isr);
+default_prepare(test);
+vmcb->control.int_ctl = V_INTR_MASKING_MASK | V_IRQ_MASK |
+(0x0f << V_INTR_PRIO_SHIFT); // Set to the highest 
priority
+vmcb->control.int_vector = 0xf1;
+virq_fired = false;
+set_test_stage(test, 0);
+}
+
+static void virq_inject_test(struct svm_test *test)
+{
+if (virq_fired) {
+report(false, "virtual interrupt fired before L2 sti");
+set_test_stage(test, -1);
+vmmcall();
+}
+
+irq_enable();
+asm volatile ("nop");
+irq_disable();
+
+if (!virq_fired) {
+report(false, "virtual interrupt not fired after L2 sti");
+set_test_stage(test, -1);
+}
+
+vmmcall();
+
+if (virq_fired) {
+report(false, "virtual interrupt fired before L2 sti after VINTR 
intercept");
+set_test_stage(test, -1);
+vmmcall();
+}
+
+irq_enable();
+asm volatile ("nop");
+irq_disable();
+
+if (!virq_fired) {
+report(false, "virtual interrupt not fired after return from VINTR 
intercept");
+set_test_stage(test, -1);
+}
+
+vmmcall();
+
+irq_enable();
+asm volatile ("nop");
+irq_disable();
+
+if (virq_fired) {
+report(false, "virtual interrupt fired when V_IRQ_PRIO less than 
V_TPR");
+set_test_stage(test, -1);
+}
+
+vmmcall();
+vmmcall();
+}
+
+static bool virq_inject_finished(struct svm_test *test)
+{
+vmcb->save.rip += 3;
+
+switch (get_test_stage(test)) {
+case 0:
+if (vmcb->control.exit_code != SVM_EXIT_VMMCALL) {
+report(false, "VMEXIT not due to vmmcall. Exit reason 0x%x",
+   vmcb->control.exit_code);
+return true;
+}
+if (vmcb->control.int_ctl & V_IRQ_MASK) {
+report(false, "V_IRQ not cleared on VMEXIT after firing");
+return true;
+}
+virq_fired = false;
+vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
+vmcb->control.int_ctl = V_INTR_MASKING_MASK | V_IRQ_MASK |
+(0x0f << V_INTR_PRIO_SHIFT);
+break;
+
+case 1:
+if (vmcb->control.exit_code != SVM_EXIT_VINTR) {
+report(false, "VMEXIT not due to vintr. Exit reason 0x%x",
+   vmcb->control.exit_code);
+return true;
+}
+if (virq_fired) {
+report(false, "V_IRQ fired before SVM_EXIT_VINTR");
+return true;
+}
+vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+break;
+
+case 2:
+if (vmcb->control.exit_code != SVM_EXIT_VMMCALL) {
+report(false, "VMEXIT not due to vmmcall. Exit reason 0x%x",
+   vmcb->control.exit_code);
+return true;
+}
+virq_fired = false;
+// Set irq to lower priority
+vmcb->control.int_ctl = V_INTR_MASKING_MASK | V_IRQ_MASK |
+(0x08 << V_INTR_PRIO_SHIFT);
+// Raise guest TPR
+vmcb->control.int_ctl |= 0x0a & V_TPR_MASK;
+break;
+
+case 3:
+if (vmcb->control.exit_code != SVM_EXIT_VMMCALL) {
+report(false, "VMEXIT not due to vmmcall. Exit reason 0x%x",
+   vmcb->control.exit_code);
+return true;
+}
+vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
+break;
+
+case 4:
+// INTERCEPT_VINTR should be ignored because V_INTR_PRIO < V_TPR
+if (vmcb->control.exit_code != SVM_EXIT_VMMCALL) {
+report(false, "VMEXIT not due to vmmcall. Exit reason 0x%x",
+   vmcb->control.exit_code);
+return true;
+}
+break;
+
+default:
+return true;
+}
+
+inc_test_stage(test);
+
+return get_test_stage(test) == 5;
+}
+
+static bool virq_inject_check(struct svm_test *test)
+{
+return get_test_stage(test) == 5;
+}
+
 #define TEST(name) { #name,

[PATCH kvm-unit-tests 2/2] svm: INIT intercept test

2020-06-08 Thread Cathy Avery

INIT vcpu 2 and intercept the INIT. This test
will leave the vcpu in an unusable state.

Signed-off-by: Cathy Avery 
---
 x86/svm_tests.c | 40 
 1 file changed, 40 insertions(+)

diff --git a/x86/svm_tests.c b/x86/svm_tests.c
index c1abd55..a4dbe91 100644
--- a/x86/svm_tests.c
+++ b/x86/svm_tests.c
@@ -1789,6 +1789,43 @@ static bool virq_inject_check(struct svm_test *test)
 return get_test_stage(test) == 5;
 }
 
+static volatile bool init_intercept;
+
+static void init_signal_intercept_prepare(struct svm_test *test)
+{
+
+vmcb_ident(vmcb);
+vmcb->control.intercept |= (1ULL << INTERCEPT_INIT);
+init_intercept = false;
+}
+
+static void init_signal_test(struct svm_test *test)
+{
+apic_icr_write(APIC_DEST_SELF | APIC_DEST_PHYSICAL | APIC_DM_INIT | 
APIC_INT_ASSERT, 0);
+}
+
+static bool init_signal_finished(struct svm_test *test)
+{
+vmcb->save.rip += 3;
+
+if (vmcb->control.exit_code != SVM_EXIT_INIT) {
+report(false, "VMEXIT not due to init intercept. Exit reason 0x%x",
+   vmcb->control.exit_code);
+return true;
+}
+
+init_intercept = true;
+
+report(true, "INIT to vcpu intercepted");
+
+return true;
+}
+
+static bool init_signal_check(struct svm_test *test)
+{
+return init_intercept;
+}
+
 #define TEST(name) { #name, .v2 = name }
 
 /*
@@ -1950,6 +1987,9 @@ struct svm_test svm_tests[] = {
 { "virq_inject", default_supported, virq_inject_prepare,
   default_prepare_gif_clear, virq_inject_test,
   virq_inject_finished, virq_inject_check },
+{ "svm_init_signal_intercept_test", default_supported, 
init_signal_intercept_prepare,
+  default_prepare_gif_clear, init_signal_test,
+  init_signal_finished, init_signal_check, .on_vcpu = 2 },
 TEST(svm_guest_state_test),
 { NULL, NULL, NULL, NULL, NULL, NULL, NULL }
 };
-- 
2.20.1

[PATCH kvm-unit-tests 1/2] svm: Add ability to execute test via test_run on a vcpu other than vcpu 0

2020-06-08 Thread Cathy Avery

When running tests that can result in a vcpu being left in an
indeterminate state it is useful to be able to run the test on
a vcpu other than 0. This patch allows test_run to be executed
on any vcpu indicated by the on_vcpu member of the svm_test struct.
The initialized state of the vcpu0 registers used to populate the
vmcb is carried forward to the other vcpus.

Signed-off-by: Cathy Avery 
---
 x86/svm.c | 49 -
 x86/svm.h | 13 +
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/x86/svm.c b/x86/svm.c
index 41685bf..9f7ae7e 100644
--- a/x86/svm.c
+++ b/x86/svm.c
@@ -367,6 +367,45 @@ test_wanted(const char *name, char *filters[], int 
filter_count)
 }
 }
 
+static void set_additional_vpcu_regs(struct extra_vcpu_info *info)
+{
+wrmsr(MSR_VM_HSAVE_PA, info->hsave);
+wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_SVME);
+wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_NX);
+write_cr3(info->cr3);
+write_cr4(info->cr4);
+write_cr0(info->cr0);
+write_dr6(info->dr6);
+write_dr7(info->dr7);
+write_cr2(info->cr2);
+wrmsr(MSR_IA32_CR_PAT, info->g_pat);
+wrmsr(MSR_IA32_DEBUGCTLMSR, info->dbgctl);
+}
+
+static void get_additional_vcpu_regs(struct extra_vcpu_info *info)
+{
+info->hsave = rdmsr(MSR_VM_HSAVE_PA);
+info->cr3 = read_cr3();
+info->cr4 = read_cr4();
+info->cr0 = read_cr0();
+info->dr7 = read_dr7();
+info->dr6 = read_dr6();
+info->cr2 = read_cr2();
+info->g_pat = rdmsr(MSR_IA32_CR_PAT);
+info->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR);
+}
+
+static void init_additional_vcpu_regs(void)
+{
+int i;
+struct extra_vcpu_info info;
+
+get_additional_vcpu_regs(&info);
+
+for (i = 1; i < cpu_count(); i++)
+on_cpu(i, (void *)set_additional_vpcu_regs, &info);
+}
+
 int main(int ac, char **av)
 {
int i = 0;
@@ -384,6 +423,8 @@ int main(int ac, char **av)
 
setup_svm();
 
+init_additional_vcpu_regs();
+
vmcb = alloc_page();
 
for (; svm_tests[i].name != NULL; i++) {
@@ -392,7 +433,13 @@ int main(int ac, char **av)
if (svm_tests[i].supported && !svm_tests[i].supported())
continue;
if (svm_tests[i].v2 == NULL) {
-   test_run(&svm_tests[i]);
+   if (svm_tests[i].on_vcpu) {
+   if (cpu_count() <= svm_tests[i].on_vcpu)
+   continue;
+   on_cpu(svm_tests[i].on_vcpu, (void *)test_run, 
&svm_tests[i]);
+   }
+   else
+   test_run(&svm_tests[i]);
} else {
vmcb_ident(vmcb);
v2_test = &(svm_tests[i]);
diff --git a/x86/svm.h b/x86/svm.h
index 645deb7..d023a32 100644
--- a/x86/svm.h
+++ b/x86/svm.h
@@ -338,6 +338,7 @@ struct svm_test {
ulong scratch;
/* Alternative test interface. */
void (*v2)(void);
+   int on_vcpu;
 };
 
 struct regs {
@@ -360,6 +361,18 @@ struct regs {
u64 rflags;
 };
 
+struct extra_vcpu_info {
+   u64 hsave;
+   u64 cr3;
+   u64 cr4;
+   u64 cr0;
+   u64 dr7;
+   u64 dr6;
+   u64 cr2;
+   u64 g_pat;
+   u64 dbgctl;
+};
+
 typedef void (*test_guest_func)(struct svm_test *);
 
 u64 *npt_get_pte(u64 address);
-- 
2.20.1

[PATCH kvm-unit-tests 0/2] svm: INIT test and test_run on selected vcpu

2020-06-08 Thread Cathy Avery

INIT intercept test and the ability to execute test_run
on a selected vcpu.

Cathy Avery (2):
  svm: Add ability to execute test via test_run on a vcpu other than
vcpu 0
  svm: INIT intercept test

 x86/svm.c   | 49 -
 x86/svm.h   | 13 +
 x86/svm_tests.c | 40 
 3 files changed, 101 insertions(+), 1 deletion(-)

-- 
2.20.1

[PATCH v3 1/2] KVM: SVM: Track asid from vcpu_svm

2020-10-26 Thread Cathy Avery

Track asid from svm->asid to allow for vmcb assignment
without regard to which level guest is running.

Suggested-by: Paolo Bonzini 
Signed-off-by: Cathy Avery 
---
 arch/x86/kvm/svm/svm.c | 16 ++--
 arch/x86/kvm/svm/svm.h |  2 ++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d4e18bda19c7..83b4f56883f8 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1101,6 +1101,7 @@ static void init_vmcb(struct vcpu_svm *svm)
save->cr4 = 0;
}
svm->asid_generation = 0;
+   svm->asid = 0;
 
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
@@ -1659,11 +1660,11 @@ static void new_asid(struct vcpu_svm *svm, struct 
svm_cpu_data *sd)
if (sd->next_asid > sd->max_asid) {
++sd->asid_generation;
sd->next_asid = sd->min_asid;
-   svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+   sd->flush_all_asid = true;
}
 
svm->asid_generation = sd->asid_generation;
-   svm->vmcb->control.asid = sd->next_asid++;
+   svm->asid = sd->next_asid++;
 
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
 }
@@ -3030,6 +3031,17 @@ static void pre_svm_run(struct vcpu_svm *svm)
/* FIXME: handle wraparound of asid_generation */
if (svm->asid_generation != sd->asid_generation)
new_asid(svm, sd);
+
+   if (sd->flush_all_asid) {
+   svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+   sd->flush_all_asid = false;
+   vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+   }
+
+   if (unlikely(svm->asid != svm->vmcb->control.asid))
+   vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+
+   svm->vmcb->control.asid = svm->asid;
 }
 
 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a798e1731709..22832362bced 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -104,6 +104,7 @@ struct vcpu_svm {
struct vmcb *vmcb;
unsigned long vmcb_pa;
struct svm_cpu_data *svm_data;
+   u32 asid;
uint64_t asid_generation;
uint64_t sysenter_esp;
uint64_t sysenter_eip;
@@ -164,6 +165,7 @@ struct svm_cpu_data {
int cpu;
 
u64 asid_generation;
+   bool flush_all_asid;
u32 max_asid;
u32 next_asid;
u32 min_asid;
-- 
2.20.1

[PATCH v3 2/2] KVM: SVM: Use a separate vmcb for the nested L2 guest

2020-10-26 Thread Cathy Avery

svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb 
( nested ).

Issues:

1) There is some wholesale copying of vmcb.save and vmcb.contol
   areas which will need to be refined.

Tested:
kvm-unit-tests
kvm self tests
Loaded fedora nested guest on fedora

Signed-off-by: Cathy Avery 
---
 arch/x86/kvm/svm/nested.c | 125 ++
 arch/x86/kvm/svm/svm.c|  42 +++--
 arch/x86/kvm/svm/svm.h|  49 ---
 3 files changed, 94 insertions(+), 122 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index e90bc436f584..3acab2bf99a5 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -75,12 +75,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu 
*vcpu)
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
-   struct vmcb *hsave = svm->nested.hsave;
 
WARN_ON(mmu_is_nested(vcpu));
 
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
-   kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, 
hsave->save.efer,
+   kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01->save.cr4,
+   svm->vmcb01->save.efer,
svm->nested.ctl.nested_cr3);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
@@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
return;
 
c = &svm->vmcb->control;
-   h = &svm->nested.hsave->control;
+   h = &svm->vmcb01->control;
g = &svm->nested.ctl;
 
svm->nested.host_intercept_exceptions = h->intercept_exceptions;
@@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm 
*svm)
 
svm->vmcb->control.int_ctl =
(svm->nested.ctl.int_ctl & ~mask) |
-   (svm->nested.hsave->control.int_ctl & mask);
+   (svm->vmcb01->control.int_ctl & mask);
 
svm->vmcb->control.virt_ext= svm->nested.ctl.virt_ext;
svm->vmcb->control.int_vector  = svm->nested.ctl.int_vector;
@@ -426,12 +426,29 @@ static void nested_prepare_vmcb_control(struct vcpu_svm 
*svm)
vmcb_mark_all_dirty(svm->vmcb);
 }
 
+static void svm_switch_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb, unsigned 
long vmcb_pa)
+{
+   if (svm->vmcb == vmcb)
+   return;
+
+   svm->vmcb = vmcb;
+   svm->vmcb_pa = vmcb_pa;
+}
+
 int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
  struct vmcb *nested_vmcb)
 {
int ret;
 
svm->nested.vmcb = vmcb_gpa;
+
+   WARN_ON(svm->vmcb == svm->nested.vmcb02);
+
+   svm->nested.vmcb02->control = svm->vmcb01->control;
+   svm->nested.vmcb02->save.cr4 = svm->vmcb01->save.cr4;
+
+   nested_svm_vmloadsave(svm->vmcb01, svm->nested.vmcb02);
+   svm_switch_vmcb(svm, svm->nested.vmcb02, svm->nested.vmcb02_pa);
load_nested_vmcb_control(svm, &nested_vmcb->control);
nested_prepare_vmcb_save(svm, nested_vmcb);
nested_prepare_vmcb_control(svm);
@@ -450,8 +467,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 {
int ret;
struct vmcb *nested_vmcb;
-   struct vmcb *hsave = svm->nested.hsave;
-   struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
u64 vmcb_gpa;
 
@@ -496,29 +511,14 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
 
-   /*
-* Save the old vmcb, so we don't need to pick what we save, but can
-* restore everything when a VMEXIT occurs
-*/
-   hsave->save.es = vmcb->save.es;
-   hsave->save.cs = vmcb->save.cs;
-   hsave->save.ss = vmcb->save.ss;
-   hsave->save.ds = vmcb->save.ds;
-   hsave->save.gdtr   = vmcb->save.gdtr;
-   hsave->save.idtr   = vmcb->save.idtr;
-   hsave->save.efer   = svm->vcpu.arch.efer;
-   hsave->save.cr0= kvm_read_cr0(&svm->vcpu);
-   hsave->save.cr4= svm->vcpu.arch.cr4;
-   hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
-   hsave->save.rip= kvm_rip_read(&svm->vcpu);
-   hsave->save.rsp= vmcb->save.rsp;
-   hsave->save.rax= vmcb->save.rax;
-   if (npt_enabled)
-   hsave->save.cr3= vmcb->save.cr3;
-   else
-   hsave->save.cr3= kvm_read_cr3(&svm->vcpu);
-
-   copy_vmcb_control_area(&hsave->control, &vmcb->control);
+   s

[PATCH v3 0/2] KVM: SVM: Create separate vmcbs for L1 and L2

2020-10-26 Thread Cathy Avery

svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb 
( nested ).

Changes:
v2 -> v3
 - Added vmcb switching helper.
 - svm_set_nested_state always forces to L1 before determining state
   to set. This is more like vmx and covers any potential L2 to L2 nested state 
switch.
 - Moved svm->asid tracking to pre_svm_run and added ASID set dirty bit
   checking.

v1 -> v2
 - Removed unnecessary update check of L1 save.cr3 during nested_svm_vmexit.
 - Moved vmcb01_pa to svm.
 - Removed get_host_vmcb() function.
 - Updated vmsave/vmload corresponding vmcb state during L2
   enter and exit which fixed the L2 load issue.
 - Moved asid workaround to a new patch which adds asid to svm.
 - Init previously uninitialized L2 vmcb save.gpat and save.cr4

Tested:
kvm-unit-tests
kvm self tests
Loaded fedora nested guest on fedora

Cathy Avery (2):
  KVM: SVM: Track asid from vcpu_svm
  KVM: SVM: Use a separate vmcb for the nested L2 guest

 arch/x86/kvm/svm/nested.c | 125 ++
 arch/x86/kvm/svm/svm.c|  58 +++---
 arch/x86/kvm/svm/svm.h|  51 +---
 3 files changed, 110 insertions(+), 124 deletions(-)

-- 
2.20.1

Re: [PATCH v3 0/2] KVM: SVM: Create separate vmcbs for L1 and L2

2020-11-12 Thread Cathy Avery

That would be the master branch of 
git://git.kernel.org/pub/scm/virt/kvm/kvm.git where the last commit was 
969df928fee43b4219646a57c7beaf2c0635


I was originally working off of the queue branch but there were issues 
with the prior commits passing the various tests.


Cathy

On 11/11/20 4:35 PM, Babu Moger wrote:

Hi Cathy,
I was going to test these patches. But it did not apply on my tree.
Tried on kvm(https://git.kernel.org/pub/scm/virt/kvm/kvm.git) and
Mainline
(https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git). What
is your base tree?
thanks
Babu


-Original Message-
From: Cathy Avery 
Sent: Monday, October 26, 2020 12:42 PM
To: linux-kernel@vger.kernel.org; k...@vger.kernel.org; pbonz...@redhat.com
Cc: vkuzn...@redhat.com; Huang2, Wei ;
mlevi...@redhat.com; sean.j.christopher...@intel.com
Subject: [PATCH v3 0/2] KVM: SVM: Create separate vmcbs for L1 and L2

svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb
( nested ).

Changes:
v2 -> v3
  - Added vmcb switching helper.
  - svm_set_nested_state always forces to L1 before determining state
to set. This is more like vmx and covers any potential L2 to L2 nested state
switch.
  - Moved svm->asid tracking to pre_svm_run and added ASID set dirty bit
checking.

v1 -> v2
  - Removed unnecessary update check of L1 save.cr3 during nested_svm_vmexit.
  - Moved vmcb01_pa to svm.
  - Removed get_host_vmcb() function.
  - Updated vmsave/vmload corresponding vmcb state during L2
enter and exit which fixed the L2 load issue.
  - Moved asid workaround to a new patch which adds asid to svm.
  - Init previously uninitialized L2 vmcb save.gpat and save.cr4

Tested:
kvm-unit-tests
kvm self tests
Loaded fedora nested guest on fedora

Cathy Avery (2):
   KVM: SVM: Track asid from vcpu_svm
   KVM: SVM: Use a separate vmcb for the nested L2 guest

  arch/x86/kvm/svm/nested.c | 125 ++
  arch/x86/kvm/svm/svm.c|  58 +++---
  arch/x86/kvm/svm/svm.h|  51 +---
  3 files changed, 110 insertions(+), 124 deletions(-)

--
2.20.1

[PATCH kvm-unit-tests v2 3/3] svm: INIT intercept test

2020-07-17 Thread Cathy Avery

INIT vcpu 2 and intercept the INIT. This test
will leave the vcpu in an unusable state.

Signed-off-by: Cathy Avery 
---
 x86/svm_tests.c | 40 
 1 file changed, 40 insertions(+)

diff --git a/x86/svm_tests.c b/x86/svm_tests.c
index 698eb20..b43c19f 100644
--- a/x86/svm_tests.c
+++ b/x86/svm_tests.c
@@ -1939,6 +1939,43 @@ static bool init_startup_check(struct svm_test *test)
 return cpu_online_count == orig_cpu_count;
 }
 
+static volatile bool init_intercept;
+
+static void init_intercept_prepare(struct svm_test *test)
+{
+init_intercept = false;
+vmcb_ident(vmcb);
+vmcb->control.intercept |= (1ULL << INTERCEPT_INIT);
+}
+
+static void init_intercept_test(struct svm_test *test)
+{
+apic_icr_write(APIC_DEST_SELF | APIC_DEST_PHYSICAL | APIC_DM_INIT | 
APIC_INT_ASSERT, 0);
+}
+
+static bool init_intercept_finished(struct svm_test *test)
+{
+vmcb->save.rip += 3;
+
+if (vmcb->control.exit_code != SVM_EXIT_INIT) {
+report(false, "VMEXIT not due to init intercept. Exit reason 0x%x",
+   vmcb->control.exit_code);
+
+return true;
+}
+
+init_intercept = true;
+
+report(true, "INIT to vcpu intercepted");
+
+return true;
+}
+
+static bool init_intercept_check(struct svm_test *test)
+{
+return init_intercept;
+}
+
 #define TEST(name) { #name, .v2 = name }
 
 /*
@@ -2255,6 +2292,9 @@ struct svm_test svm_tests[] = {
 { "svm_init_startup_test", smp_supported, init_startup_prepare,
   default_prepare_gif_clear, null_test,
   init_startup_finished, init_startup_check },
+{ "svm_init_intercept_test", smp_supported, init_intercept_prepare,
+  default_prepare_gif_clear, init_intercept_test,
+  init_intercept_finished, init_intercept_check, .on_vcpu = 2 },
 TEST(svm_guest_state_test),
 { NULL, NULL, NULL, NULL, NULL, NULL, NULL }
 };
-- 
2.20.1

[PATCH kvm-unit-tests v2 1/3] svm: Add ability to execute test via test_run on a vcpu other than vcpu 0

2020-07-17 Thread Cathy Avery

When running tests that can result in a vcpu being left in an
indeterminate state it is useful to be able to run the test on
a vcpu other than 0. This patch allows test_run to be executed
on any vcpu indicated by the on_vcpu member of the svm_test struct.
The initialized state of the vcpu0 registers used to populate the
vmcb is carried forward to the other vcpus.

Signed-off-by: Cathy Avery 
---
 lib/x86/vm.c | 18 ++
 lib/x86/vm.h |  7 +++
 x86/svm.c| 24 +++-
 x86/svm.h|  2 ++
 4 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/lib/x86/vm.c b/lib/x86/vm.c
index 41d6d96..e223bb4 100644
--- a/lib/x86/vm.c
+++ b/lib/x86/vm.c
@@ -2,6 +2,7 @@
 #include "libcflat.h"
 #include "vmalloc.h"
 #include "alloc_page.h"
+#include "smp.h"
 
 pteval_t *install_pte(pgd_t *cr3,
  int pte_level,
@@ -139,9 +140,18 @@ static void setup_mmu_range(pgd_t *cr3, phys_addr_t start, 
size_t len)
install_pages(cr3, phys, max - phys, (void *)(ulong)phys);
 }
 
+static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info)
+{
+   write_cr3(info->cr3);
+   write_cr4(info->cr4);
+   write_cr0(info->cr0);
+}
+
 void *setup_mmu(phys_addr_t end_of_memory)
 {
 pgd_t *cr3 = alloc_page();
+struct vm_vcpu_info info;
+int i;
 
 memset(cr3, 0, PAGE_SIZE);
 
@@ -166,6 +176,14 @@ void *setup_mmu(phys_addr_t end_of_memory)
 printf("cr0 = %lx\n", read_cr0());
 printf("cr3 = %lx\n", read_cr3());
 printf("cr4 = %lx\n", read_cr4());
+
+info.cr3 = read_cr3();
+info.cr4 = read_cr4();
+info.cr0 = read_cr0();
+
+for (i = 1; i < cpu_count(); i++)
+on_cpu(i, (void *)set_additional_vcpu_vmregs, &info);
+
 return cr3;
 }
 
diff --git a/lib/x86/vm.h b/lib/x86/vm.h
index 8750a1e..3a1432f 100644
--- a/lib/x86/vm.h
+++ b/lib/x86/vm.h
@@ -45,4 +45,11 @@ static inline void *current_page_table(void)
 
 void split_large_page(unsigned long *ptep, int level);
 void force_4k_page(void *addr);
+
+struct vm_vcpu_info {
+u64 cr3;
+u64 cr4;
+u64 cr0;
+};
+
 #endif
diff --git a/x86/svm.c b/x86/svm.c
index d8c8272..975c477 100644
--- a/x86/svm.c
+++ b/x86/svm.c
@@ -275,6 +275,17 @@ static void test_run(struct svm_test *test)
irq_enable();
 
report(test->succeeded(test), "%s", test->name);
+
+if (test->on_vcpu)
+   test->on_vcpu_done = true;
+}
+
+static void set_additional_vpcu_msr(void *msr_efer)
+{
+   void *hsave = alloc_page();
+
+   wrmsr(MSR_VM_HSAVE_PA, virt_to_phys(hsave));
+   wrmsr(MSR_EFER, (ulong)msr_efer | EFER_SVME | EFER_NX);
 }
 
 static void setup_svm(void)
@@ -294,6 +305,9 @@ static void setup_svm(void)
if (!npt_supported())
return;
 
+   for (i = 1; i < cpu_count(); i++)
+   on_cpu(i, (void *)set_additional_vpcu_msr, (void 
*)rdmsr(MSR_EFER));
+
printf("NPT detected - running all tests with NPT enabled\n");
 
/*
@@ -396,7 +410,15 @@ int main(int ac, char **av)
if (svm_tests[i].supported && !svm_tests[i].supported())
continue;
if (svm_tests[i].v2 == NULL) {
-   test_run(&svm_tests[i]);
+   if (svm_tests[i].on_vcpu) {
+   if (cpu_count() <= svm_tests[i].on_vcpu)
+   continue;
+   on_cpu_async(svm_tests[i].on_vcpu, (void 
*)test_run, &svm_tests[i]);
+   while (!svm_tests[i].on_vcpu_done)
+   cpu_relax();
+   }
+   else
+   test_run(&svm_tests[i]);
} else {
vmcb_ident(vmcb);
v2_test = &(svm_tests[i]);
diff --git a/x86/svm.h b/x86/svm.h
index f8e7429..1e60d52 100644
--- a/x86/svm.h
+++ b/x86/svm.h
@@ -348,6 +348,8 @@ struct svm_test {
ulong scratch;
/* Alternative test interface. */
void (*v2)(void);
+   int on_vcpu;
+   bool on_vcpu_done;
 };
 
 struct regs {
-- 
2.20.1

[PATCH kvm-unit-tests v2 2/3] svm: INIT and STARTUP ipi test

2020-07-17 Thread Cathy Avery

Init the vcpu and issue the STARTUP ipi to indicate the vcpu
should execute its startup routine.

Signed-off-by: Cathy Avery 
---
 x86/cstart64.S  |  1 +
 x86/svm_tests.c | 57 +
 2 files changed, 58 insertions(+)

diff --git a/x86/cstart64.S b/x86/cstart64.S
index 3ae98d3..dfd7320 100644
--- a/x86/cstart64.S
+++ b/x86/cstart64.S
@@ -7,6 +7,7 @@
 .globl tss_descr
 .globl gdt64_desc
 .globl online_cpus
+.globl cpu_online_count
 
 ipi_vector = 0x20
 
diff --git a/x86/svm_tests.c b/x86/svm_tests.c
index 3b0d019..698eb20 100644
--- a/x86/svm_tests.c
+++ b/x86/svm_tests.c
@@ -17,6 +17,8 @@ static void *scratch_page;
 
 #define LATENCY_RUNS 100
 
+extern u16 cpu_online_count;
+
 u64 tsc_start;
 u64 tsc_end;
 
@@ -1885,6 +1887,58 @@ static bool reg_corruption_check(struct svm_test *test)
 return get_test_stage(test) == 1;
 }
 
+static void get_tss_entry(void *data)
+{
+struct descriptor_table_ptr gdt;
+struct segment_desc64 *gdt_table;
+struct segment_desc64 *tss_entry;
+u16 tr = 0;
+
+sgdt(&gdt);
+tr = str();
+gdt_table = (struct segment_desc64 *) gdt.base;
+tss_entry = &gdt_table[tr / sizeof(struct segment_desc64)];
+*((struct segment_desc64 **)data) = tss_entry;
+}
+
+static int orig_cpu_count;
+
+static void init_startup_prepare(struct svm_test *test)
+{
+struct segment_desc64 *tss_entry;
+int i;
+
+vmcb_ident(vmcb);
+
+on_cpu(1, get_tss_entry, &tss_entry);
+
+orig_cpu_count = cpu_online_count;
+
+apic_icr_write(APIC_DEST_PHYSICAL | APIC_DM_INIT | APIC_INT_ASSERT,
+   id_map[1]);
+
+delay(1ULL);
+
+--cpu_online_count;
+
+*(uint64_t *)tss_entry &= ~DESC_BUSY;
+
+apic_icr_write(APIC_DEST_PHYSICAL | APIC_DM_STARTUP, id_map[1]);
+
+for (i = 0; i < 5 && cpu_online_count < orig_cpu_count; i++)
+   delay(1ULL);
+}
+
+static bool init_startup_finished(struct svm_test *test)
+{
+return true;
+}
+
+static bool init_startup_check(struct svm_test *test)
+{
+return cpu_online_count == orig_cpu_count;
+}
+
 #define TEST(name) { #name, .v2 = name }
 
 /*
@@ -2198,6 +2252,9 @@ struct svm_test svm_tests[] = {
 { "reg_corruption", default_supported, reg_corruption_prepare,
   default_prepare_gif_clear, reg_corruption_test,
   reg_corruption_finished, reg_corruption_check },
+{ "svm_init_startup_test", smp_supported, init_startup_prepare,
+  default_prepare_gif_clear, null_test,
+  init_startup_finished, init_startup_check },
 TEST(svm_guest_state_test),
 { NULL, NULL, NULL, NULL, NULL, NULL, NULL }
 };
-- 
2.20.1

[PATCH kvm-unit-tests v2 0/3] svm: INIT test and test_run on selected vcpu

2020-07-17 Thread Cathy Avery

INIT intercept test and the ability to execute test_run
on a selected vcpu

Changes from v1:

1) Incorporated feedback:
- DR6/DR7/CR2/DEBUGCTL should not be need.
- HSAVE should be set to a different page for each vCPU
- The on_cpu to set EFER should be in setup_svm
- The on_cpu to set cr0/cr3/cr4 should be in setup_vm.

2) Execute tests on selected vcpu using on_cpu_async so the tests
may use the on_cpu functions without causing an ipi_lock deadlock.

3) Added additional test svm_init_startup_test which inits the vcpu and
restarts with sipi.

Cathy Avery (3):
  svm: Add ability to execute test via test_run on a vcpu other than
vcpu 0
  svm: INIT and STARTUP ipi test
  svm: INIT intercept test

 lib/x86/vm.c| 18 +
 lib/x86/vm.h|  7 
 x86/cstart64.S  |  1 +
 x86/svm.c   | 24 +++-
 x86/svm.h   |  2 +
 x86/svm_tests.c | 97 +
 6 files changed, 148 insertions(+), 1 deletion(-)

-- 
2.20.1

Re: [PATCH] KVM: SVM: Use a separate vmcb for the nested L2 guest

2020-09-21 Thread Cathy Avery


On 9/18/20 5:11 PM, Wei Huang wrote:

On 09/17 03:23, Cathy Avery wrote:

svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb 
( nested ).

Issues:

1) There is some wholesale copying of vmcb.save and vmcb.contol
areas which will need to be refined.

2) There is a workaround in nested_svm_vmexit() where

if (svm->vmcb01->control.asid == 0)
svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;

This was done as a result of the kvm selftest 'state_test'. In that
test svm_set_nested_state() is called before svm_vcpu_run().
The asid is assigned by svm_vcpu_run -> pre_svm_run for the current
vmcb which is now vmcb02 as we are in nested mode subsequently
vmcb01.control.asid is never set as it should be.

Tested:
kvm-unit-tests
kvm self tests

I was able to run some basic nested SVM tests using this patch. Full L2 VM
(Fedora) had some problem to boot after grub loading kernel.
Thanks for the feedback and my responses are below.  I am looking into 
the load issue.


Comments below.


Signed-off-by: Cathy Avery 
---
  arch/x86/kvm/svm/nested.c | 116 ++
  arch/x86/kvm/svm/svm.c|  41 +++---
  arch/x86/kvm/svm/svm.h|  10 ++--
  3 files changed, 81 insertions(+), 86 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index e90bc436f584..0a06e62010d8 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -75,12 +75,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu 
*vcpu)
  static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
  {
struct vcpu_svm *svm = to_svm(vcpu);
-   struct vmcb *hsave = svm->nested.hsave;
  
  	WARN_ON(mmu_is_nested(vcpu));
  
  	vcpu->arch.mmu = &vcpu->arch.guest_mmu;

-   kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, 
hsave->save.efer,
+   kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01->save.cr4,
+   svm->vmcb01->save.efer,
svm->nested.ctl.nested_cr3);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
@@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
return;
  
  	c = &svm->vmcb->control;

-   h = &svm->nested.hsave->control;
+   h = &svm->vmcb01->control;
g = &svm->nested.ctl;
  
  	svm->nested.host_intercept_exceptions = h->intercept_exceptions;

@@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm 
*svm)
  
  	svm->vmcb->control.int_ctl =

(svm->nested.ctl.int_ctl & ~mask) |
-   (svm->nested.hsave->control.int_ctl & mask);
+   (svm->vmcb01->control.int_ctl & mask);
  
  	svm->vmcb->control.virt_ext= svm->nested.ctl.virt_ext;

svm->vmcb->control.int_vector  = svm->nested.ctl.int_vector;
@@ -432,6 +432,12 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 
vmcb_gpa,
int ret;
  
  	svm->nested.vmcb = vmcb_gpa;

+
+   WARN_ON(svm->vmcb == svm->nested.vmcb02);
+
+   svm->nested.vmcb02->control = svm->vmcb01->control;

This part is a bit confusing. svm->vmcb01->control contains the control
info from L0 hypervisor to L1 VM. Shouldn't vmcb02->control use the info
from the control info contained in nested_vmcb?
It does during nested_prepare_vmcb_control().  Now that there is a 
separate vmcb01 and vmcb02 vmcb02.control still relies on the original 
contents of vmcb01.control so vmcb02.control is initialized by a whole 
sale copy of vmcb01.control which is admittedly excessive ( patch issues 
section ) and needs to be refined to just what is needed.



+   svm->vmcb = svm->nested.vmcb02;
+   svm->vmcb_pa = svm->nested.vmcb02_pa;
load_nested_vmcb_control(svm, &nested_vmcb->control);
nested_prepare_vmcb_save(svm, nested_vmcb);
nested_prepare_vmcb_control(svm);
@@ -450,8 +456,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
  {
int ret;
struct vmcb *nested_vmcb;
-   struct vmcb *hsave = svm->nested.hsave;
-   struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
u64 vmcb_gpa;
  
@@ -496,29 +500,17 @@ int nested_svm_vmrun(struct vcpu_svm *svm)

kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
  
-	/*

-* Save the old vmcb, so we don't need to pick what we save, but can
-* restore everything when a VMEXIT occurs
-*/
-   hsave->save.es = vmcb->save.es;
-   hsave->save.cs = vmcb->save.cs;
-   hsave->save.ss = vmcb->save.ss;
-

Re: [PATCH] KVM: SVM: Use a separate vmcb for the nested L2 guest

2020-09-18 Thread Cathy Avery


On 9/18/20 11:16 AM, Babu Moger wrote:

Cathy,
Thanks for the patches. It cleans up the code nicely.
But there are some issues with the patch. I was able to bring the L1 guest
with your patch. But when I tried to load L2 guest it crashed. I am
thinking It is mostly due to save/restore part of vmcb. Few comments below.


-Original Message-
From: Cathy Avery 
Sent: Thursday, September 17, 2020 2:23 PM
To: linux-kernel@vger.kernel.org; k...@vger.kernel.org; pbonz...@redhat.com
Cc: vkuzn...@redhat.com; Huang2, Wei 
Subject: [PATCH] KVM: SVM: Use a separate vmcb for the nested L2 guest

svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb
( nested ).

Issues:

1) There is some wholesale copying of vmcb.save and vmcb.contol
areas which will need to be refined.

2) There is a workaround in nested_svm_vmexit() where

if (svm->vmcb01->control.asid == 0)
svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;

This was done as a result of the kvm selftest 'state_test'. In that
test svm_set_nested_state() is called before svm_vcpu_run().
The asid is assigned by svm_vcpu_run -> pre_svm_run for the current
vmcb which is now vmcb02 as we are in nested mode subsequently
vmcb01.control.asid is never set as it should be.

Tested:
kvm-unit-tests
kvm self tests

Signed-off-by: Cathy Avery 
---
  arch/x86/kvm/svm/nested.c | 116 ++
  arch/x86/kvm/svm/svm.c|  41 +++---
  arch/x86/kvm/svm/svm.h|  10 ++--
  3 files changed, 81 insertions(+), 86 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index
e90bc436f584..0a06e62010d8 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -75,12 +75,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct
kvm_vcpu *vcpu)  static void nested_svm_init_mmu_context(struct kvm_vcpu
*vcpu)  {
struct vcpu_svm *svm = to_svm(vcpu);
-   struct vmcb *hsave = svm->nested.hsave;

WARN_ON(mmu_is_nested(vcpu));

vcpu->arch.mmu = &vcpu->arch.guest_mmu;
-   kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4,
hsave->save.efer,
+   kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01-

save.cr4,

+   svm->vmcb01->save.efer,
svm->nested.ctl.nested_cr3);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
@@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
return;

c = &svm->vmcb->control;
-   h = &svm->nested.hsave->control;
+   h = &svm->vmcb01->control;
g = &svm->nested.ctl;

svm->nested.host_intercept_exceptions = h->intercept_exceptions;
@@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct
vcpu_svm *svm)

svm->vmcb->control.int_ctl =
(svm->nested.ctl.int_ctl & ~mask) |
-   (svm->nested.hsave->control.int_ctl & mask);
+   (svm->vmcb01->control.int_ctl & mask);

svm->vmcb->control.virt_ext= svm->nested.ctl.virt_ext;
svm->vmcb->control.int_vector  = svm->nested.ctl.int_vector;
@@ -432,6 +432,12 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64
vmcb_gpa,
int ret;

svm->nested.vmcb = vmcb_gpa;
+
+   WARN_ON(svm->vmcb == svm->nested.vmcb02);
+
+   svm->nested.vmcb02->control = svm->vmcb01->control;
+   svm->vmcb = svm->nested.vmcb02;
+   svm->vmcb_pa = svm->nested.vmcb02_pa;
load_nested_vmcb_control(svm, &nested_vmcb->control);
nested_prepare_vmcb_save(svm, nested_vmcb);
nested_prepare_vmcb_control(svm);
@@ -450,8 +456,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)  {
int ret;
struct vmcb *nested_vmcb;
-   struct vmcb *hsave = svm->nested.hsave;
-   struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
u64 vmcb_gpa;

@@ -496,29 +500,17 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);

-   /*
-* Save the old vmcb, so we don't need to pick what we save, but can
-* restore everything when a VMEXIT occurs
-*/
-   hsave->save.es = vmcb->save.es;
-   hsave->save.cs = vmcb->save.cs;
-   hsave->save.ss = vmcb->save.ss;
-   hsave->save.ds = vmcb->save.ds;
-   hsave->save.gdtr   = vmcb->save.gdtr;
-   hsave->save.idtr   = vmcb->save.idtr;
-   hsave->save.efer   = svm->vcpu.arch.efer;
-   hsave->save.cr0= kvm_read_cr0(&s

[PATCH] KVM: SVM: Use a separate vmcb for the nested L2 guest

2020-09-17 Thread Cathy Avery

svm->vmcb will now point to either a separate vmcb L1 ( not nested ) or L2 vmcb 
( nested ).

Issues:

1) There is some wholesale copying of vmcb.save and vmcb.contol
   areas which will need to be refined.

2) There is a workaround in nested_svm_vmexit() where

   if (svm->vmcb01->control.asid == 0)
   svm->vmcb01->control.asid = svm->nested.vmcb02->control.asid;

   This was done as a result of the kvm selftest 'state_test'. In that
   test svm_set_nested_state() is called before svm_vcpu_run().
   The asid is assigned by svm_vcpu_run -> pre_svm_run for the current
   vmcb which is now vmcb02 as we are in nested mode subsequently
   vmcb01.control.asid is never set as it should be.

Tested:
kvm-unit-tests
kvm self tests

Signed-off-by: Cathy Avery 
---
 arch/x86/kvm/svm/nested.c | 116 ++
 arch/x86/kvm/svm/svm.c|  41 +++---
 arch/x86/kvm/svm/svm.h|  10 ++--
 3 files changed, 81 insertions(+), 86 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index e90bc436f584..0a06e62010d8 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -75,12 +75,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu 
*vcpu)
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
-   struct vmcb *hsave = svm->nested.hsave;
 
WARN_ON(mmu_is_nested(vcpu));
 
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
-   kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, 
hsave->save.efer,
+   kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01->save.cr4,
+   svm->vmcb01->save.efer,
svm->nested.ctl.nested_cr3);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
@@ -105,7 +105,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
return;
 
c = &svm->vmcb->control;
-   h = &svm->nested.hsave->control;
+   h = &svm->vmcb01->control;
g = &svm->nested.ctl;
 
svm->nested.host_intercept_exceptions = h->intercept_exceptions;
@@ -403,7 +403,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm 
*svm)
 
svm->vmcb->control.int_ctl =
(svm->nested.ctl.int_ctl & ~mask) |
-   (svm->nested.hsave->control.int_ctl & mask);
+   (svm->vmcb01->control.int_ctl & mask);
 
svm->vmcb->control.virt_ext= svm->nested.ctl.virt_ext;
svm->vmcb->control.int_vector  = svm->nested.ctl.int_vector;
@@ -432,6 +432,12 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 
vmcb_gpa,
int ret;
 
svm->nested.vmcb = vmcb_gpa;
+
+   WARN_ON(svm->vmcb == svm->nested.vmcb02);
+
+   svm->nested.vmcb02->control = svm->vmcb01->control;
+   svm->vmcb = svm->nested.vmcb02;
+   svm->vmcb_pa = svm->nested.vmcb02_pa;
load_nested_vmcb_control(svm, &nested_vmcb->control);
nested_prepare_vmcb_save(svm, nested_vmcb);
nested_prepare_vmcb_control(svm);
@@ -450,8 +456,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 {
int ret;
struct vmcb *nested_vmcb;
-   struct vmcb *hsave = svm->nested.hsave;
-   struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
u64 vmcb_gpa;
 
@@ -496,29 +500,17 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
 
-   /*
-* Save the old vmcb, so we don't need to pick what we save, but can
-* restore everything when a VMEXIT occurs
-*/
-   hsave->save.es = vmcb->save.es;
-   hsave->save.cs = vmcb->save.cs;
-   hsave->save.ss = vmcb->save.ss;
-   hsave->save.ds = vmcb->save.ds;
-   hsave->save.gdtr   = vmcb->save.gdtr;
-   hsave->save.idtr   = vmcb->save.idtr;
-   hsave->save.efer   = svm->vcpu.arch.efer;
-   hsave->save.cr0= kvm_read_cr0(&svm->vcpu);
-   hsave->save.cr4= svm->vcpu.arch.cr4;
-   hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
-   hsave->save.rip= kvm_rip_read(&svm->vcpu);
-   hsave->save.rsp= vmcb->save.rsp;
-   hsave->save.rax= vmcb->save.rax;
-   if (npt_enabled)
-   hsave->save.cr3= vmcb->save.cr3;
-   else
-   hsave->save.cr3= kvm_read_cr3(&svm->vcpu);
-
-   copy_vmcb_control_area(&hsave->control, &vmcb->control);
+
+   /* Update vmcb0. We will restore

[PATCH] scsi: storvsc: Allow only one remove lun work item to be issued per lun

2017-04-15 Thread Cathy Avery

When running multipath on a VM if all available paths go down
the driver can schedule large amounts of storvsc_remove_lun
work items to the same lun. In response to the failing paths
typically storvsc responds by taking host->scan_mutex and issuing
a TUR per lun. If there has been heavy IO to the failed device
all the failed IOs are returned from the host. A remove lun work
item is issued per failed IO. If the outstanding TURs have not been
completed in a timely manner the scan_mutex is never released or
released too late. Consequently the many remove lun work items are
not completed as scsi_remove_device also tries to take host->scan_mutex.
This results in dragging the VM down and sometimes completely.

This patch only allows one remove lun to be issued to a particular
lun while it is an instantiated member of the scsi stack.

Signed-off-by: Cathy Avery 
---
 drivers/scsi/storvsc_drv.c | 33 +++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 016639d..9dbb5bf 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -478,6 +478,10 @@ struct storvsc_device {
u64 port_name;
 };
 
+struct storvsc_dev_hostdata {
+   atomic_t req_remove_lun;
+};
+
 struct hv_host_device {
struct hv_device *dev;
unsigned int port;
@@ -918,6 +922,8 @@ static void storvsc_handle_error(struct vmscsi_request 
*vm_srb,
u8 asc, u8 ascq)
 {
struct storvsc_scan_work *wrk;
+   struct storvsc_dev_hostdata *hostdata;
+   struct scsi_device *sdev;
void (*process_err_fn)(struct work_struct *work);
bool do_work = false;
 
@@ -953,8 +959,17 @@ static void storvsc_handle_error(struct vmscsi_request 
*vm_srb,
}
break;
case SRB_STATUS_INVALID_LUN:
-   do_work = true;
-   process_err_fn = storvsc_remove_lun;
+   sdev = scsi_device_lookup(host, 0, vm_srb->target_id,
+ vm_srb->lun);
+   if (sdev) {
+   hostdata = sdev->hostdata;
+   if (hostdata &&
+   !atomic_cmpxchg(&hostdata->req_remove_lun, 0, 1)) {
+   do_work = true;
+   process_err_fn = storvsc_remove_lun;
+   }
+   scsi_device_put(sdev);
+   }
break;
case SRB_STATUS_ABORTED:
if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID &&
@@ -1426,9 +1441,22 @@ static int storvsc_device_configure(struct scsi_device 
*sdevice)
sdevice->no_write_same = 0;
}
 
+   sdevice->hostdata = kzalloc(sizeof(struct storvsc_dev_hostdata),
+   GFP_ATOMIC);
+   if (!sdevice->hostdata)
+   return -ENOMEM;
+
return 0;
 }
 
+static void storvsc_device_destroy(struct scsi_device *sdevice)
+{
+   if (sdevice->hostdata) {
+   kfree(sdevice->hostdata);
+   sdevice->hostdata = NULL;
+   }
+}
+
 static int storvsc_get_chs(struct scsi_device *sdev, struct block_device * 
bdev,
   sector_t capacity, int *info)
 {
@@ -1669,6 +1697,7 @@ static struct scsi_host_template scsi_driver = {
.eh_timed_out = storvsc_eh_timed_out,
.slave_alloc =  storvsc_device_alloc,
.slave_configure =  storvsc_device_configure,
+   .slave_destroy =storvsc_device_destroy,
.cmd_per_lun =  255,
.this_id =  -1,
.use_clustering =   ENABLE_CLUSTERING,
-- 
2.5.0

[PATCH v3 1/2] scsi: scsi_transport_fc: Add dummy initiator role to rport

2017-04-17 Thread Cathy Avery

This patch allows scsi drivers that expose virturalized fibre channel
devices but that do not expose rports to successfully rescan the scsi
bus via echo "- - -" > /sys/class/scsi_host/hostX/scan.
Drivers can create a pseudo rport and indicate
FC_PORT_ROLE_FCP_DUMMY_INITIATOR as the rport's role in
fc_rport_identifiers. This insures that a valid scsi_target_id
is assigned to the newly created rport and it can meet the
requirements of fc_user_scan_tgt calling scsi_scan_target.

Signed-off-by: Cathy Avery 
---
 drivers/scsi/scsi_transport_fc.c | 10 ++
 include/scsi/scsi_transport_fc.h |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 2d753c9..de85602 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -289,9 +289,10 @@ static const struct {
u32 value;
char*name;
 } fc_port_role_names[] = {
-   { FC_PORT_ROLE_FCP_TARGET,  "FCP Target" },
-   { FC_PORT_ROLE_FCP_INITIATOR,   "FCP Initiator" },
-   { FC_PORT_ROLE_IP_PORT, "IP Port" },
+   { FC_PORT_ROLE_FCP_TARGET,  "FCP Target" },
+   { FC_PORT_ROLE_FCP_INITIATOR,   "FCP Initiator" },
+   { FC_PORT_ROLE_IP_PORT, "IP Port" },
+   { FC_PORT_ROLE_FCP_DUMMY_INITIATOR, "FCP Dummy Initiator" },
 };
 fc_bitfield_name_search(port_roles, fc_port_role_names)
 
@@ -2628,7 +2629,8 @@ fc_remote_port_create(struct Scsi_Host *shost, int 
channel,
spin_lock_irqsave(shost->host_lock, flags);
 
rport->number = fc_host->next_rport_number++;
-   if (rport->roles & FC_PORT_ROLE_FCP_TARGET)
+   if ((rport->roles & FC_PORT_ROLE_FCP_TARGET) ||
+   (rport->roles & FC_PORT_ROLE_FCP_DUMMY_INITIATOR))
rport->scsi_target_id = fc_host->next_target_id++;
else
rport->scsi_target_id = -1;
diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index b21b8aa5..6e208bb 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -162,6 +162,7 @@ enum fc_tgtid_binding_type  {
 #define FC_PORT_ROLE_FCP_TARGET0x01
 #define FC_PORT_ROLE_FCP_INITIATOR 0x02
 #define FC_PORT_ROLE_IP_PORT   0x04
+#define FC_PORT_ROLE_FCP_DUMMY_INITIATOR   0x08
 
 /* The following are for compatibility */
 #define FC_RPORT_ROLE_UNKNOWN  FC_PORT_ROLE_UNKNOWN
-- 
2.5.0

[PATCH v3 0/2] scsi: storvsc: Add support for FC rport

2017-04-17 Thread Cathy Avery

The updated patch set provides a way for drivers ( specifically
storvsc in this case ) that expose virturalized fc devices
but that do not expose rports to be manually scanned. This is
done via creating a pseudo rport in storvsc and a
corresponding dummy initiator rport role in the fc transport.

Changes since v2:
- Additional patch adding FC_PORT_ROLE_FCP_DUMMY_INITIATOR role
  to fc_transport
- Changed storvsc rport role to FC_PORT_ROLE_FCP_DUMMY_INITIATOR

Changes since v1:
- Fix fc_rport_identifiers init [Stephen Hemminger]
- Better error checking


Cathy Avery (2):
  scsi: scsi_transport_fc: Add dummy initiator role to rport
  scsi: storvsc: Add support for FC rport.

 drivers/scsi/scsi_transport_fc.c | 10 ++
 drivers/scsi/storvsc_drv.c   | 23 ++-
 include/scsi/scsi_transport_fc.h |  1 +
 3 files changed, 25 insertions(+), 9 deletions(-)

-- 
2.5.0

[PATCH v3 2/2] scsi: storvsc: Add support for FC rport.

2017-04-17 Thread Cathy Avery

Included in the current storvsc driver for Hyper-V is the ability
to access luns on an FC fabric via a virtualized fiber channel
adapter exposed by the Hyper-V host. The driver also attaches to
the FC transport to allow host and port names to be published under
/sys/class/fc_host/hostX. Current customer tools running on the VM
require that these names be available in the well known standard
location under fc_host/hostX.

This patch stubs in an rport per fc_host and sets its rport role
as FC_PORT_ROLE_FCP_DUMMY_INITIATOR to indicate to the fc_transport
that it is a pseudo rport in order to scan the scsi stack via
echo "- - -" > /sys/class/scsi_host/hostX/scan.

Signed-off-by: Cathy Avery 
---
 drivers/scsi/storvsc_drv.c | 23 ++-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 016639d..1ec8579 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -476,6 +476,9 @@ struct storvsc_device {
 */
u64 node_name;
u64 port_name;
+#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
+   struct fc_rport *rport;
+#endif
 };
 
 struct hv_host_device {
@@ -1823,19 +1826,27 @@ static int storvsc_probe(struct hv_device *device,
target = (device->dev_instance.b[5] << 8 |
 device->dev_instance.b[4]);
ret = scsi_add_device(host, 0, target, 0);
-   if (ret) {
-   scsi_remove_host(host);
-   goto err_out2;
-   }
+   if (ret)
+   goto err_out3;
}
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
+   struct fc_rport_identifiers ids = {
+   .roles = FC_PORT_ROLE_FCP_DUMMY_INITIATOR,
+   };
+
fc_host_node_name(host) = stor_device->node_name;
fc_host_port_name(host) = stor_device->port_name;
+   stor_device->rport = fc_remote_port_add(host, 0, &ids);
+   if (!stor_device->rport)
+   goto err_out3;
}
 #endif
return 0;
 
+err_out3:
+   scsi_remove_host(host);
+
 err_out2:
/*
 * Once we have connected with the host, we would need to
@@ -1861,8 +1872,10 @@ static int storvsc_remove(struct hv_device *dev)
struct Scsi_Host *host = stor_device->host;
 
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
-   if (host->transportt == fc_transport_template)
+   if (host->transportt == fc_transport_template) {
+   fc_remote_port_delete(stor_device->rport);
fc_remove_host(host);
+   }
 #endif
scsi_remove_host(host);
storvsc_dev_remove(dev);
-- 
2.5.0

Re: [PATCH] scsi: storvsc: Allow only one remove lun work item to be issued per lun

2017-04-17 Thread Cathy Avery


On 04/15/2017 10:06 AM, Christoph Hellwig wrote:

Just add a singlethreaded workqueue for storvsc_handle_error and you'll
get serialization for all error handling for free.


The problem I am seeing is that many work items can be queued up for the 
same lun before it goes away. The single threaded queue would have to allow
for only a queue of one and no more. Either that or each work item for a 
particular lun must have the same memory address so it gets

rejected if it you try to queue a remove to the same lun twice.

Maybe I am not understanding your suggestion correctly.

Thanks,

Cathy

[PATCH v2] scsi: storvsc: Add support for FC rport.

2017-04-03 Thread Cathy Avery

Included in the current storvsc driver for Hyper-V is the ability
to access luns on an FC fabric via a virtualized fiber channel
adapter exposed by the Hyper-V host. The driver also attaches to
the FC transport to allow host and port names to be published under
/sys/class/fc_host/hostX. Current customer tools running on the VM
require that these names be available in the well known standard
location under fc_host/hostX.

A problem arose when attaching to the FC transport. The scsi_scan
code attempts to call fc_user_scan which has basically become a no-op
due to the fact that the virtualized FC device does not expose rports.
At this point you cannot refresh the scsi bus after mapping or unmapping
luns on the SAN without a reboot of the VM.

This patch stubs in an rport per fc_host in storvsc so that the
requirement of a defined rport is now met within the fc_transport and
echo "- - -" > /sys/class/scsi_host/hostX/scan now works.

Signed-off-by: Cathy Avery 
---
Changes since v1:
- Fix fc_rport_identifiers init [Stephen Hemminger]
- Better error checking
---
 drivers/scsi/storvsc_drv.c | 23 ++-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 638e5f4..37646d1 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -478,6 +478,9 @@ struct storvsc_device {
 */
u64 node_name;
u64 port_name;
+#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
+   struct fc_rport *rport;
+#endif
 };
 
 struct hv_host_device {
@@ -1816,19 +1819,27 @@ static int storvsc_probe(struct hv_device *device,
target = (device->dev_instance.b[5] << 8 |
 device->dev_instance.b[4]);
ret = scsi_add_device(host, 0, target, 0);
-   if (ret) {
-   scsi_remove_host(host);
-   goto err_out2;
-   }
+   if (ret)
+   goto err_out3;
}
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
+   struct fc_rport_identifiers ids = {
+   .roles = FC_PORT_ROLE_FCP_TARGET,
+   };
+
fc_host_node_name(host) = stor_device->node_name;
fc_host_port_name(host) = stor_device->port_name;
+   stor_device->rport = fc_remote_port_add(host, 0, &ids);
+   if (!stor_device->rport)
+   goto err_out3;
}
 #endif
return 0;
 
+err_out3:
+   scsi_remove_host(host);
+
 err_out2:
/*
 * Once we have connected with the host, we would need to
@@ -1854,8 +1865,10 @@ static int storvsc_remove(struct hv_device *dev)
struct Scsi_Host *host = stor_device->host;
 
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
-   if (host->transportt == fc_transport_template)
+   if (host->transportt == fc_transport_template) {
+   fc_remote_port_delete(stor_device->rport);
fc_remove_host(host);
+   }
 #endif
scsi_remove_host(host);
storvsc_dev_remove(dev);
-- 
2.5.0

Re: [PATCH v2] scsi: storvsc: Add support for FC rport.

2017-04-03 Thread Cathy Avery


On 04/03/2017 08:17 AM, Christoph Hellwig wrote:

if (host->transportt == fc_transport_template) {
+   struct fc_rport_identifiers ids = {
+   .roles = FC_PORT_ROLE_FCP_TARGET,
+   };

I don't think storvsc ever acts as FCP target.


In order to implement the work around so that the scsi scan works 
indicating FC_PORT_ROLE_FCP_TARGET as a role was necessary due to its 
test in fc_scsi_scan_rport. The idea here is to avoid making any changes 
to the fc_transport driver which was of some concern.



Thanks,

Cathy

[PATCH v3 0/2] scsi: storvsc: Add support for FC rport

2017-04-05 Thread Cathy Avery

The updated patch set provides a way for drivers ( specifically
storvsc in this case ) that expose virturalized fc devices
but that do not expose rports to be manually scanned. This is
done via creating a pseudo rport in storvsc and a
corresponding dummy initiator rport role in the fc transport.

Changes since v2:
- Additional patch adding FC_PORT_ROLE_FCP_DUMMY_INITIATOR role
  to fc_transport
- Changed storvsc rport role to FC_PORT_ROLE_FCP_DUMMY_INITIATOR

Changes since v1:
- Fix fc_rport_identifiers init [Stephen Hemminger]
- Better error checking


Cathy Avery (2):
  scsi: scsi_transport_fc: Add dummy initiator role to rport
  scsi: storvsc: Add support for FC rport.

 drivers/scsi/scsi_transport_fc.c | 10 ++
 drivers/scsi/storvsc_drv.c   | 23 ++-
 include/scsi/scsi_transport_fc.h |  1 +
 3 files changed, 25 insertions(+), 9 deletions(-)

-- 
2.5.0

[PATCH v3 2/2] scsi: storvsc: Add support for FC rport.

2017-04-05 Thread Cathy Avery

Included in the current storvsc driver for Hyper-V is the ability
to access luns on an FC fabric via a virtualized fiber channel
adapter exposed by the Hyper-V host. The driver also attaches to
the FC transport to allow host and port names to be published under
/sys/class/fc_host/hostX. Current customer tools running on the VM
require that these names be available in the well known standard
location under fc_host/hostX.

This patch stubs in an rport per fc_host and sets its rport role
as FC_PORT_ROLE_FCP_DUMMY_INITIATOR to indicate to the fc_transport
that it is a pseudo rport in order to scan the scsi stack via
echo "- - -" > /sys/class/scsi_host/hostX/scan.

Signed-off-by: Cathy Avery 
---
 drivers/scsi/storvsc_drv.c | 23 ++-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 016639d..1ec8579 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -476,6 +476,9 @@ struct storvsc_device {
 */
u64 node_name;
u64 port_name;
+#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
+   struct fc_rport *rport;
+#endif
 };
 
 struct hv_host_device {
@@ -1823,19 +1826,27 @@ static int storvsc_probe(struct hv_device *device,
target = (device->dev_instance.b[5] << 8 |
 device->dev_instance.b[4]);
ret = scsi_add_device(host, 0, target, 0);
-   if (ret) {
-   scsi_remove_host(host);
-   goto err_out2;
-   }
+   if (ret)
+   goto err_out3;
}
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
if (host->transportt == fc_transport_template) {
+   struct fc_rport_identifiers ids = {
+   .roles = FC_PORT_ROLE_FCP_DUMMY_INITIATOR,
+   };
+
fc_host_node_name(host) = stor_device->node_name;
fc_host_port_name(host) = stor_device->port_name;
+   stor_device->rport = fc_remote_port_add(host, 0, &ids);
+   if (!stor_device->rport)
+   goto err_out3;
}
 #endif
return 0;
 
+err_out3:
+   scsi_remove_host(host);
+
 err_out2:
/*
 * Once we have connected with the host, we would need to
@@ -1861,8 +1872,10 @@ static int storvsc_remove(struct hv_device *dev)
struct Scsi_Host *host = stor_device->host;
 
 #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS)
-   if (host->transportt == fc_transport_template)
+   if (host->transportt == fc_transport_template) {
+   fc_remote_port_delete(stor_device->rport);
fc_remove_host(host);
+   }
 #endif
scsi_remove_host(host);
storvsc_dev_remove(dev);
-- 
2.5.0

[PATCH v3 1/2] scsi: scsi_transport_fc: Add dummy initiator role to rport

2017-04-05 Thread Cathy Avery

This patch allows scsi drivers that expose virturalized fibre channel
devices but that do not expose rports to successfully rescan the scsi
bus via echo "- - -" > /sys/class/scsi_host/hostX/scan.
Drivers can create a pseudo rport and indicate
FC_PORT_ROLE_FCP_DUMMY_INITIATOR as the rport's role in
fc_rport_identifiers. This insures that a valid scsi_target_id
is assigned to the newly created rport and it can meet the
requirements of fc_user_scan_tgt calling scsi_scan_target.

Signed-off-by: Cathy Avery 
---
 drivers/scsi/scsi_transport_fc.c | 10 ++
 include/scsi/scsi_transport_fc.h |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 2d753c9..de85602 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -289,9 +289,10 @@ static const struct {
u32 value;
char*name;
 } fc_port_role_names[] = {
-   { FC_PORT_ROLE_FCP_TARGET,  "FCP Target" },
-   { FC_PORT_ROLE_FCP_INITIATOR,   "FCP Initiator" },
-   { FC_PORT_ROLE_IP_PORT, "IP Port" },
+   { FC_PORT_ROLE_FCP_TARGET,  "FCP Target" },
+   { FC_PORT_ROLE_FCP_INITIATOR,   "FCP Initiator" },
+   { FC_PORT_ROLE_IP_PORT, "IP Port" },
+   { FC_PORT_ROLE_FCP_DUMMY_INITIATOR, "FCP Dummy Initiator" },
 };
 fc_bitfield_name_search(port_roles, fc_port_role_names)
 
@@ -2628,7 +2629,8 @@ fc_remote_port_create(struct Scsi_Host *shost, int 
channel,
spin_lock_irqsave(shost->host_lock, flags);
 
rport->number = fc_host->next_rport_number++;
-   if (rport->roles & FC_PORT_ROLE_FCP_TARGET)
+   if ((rport->roles & FC_PORT_ROLE_FCP_TARGET) ||
+   (rport->roles & FC_PORT_ROLE_FCP_DUMMY_INITIATOR))
rport->scsi_target_id = fc_host->next_target_id++;
else
rport->scsi_target_id = -1;
diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index b21b8aa5..6e208bb 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -162,6 +162,7 @@ enum fc_tgtid_binding_type  {
 #define FC_PORT_ROLE_FCP_TARGET0x01
 #define FC_PORT_ROLE_FCP_INITIATOR 0x02
 #define FC_PORT_ROLE_IP_PORT   0x04
+#define FC_PORT_ROLE_FCP_DUMMY_INITIATOR   0x08
 
 /* The following are for compatibility */
 #define FC_RPORT_ROLE_UNKNOWN  FC_PORT_ROLE_UNKNOWN
-- 
2.5.0

[PATCH] scsi: storvsc: Fix scsi_cmd error assignments in storvsc_handle_error

2017-12-19 Thread Cathy Avery

When an I/O is returned with an srb_status of SRB_STATUS_INVALID_LUN
which has zero good_bytes it must be assigned an error. Otherwise
the I/O will be continuously requeued and will cause a deadlock in the
case where disks are being hot added and removed. sd_probe_async will
wait forever for its I/O to complete while holding scsi_sd_probe_domain.

Also returning the default error of DID_TARGET_FAILURE causes
multipath to not retry the I/O resulting in applications receiving I/O
errors before a failover can occur.

Signed-off-by: Cathy Avery 
Signed-off-by: Long Li 
---
 drivers/scsi/storvsc_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 1b06cf0..3b3d1d0 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -953,10 +953,11 @@ static void storvsc_handle_error(struct vmscsi_request 
*vm_srb,
case TEST_UNIT_READY:
break;
default:
-   set_host_byte(scmnd, DID_TARGET_FAILURE);
+   set_host_byte(scmnd, DID_ERROR);
}
break;
case SRB_STATUS_INVALID_LUN:
+   set_host_byte(scmnd, DID_NO_CONNECT);
do_work = true;
process_err_fn = storvsc_remove_lun;
break;
-- 
2.5.0

71 matches

Mail list logo