static char help[] = "MWE\n";

#include <petscmat.h>
#include <petsc/private/petscimpl.h>

int main(int argc,char **argv)
{
  Mat            A,B;
  Vec            t,v;
  PetscScalar    *vv,*aa;
  PetscInt       n=30,k=6,l=0,i,Istart,Iend,nloc,bs,test=1;
  PetscErrorCode ierr;

  ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr;
  ierr = PetscOptionsGetInt(NULL,NULL,"-n",&n,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsGetInt(NULL,NULL,"-k",&k,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsGetInt(NULL,NULL,"-test",&test,NULL);CHKERRQ(ierr);

  /* sparse matrix */
  ierr = MatCreate(PETSC_COMM_WORLD,&A);CHKERRQ(ierr);
  ierr = MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,n,n);CHKERRQ(ierr);
  ierr = MatSetType(A,MATAIJCUSPARSE);CHKERRQ(ierr);
  ierr = MatSetFromOptions(A);CHKERRQ(ierr);
  ierr = MatSetUp(A);CHKERRQ(ierr);

  ierr = MatGetOwnershipRange(A,&Istart,&Iend);CHKERRQ(ierr);
  for (i=Istart;i<Iend;i++) {
    if (i>0) { ierr = MatSetValue(A,i,i-1,-1.0,INSERT_VALUES);CHKERRQ(ierr); }
    if (i<n-1) { ierr = MatSetValue(A,i,i+1,-1.0,INSERT_VALUES);CHKERRQ(ierr); }
    ierr = MatSetValue(A,i,i,2.0,INSERT_VALUES);CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
  ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);

  /* template vector */
  ierr = MatCreateVecs(A,NULL,&t);CHKERRQ(ierr);

  /* long vector, contains the stacked columns of an nxk dense matrix */
  ierr = VecGetLocalSize(t,&nloc);CHKERRQ(ierr);
  ierr = VecGetBlockSize(t,&bs);CHKERRQ(ierr);
  ierr = VecCreate(PetscObjectComm((PetscObject)t),&v);CHKERRQ(ierr);
  ierr = VecSetType(v,((PetscObject)t)->type_name);CHKERRQ(ierr);
  ierr = VecSetSizes(v,k*nloc,k*n);CHKERRQ(ierr);
  ierr = VecSetBlockSize(v,bs);CHKERRQ(ierr);

  /* dense matrix that contains the columns of v */
  ierr = VecCUDAGetArray(v,&vv);CHKERRQ(ierr);
  if (test==1) {
    ierr = MatCreateDenseCUDA(PetscObjectComm((PetscObject)v),nloc,PETSC_DECIDE,n,k-l,vv,&B);CHKERRQ(ierr); /* pass a pointer to avoid allocation of storage */
    ierr = MatDenseCUDAPlaceArray(B,NULL);CHKERRQ(ierr);  /* replace with a null pointer, the value after BVRestoreMat */
    ierr = MatDenseCUDAPlaceArray(B,vv+l*nloc);CHKERRQ(ierr);  /* set the actual pointer */
  } else {
    ierr = MatCreateDenseCUDA(PetscObjectComm((PetscObject)v),nloc,PETSC_DECIDE,n,k-l,NULL,&B);CHKERRQ(ierr);
    ierr = MatDenseCUDAPlaceArray(B,vv+l*nloc);CHKERRQ(ierr);  /* set the actual pointer */
  }
  ierr = VecCUDARestoreArray(v,&vv);CHKERRQ(ierr);

  /* use B here*/

  /* finished using B */
  ierr = MatDenseCUDAGetArray(B,&aa);CHKERRQ(ierr);
  vv = aa-l*nloc;
  ierr = MatDenseCUDAResetArray(B);CHKERRQ(ierr);
  ierr = VecCUDARestoreArray(v,&vv);CHKERRQ(ierr);


  /* free work space */
  ierr = MatDestroy(&A);CHKERRQ(ierr);
  ierr = VecDestroy(&t);CHKERRQ(ierr);
  ierr = VecDestroy(&v);CHKERRQ(ierr);
  ierr = PetscFinalize();
  return ierr;
}
