Hi, This gives me more combinations than you got with SAS. Also, this selects the one with minimum dimension between duplicates.
final3New<-read.table(file="real_data_cecilia.txt",sep="\t",header=T) final3New1<-read.csv("real_data_cecilia_new.csv") fun3<- function(dat){ if(any(duplicated(dat))){ indx<- which(duplicated(dat)) row.names(dat)<-1:nrow(dat) dat1<- subset(dat[indx,],dummy==1) dat0<- subset(dat[indx,],dummy==0) indx1<- as.numeric(row.names(dat1)) indx11<- sort(c(indx1,indx1+1)) indx0<- as.numeric(row.names(dat0)) indx00<- sort(c(indx0,indx0-1)) indx10<- sort(c(indx11,indx00)) res <- dat[-indx10,] res } else { dat } } fun1New<-function(dat,percent,number){ lst1<- split(dat,list(dat$year,dat$industry)) lst2<- lst1[lapply(lst1,nrow)>1] lst3<- lapply(lst2,function(x) { CombN1<-combn(seq_len(nrow(x)),2) lapply(split(CombN1,col(CombN1)),function(y){ x1<-x[y,] x1[sum(x1$dummy)==1,] }) }) lst4<- lapply(lst3,function(x) x[lapply(x,nrow)>0]) lst5<- lst4[lapply(lst4,length)>0] lst6<- lapply(lst5,function(x){ lapply(x,function(y){ x1<- abs(diff(y$dimension))< number x2<- y$dimension[2]+ (y$dimension[2]*percent) x3<- y$dimension[2]- (y$dimension[2]*percent) x4<- y$dimension[1]+ (y$dimension[1]*percent) x5<- y$dimension[1]- (y$dimension[1]*percent) x6<- (y$dimension[1] < x2) & (y$dimension[1] > x3) x7<- (y$dimension[2]< x4) & (y$dimension[2]> x5) y[((x6 & x1)| (x7 & x1)),] }) } ) lst7<- lapply(lst6,function(x) x[lapply(x,nrow)>0]) lst8<- lst7[lapply(lst7,length)>0] lst9<- lapply(lst8,function(x) do.call(rbind,x)) lst10<-lapply(lst9,function(x) { row.names(x)<- 1:nrow(x) x1<-x[x$dummy==1,] do.call(rbind,lapply(split(x1,x1$dimension),function(y){ indx1<-sort(c(as.numeric(row.names(y)),as.numeric(row.names(y))+1)) x2<-x[indx1,] x3<- subset(x2,dummy==0) x4<-x3[which.min(abs(x2$dimension[1]-x3$dimension)),] rbind(x2[1,],x4) })) }) res<- do.call(rbind,lapply(lst10,fun3)) row.names(res)<- 1:nrow(res) res } ####1st dataset res10PercentHigh<- fun1New(final3New,0.10,500000000) dim(res10PercentHigh) #[1] 764 5 dim(unique(res10PercentHigh)) #[1] 764 5 nrow(subset(res10PercentHigh,dummy==0)) #[1] 382 nrow(subset(res10PercentHigh,dummy==1)) #[1] 382 res10PercentLow<- fun1New(final3New,0.10,50) dim(res10PercentLow) #[1] 294 5 dim(unique(res10PercentLow)) #[1] 294 5 nrow(subset(res10PercentLow,dummy==0)) #[1] 147 nrow(subset(res10PercentLow,dummy==1)) #[1] 147 res5PercentHigh<- fun1New(final3New,0.05,500000000) dim(res5PercentHigh) #[1] 630 5 dim(unique(res5PercentHigh)) #[1] 630 5 nrow(subset(res5PercentHigh,dummy==0)) #[1] 315 nrow(subset(res5PercentHigh,dummy==1)) #[1] 315 res5PercentLow<- fun1New(final3New,0.05,50) dim(res5PercentLow) #[1] 294 5 dim(unique(res5PercentLow)) #[1] 294 5 nrow(subset(res5PercentLow,dummy==0)) #[1] 147 nrow(subset(res5PercentLow,dummy==1)) #[1] 147 #######2nd dataset res10PercentHigh<- fun1New(final3New1,0.10,500000000) dim(res10PercentHigh) #[1] 760 5 dim(unique(res10PercentHigh)) #[1] 760 5 nrow(subset(res10PercentHigh,dummy==0)) #[1] 380 nrow(subset(res10PercentHigh,dummy==1)) #[1] 380 res10PercentLow<- fun1New(final3New1,0.10,100) dim(res10PercentLow) #[1] 418 5 dim(unique(res10PercentLow)) #[1] 418 5 nrow(subset(res10PercentLow,dummy==0)) #[1] 209 nrow(subset(res10PercentLow,dummy==1)) #[1] 209 res5PercentHigh<- fun1New(final3New1,0.05,500000000) dim(res5PercentHigh) #[1] 640 5 dim(unique(res5PercentHigh)) #[1] 640 5 nrow(subset(res5PercentHigh,dummy==0)) #[1] 320 nrow(subset(res5PercentHigh,dummy==1)) #[1] 320 res5PercentLow<- fun1New(final3New1,0.05,50) dim(res5PercentLow) #[1] 310 5 dim(unique(res5PercentLow)) #[1] 310 5 nrow(subset(res5PercentLow,dummy==0)) #[1] 155 nrow(subset(res5PercentLow,dummy==1)) #[1] 155 res20PercentHigh<- fun1New(final3New1,0.20,500000000) dim(res20PercentHigh) #[1] 846 5 dim(unique(res20PercentHigh)) #[1] 846 5 nrow(subset(res20PercentHigh,dummy==0)) #[1] 423 nrow(subset(res20PercentHigh,dummy==1)) #[1] 423 A.K. ----- Original Message ----- From: Cecilia Carmo <cecilia.ca...@ua.pt> To: arun <smartpink...@yahoo.com> Cc: Sent: Sunday, June 16, 2013 5:57 AM Subject: RE: matched samples, dataframe, panel data In the script I send you and with the file that I sent with it and with the old function 1 and 2 it got 350 combinations and it was possible to have more Now with new fun 1 and 3 I have less, so it is not ok, does it? > res10Percent<- fun1New(final3New2,0.10,500000000) > res10F3<- fun3(res10Percent) > dim(res10F3) [1] 600 5 > nrow(subset(res10F3,dummy==0)) [1] 300 > nrow(subset(res10F3,dummy==1)) [1] 300 Sorry for making you spending so much time. I thought it could be easier. Cecília ________________________________________ De: arun [smartpink...@yahoo.com] Enviado: sexta-feira, 14 de Junho de 2013 23:09 Para: Cecilia Carmo Assunto: Re: matched samples, dataframe, panel data One thing I forgot to mention. I used fun3() because i found fun2() still have some problems with getting the correct dimensions. You can check the results of fun1() and fun3() and see if all the combinations are got. Then, if I get chance, I will correct fun2(). """"" And you conclude that they are the same! """"""" Here, also I am not concluding anything. A.K. ----- Original Message ----- From: arun <smartpink...@yahoo.com> To: Cecilia Carmo <cecilia.ca...@ua.pt> Cc: R help <r-help@r-project.org> Sent: Friday, June 14, 2013 6:05 PM Subject: Re: matched samples, dataframe, panel data Hi, I changed the fun1(). Now, it should be possible to get all the possible combinations within each group. final3New<-read.table(file="real_data_cecilia.txt",sep="\t",header=T) final3New1<-read.csv("real_data_cecilia_new.csv") fun1New<- function(dat,percent,number){ lst1<- split(dat,list(dat$year,dat$industry)) lst2<- lst1[lapply(lst1,nrow)>1] lst3<- lapply(lst2,function(x) { CombN1<-combn(seq_len(nrow(x)),2) lapply(split(CombN1,col(CombN1)),function(y){ x1<-x[y,] x1[sum(x1$dummy)==1,] }) }) lst4<- lapply(lst3,function(x) x[lapply(x,nrow)>0]) lst5<- lst4[lapply(lst4,length)>0] lst6<- lapply(lst5,function(x){ lapply(x,function(y){ x1<- abs(diff(y$dimension))< number x2<- y$dimension[2]+ (y$dimension[2]*percent) x3<- y$dimension[2]- (y$dimension[2]*percent) x4<- (y$dimension[1] < x2) & (y$dimension[1] > x3) y[x4 & x1,] }) } ) lst7<- lapply(lst6,function(x) x[lapply(x,nrow)>0]) lst8<- lst7[lapply(lst7,length)>0] res<- do.call(rbind,lapply(lst8,function(x){ do.call(rbind,x) })) row.names(res)<- 1:nrow(res) res } ##Applying fun1New res5Percent<- fun1New(final3New,0.05,50) dim(res5Percent) #[1] 718 5 res5PercentHigh<- fun1New(final3New,0.05,500000) dim(res5PercentHigh) #[1] 2788 5 res5Percent1<- fun1New(final3New1,0.05,50) dim(res5Percent1) #[1] 870 5 res5Percent1High<- fun1New(final3New1,0.05,500000) dim(res5Percent1High) #[1] 2902 5 res10Percent<- fun1New(final3New,0.10,200) dim(res10Percent) #[1] 2928 5 res10Percent1<- fun1New(final3New1,0.10,200) dim(res10Percent1) #[1] 3092 5 fun3<- function(dat){ indx<- duplicated(dat) dat1<- subset(dat[indx,],dummy==1) dat0<- subset(dat[indx,],dummy==0) indx1<- as.numeric(row.names(dat1)) indx11<- sort(c(indx1,indx1+1)) indx0<- as.numeric(row.names(dat0)) indx00<- sort(c(indx0,indx0-1)) indx10<- sort(c(indx11,indx00)) res <- dat[-indx10,] res } #Applying fun3() res5F3<- fun3(res5Percent) dim(res5F3) #[1] 278 5 res5F3High<- fun3(res5PercentHigh) dim(res5F3High) #[1] 546 5 res5F3_1<- fun3(res5Percent1) #[1] 302 5 res5F3High_1<- fun3(res5Percent1High) dim(res5F3High_1) #[1] 570 5 res10F3<- fun3(res10Percent) dim(res10F3) #[1] 462 5 res10F3_1<- fun3(res10Percent1) #[1] 474 5 nrow(subset(res5F3,dummy==0)) #[1] 139 nrow(subset(res5F3,dummy==1)) #[1] 139 nrow(subset(res5F3High,dummy==1)) #[1] 273 nrow(subset(res5F3High,dummy==0)) #[1] 273 nrow(subset(res10F3,dummy==0)) #[1] 231 nrow(subset(res10F3,dummy==1)) #[1] 231 nrow(subset(res10F3_1,dummy==1)) #[1] 237 nrow(subset(res10F3_1,dummy==0)) #[1] 237 dim(unique(res5F3)) #[1] 278 5 dim(unique(res5F3High)) #[1] 546 5 dim(unique(res10F3_1)) #[1] 474 5 dim(unique(res10F3)) #[1] 462 5 A.K. ________________________________ From: Cecilia Carmo <cecilia.ca...@ua.pt> To: arun <smartpink...@yahoo.com> Sent: Friday, June 14, 2013 10:44 AM Subject: me again There some matchs that are missing. That is, it is possible to have more matchs. I'm sending you a sript and the data. Than you. Cecília ______________________________________________ R-help@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.