Thanks, AK. The three codes worked as expected. Again, thanks so much for understanding my problem and proving the right solutions. Atem.
On Saturday, November 9, 2013 6:27 PM, arun <smartpink...@yahoo.com> wrote: HI, The code could be shortened by using ?merge or ?join(). library(plyr) ##Using the output from `lst6` lst7 <- lapply(lst6,function(x) {x1 <- data.frame(Year=rep(1961:2005,each=12),Mo=rep(1:12,45)); x2 <-join(x1,x,type="left",by=c("Year","Mo"))}) ##rest are the same (only change in object names) sapply(lst7,nrow) lst8 <-lapply(lst7,function(x) data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE))) lst9<- lapply(seq_along(lst8),function(i){ x<- lst11[[i]] colnames(x)<- lstf1[i] row.names(x)<- 1:nrow(x) x }) sapply(lst9,nrow) res2New <- do.call(cbind,lst9) dim(res2New) #[1] 16740 98 res2New[res2New ==-9999.9]<-NA # change missing value identifier as in your data set which(res2New==-9999.9) #integer(0) dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day") dates2<- as.character(dates1) sldat<- split(dates2,list(gsub("-.*","",dates2))) lst12<-lapply(sldat,function(x) lapply(split(x,gsub(".*-(.*)-.*","\\1",x)), function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0) {x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)} else y} )) any(sapply(lst12,function(x) any(lapply(x,length)!=31))) #[1] FALSE lst22<-lapply(lst12,function(x) unlist(x,use.names=FALSE)) sapply(lst22,length) dates3<-unlist(lst22,use.names=FALSE) length(dates3) res3New <- data.frame(dates=dates3,res2New,stringsAsFactors=FALSE) str(res3New) res3New$dates<-as.Date(res3New$dates) res4New <- res3New[!is.na(res3New$dates),] res4New[1:3,1:3] dim(res4New) colnames(res4) <- colnames(res4New) identical(res4,res4New) #[1] TRUE A.K. On Saturday, November 9, 2013 5:46 PM, arun <smartpink...@yahoo.com> wrote: Hi, Try: library(stringr) ##### Created the selected files (98) in a separate working folder (SubsetFiles1) (refer to my previous mail) filelst <- list.files() #Sublst <- filelst[1:2] res <- lapply(filelst,function(x) {con <- file(x) Lines1 <- readLines(con) close(con) Lines2 <- Lines1[-1] Lines3 <- str_split(Lines2,"-9999.9M") Lines4 <- str_trim(unlist(lapply(Lines3,function(x) {x[x==""] <- NA paste(x,collapse=" ")}))) Lines5 <- gsub("(\\d+)[A-Za-z]","\\1",Lines4) res1 <- read.table(text=Lines5,sep="",header=FALSE,fill=TRUE) res1}) ##Created another folder "Modified" to store the "res" files lapply(seq_along(res),function(i) write.table(res[[i]],paste("/home/arunksa111/Zl/Modified",paste0("Mod_",filelst[i]),sep="/"),row.names=FALSE,quote=FALSE)) lstf1 <- list.files(path="/home/arunksa111/Zl/Modified") lst1 <- lapply(lstf1,function(x) readLines(paste("/home/arunksa111/Zl/Modified",x,sep="/"))) which(lapply(lst1,function(x) length(grep("\\d+-9999.9",x)))>0 ) #[1] 7 11 14 15 30 32 39 40 42 45 46 53 60 65 66 68 69 70 73 74 75 78 80 82 83 #[26] 86 87 90 91 93 lst2 <- lapply(lst1,function(x) gsub("(\\d+)(-9999.9)","\\1 \\2",x)) #lapply(lst2,function(x) x[grep("\\d+-9999.9",x)]) ##checking for the pattern lst3 <- lapply(lst2,function(x) {x<-gsub("(-9999.9)(-9999.9)","\\1 \\2",x)})# #lapply(lst3,function(x) x[grep("\\d+-9999.9",x)]) ##checking for the pattern # lapply(lst3,function(x) x[grep("-9999.9",x)]) ###second check lst4 <- lapply(lst3,function(x) gsub("(Day) (\\d+)","\\1_\\2", x[-1])) #removed the additional header "V1", "V2", etc. #sapply(lst4,function(x) length(strsplit(x[1]," ")[[1]])) #checking the number of columns that should be present lst5 <- lapply(lst4,function(x) unlist(lapply(x, function(y) word(y,1,33)))) lst6 <- lapply(lst5,function(x) read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE)) # head(lst6[[94]],3) lst7 <- lapply(lst6,function(x) x[x$Year >=1961 & x$Year <=2005,]) #head(lst7[[45]],3) lst8 <- lapply(lst7,function(x) x[!is.na(x$Year),]) lst9 <- lapply(lst8,function(x) { if((min(x$Year)>1961)|(max(x$Year)<2005)){ n1<- (min(x$Year)-1961)*12 x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1)) n2<- (2005-max(x$Year))*12 x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2)) colnames(x1) <- colnames(x) colnames(x2) <- colnames(x) x3<- rbind(x1,x,x2) } else if((min(x$Year)==1961) & (max(x$Year)==2005)) { if((min(x$Mo[x$Year==1961])>1)|(max(x$Mo[x$Year==2005])<12)){ n1 <- min(x$Mo[x$Year==1961])-1 x1 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1)) n2 <- (12-max(x$Mo[x$Year==2005])) x2 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2)) colnames(x1) <- colnames(x) colnames(x2) <- colnames(x) x3 <- rbind(x1,x,x2) } else { x } } }) which(sapply(lst9,nrow)!=540) #[1] 45 46 54 64 65 66 70 75 97 lst10 <- lapply(lst9,function(x) {x1 <- x[!is.na(x$Year),] hx1 <- head(x1,1) tx1 <- tail(x1,1) x2 <- as.data.frame(matrix(NA, ncol=ncol(x), nrow=hx1$Mo-1)) x3 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=12-tx1$Mo)) colnames(x2) <- colnames(x) colnames(x3) <- colnames(x) if(nrow(x) < 540) rbind(x2,x,x3) else x }) which(sapply(lst10,nrow)!=540) #integer(0) lst11 <-lapply(lst10,function(x) data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE))) lst12<- lapply(seq_along(lst10),function(i){ x<- lst11[[i]] colnames(x)<- lstf1[i] row.names(x)<- 1:nrow(x) x }) res2 <- do.call(cbind,lst11) dim(res2) #[1] 16740 98 res2[res2==-9999.9]<-NA # change missing value identifier as in your data set which(res2==-9999.9) #integer(0) dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day") dates2<- as.character(dates1) sldat<- split(dates2,list(gsub("-.*","",dates2))) lst12<-lapply(sldat,function(x) lapply(split(x,gsub(".*-(.*)-.*","\\1",x)), function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0) {x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)} else y} )) any(sapply(lst12,function(x) any(lapply(x,length)!=31))) #[1] FALSE lst22<-lapply(lst12,function(x) unlist(x,use.names=FALSE)) sapply(lst22,length) dates3<-unlist(lst22,use.names=FALSE) length(dates3) res3 <- data.frame(dates=dates3,res2,stringsAsFactors=FALSE) str(res3) res3$dates<-as.Date(res3$dates) res4 <- res3[!is.na(res3$dates),] res4[1:3,1:3] dim(res4) #[1] 16436 99 A.K. On Friday, November 8, 2013 5:54 PM, Zilefac Elvis <zilefacel...@yahoo.com> wrote: Hi Ak, I think I figured out how to do the sub-setting. All I needed was to use column 3 in Temperature_inventory and select matching .txt files in the .zip file. The final result would be a subset of files whose IDs are in column 3 of temp_inventory. ************************************************************************* I also have this script which you developed for managing precipitation files. Now I want to use the same code for the temperature files I sent to you. I tried doing it with some errors. Please try these scripts on my temperature data. If you need further information let me know. Note here that -9999.99M is -9999.9M in the temperature files. library(stringr)# load it res<-lapply(temp,function(x) {con <- file(x); Lines1<- readLines(con); close(con); Lines2<-Lines1[-1];# myfiles contain headers in row 2, so I removed the headers Lines3<- str_split(Lines2,"-9999.99M"); Lines4<- str_trim(unlist(lapply(Lines3,function(x){x[x==""]<-NA;#replace missing identifier with NA paste(x,collapse=" ")}))); Lines5<- gsub("(\\d+)[A-Za-z]","\\1",Lines4); res<- read.table(text=Lines5,sep="",header=FALSE,fill=TRUE)}) lapply(res,head,2)# take a look at first two rows of res. lapply(seq_along(res),function(i) write.table(res[[i]],paste0(gsub(".txt","",temp[i]),".txt"),row.names=FALSE,quote=FALSE)) #******************************************************************************************************** # Then use the following as a continuation from the one above lstf1<- list.files(pattern=".txt") length(lstf1) fun2<- function(lstf){ lst1<-lapply(lstf,function(x) readLines(x)) lst2<-lapply(lst1,function(x) {gsub("(\\d+)(-9999.99)","\\1 \\2",x)})#change missing value identifier as in your data set lst3<-lapply(lst2,function(x) {x<-gsub("(\\d+)(-9999.99)","\\1 \\2",x)})#change missing value identifier as in your data set lst4<- lapply(lst3,function(x) read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE)) lst5<- lapply(lst4,function(x) x[x$V1>=1961 & x$V1<=2005,]) lst6<- lapply(lst5,function(x) x[!is.na(x$V1),]) lst7<- lapply(lst6,function(x) { if((min(x$V1)>1961)|(max(x$V1)<2005)){ n1<- (min(x$V1)-1961)*12 x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1)) n2<- (2005-max(x$V1))*12 x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2)) x3<- rbind(x1,x,x2) } else { x } }) lst8<-lapply(lst7,function(x) data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE))) ####changed lst9<- lapply(seq_along(lst8),function(i){ x<- lst8[[i]] colnames(x)<- lstf1[i] row.names(x)<- 1:nrow(x) x }) do.call(cbind,lst9)} res<-fun2(lstf1) dim(res) res[res==-9999.99]<-NA # change missing value identifier as in your data set which(res==-9999.99)#change missing value identifier as in your data set dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day") dates2<- as.character(dates1) sldat<- split(dates2,list(gsub("-.*","",dates2))) lst11<-lapply(sldat,function(x) lapply(split(x,gsub(".*-(.*)-.*","\\1",x)), function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0) {x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)} else y} )) any(sapply(lst1,function(x) any(lapply(x,length)!=31))) lst22<-lapply(lst11,function(x) unlist(x,use.names=FALSE)) sapply(lst22,length) dates3<-unlist(lst22,use.names=FALSE) length(dates3) res1<- data.frame(dates=dates3,res,stringsAsFactors=FALSE) str(res1) res1$dates<-as.Date(res1$dates) res2<-res1[!is.na(res1$dates),] res2[1:3,1:3] dim(res2) write.csv(res2, file = "TemperatureAllstations.csv")# #*********************************************************************************** Waiting for your useful input. Thanks so much, Atem. Hi Atem, It is not clear what you wanted to do. If you want to transfer the subset of files from the main folder to a new location, then you may try: (make sure you create a copy of the original .txt folder before doing this) I created three sub folders and two files (BTemperature_Stations.txt and Tempearture inventory.csv) in my working directory. list.files() #[1] "BTemperature_Stations.txt" "Files1" ## Files1 folder contains all the .txt files; #SubsetFiles: created to subset the files that match the condition #[3] "FilesCopy" "SubsetFiles1" #FilesCopy. A copy of the Files1 folder #[5] "Tempearture inventory.csv" list.files(pattern="\\.") #[1] "BTemperature_Stations.txt" "Tempearture inventory.csv" fl1 <- list.files(pattern="\\.") dat1 <- read.table(fl1[1],header=TRUE,sep="",stringsAsFactors=FALSE,fill=TRUE,check.names=FALSE) dat2 <- read.csv(fl1[2],header=TRUE,sep=",",stringsAsFactors=FALSE,check.names=FALSE) vec1 <- dat1[,3][dat1[,3]%in% dat2[,3]] vec2 <- list.files(path="/home/arunksa111/Zl/Files1",recursive=TRUE) sum(gsub(".txt","",vec2) %in% vec1) #[1] 98 vec3 <- vec2[gsub(".txt","",vec2) %in% vec1] lapply(vec3, function(x) file.rename(paste("/home/arunksa111/Zl/Files1",x,sep="/"), paste("/home/arunksa111/Zl/SubsetFiles1",x,sep="/"))) #change the path accordingly. length(list.files(path="/home/arunksa111/Zl/SubsetFiles1")) #[1] 98 fileDim <- sapply(vec3,function(x) {x1 <-read.delim(paste("/home/arunksa111/Zl/SubsetFiles1",x,sep="/"),header=TRUE,stringsAsFactors=FALSE,sep=",",check.names=FALSE); dim(x1)}) fileDim[,1:3] # dn3011120.txt dn3011240.txt dn3011887.txt #[1,] 1151 791 1054 #[2,] 7 7 7 A.K. wrote: Hi AK, I want to select some files from a list of files. All are text files. The index for selection is found in column 3 of both files. Attached are my data files. Btemperature_Stations is my main file. Temperature inventory is my 'wanted' file and is a subset of Btemperature_Stations. Using column 3 in both files, select the files in Temperature inventory from Btemperature_Stations. The .zip file contains the .txt files which you will extract to a folder and do the selection in R. Thanks, Atem. [[alternative HTML version deleted]]
______________________________________________ R-help@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.