Re: [R] select .txt from .txt in a directory

Zilefac Elvis Mon, 11 Nov 2013 13:01:44 -0800

Thanks, AK.
The three codes worked as expected.
Again, thanks so much for understanding my problem and proving the right 
solutions.
Atem.




On Saturday, November 9, 2013 6:27 PM, arun <smartpink...@yahoo.com> wrote:
 
HI,

The code could be shortened by using ?merge or ?join().
library(plyr)
##Using the output from `lst6`


lst7 <- lapply(lst6,function(x) {x1 <- 
data.frame(Year=rep(1961:2005,each=12),Mo=rep(1:12,45)); x2 
<-join(x1,x,type="left",by=c("Year","Mo"))})

##rest are the same (only change in object names)

 sapply(lst7,nrow)
 lst8 <-lapply(lst7,function(x) 
data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE))) 
  lst9<- lapply(seq_along(lst8),function(i){
    x<- lst11[[i]]
    colnames(x)<- lstf1[i]
    row.names(x)<- 1:nrow(x)
    x
  }) 
sapply(lst9,nrow)
res2New <- do.call(cbind,lst9)
 dim(res2New)
#[1] 16740    98
res2New[res2New ==-9999.9]<-NA # change missing value identifier as in your 
data set
which(res2New==-9999.9)
#integer(0)

dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day")
dates2<- as.character(dates1)
sldat<- split(dates2,list(gsub("-.*","",dates2)))
lst12<-lapply(sldat,function(x) lapply(split(x,gsub(".*-(.*)-.*","\\1",x)), 
function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0) 
{x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)}
 else y} ))
any(sapply(lst12,function(x) any(lapply(x,length)!=31)))
#[1] FALSE

lst22<-lapply(lst12,function(x) unlist(x,use.names=FALSE))
sapply(lst22,length)
dates3<-unlist(lst22,use.names=FALSE)
length(dates3)
res3New <- data.frame(dates=dates3,res2New,stringsAsFactors=FALSE)
str(res3New)
res3New$dates<-as.Date(res3New$dates)
res4New <- res3New[!is.na(res3New$dates),]
res4New[1:3,1:3]
dim(res4New)
colnames(res4) <- colnames(res4New)
 identical(res4,res4New)
#[1] TRUE

A.K.






On Saturday, November 9, 2013 5:46 PM, arun <smartpink...@yahoo.com> wrote:


Hi,
Try:
library(stringr)
##### Created the selected files (98) in a separate working  folder 
(SubsetFiles1) (refer to my previous mail)
filelst <- list.files()
#Sublst <- filelst[1:2]
res <- lapply(filelst,function(x) {con <- file(x)
     Lines1 <- readLines(con) close(con)
     Lines2 <- Lines1[-1]
     Lines3 <- str_split(Lines2,"-9999.9M")
     Lines4 <- str_trim(unlist(lapply(Lines3,function(x) {x[x==""] <- NA
     paste(x,collapse=" ")})))
     Lines5 <- gsub("(\\d+)[A-Za-z]","\\1",Lines4)
     res1 <- read.table(text=Lines5,sep="",header=FALSE,fill=TRUE)
     res1})

##Created another folder "Modified" to store the "res" files
lapply(seq_along(res),function(i) 
write.table(res[[i]],paste("/home/arunksa111/Zl/Modified",paste0("Mod_",filelst[i]),sep="/"),row.names=FALSE,quote=FALSE))

 lstf1 <- list.files(path="/home/arunksa111/Zl/Modified")  

lst1 <- lapply(lstf1,function(x) 
readLines(paste("/home/arunksa111/Zl/Modified",x,sep="/")))
 which(lapply(lst1,function(x) length(grep("\\d+-9999.9",x)))>0 )
 #[1]  7 11 14 15 30 32 39 40 42 45 46 53 60 65 66 68 69 70 73 74 75 78 80 82 83
#[26] 86 87 90 91 93

lst2 <- lapply(lst1,function(x) gsub("(\\d+)(-9999.9)","\\1 \\2",x))
 #lapply(lst2,function(x) x[grep("\\d+-9999.9",x)]) ##checking for the pattern

lst3 <- lapply(lst2,function(x) {x<-gsub("(-9999.9)(-9999.9)","\\1 \\2",x)})#
#lapply(lst3,function(x) x[grep("\\d+-9999.9",x)])  ##checking for the pattern
# lapply(lst3,function(x) x[grep("-9999.9",x)]) ###second check
lst4 <- lapply(lst3,function(x) gsub("(Day) (\\d+)","\\1_\\2", x[-1]))  
#removed the additional header "V1", "V2", etc.

#sapply(lst4,function(x) length(strsplit(x[1]," ")[[1]])) #checking the number 
of columns that should be present
lst5 <- lapply(lst4,function(x) unlist(lapply(x, function(y) word(y,1,33))))
lst6 <- lapply(lst5,function(x) 
read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE))
# head(lst6[[94]],3)
lst7 <- lapply(lst6,function(x) x[x$Year >=1961 & x$Year <=2005,])
#head(lst7[[45]],3)
 lst8 <- lapply(lst7,function(x) x[!is.na(x$Year),])


lst9 <- lapply(lst8,function(x) {
    if((min(x$Year)>1961)|(max(x$Year)<2005)){
      n1<- (min(x$Year)-1961)*12
      x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
      n2<- (2005-max(x$Year))*12
      x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
       colnames(x1) <- colnames(x)
       colnames(x2) <- colnames(x)        
      x3<- rbind(x1,x,x2)
    }
   else if((min(x$Year)==1961) & (max(x$Year)==2005)) {
          if((min(x$Mo[x$Year==1961])>1)|(max(x$Mo[x$Year==2005])<12)){
       n1 <- min(x$Mo[x$Year==1961])-1
       x1 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
       n2 <- (12-max(x$Mo[x$Year==2005]))          
       x2 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
       colnames(x1) <- colnames(x)
       colnames(x2) <- colnames(x)
       x3 <- rbind(x1,x,x2)
      }
        else {    
        x
    }
     
    } })

which(sapply(lst9,nrow)!=540)
#[1] 45 46 54 64 65 66 70 75 97
lst10 <- lapply(lst9,function(x) {x1 <- x[!is.na(x$Year),]
             hx1 <- head(x1,1)
             tx1 <- tail(x1,1)
             x2 <- as.data.frame(matrix(NA, ncol=ncol(x), nrow=hx1$Mo-1))
             x3 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=12-tx1$Mo))
             colnames(x2) <- colnames(x)
             colnames(x3) <- colnames(x)
             if(nrow(x) < 540) rbind(x2,x,x3) else x  })
which(sapply(lst10,nrow)!=540)
#integer(0)



lst11 <-lapply(lst10,function(x) 
data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE))) 
  lst12<- lapply(seq_along(lst10),function(i){
    x<- lst11[[i]]
    colnames(x)<- lstf1[i]
    row.names(x)<- 1:nrow(x)
    x
  })
res2 <-  do.call(cbind,lst11)
 dim(res2)
#[1] 16740    98
 
res2[res2==-9999.9]<-NA # change missing value identifier as in your data set
which(res2==-9999.9)
#integer(0)

dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day")
dates2<- as.character(dates1)
sldat<- split(dates2,list(gsub("-.*","",dates2)))
lst12<-lapply(sldat,function(x) lapply(split(x,gsub(".*-(.*)-.*","\\1",x)), 
function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0) 
{x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)}
 else y} ))
any(sapply(lst12,function(x) any(lapply(x,length)!=31)))
#[1] FALSE

lst22<-lapply(lst12,function(x) unlist(x,use.names=FALSE))
sapply(lst22,length)
dates3<-unlist(lst22,use.names=FALSE)
length(dates3)
res3 <- data.frame(dates=dates3,res2,stringsAsFactors=FALSE)
str(res3)
res3$dates<-as.Date(res3$dates)
res4 <- res3[!is.na(res3$dates),]
res4[1:3,1:3]
dim(res4)
 #[1] 16436    99


A.K.





On Friday, November 8, 2013 5:54 PM, Zilefac Elvis <zilefacel...@yahoo.com> 
wrote:

Hi Ak,

I think I figured out how to do the sub-setting. All I needed was to use column 
3 in Temperature_inventory and select matching .txt files in the .zip file. The 
final result would be a subset of files whose IDs are in column 3 of 
temp_inventory.
*************************************************************************
I also have this script which you developed for managing precipitation files. 
Now I want to use the same code for the temperature files I sent to you. I 
tried doing it with some errors.
Please try these scripts on my temperature data. If you need further 
information let me know.
Note here that -9999.99M is -9999.9M in the temperature files.

library(stringr)# load it
res<-lapply(temp,function(x) {con <- file(x);
                                 Lines1<- readLines(con);
                                 close(con);
                                 Lines2<-Lines1[-1];# myfiles contain headers 
in row 2, so I removed the headers
                                 Lines3<- str_split(Lines2,"-9999.99M");
                                 Lines4<- 
str_trim(unlist(lapply(Lines3,function(x){x[x==""]<-NA;#replace missing 
identifier with NA
                                                                                
    paste(x,collapse=" ")})));
                                 Lines5<- gsub("(\\d+)[A-Za-z]","\\1",Lines4);
                                 res<- 
read.table(text=Lines5,sep="",header=FALSE,fill=TRUE)})
lapply(res,head,2)# take a look at first two rows of res.
lapply(seq_along(res),function(i) 
write.table(res[[i]],paste0(gsub(".txt","",temp[i]),".txt"),row.names=FALSE,quote=FALSE))
#********************************************************************************************************
# Then use the following as a continuation from the one above

lstf1<- list.files(pattern=".txt")
length(lstf1)
fun2<- function(lstf){
  lst1<-lapply(lstf,function(x) readLines(x))
  lst2<-lapply(lst1,function(x) {gsub("(\\d+)(-9999.99)","\\1 \\2",x)})#change 
missing value identifier as in your data set
  lst3<-lapply(lst2,function(x) {x<-gsub("(\\d+)(-9999.99)","\\1 
\\2",x)})#change missing value identifier as in your data set
  lst4<- lapply(lst3,function(x) 
read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE))
  lst5<- lapply(lst4,function(x) x[x$V1>=1961 & x$V1<=2005,])
  lst6<- lapply(lst5,function(x) x[!is.na(x$V1),])
  lst7<- lapply(lst6,function(x) {
    if((min(x$V1)>1961)|(max(x$V1)<2005)){
      n1<- (min(x$V1)-1961)*12
      x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
      n2<- (2005-max(x$V1))*12
      x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
      x3<- rbind(x1,x,x2)
    }
    else {
      x
    } })
  lst8<-lapply(lst7,function(x) 
data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE))) ####changed
  lst9<- lapply(seq_along(lst8),function(i){
    x<- lst8[[i]]
    colnames(x)<- lstf1[i]
    row.names(x)<- 1:nrow(x)
    x
  })
  do.call(cbind,lst9)}
res<-fun2(lstf1)
dim(res)
res[res==-9999.99]<-NA # change missing value identifier as in your data set
which(res==-9999.99)#change missing value identifier as in your data set
dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day")
dates2<- as.character(dates1)
sldat<- split(dates2,list(gsub("-.*","",dates2)))
lst11<-lapply(sldat,function(x) lapply(split(x,gsub(".*-(.*)-.*","\\1",x)), 
function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0) 
{x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)}
 else y} ))
any(sapply(lst1,function(x) any(lapply(x,length)!=31)))
lst22<-lapply(lst11,function(x) unlist(x,use.names=FALSE))
sapply(lst22,length)
dates3<-unlist(lst22,use.names=FALSE)
length(dates3)
res1<- data.frame(dates=dates3,res,stringsAsFactors=FALSE)
str(res1)
res1$dates<-as.Date(res1$dates)
res2<-res1[!is.na(res1$dates),]
res2[1:3,1:3]
dim(res2)
write.csv(res2, file = "TemperatureAllstations.csv")#
#***********************************************************************************

Waiting for your useful input.

Thanks so much,
Atem.








Hi Atem,

It is not clear what you wanted to do.  If you want to transfer the subset of 
files from the main folder to a new location, then you may try: (make sure you 
create a copy of the original .txt folder before doing this)
I created three sub folders and two files (BTemperature_Stations.txt and 
Tempearture inventory.csv) in my working directory.


list.files()
#[1] "BTemperature_Stations.txt" "Files1"          ## Files1 folder contains 
all the .txt files; #SubsetFiles: created to subset the files that match the 
condition                 
#[3]
"FilesCopy"                 "SubsetFiles1"          #FilesCopy. A copy of the 
Files1 folder   
#[5] "Tempearture inventory.csv"




list.files(pattern="\\.")
#[1] "BTemperature_Stations.txt" "Tempearture inventory.csv"
fl1 <- list.files(pattern="\\.")
 dat1 <- 
read.table(fl1[1],header=TRUE,sep="",stringsAsFactors=FALSE,fill=TRUE,check.names=FALSE)
 dat2 <- 
read.csv(fl1[2],header=TRUE,sep=",",stringsAsFactors=FALSE,check.names=FALSE)
vec1 <- dat1[,3][dat1[,3]%in% dat2[,3]]
vec2 <- list.files(path="/home/arunksa111/Zl/Files1",recursive=TRUE)
 sum(gsub(".txt","",vec2) %in% vec1)
#[1] 98
vec3 <-  vec2[gsub(".txt","",vec2) %in% vec1]
lapply(vec3, function(x) 
file.rename(paste("/home/arunksa111/Zl/Files1",x,sep="/"), 
paste("/home/arunksa111/Zl/SubsetFiles1",x,sep="/"))) #change the path 
accordingly. 
length(list.files(path="/home/arunksa111/Zl/SubsetFiles1"))
#[1] 98

fileDim <- sapply(vec3,function(x) {x1 
<-read.delim(paste("/home/arunksa111/Zl/SubsetFiles1",x,sep="/"),header=TRUE,stringsAsFactors=FALSE,sep=",",check.names=FALSE);
 dim(x1)})
fileDim[,1:3]
#     dn3011120.txt dn3011240.txt dn3011887.txt
#[1,]          1151           791          1054
#[2,]             7             7             7


A.K.







 wrote:

Hi AK,


I want to select some files from a list of files. All are text files. The index 
for selection is found in column 3 of both files.


Attached are my data files.
Btemperature_Stations is my
main file.
Temperature inventory is my 'wanted' file and is a subset of 
Btemperature_Stations.
Using column 3 in both files, select the files in Temperature inventory from 
Btemperature_Stations.
The .zip file contains the .txt files which you will extract to a folder and do 
the selection in R.

Thanks,
Atem.
        [[alternative HTML version deleted]]

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Re: [R] select .txt from .txt in a directory

Reply via email to