Re: [R] Quantile and rowMean from multiple files in a folder

arun Mon, 14 Apr 2014 22:54:08 -0700

Hi Atem,

I guess this is what you wanted.


###Q1: 
###
###working directory: Observed
 #Only one file per Site.  Assuming this is the case for the full dataset, then 
I guess there is no need to average

dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", 
list.files(pattern = ".csv")))

lst2 <-  lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- 
readLines(x2); header1 <- lines1[1:2]; dat1 <- 
read.table(text=lines1,header=FALSE,sep=",",stringsAsFactors=FALSE, skip=2); 
colnames(dat1) <- 
Reduce(paste,strsplit(header1,","));dat1[-c(nrow(dat1),nrow(dat1)-1),]}))


#different number of rows
 sapply(seq_along(lst2),function(i){lstN <- lapply(lst2[[i]],function(x) 
x[,-1]);sapply(lstN,function(x) nrow(x))})
 #[1] 9 9 9 8 2 9

#difference in number of columns
sapply(seq_along(lst2),function(i) {sapply(lst2[[i]],function(x) ncol(x))})
 #[1] 157 258 258  98 157 258

library(plyr)
library(stringr)

lst3 <- setNames(lapply(seq_along(lst2),function(i) 
{lapply(lst2[[i]],function(x) {names(x)[-1] <- paste(names(x)[-1], 
names(lst1)[i],sep="_"); names(x) <- str_trim(names(x)); x})[[1]]}), 
names(lst1)) 

df1 <- join_all(lst3,by="Year")
dim(df1)
 #[1]    9 1181 


sapply(split(names(df1)[-1] ,gsub(".*\\_","",names(df1)[-1])),function(x) {df2 
<- df1[,x];df3 <- data.frame(Percentiles=paste0(seq(0,100, by=1) ,"%"), 
numcolwise(function(y) 
quantile(y,seq(0,1,by=0.01),na.rm=TRUE))(df2),stringsAsFactors=FALSE);ncol(df3) 
})
 #G100 G101 G102 G103 G104 G105 
# 157  258  258   98  157  258 

lst4 <- split(names(df1)[-1] ,gsub(".*\\_","",names(df1)[-1]))

lapply(seq_along(lst4),function(i) {df2 <- df1[,lst4[[i]]]; df3 <- 
data.frame(Percentiles=paste0(seq(0,100, by=1) ,"%"), numcolwise(function(y) 
quantile(y,seq(0,1,by=0.01),na.rm=TRUE))(df2),stringsAsFactors=FALSE);df3[1:3,1:3];
 write.csv(df3,paste0(paste(getwd(), 
"final",paste(names(lst1)[[i]],"Quantile",sep="_"),sep="/"),".csv"),row.names=FALSE,quote=FALSE)})
 

ReadOut1 <- 
lapply(list.files(recursive=TRUE)[grep("Quantile",list.files(recursive=TRUE))],function(x)
 read.csv(x,header=TRUE,stringsAsFactors=FALSE)) 

sapply(ReadOut1,dim)
#     [,1] [,2] [,3] [,4] [,5] [,6]
 #[1,]  101  101  101  101  101  101 
#[2,]  157  258  258   98  157  258

lapply(ReadOut1,function(x) x[1:2,1:3])[1:3]
 #[[1]] 
#  Percentiles pav.DJF_G100 pav.MAM_G100 
#1          0%            0     0.640500 
#2          1%            0     0.664604 
# 
#[[2]] 
#  Percentiles txav.DJF_G101 txav.MAM_G101
 #1          0%      -13.8756      4.742400 
#2          1%      -13.8140      4.817184
 #
 #[[3]] 
#  Percentiles txav.DJF_G102 txav.MAM_G102
 #1          0%     -15.05000      4.520700
 #2          1%     -14.96833      4.543828 
#####
###Q2: 
###Observed data 

dir.create("Indices")
 names1 <- unlist(lapply(ReadOut1,function(x)
 names(x)[-1])) 
names2 <-  gsub("\\_.*","",names1)
 names3 <- unique(gsub("[.]", " ", names2)) 

res <- do.call(rbind,lapply(seq_along(lst4),function(i) {df2 <- 
df1[,lst4[[i]]];vec1 <- colMeans(df2,na.rm=TRUE); vec2 <- 
rep(NA,length(names3));names(vec2) <- paste(names3,names(lst4)[[i]],sep="_"); 
vec2[names(vec2) %in% names(vec1)] <- vec1; names(vec2) <- 
gsub("\\_.*","",names(vec2)); vec2  }))


lapply(seq_len(ncol(res)),function(i) {mat1 <- 
t(res[,i,drop=FALSE]);colnames(mat1) <- names(lst4); 
write.csv(mat1,paste0(paste(getwd(),"Indices", gsub(" 
","_",rownames(mat1)),sep="/"),".csv"),row.names=FALSE,quote=FALSE)})

##Output2:
ReadOut2 <- 
lapply(list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))],function(x)
 read.csv(x,header=TRUE,stringsAsFactors=FALSE)) 

length(ReadOut2) 

#[1] 257


list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))][1]
#[1] "Indices/pav_ANN.csv" 

res[,"pav ANN",drop=FALSE] 

#      pav ANN
#[1,] 1.298811
#[2,] 7.642922 

#[3,] 6.740011 

#[4,]       NA
#[5,] 1.296650 

#[6,] 6.887622 


ReadOut2[[1]]
#      G100     G101     G102 G103    G104     G105
#1 1.298811 7.642922 6.740011   NA 1.29665 6.887622 

###Sample data 

###Working directory changed to "sample" 

dir.create("Indices_colMeans")

lst1 <- 
split(list.files(pattern=".csv"),gsub("\\_.*","",list.files(pattern=".csv"))) 

lst2 <-  lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- 
readLines(x2); header1 <- lines1[1:2]; dat1 <- 
read.table(text=lines1,header=FALSE,sep=",",stringsAsFactors=FALSE, skip=2); 
colnames(dat1) <- 
Reduce(paste,strsplit(header1,","));dat1[-c(nrow(dat1),nrow(dat1)-1),]}))
res1 <- do.call(rbind,lapply(seq_along(lst2),function(i) 
{rowMeans(do.call(cbind,lapply(lst2[[i]],function(x) 
colMeans(x[,-1],na.rm=TRUE))),na.rm=TRUE) })) 

lapply(seq_len(ncol(res1)),function(i){mat1 <- t(res1[,i,drop=FALSE]); 
colnames(mat1) <- 
names(lst2);write.csv(mat1,paste0(paste(getwd(),"Indices_colMeans",gsub(" 
","_",rownames(mat1)),sep="/"),".csv"),row.names=FALSE,quote=FALSE)})

##Output2 Sample
ReadOut2S <- 
lapply(list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))],function(x)
 read.csv(x,header=TRUE,stringsAsFactors=FALSE)) 

length(ReadOut2S)
#[1] 257

list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))][1] 

#[1] "Indices_colMeans/pav_ANN.csv" 

res1[,"pav ANN",drop=FALSE] 

#      pav ANN
#[1,] 1.545620
#[2,] 1.518553 

ReadOut2S[[1]] 

#     G100     G101 

#1 1.54562 1.518553 


A.K.


On Monday, April 14, 2014 1:05 AM, Zilefac Elvis <zilefacel...@yahoo.com> wrote:

Hi AK,

Q1) Please apply the Quantilecode.R to Observed.zip (attached). I tried but 
received an error which was self-explanatory but I could not change the 
dimensions in the code.


Q2) Please apply Quantilecode.R to both sample.zip and observed.zip. Here, 
instead of doing quantile(y, seq(0, 1, by = 0.01), take colMeans of the 
indices. 


I have tried to solve both Q1 and Q2 but still unable to control the dimensions.

Thanks,
Atem.
On Sunday, April 13, 2014 9:05 AM, arun <smartpink...@yahoo.com> wrote:



Hi Atem,

On my end, the codes are not formatted in the email as seen in the screen of 
formatR GUI.

I am attaching the .R file in case there is some difficulty for you.
Arun



On Sunday, April 13, 2014 10:54 AM, arun <smartpink...@yahoo.com> wrote:
Hi,

I am formatting the codes using library(formatR).  Hopefully, it will not be 
mangled in the email.
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", 
list.files(pattern = ".csv")))

lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) { lines1 <- 
readLines(x2) header1 <- lines1[1:2] dat1 <- read.table(text = lines1, header = 
FALSE, sep = ",", stringsAsFactors = FALSE,  skip = 2) colnames(dat1) <- 
Reduce(paste, strsplit(header1, ",")) dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))

library(plyr) 

lapply(seq_along(lst2), function(i) { lstN <- lapply(lst2[[i]], function(x) x[, 
-1]) lstQ1 <- lapply(lstN, function(x) numcolwise(function(y) quantile(y, 
seq(0, 1,  by = 0.01), na.rm = TRUE))(x)) arr1 <- array(unlist(lstQ1), dim = 
c(dim(lstQ1[[1]]), length(lstQ1)), dimnames = list(NULL,  lapply(lstQ1, 
names)[[1]])) res <- rowMeans(arr1, dims = 2, na.rm = TRUE) colnames(res) <- 
gsub(" ", "_", colnames(res)) res1 <- data.frame(Percentiles = paste0(seq(0, 
100, by = 1), "%"), res, stringsAsFactors = FALSE) write.csv(res1, 
paste0(paste(getwd(), "final", paste(names(lst1)[[i]], "Quantile",  sep = "_"), 
sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

ReadOut1 <- lapply(list.files(recursive = TRUE)[grep("Quantile", 
list.files(recursive = TRUE))],  function(x) read.csv(x, header = TRUE, 
stringsAsFactors = FALSE))
sapply(ReadOut1,
dim)
#     [,1] [,2]
#[1,]  101  101
#[2,]  258  258

lapply(ReadOut1,function(x) x[1:2,1:3])
#[[1]]
#  Percentiles  txav_DJF txav_MAM
#1          0% -12.68566  7.09702
#2          1% -12.59062  7.15338
#
#[[2]]
#  Percentiles  txav_DJF txav_MAM
#1          0% -12.75516 6.841840
#2          1% -12.68244 6.910664 


###Q2:

dir.create("Indices")
names1 <- lapply(ReadOut1, function(x) names(x))[[1]]
lstNew <- simplify2array(ReadOut1)
lapply(2:nrow(lstNew), function(i) { dat1 <- data.frame(lstNew[1], 
do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE) colnames(dat1) <- 
c(rownames(lstNew)[1], paste(names(lst1), rep(rownames(lstNew)[i],  
length(lst1)), sep = "_")) write.csv(dat1, paste0(paste(getwd(), "Indices", 
rownames(lstNew)[i], sep = "/"),  ".csv"), row.names = FALSE, quote = FALSE)
}) ## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", 
list.files(recursive = TRUE))],  function(x) read.csv(x, header = TRUE, 
stringsAsFactors = FALSE))
length(ReadOut2)
# [1] 257 


head(ReadOut2[[1]], 2)
#  Percentiles G100_pav_ANN G101_pav_ANN
#1          0%     1.054380     1.032740
#2          1% 
   1.069457     1.045689 


A.K.












On Sunday, April 13, 2014 2:46 AM, Zilefac Elvis <zilefacel...@yahoo.com> wrote:

Hi AK,
Q1) I need your help again. Using the previous data (attached) and the previous 
code below,instead of taking rowMeans, let's do quantile(x,seq(0,1,by=0.01)). 

Delete the last 2 rows (Trend and p<) in each file before doing 
quantile(x,seq(0,1,by=0.01)).

For example, assume that I want to
calculate quantile(x,seq(0,1,by=0.01)) for each column of Site G100. I will do 
so for the 5 sims of site G100 and then take their average. This will be 
approximately close to the true value than just calculating 
quantile(x,seq(0,1,by=0.01)) from one sim. Please do this same thing for all 
the files.

So, when you do rowMeans, it should be the mean of quantile(x,seq(0,1,by=0.01)) 
calculated from all sims in that Site.

Output

The number of files in "final" remains the same (2 files). The "Year" 
column(will be replaced) will contain  the names of 
quantile(x,seq(0,1,by=0.01)) such as  0%           1%           2%           3% 
          4%           5%           6%, ..., 98%    
     99%         100% . You can give this column any name such as "Percentiles".


Q2)  From the folder "final", please go to each file identified by site name, 
take a column, say col1 of txav  from each file, create a dataframe whose 
colnames are site codes (names of files in "final"). Create a folder called 
"Indices" and place this dataframe in it. The filename for the dataframe is 
txav, say. So, in "Indices", you will have one file having 3 columns [, 
c(Percentiles, G100,G101)]. The idea is that I want to be able to pick any 
column from files in "final" and form a dataframe from which I will generate my 
qqplot or boxplot.

Thanks very much AK.
Atem
This should be the final step of this my drama, at least for now.
#==============================================================================================================

dir.create("final")
lst1 <- 
split(list.files(pattern=".csv"),gsub("\\_.*","",list.files(pattern=".csv"))) 
lst2 <-  lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- 
readLines(x2); header1 <- lines1[1:2]; dat1 <- 
read.table(text=lines1,header=FALSE,sep=",",stringsAsFactors=FALSE, skip=2); 
colnames(dat1) <- Reduce(paste,strsplit(header1,","));dat1}))

lstYear <- lapply(lst2,function(x) lapply(x, function(y) 
y[,1,drop=FALSE])[[1]]) 


lapply(seq_along(lst2),function(i) {lstN <-lapply(lst2[[i]],function(x) 
x[,-1]); arr1 <- 
array(unlist(lstN),dim=c(dim(lstN[[1]]),length(lstN)),dimnames=list(NULL,lapply(lstN,names)[[1]]));res
 <-
cbind(lstYear[[i]],rowMeans(arr1,dims=2,na.rm=TRUE)); names(res) <- 
gsub("\\_$","",gsub(" ", "_",names(res))); res[,1] <- gsub(" <", "",res[,1]); 
write.csv(res,paste0(paste(getwd(),"final",names(lst1)
[[i]],sep="/"),".csv"),row.names=FALSE,quote=FALSE)  }) 



#====================================================================================================

### Q1: working directory: Observed #Only one file per Site.  Assuming this is 
the
### case for the full dataset, then I guess there is no need to average
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", 
list.files(pattern = ".csv")))

lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) {
    lines1 <- readLines(x2)
    header1 <- lines1[1:2]
    dat1 <- read.table(text = lines1, header = FALSE, sep = ",", 
stringsAsFactors = FALSE, 
        skip = 2)
    colnames(dat1) <- Reduce(paste, strsplit(header1, ","))
    dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))

# different number of rows
sapply(seq_along(lst2), function(i) {
    lstN <- lapply(lst2[[i]], function(x) x[, -1])
    sapply(lstN, function(x) nrow(x))
})
# [1] 9 9 9 8 2 9 difference in column number
sapply(seq_along(lst2), function(i) {
    sapply(lst2[[i]], function(x) ncol(x))
})
# [1] 157 258 258 98 157 258
library(plyr)
library(stringr)

lst3 <- setNames(lapply(seq_along(lst2), function(i) {
    lapply(lst2[[i]], function(x) {
        names(x)[-1] <- paste(names(x)[-1], names(lst1)[i], sep = "_")
        names(x) <- str_trim(names(x))
        x
    })[[1]]
}), names(lst1))

df1 <- join_all(lst3, by = "Year")
dim(df1)
# [1] 9 1181
sapply(split(names(df1)[-1], gsub(".*\\_", "", names(df1)[-1])), function(x) {
    df2 <- df1[, x]
    df3 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), 
numcolwise(function(y) quantile(y, 
        seq(0, 1, by = 0.01), na.rm = TRUE))(df2), stringsAsFactors = FALSE)
    ncol(df3)
})
# G100 G101 G102 G103 G104 G105 157 258 258 98 157 258
lst4 <- split(names(df1)[-1], gsub(".*\\_", "", names(df1)[-1]))

lapply(seq_along(lst4), function(i) {
    df2 <- df1[, lst4[[i]]]
    df3 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), 
numcolwise(function(y) quantile(y, 
        seq(0, 1, by = 0.01), na.rm = TRUE))(df2), stringsAsFactors = FALSE)
    df3[1:3, 1:3]
    write.csv(df3, paste0(paste(getwd(), "final", paste(names(lst1)[[i]], 
"Quantile", 
        sep = "_"), sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

ReadOut1 <- lapply(list.files(recursive = TRUE)[grep("Quantile", 
list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
sapply(ReadOut1, dim)
# [,1] [,2] [,3] [,4] [,5] [,6] [1,] 101 101 101 101 101 101 [2,] 157 258 258 98
# 157 258
lapply(ReadOut1, function(x) x[1:2, 1:3])[1:3]
# [[1]] Percentiles pav.DJF_G100 pav.MAM_G100 1 0% 0 0.640500 2 1% 0 0.664604
# [[2]] Percentiles txav.DJF_G101 txav.MAM_G101 1 0% -13.8756 4.742400 2 1%
# -13.8140 4.817184 [[3]] Percentiles txav.DJF_G102 txav.MAM_G102 1 0% -15.05000
# 4.520700 2 1% -14.96833 4.543828

### Q2: Observed data

dir.create("Indices")

names1 <- unlist(lapply(ReadOut1, function(x) names(x)[-1]))
names2 <- gsub("\\_.*", "", names1)
names3 <- unique(gsub("[.]", " ", names2))

res <- do.call(rbind, lapply(seq_along(lst4), function(i) {
    df2 <- df1[, lst4[[i]]]
    vec1 <- colMeans(df2, na.rm = TRUE)
    vec2 <- rep(NA, length(names3))
    names(vec2) <- paste(names3, names(lst4)[[i]], sep = "_")
    vec2[names(vec2) %in% names(vec1)] <- vec1
    names(vec2) <- gsub("\\_.*", "", names(vec2))
    vec2
}))

lapply(seq_len(ncol(res)), function(i) {
    mat1 <- t(res[, i, drop = FALSE])
    colnames(mat1) <- names(lst4)
    write.csv(mat1, paste0(paste(getwd(), "Indices", gsub(" ", "_", 
rownames(mat1)), 
        sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", 
list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2)
# [1] 257
list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))][1]
# [1] 'Indices/pav_ANN.csv'
res[, "pav ANN", drop = FALSE]
# pav ANN [1,] 1.298811 [2,] 7.642922 [3,] 6.740011 [4,] NA [5,] 1.296650 [6,]
# 6.887622

ReadOut2[[1]]
# G100 G101 G102 G103 G104 G105 1 1.298811 7.642922 6.740011 NA 1.29665 6.887622

### Sample data Working directory changed to 'sample'
dir.create("Indices_colMeans")

lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", 
list.files(pattern = ".csv")))

lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) {
    lines1 <- readLines(x2)
    header1 <- lines1[1:2]
    dat1 <- read.table(text = lines1, header = FALSE, sep = ",", 
stringsAsFactors = FALSE, 
        skip = 2)
    colnames(dat1) <- Reduce(paste, strsplit(header1, ","))
    dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))
res1 <- do.call(rbind, lapply(seq_along(lst2), function(i) {
    rowMeans(do.call(cbind, lapply(lst2[[i]], function(x) colMeans(x[, -1], 
na.rm = TRUE))), 
        na.rm = TRUE)
}))

lapply(seq_len(ncol(res1)), function(i) {
    mat1 <- t(res1[, i, drop = FALSE])
    colnames(mat1) <- names(lst2)
    write.csv(mat1, paste0(paste(getwd(), "Indices_colMeans", gsub(" ", "_", 
rownames(mat1)), 
        sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

## Output2 Sample
ReadOut2S <- lapply(list.files(recursive = TRUE)[grep("Indices", 
list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2S)
# [1] 257
list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))][1]
# [1] 'Indices_colMeans/pav_ANN.csv'
res1[, "pav ANN", drop = FALSE]
# pav ANN [1,] 1.545620 [2,] 1.518553

ReadOut2S[[1]]
# G100 G101 1 1.54562 1.518553

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Re: [R] Quantile and rowMean from multiple files in a folder

Reply via email to