Here is another possibility library(stringr)
readterm <- function(term, text) { lapply(str_split(text, fixed(term))[[1]][-1], fread, skip = 4, nrows = 5 ) } easymethod <- function(whalines) { whalines <- str_c(whalines, collapse = "\n") lapply(c(srchStr1, srchStr2), readterm, text = whalines ) } ## perhaps a tad slower, but I find it easier to follow. library(microbenchmark) bigwhalines <- strsplit( paste( rep( wha, 100 ) , collapse = "\n" ) , "\n" )[[ 1 ]] microbenchmark(slowresult <- slowmethod( bigwhalines ), fastresult <- fastmethod( bigwhalines ), easyresult <- easymethod( bigwhalines ) ) On Wed, Jul 24, 2019 at 10:41 PM Jeff Newmiller <jdnew...@dcn.davis.ca.us> wrote: > > Sorry, I was on my phone and did not see that you were already using but > completely missing the vectorized nature of these functions. > > Consider the following: > > ############# > # after executing your sample code > slowmethod <- function( whalines ) { > lines <- whalines > mc_list <- NULL > for (i in 1:length(lines)){ > # Look for start of water content > if(grepl(srchStr1, lines[i])){ > mc_list <- c(mc_list, i) > } > } > > tmp_list <- NULL > for (i in 1:length(lines)){ > # Look for start of temperature data > if(grepl(srchStr2, lines[i])){ > tmp_list <- c(tmp_list, i) > } > } > > # Store the water content arrays > wc <- list() > # Read all the moisture content profiles > for(i in 1:length(mc_list)){ > lineNum <- mc_list[i] + 3 > mct <- read.table( text = wha > , skip=lineNum > , nrows=5 > , col.names=c('depth','wc') > ) > wc[[i]] <- mct > } > > # Store the water temperature arrays > tmp <- list() > # Read all the temperature profiles > for(i in 1:length(tmp_list)){ > lineNum <- tmp_list[i] + 3 > tmpt <- read.table(text = wha, skip=lineNum, nrows=5, > col.names=c('depth','tmp')) > tmp[[i]] <- tmpt > } > list(mc = wc, wt = tmp ) > } > > library(data.table) > > fastmethod <- function( whalines ) { > # identify tabular formatted lines > idx_tbl0 <- grepl( "^\\s*[.\\d+-]+\\s+[.\\d+-]+\\s*$" > , whalines > , perl=TRUE ) > # identify groups of lines according to moisture content > mc_block <- grep( srchStr1, whalines, fixed = TRUE ) > mc_start <- rep( FALSE, length( whalines ) ) > mc_start[ mc_block+4L ] <- TRUE > grp_mc <- cumsum( mc_start ) > # identify contiguous block of tabular lines in each block > mc_tab <- 0 == ave( !idx_tbl0, grp_mc, FUN = cumsum ) > # extract moisture content data > mc_dta <- fread( text = whalines[ mc_tab ], header=FALSE ) > mclist <- split( mc_dta, grp_mc[ mc_tab ] ) > > # identify groups of lines according to moisture content > wt_block <- grep( srchStr2, whalines, fixed = TRUE ) > wt_start <- rep( FALSE, length( whalines ) ) > wt_start[ wt_block+4L ] <- TRUE > grp_wt <- cumsum( wt_start ) > # identify contiguous block of tabular lines in each block > wt_tab <- 0 == ave( !idx_tbl0, grp_wt, FUN = cumsum ) > # extract data frames > wt_dta <- fread( text = whalines[ wt_tab ], header=FALSE ) > wtlist <- split( wt_dta, grp_wt[ wt_tab ] ) > list(mc = mclist, wt = wtlist ) > } > library(microbenchmark) > bigwhalines <- strsplit( paste( rep( wha, 100 ) > , collapse = "\n" ) > , "\n" )[[ 1 ]] > microbenchmark( slowresult <- slowmethod( bigwhalines ) > , fastresult <- fastmethod( bigwhalines ) > ) > > > On Wed, 24 Jul 2019, Jeff Newmiller wrote: > > > ?readLines > > ?grep > > ?textConnection > > > > On July 24, 2019 11:54:07 AM PDT, "Morway, Eric via R-help" > > <r-help@r-project.org> wrote: > >> The small reproducible example below works, but is way too slow on the > >> real > >> problem. The real problem is attempting to extract ~2920 repeated > >> arrays > >> from a 60 Mb file and takes ~80 minutes. I'm wondering how I might > >> re-engineer the script to avoid opening and closing the file 2920 times > >> as > >> is the case now. That is, is there a way to keep the file open and > >> peel > >> out the arrays and stuff them into a list of data.tables, as is done in > >> the > >> small reproducible example below, but in a significantly faster way? > >> > >> wha <- " INITIAL PRESSURE HEAD > >> INITIAL TEMPERATURE SET TO 4.000E+00 DEGREES C > >> VS2DH - MedSand for TL test > >> > >> TOTAL ELAPSED TIME = 0.000000E+00 sec > >> TIME STEP 0 > >> > >> MOISTURE CONTENT > >> Z, IN > >> m X OR R DISTANCE, IN m > >> 0.500 > >> 0.075 0.1475 > >> 0.225 0.1475 > >> 0.375 0.1475 > >> 0.525 0.1475 > >> 0.675 0.1475 > >> blah > >> blah > >> blah > >> TEMPERATURE, IN DECREES C > >> Z, IN > >> m X OR R DISTANCE, IN m > >> 0.500 > >> 0.075 1.1475 > >> 0.225 2.1475 > >> 0.375 3.1475 > >> 0.525 4.1475 > >> 0.675 5.1475 > >> blah > >> blah > >> blah > >> > >> TOTAL ELAPSED TIME = 8.6400E+04 sec > >> TIME STEP 0 > >> > >> MOISTURE CONTENT > >> Z, IN > >> m X OR R DISTANCE, IN m > >> 0.500 > >> 0.075 0.1875 > >> 0.225 0.1775 > >> 0.375 0.1575 > >> 0.525 0.1675 > >> 0.675 0.1475 > >> blah > >> blah > >> blah TEMPERATURE, IN DECREES C > >> Z, IN > >> m X OR R DISTANCE, IN m > >> 0.500 > >> 0.075 1.1475 > >> 0.225 2.1475 > >> 0.375 3.1475 > >> 0.525 4.1475 > >> 0.675 5.1475 > >> blah > >> blah > >> blah" > >> > >> example_content <- textConnection(wha) > >> > >> srchStr1 <- ' MOISTURE CONTENT' > >> srchStr2 <- 'TEMPERATURE, IN DECREES C' > >> > >> lines <- readLines(example_content) > >> mc_list <- NULL > >> for (i in 1:length(lines)){ > >> # Look for start of water content > >> if(grepl(srchStr1, lines[i])){ > >> mc_list <- c(mc_list, i) > >> } > >> } > >> > >> tmp_list <- NULL > >> for (i in 1:length(lines)){ > >> # Look for start of temperature data > >> if(grepl(srchStr2, lines[i])){ > >> tmp_list <- c(tmp_list, i) > >> } > >> } > >> > >> # Store the water content arrays > >> wc <- list() > >> # Read all the moisture content profiles > >> for(i in 1:length(mc_list)){ > >> lineNum <- mc_list[i] + 3 > >> mct <- read.table(text = wha, skip=lineNum, nrows=5, > >> col.names=c('depth','wc')) > >> wc[[i]] <- mct > >> } > >> > >> # Store the water temperature arrays > >> tmp <- list() > >> # Read all the temperature profiles > >> for(i in 1:length(tmp_list)){ > >> lineNum <- tmp_list[i] + 3 > >> tmpt <- read.table(text = wha, skip=lineNum, nrows=5, > >> col.names=c('depth','tmp')) > >> tmp[[i]] <- tmpt > >> } > >> > >> # quick inspection > >> length(wc) > >> wc[[1]] > >> # Looks like what I'm after, but too slow in real world problem > >> > >> [[alternative HTML version deleted]] > >> > >> ______________________________________________ > >> R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see > >> https://stat.ethz.ch/mailman/listinfo/r-help > >> PLEASE do read the posting guide > >> http://www.R-project.org/posting-guide.html > >> and provide commented, minimal, self-contained, reproducible code. > > > > -- > > Sent from my phone. Please excuse my brevity. > > > > ______________________________________________ > > R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see > > https://stat.ethz.ch/mailman/listinfo/r-help > > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > > and provide commented, minimal, self-contained, reproducible code. > > > > --------------------------------------------------------------------------- > Jeff Newmiller The ..... ..... Go Live... > DCN:<jdnew...@dcn.davis.ca.us> Basics: ##.#. ##.#. Live Go... > Live: OO#.. Dead: OO#.. Playing > Research Engineer (Solar/Batteries O.O#. #.O#. with > /Software/Embedded Controllers) .OO#. .OO#. rocks...1k > > ______________________________________________ > R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code. ______________________________________________ R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.