
用R中的grep删除文本文件中的行,r,dataframe,data-cleaning,R,Dataframe,Data Cleaning,我有一个非常重复的文本文件,前几行是这样的: Filename: ROI: red_1 [Red] 20 points Basic Stats Min Max Mean Stdev Band 1 0.013282 0.133982 0.061581 0.034069 Band 2 0.009866 0.112935 0.042688 0.026618 Band 3 0.008


ROI: red_1 [Red] 20 points

Basic Stats      Min         Max        Mean       Stdev
     Band 1 0.013282    0.133982    0.061581    0.034069
     Band 2 0.009866    0.112935    0.042688    0.026618
     Band 3 0.008304    0.037059    0.018434    0.007515
     Band 4 0.004726    0.040089    0.018490    0.009605

Histogram         DN       Npts   Total  Percent     Acc Pct
Band 1      0.013282          1       1   5.0000      5.0000
Bin=0.00047 0.013755          0       1   0.0000      5.0000
            0.014228          0       1   0.0000      5.0000

Stats for ROI: red_5 [Red] 20 points
Basic Stats      Min         Max        Mean       Stdev
     Band 1 0.030513    0.180980    0.090056    0.044456
     Band 2 0.022289    0.157861    0.046419    0.030555
     Band 3 0.012533    0.046693    0.027343    0.008947
     Band 4 0.003332    0.041555    0.016888    0.007770

Histogram         DN       Npts   Total  Percent     Acc Pct
Band 1      0.030513          1       1   5.0000      5.0000
Bin=0.00059 0.031103          0       1   0.0000      5.0000
            0.031693          0       1   0.0000      5.0000
            0.032283          0       1   0.0000      5.0000
ROI                     Band    min         max         mean        stdev
red_1 [Red] 20 points   Band 1  0.013282    0.133982    0.061581    0.034069
red_1 [Red] 20 points   Band 2  0.009866    0.112935    0.042688    0.026618
red_1 [Red] 20 points   Band 3  0.008304    0.037059    0.018434    0.007515
red_1 [Red] 20 points   Band 4  0.004726    0.040089    0.018490    0.009605
red_2 [Red] 12 points   Band 1  0.032262    0.124425    0.078073    0.028031
red_2 [Red] 12 points   Band 2  0.021072    0.064156    0.037923    0.012178
red_2 [Red] 12 points   Band 3  0.013404    0.066043    0.036316    0.014787
red_2 [Red] 12 points   Band 4  0.005162    0.055781    0.015526    0.013255
red_3 [Red] 12 points   Band 1  0.037488    0.107830    0.057892    0.018964
red_3 [Red] 12 points   Band 2  0.028140    0.072370    0.045340    0.014507

ROI: red_1 [Red] 20 points

Basic Stats      Min         Max        Mean       Stdev
     Band 1 0.013282    0.133982    0.061581    0.034069
     Band 2 0.009866    0.112935    0.042688    0.026618
     Band 3 0.008304    0.037059    0.018434    0.007515
     Band 4 0.004726    0.040089    0.018490    0.009605

Stats for ROI: red_5 [Red] 20 points
Basic Stats      Min         Max        Mean       Stdev
     Band 1 0.030513    0.180980    0.090056    0.044456
     Band 2 0.022289    0.157861    0.046419    0.030555
     Band 3 0.012533    0.046693    0.027343    0.008947
     Band 4 0.003332    0.041555    0.016888    0.007770



Stats for ROI: red_5 [Red] 20 points
Basic Stats      Min         Max        Mean       Stdev
     Band 1 0.030513    0.180980    0.090056    0.044456
     Band 2 0.022289    0.157861    0.046419    0.030555
     Band 3 0.012533    0.046693    0.027343    0.008947
     Band 4 0.003332    0.041555    0.016888    0.007770

Histogram         DN       Npts   Total  Percent     Acc Pct
Band 1      0.030513          1       1   5.0000      5.0000
Bin=0.00059 0.031103          0       1   0.0000      5.0000
            0.031693          0       1   0.0000      5.0000
            0.032283          0       1   0.0000      5.0000
ROI                     Band    min         max         mean        stdev
red_1 [Red] 20 points   Band 1  0.013282    0.133982    0.061581    0.034069
red_1 [Red] 20 points   Band 2  0.009866    0.112935    0.042688    0.026618
red_1 [Red] 20 points   Band 3  0.008304    0.037059    0.018434    0.007515
red_1 [Red] 20 points   Band 4  0.004726    0.040089    0.018490    0.009605
red_2 [Red] 12 points   Band 1  0.032262    0.124425    0.078073    0.028031
red_2 [Red] 12 points   Band 2  0.021072    0.064156    0.037923    0.012178
red_2 [Red] 12 points   Band 3  0.013404    0.066043    0.036316    0.014787
red_2 [Red] 12 points   Band 4  0.005162    0.055781    0.015526    0.013255
red_3 [Red] 12 points   Band 1  0.037488    0.107830    0.057892    0.018964
red_3 [Red] 12 points   Band 2  0.028140    0.072370    0.045340    0.014507

file <- file("test2.txt",)
dat <- readLines(file)
out <- NULL
roi <- NULL
for(i in 1:length(dat)){
  line <- dat[i]
  if(length(grep("ROI: ",line))>0){
    roi <- substr(line,regexpr("ROI",line)[1]+5,nchar(line))
  if(substr(line,0,9)=="     Band"){
    splitLine <- strsplit(trimws(line),"\t")[[1]]
    outLine <- data.frame("ROI" = roi, 
                       "Band" = splitLine[1],
                       "min" = splitLine[2],
                       "max" = splitLine[3],
                       "mean" = splitLine[4],
                       "stdev" = splitLine[5]
    out <- rbind(out,outLine)



Stats for ROI: red_5 [Red] 20 points
Basic Stats      Min         Max        Mean       Stdev
     Band 1 0.030513    0.180980    0.090056    0.044456
     Band 2 0.022289    0.157861    0.046419    0.030555
     Band 3 0.012533    0.046693    0.027343    0.008947
     Band 4 0.003332    0.041555    0.016888    0.007770

Histogram         DN       Npts   Total  Percent     Acc Pct
Band 1      0.030513          1       1   5.0000      5.0000
Bin=0.00059 0.031103          0       1   0.0000      5.0000
            0.031693          0       1   0.0000      5.0000
            0.032283          0       1   0.0000      5.0000
ROI                     Band    min         max         mean        stdev
red_1 [Red] 20 points   Band 1  0.013282    0.133982    0.061581    0.034069
red_1 [Red] 20 points   Band 2  0.009866    0.112935    0.042688    0.026618
red_1 [Red] 20 points   Band 3  0.008304    0.037059    0.018434    0.007515
red_1 [Red] 20 points   Band 4  0.004726    0.040089    0.018490    0.009605
red_2 [Red] 12 points   Band 1  0.032262    0.124425    0.078073    0.028031
red_2 [Red] 12 points   Band 2  0.021072    0.064156    0.037923    0.012178
red_2 [Red] 12 points   Band 3  0.013404    0.066043    0.036316    0.014787
red_2 [Red] 12 points   Band 4  0.005162    0.055781    0.015526    0.013255
red_3 [Red] 12 points   Band 1  0.037488    0.107830    0.057892    0.018964
red_3 [Red] 12 points   Band 2  0.028140    0.072370    0.045340    0.014507

file <- file("test2.txt",)
dat <- readLines(file)
out <- NULL
roi <- NULL
for(i in 1:length(dat)){
  line <- dat[i]
  if(length(grep("ROI: ",line))>0){
    roi <- substr(line,regexpr("ROI",line)[1]+5,nchar(line))
  if(substr(line,0,9)=="     Band"){
    splitLine <- strsplit(trimws(line),"\t")[[1]]
    outLine <- data.frame("ROI" = roi, 
                       "Band" = splitLine[1],
                       "min" = splitLine[2],
                       "max" = splitLine[3],
                       "mean" = splitLine[4],
                       "stdev" = splitLine[5]
    out <- rbind(out,outLine)

dataframe out应该非常接近您要查找的内容。


input <- readLines("https://dl.dropboxusercontent.com/u/45095175/test2.txt")
roi_lines <- grep("ROI", input)
basic_stat_lines <- grep("Basic Stats", input)
roi_names <- sub("^.*ROI: ", "", input[roi_lines])
roi_data <- lapply(1:length(basic_stat_lines), function(i) {
  data.frame(roi = roi_names[i], read.delim(text = input[basic_stat_lines[i] + 0:4],
                                           stringsAsFactors = FALSE, check.names = FALSE),
             stringsAsFactors = FALSE)
roi_data_all <- do.call("rbind", roi_data)


input <- readLines("https://dl.dropboxusercontent.com/u/45095175/test2.txt")
roi_lines <- grep("ROI", input)
basic_stat_lines <- grep("Basic Stats", input)
roi_names <- sub("^.*ROI: ", "", input[roi_lines])
roi_data <- lapply(1:length(basic_stat_lines), function(i) {
  data.frame(roi = roi_names[i], read.delim(text = input[basic_stat_lines[i] + 0:4],
                                           stringsAsFactors = FALSE, check.names = FALSE),
             stringsAsFactors = FALSE)
roi_data_all <- do.call("rbind", roi_data)

myfun <- function( file, what )
  x <- readLines( file )
  g1 <- which( grepl("ROI:", x))
  if( what == 'Basic Stats'){
    g2 <- which( grepl('Basic Stats', x))
  } else if ( what == "Histogram" ) {
    g2 <- which( grepl("Histogram", x))
  } else {
    stop( 'what value is not supported')

  df_list <- list()
  counter <- 0
  while( counter < length( g1 ))
    counter <- counter + 1

    if( counter != length( g1 ) ){
      low  <- g1[ counter ]
      high <- g1[ counter + 1 ]
    } else {
      low  <- g1[ counter ]
      high <- length( x )

    min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )

    title <- ifelse( counter == 1,
                     list( gsub( '\\[|\\]', '', unlist( strsplit( x[ low ], "\ ") )[ 2:4 ] ) ), 
                     list( gsub( '\\[|\\]', '', unlist( strsplit( x[ low ], "\ ") )[ 4:6 ] ) ) )

    if( what == 'Basic Stats'){
      min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )
      x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: ( min_ind + 5 ) ], "\t")), stringsAsFactors = FALSE )
      colnames( x1 ) <- x1[1, ]
      x1 <- x1[2:5, ]
      x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
      colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
      colnames(x1) <- gsub("\ ", '', colnames(x1))  # remove spaces
      # convert from character to numeric data type
      x1[, 2:5 ] <- lapply( x1[, 2:5 ], function(x) as.numeric( as.character( x ) ) )
      df_list[[ as.character(counter) ]] <- x1

    } else if ( what == "Histogram" ) {
      x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: (high-1) ], "\t")), stringsAsFactors = FALSE )
      # column names and band and bin columns
      colnames( x1 ) <- x1[1, ]
      colnames(x1)[1] <- 'Histogram'
      x1$Band <- rep( gsub("[Band\ ]", '', grep( "Band", x1$Histogram, value = TRUE )),
                      diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
      x1$Bin <- rep( gsub("[Bin=\ ]", '', grep( "Bin", x1$Histogram, value = TRUE )),
                     diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
      x1 <- x1[! grepl( 'Histogram', x1$Histogram ), ]
      x1$Histogram <- NULL

      x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
      colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
      colnames(x1) <- gsub("\ ", '', colnames(x1))  # remove spaces
      # convert from character to numeric data type
      x1[, c(1:7, 10) ] <- lapply( x1[, c(1:7, 10) ], function(x) as.numeric( as.character( x ) ) )
      df_list[[ as.character(counter) ]] <- x1

  return( df_list )


myfun <- function( file, what )
  x <- readLines( file )
  g1 <- which( grepl("ROI:", x))
  if( what == 'Basic Stats'){
    g2 <- which( grepl('Basic Stats', x))
  } else if ( what == "Histogram" ) {
    g2 <- which( grepl("Histogram", x))
  } else {
    stop( 'what value is not supported')

  df_list <- list()
  counter <- 0
  while( counter < length( g1 ))
    counter <- counter + 1

    if( counter != length( g1 ) ){
      low  <- g1[ counter ]
      high <- g1[ counter + 1 ]
    } else {
      low  <- g1[ counter ]
      high <- length( x )

    min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )

    title <- ifelse( counter == 1,
                     list( gsub( '\\[|\\]', '', unlist( strsplit( x[ low ], "\ ") )[ 2:4 ] ) ), 
                     list( gsub( '\\[|\\]', '', unlist( strsplit( x[ low ], "\ ") )[ 4:6 ] ) ) )

    if( what == 'Basic Stats'){
      min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )
      x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: ( min_ind + 5 ) ], "\t")), stringsAsFactors = FALSE )
      colnames( x1 ) <- x1[1, ]
      x1 <- x1[2:5, ]
      x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
      colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
      colnames(x1) <- gsub("\ ", '', colnames(x1))  # remove spaces
      # convert from character to numeric data type
      x1[, 2:5 ] <- lapply( x1[, 2:5 ], function(x) as.numeric( as.character( x ) ) )
      df_list[[ as.character(counter) ]] <- x1

    } else if ( what == "Histogram" ) {
      x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: (high-1) ], "\t")), stringsAsFactors = FALSE )
      # column names and band and bin columns
      colnames( x1 ) <- x1[1, ]
      colnames(x1)[1] <- 'Histogram'
      x1$Band <- rep( gsub("[Band\ ]", '', grep( "Band", x1$Histogram, value = TRUE )),
                      diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
      x1$Bin <- rep( gsub("[Bin=\ ]", '', grep( "Bin", x1$Histogram, value = TRUE )),
                     diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
      x1 <- x1[! grepl( 'Histogram', x1$Histogram ), ]
      x1$Histogram <- NULL

      x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
      colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
      colnames(x1) <- gsub("\ ", '', colnames(x1))  # remove spaces
      # convert from character to numeric data type
      x1[, c(1:7, 10) ] <- lapply( x1[, c(1:7, 10) ], function(x) as.numeric( as.character( x ) ) )
      df_list[[ as.character(counter) ]] <- x1

  return( df_list )
