文本的R特征提取_R_Text_Nlp_Text Mining_Feature Extraction

文本的R特征提取

r text nlp

文本的R特征提取,r,text,nlp,text-mining,feature-extraction,R,Text,Nlp,Text Mining,Feature Extraction,我的问题是关于文本挖掘和文本处理我想从我的文本构建一个数据框架我的数据是： text <- c("#*TeX: The Program, #@Donald E. Knuth, #t1986, #c, #index68, "" #*Foundations of Databases., #@Serge Abiteboul,Richard Hull,Victor Vianu, #t1995, #c, #index69, #%1118192, #%189, #%1088975, #%97127

我的问题是关于文本挖掘和文本处理

我想从我的文本构建一个数据框架

我的数据是：

text <- c("#*TeX: The Program,
#@Donald E. Knuth,
#t1986,
#c,
#index68,
""
#*Foundations of Databases.,
#@Serge Abiteboul,Richard Hull,Victor Vianu,
#t1995,
#c,
#index69,
#%1118192,
#%189,
#%1088975,
#%971271,
#%832272,
#!From the Book: This book will teach you how to write specifications of computer systems, using the language TLA+.")

text新的和改进的
text.n <- strsplit(text, "\n(?=#\\*)", perl=TRUE)[[1]]; text.n

text.s <- lapply(text.n, function(x) strsplit(x, "\n")[[1]])

patterns <- list(title="^#\\*", 
                autors="^#@",
                  year="^#t",
                 revue="^#c",
              id_paper="^#index",
                id_ref="^#%",
              abstract="^#!")

tex.l <- lapply(text.s, function(x)
  lapply(patterns, function(y)
    paste(sub(y, "", grep(y, x, value=TRUE)), collapse=",")
  )
) 

tex.m <- matrix(unlist(tex.l), ncol=length(tex.l[[1]]), byrow=TRUE)
tex.df <- as.data.frame(tex.m, stringsAsFactors=FALSE)
colnames(tex.df) <- names(patterns)

str(tex.df)

# 'data.frame': 2 obs. of  7 variables:
# $ title   : chr "TeX: The Program" "Foundations of Databases."
# $ autors  : chr "Donald E. Knuth" "Serge Abiteboul,Richard Hull,Victor Vianu"
# $ year    : chr "1986" "1995"
# $ revue   : chr "" ""
# $ id_paper: chr "68" "69"
# $ id_ref  : chr "" "1118192,189,1088975,971271,832272"
# $ abstract: chr "" "From the Book: This book will teach you how to write 
#                     specifications of computer systems, using the language TLA+."

text.n这里有一个基于@AkselA答案的解决方案。我不能仅在评论中处理这一点，因此，还有一个额外的答案（我知道我可以把它格式化得更漂亮…）
到目前为止，您尝试了什么？我尝试了使用grep进行提取，但无法将id_ref连接到唯一的行中。请参阅下面的回答…您可以使用list（）
或paste0（…，collapse=“，”）
连接多个元素并将它们存储为单个条目。数据框的大小将是标题的大小。因为每篇文章都必须有一个标题。@ManuelBickel:但我们最终只会得到一个向量。@Cincinatus:id\u ref会与之冲突。@ManuelBickel:没问题，只是停下来重新看一下。谢谢你的regex模式，我得到的不是最理想的。非常感谢你，你的答案是正确的。但我只能给出一个解决方案。谢谢你，你是个天才。
coln <- c("title", "authors", "year", "revue","id_paper", "id_ref", "abstract")
      title_index <- grep("^#[*]", text)
      authors_index <- grep("#@", text)
      year_index <- grep("#t", text)
      revue_index <- grep("#c", text)
      id_paper_index <- grep("#index", text)
      id_refindex <- grep("#%", text)
      abstract_index <- grep("#!", text)
      df <- matrix(NA, nrow=length(title_index), ncol=length(coln))
      colnames(df) <- coln
      stoc_index <- grep("#cSTOC", text)
      sigir_index <- grep("#cSIGIR", text)}


  ########## titre
  {der_pos <- length(title_index)
    tit_position  <- c(title_index , der_pos)
    for(i in 1:length(title_position)){
      if(i != length(title_position)){
        df[i, "title"] <- text[title_position[i]]
      }
    }
  }

  ########## author 
{der_pos <- length(authors_index)
    authors_position  <- c(authors_index )
    for(i in 1:length(auteur_position)){
      if(i != length(auteur_position)){
        df[i, "auteur"] <- text[auteur_position[i]]
      }
    }
  }

  ########## year
{der_pos <- length(year_index)
    year_position  <- c(year_index , der_pos)
    for(i in 1:length(year_position)){
      if(i != length(year_position)){
        df[i, "année"] <- text[year_position[i]]
      }
    }
  }

  ##########??? revue
  {der_pos <- length(revue_index)
    revue_position  <- c(revue_index )
    for(i in 1:length(revue_position)){
      if(i != length(revue_position)){
        df[i, "revue"] <- text[revue_position[i]]
      }
    }
  }

  ########## id_paper
  {der_pos <- length(id_paper_index)
    id_paper_position  <- c(id_paper_index , dern_pos)
    for(i in 1:length(id_paper_position)){
      if(i != length(id_paper_position)){
        df[i, "id_paper"] <- text[id_paper_position[i]]
      }
    }
  }

  ########## id_ref
  {der_pos <- length(id_ref_index)
    id_ref_position  <- c(id_ref_index , der_pos)
    for(i in 1:length(id_ref_position)){
      if(i != length(id_ref_position)){
        df[i, "id_ref"] <- text[id_ref_position[i]]
      }
    }
  }
  ########## abstract
  {der_pos <- length(abstract_index)
    abstract_position  <- c(abstract_index , der_pos)
    for(i in 1:length(abstract_position)){
      if(i != length(abstract_position)){
        df[i, "abstract"] <- text[abstract_position[i]]
      }
    }
  }

text.n <- strsplit(text, "\n(?=#\\*)", perl=TRUE)[[1]]; text.n

text.s <- lapply(text.n, function(x) strsplit(x, "\n")[[1]])

patterns <- list(title="^#\\*", 
                autors="^#@",
                  year="^#t",
                 revue="^#c",
              id_paper="^#index",
                id_ref="^#%",
              abstract="^#!")

tex.l <- lapply(text.s, function(x)
  lapply(patterns, function(y)
    paste(sub(y, "", grep(y, x, value=TRUE)), collapse=",")
  )
) 

tex.m <- matrix(unlist(tex.l), ncol=length(tex.l[[1]]), byrow=TRUE)
tex.df <- as.data.frame(tex.m, stringsAsFactors=FALSE)
colnames(tex.df) <- names(patterns)

str(tex.df)

# 'data.frame': 2 obs. of  7 variables:
# $ title   : chr "TeX: The Program" "Foundations of Databases."
# $ autors  : chr "Donald E. Knuth" "Serge Abiteboul,Richard Hull,Victor Vianu"
# $ year    : chr "1986" "1995"
# $ revue   : chr "" ""
# $ id_paper: chr "68" "69"
# $ id_ref  : chr "" "1118192,189,1088975,971271,832272"
# $ abstract: chr "" "From the Book: This book will teach you how to write 
#                     specifications of computer systems, using the language TLA+."

#split into individual docs
text.s = strsplit(text, "\n(?=#\\*)", perl = T)[[1]]

# function to extract information from individual docs
extract_info = function(x, patterns = list(title="^*#\\*", 
                                           autors="^*#@",
                                           year="^*#t",
                                           revue="^*#c",
                                           id_paper="^*#index",
                                           id_ref="^*#%",
                                           abstract="^*#!")) {
  lapply(patterns, function(p) {
    extract = grep(p, x, value = T)
    # here you check the length of the potential output
    # and modify the type according to your needs
    if (length(extract) > 1) {
     extract = list(extract)
    } else if (length(extract) == 0) {
     extract = NA
    }
    return(extract)
    })
}

# apply the function to the data
# and rbind it into a data.frame
do.call(rbind, 
        lapply(text.s, function(x) {
  x = strsplit(x, "\\n")[[1]]
  extract_info(x)
})
)

# title                         autors                                        year     revue id_paper   id_ref
# [1,] "#*TeX: The Program"          "#@Donald E. Knuth"                           "#t1986" "#c"  "#index68" NA    
# [2,] "#*Foundations of Databases." "#@Serge Abiteboul,Richard Hull,Victor Vianu" "#t1995" "#c"  "#index69" List,1
# abstract                                                                                                         
# [1,] NA                                                                                                               
# [2,] "#!From the Book: This book will teach you how to write specifications of computer systems, using th" [truncated]