Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/regex/16.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/64.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Regex R:构建文本分类器_Regex_R_Rules_Document Classification - Fatal编程技术网

Regex R:构建文本分类器

Regex R:构建文本分类器,regex,r,rules,document-classification,Regex,R,Rules,Document Classification,我的内容集必须根据一些规则进行分类 样本数据: 1 chin jeffrey hong kong wednesday october global business reporting cc subramanian raghuveer kumar m santhosh antoo ramesh subject request obtain global icis data dear team appreciate can distribute mon

我的内容集必须根据一些规则进行分类

样本数据:

    1     chin jeffrey hong kong  wednesday  october     global business reporting cc subramanian raghuveer   kumar m santhosh   antoo ramesh subject request  obtain global icis data dear team appreciate   can distribute   monthly basis  latest global icis data    ramesh antoo upon  availability regards jeffrey chin associate business risk strategy  efficiency brse  asia international  institutional banking australia  new zealand banking group limited f three exchange square  connaught place central hong kong phone    voice net   email jeffreychinanzcom brse   key business risk interface team within iib providing global support  strategic perspectives  policy procedures  reporting  includes risk appetite credit process quality assurance interlock  key stakeholders  well  iib support  key projects   project glue cacheorion working groups efficiency initiatives  business risk forums p please consider  environment  printing  email
    2      beren stuart vanuatu  monday  october     g s venkatesh ramesh sandeep talanki   h r nagaraj subject please approve  qlikview gpa access hi please process  following form  gpa access please email  requestor  line manager   access  granted raj can  please add  given  user  qlikview workgroup    gpa access form  requestors name lim tek kon vanuatu address lini high way port vila efate title  relationship manager emerging corporates employee id  lan id limtk bsbcc  authorising manager beren stuart vanuatu  yes   read  use  gpa dashboard business technical reason na  
    3     kumar m santhosh   behalf  relationshipbankingfinancesupport  friday  october     g s venkatesh cc global business reporting subject fw please approve  qlikview gpa access regards santhosh   faunt daniel png  wednesday  october     relationshipbankingfinancesupport cc amet sova subject fw please approve  qlikview gpa access hi   unable  approve  excel due  macro issues please process   amet sova  monday  october     faunt daniel png subject please approve  qlikview gpa access hello can  please review  attached form  click line manager approval  approve 
    4     thomson owen tonga  thursday  october     g s venkatesh ramesh sandeep talanki   h r nagaraj subject please approve  qlikview gpa access hi please process  following form  gpa access please email  requestor  line manager   access  granted raj can  please add  given  user  qlikview workgroup    gpa access form  requestors name hia viliami address head office fakafanua centre maufanga vuna road nukualofa tongatapu tonga nukualofa tongatapu title  nfc  amu manager employee id  lan id hiav bsbcc   authorising manager thomson owen tonga  yes   read  use  gpa dashboard business technical reason  
    5     kumar rajesh fiji  tuesday  october     g s venkatesh ramesh sandeep talanki   h r nagaraj subject please approve  qlikview gpa access hi please process  following form  gpa access please email  requestor  line manager   access  granted raj can  please add  given  user  qlikview workgroup    gpa access form  requestors
 #Rule Classifier-----
rule <- function(out)
{
  for(i in out)
  {
    for(j in out[i])
    {
      if(x[j]=="Access")
      {
        if(x[j]=="gpa" | x[j]=="qlikview")
        {
          return("Access")
        }
      }
      else if(x[j]=="Report")
      {
        if(x[j]=="pfi" | x[j]=="data" )
        {
          return("Report")
        }
      }
    }
  }
}
这只是一行值,在实时情况下,我必须遍历500-10000行,这里我提取了我要使用的单词

> O
$text
$text[[1]]
 [1] "qlikview" "gpa"      "access"   "gpa"      "access"   "access"   "qlikview" "gpa"      "access"  "gpa"     

$text[[2]]
 [1] "report"   "qlikview" "gpa"      "access"   "qlikview" "gpa"      "access"   "qlikview" "gpa"     
[10] "access"  

$text[[3]]
 [1] "qlikview" "gpa"      "access"   "gpa"      "access"   "access"   "qlikview" "gpa"      "access"  
[10] "gpa"     

$text[[4]]
 [1] "qlikview" "gpa"      "access"   "gpa"      "access"   "access"   "qlikview" "gpa"      "access"  
[10] "gpa"     

$text[[5]]
 [1] "report"   "qlikview" "gpa"      "access"   "access"   "gpa"      "access"   "qlikview" "gpa" "access"   "access"   "gpa"      "qlikview" "gpa"      "access"   "qlikview" "gpa"  "access"  

$text[[6]]
 [1] "report"   "qlikview" "access"   "access"   "report"   "qlikview" "access"   "access"   "gpa"     
[10] "qlikview" "access"   "access"   "qlikview" "access"   "access"  

$text[[7]]
 [1] "report" "report" "access" "access" "report" "report" "report" "report" "report" "report" "data"  "data"   "report" "access" "report" "report"

$text[[8]]
[1] "report"   "qlikview" "gpa"      "access"   "gpa"      "access"  

$text[[9]]
 [1] "report" "gpa"    "access" "access" "gpa"    "gpa"    "gpa"    "gpa"    "gpa"    "access" "gpa"   "gpa"    "gpa"    "report"

$text[[10]]
 [1] "report" "gpa"    "gpa"    "access" "gpa"    "access" "gpa"    "access" "gpa"    "gpa"    "report" "gpa"    "gpa"    "access" "gpa"    "gpa"    "gpa"    "gpa"    "gpa"  
现在我必须使用if条件在此基础上构建规则,同时,如何表示列表以循环每个文本,并检查是否存在“access”,然后检查是否存在“gpa”或“qlikview”,然后作为行值的访问返回(示例数据) 否则,如果存在“报告”,则检查“pfi”或“区域”返回报告

我已将上面的列表转换为如下所示的数据帧

代码:

预期产出

1      Access
2       Access
3       Access
4       Access
5       Access
6       Access
7       Report/Data
8       Access
9       Access
10      Access
11     Report/Data
12     Report/Data
13     Report/Data
14     Report/Data

dput(O$text[1:10])

list(c("qlikview", "gpa", "access", "gpa", "access", "access", 
"qlikview", "gpa", "access", "gpa"), c("report", "qlikview", 
"gpa", "access", "qlikview", "gpa", "access", "qlikview", "gpa", 
"access"), c("qlikview", "gpa", "access", "gpa", "access", "access", 
"qlikview", "gpa", "access", "gpa"), c("qlikview", "gpa", "access", 
"gpa", "access", "access", "qlikview", "gpa", "access", "gpa"
), c("report", "qlikview", "gpa", "access", "access", "gpa", 
"access", "qlikview", "gpa", "access", "access", "gpa", "qlikview", 
"gpa", "access", "qlikview", "gpa", "access"), c("report", "qlikview", 
"access", "access", "report", "qlikview", "access", "access", 
"gpa", "qlikview", "access", "access", "qlikview", "access", 
"access"), c("report", "report", "access", "access", "report", 
"report", "report", "report", "report", "report", "data", "data", 
"report", "access", "report", "report"), c("report", "qlikview", 
"gpa", "access", "gpa", "access"), c("report", "gpa", "access", 
"access", "gpa", "gpa", "gpa", "gpa", "gpa", "access", "gpa", 
"gpa", "gpa", "report"), c("report", "gpa", "gpa", "access", 
"gpa", "access", "gpa", "access", "gpa", "gpa", "report", "gpa", 
"gpa", "access", "gpa", "gpa", "gpa", "gpa", "gpa"))
rule(out)
#this is throwing some error -  Error in `[.default`(out, i) : invalid subscript type 'list' 

我知道这很幼稚,但我是新手,如果我在某些地方出错,请纠正我

您似乎认为,
用于。。。在
中,循环将以整数进行迭代。它们只是在列表中的对象上进行迭代,因此当它是一个列表时,您尝试使用
i
作为索引。但是
?lappy
将向您展示处理列表的更好方法

text <- list(c("qlikview", "gpa", "access", "gpa", "access", "access",
            "qlikview", "gpa", "access", "gpa"), 
          c("report", "qlikview", 
            "gpa", "access", "qlikview", "gpa", "access", "qlikview", "gpa", 
            "access"), 
          c("qlikview", "gpa", "access", "gpa", "access", "access", 
            "qlikview", "gpa", "access", "gpa"), 
          c("qlikview", "gpa", "access", 
            "gpa", "access", "access", "qlikview", "gpa", "access", "gpa"), 
          c("report", "qlikview", "gpa", "access", "access", "gpa", 
            "access", "qlikview", "gpa", "access", "access", "gpa", "qlikview", 
            "gpa", "access", "qlikview", "gpa", "access"), 
          c("report", "qlikview", 
            "access", "access", "report", "qlikview", "access", "access", 
            "gpa", "qlikview", "access", "access", "qlikview", "access", 
            "access"), 
          c("report", "report", "access", "access", "report", 
            "report", "report", "report", "report", "report", "data", "data", 
            "report", "access", "report", "report"), 
          c("report", "qlikview", 
            "gpa", "access", "gpa", "access"), 
          c("report", "gpa", "access", 
            "access", "gpa", "gpa", "gpa", "gpa", "gpa", "access", "gpa", 
            "gpa", "gpa", "report"), 
          c("report", "gpa", "gpa", "access", 
            "gpa", "access", "gpa", "access", "gpa", "gpa", "report", "gpa", 
            "gpa", "access", "gpa", "gpa", "gpa", "gpa", "gpa")) 
O <- as.data.frame(cbind(text))

rule <- function(out) {   
  vapply(out$text, function (row) {
    if ("access" %in% row && ("gpa" %in% row || "qlikview" %in% row)) {
      return("Access")
    } else if ("report" %in% row && ("pfi" %in% row || "data" %in% row)) {
      return("Report/Data")
    } else {
      return("NA")
    }   
  }, "") 
}

rule(O)
#  [1] "Access"      "Access"      "Access"      "Access"      "Access"      "Access"      "Report/Data" "Access"     
#  [9] "Access"      "Access"

text如果您想知道关于如何删除列表类型c()的问题的答案,请将其作为单独的问题发布,运行dput(O$text[1:10])并将输出粘贴到您的问题中,并解释您希望输出是什么样子。如果两者都不适用,我给出了返回(“NA”)
现在我得到的都是NA,如果没有给出,那么
被抛出为outputI我尝试用上面的代码(我已经给出了问题)修改它,但是我只得到了
Access
作为输出,因为它不转到else语句。文本“Access”在您的所有示例数据中,所以为什么您对它成功找到它感到惊讶?您应该在问题中添加您的预期输出。这是您想要的吗?否则,您将需要对输入数据运行dput。我使用了dput,但在dataframe中得到了相同的列表类型
text <- list(c("qlikview", "gpa", "access", "gpa", "access", "access",
            "qlikview", "gpa", "access", "gpa"), 
          c("report", "qlikview", 
            "gpa", "access", "qlikview", "gpa", "access", "qlikview", "gpa", 
            "access"), 
          c("qlikview", "gpa", "access", "gpa", "access", "access", 
            "qlikview", "gpa", "access", "gpa"), 
          c("qlikview", "gpa", "access", 
            "gpa", "access", "access", "qlikview", "gpa", "access", "gpa"), 
          c("report", "qlikview", "gpa", "access", "access", "gpa", 
            "access", "qlikview", "gpa", "access", "access", "gpa", "qlikview", 
            "gpa", "access", "qlikview", "gpa", "access"), 
          c("report", "qlikview", 
            "access", "access", "report", "qlikview", "access", "access", 
            "gpa", "qlikview", "access", "access", "qlikview", "access", 
            "access"), 
          c("report", "report", "access", "access", "report", 
            "report", "report", "report", "report", "report", "data", "data", 
            "report", "access", "report", "report"), 
          c("report", "qlikview", 
            "gpa", "access", "gpa", "access"), 
          c("report", "gpa", "access", 
            "access", "gpa", "gpa", "gpa", "gpa", "gpa", "access", "gpa", 
            "gpa", "gpa", "report"), 
          c("report", "gpa", "gpa", "access", 
            "gpa", "access", "gpa", "access", "gpa", "gpa", "report", "gpa", 
            "gpa", "access", "gpa", "gpa", "gpa", "gpa", "gpa")) 
O <- as.data.frame(cbind(text))

rule <- function(out) {   
  vapply(out$text, function (row) {
    if ("access" %in% row && ("gpa" %in% row || "qlikview" %in% row)) {
      return("Access")
    } else if ("report" %in% row && ("pfi" %in% row || "data" %in% row)) {
      return("Report/Data")
    } else {
      return("NA")
    }   
  }, "") 
}

rule(O)
#  [1] "Access"      "Access"      "Access"      "Access"      "Access"      "Access"      "Report/Data" "Access"     
#  [9] "Access"      "Access"