Parsing 使用解析器组合器解析Scala语句

Parsing 使用解析器组合器解析Scala语句,parsing,scala,dsl,parser-combinators,Parsing,Scala,Dsl,Parser Combinators,如何有效地解析(没有太多的代码混乱)下面这样的语句? 关键字/分隔符放置在[]内 新德里公司私人有限公司(自2009年1月至2012年1月)经理 使用解析组合符从文本中提取人名、公司名和日期范围。(预期输出显示在底部) 下面是为上述内容编写的代码- case class CompanyWithMonthDateRange(company:String, position:String, dateRange:List[MonthYear]) case class MonthY

如何有效地解析(没有太多的代码混乱)下面这样的语句? 关键字/分隔符放置在[]内

新德里公司私人有限公司(自2009年1月至2012年1月)经理

使用解析组合符从文本中提取人名、公司名和日期范围。(预期输出显示在底部)

下面是为上述内容编写的代码-



    case class CompanyWithMonthDateRange(company:String, position:String, dateRange:List[MonthYear])

    case class MonthYear(month:String, year:Int)

    object CompanyParser1 extends RegexParsers {
      override type Elem = Char
      override def skipWhitespace = false
      def keywords: Parser[String] = "for" | "in" | "with" |"at" | "from" | "pvt"|"ltd" | "company" | "co" | "limited" | "inc" | "corporation" | "jan" |\
     "feb" | "mar" | "apr" | "may" | "jun" | "jul" | "aug" | "sep" | "nov" | "dec" | "to" | "till" | "until" | "upto"

      val date = ("""\d\d\d\d""".r | """\d\d""".r)
      val integer     = ("""(0|[1-9]\d*)""".r) ^^ { _.toInt }
      val comma = ("""\,""".r)
      val quote = ("""[\'\"]+""".r)
      val underscore  = ("""\_""".r)
      val dot = ("""\.""".r)
      val space = ("""\s+""".r) ^^ {case _ => ""}
      val colon = (""":""".r)
      val ampersand = ("""(\&|and)""".r)
      val hyphen = ("""\-""".r)
      val brackets = ("""[\(\)]+""".r)
      val newline = ("""[\n\r]""".r)
      val months = ("""(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)""".r)
      val toTillUntil = ("""(to|till|until|upto)""".r)
      val asWord = ("""(as)""".r)
      val fromWord = ("""from""".r)
      val forWithAt = ("""(in|for|with|at)""".r)
      val companyExt = ("""(pvt|ltd|company|co|limited|inc|corporation)""".r)
      val alphabets = not(keywords)~"""[a-zA-Z]+""".r
      val name = not(keywords)~"""[a-zA-Z][a-zA-Z\,\-\'\&\(\)]+\s+""".r

      def possibleCompanyExts = companyExt <~ (dot *)  ^^ {_.toString.trim}
      def alphabetsExt = ((alphabets ~ ((quote | ampersand | hyphen | brackets | underscore | comma) *) <~ (space *))+) ^^ { case a => a.toString.trim}
      def companyNameExt = (alphabetsExt <~ (space *) <~ (possibleCompanyExts+)) ^^ {_.toString
      }
      def companyName = alphabetsExt *
      def entityName = (alphabetsExt+) ^^ {case l => l.map(s => s.trim).mkString(" ")}
      def dateWithEndingChars = date <~ ((comma | quote | dot | newline) *) <~ (space *) ^^ {_.toInt}
      def monthWithEndingChars = months <~ ((comma | quote | dot | newline) *) <~ (space *) ^^ { _.toString}
      def monthWithDate = monthWithEndingChars ~ dateWithEndingChars ^^ { case a~b => MonthYear(a,b)}
      def monthDateRange = monthWithDate ~ (space *) ~ toTillUntil ~ (space *) ~ monthWithDate ^^ { case a~s1~b~s2~c => List(a,c)}
      def companyWithMonthDateRange = (companyNameExt ~ (space *) ~ monthDateRange) ^^ {
        case a~b~c => CompanyWithMonthDateRange(company = a, dateRange = c, position = "")
      }
      def positionWithCompanyWithMonthDateRange = ((name+) ~ (space *) ~ forWithAt ~ (space *) ~ companyWithMonthDateRange) ^^ {             
        case a~s1~b~s2~c => c.copy(position = a.mkString(","))

      }
    def apply(input:String) =     {
        parseAll(positionWithCompanyWithMonthDateRange,input) match {
        case Success(lup,_) => println(lup)
        case x => println(x)
        }
      }
    }

另外,如何删除上面文本中出现的不需要的“~”

谢谢,
Pawan

我不想写这篇文章来解决你真正的问题,只是想把这句话解析成你提供的数据结构,我不确定它是否有用,只是作为参考

在您的
公司WithMonthDaterRange
中,我不知道将提取的名称放在何处,因此,我将省略它,添加它应该很简单

object CompParser extends RegexParsers {
  val For = "[for]"
  val From = "[from]"
  val To = "[to]"
  val Keyword = For | From | To
  val Def = """(?m)(?<=^|\]).*?(?=\[|(\.\s*[\n\r]+))""".r
  val End = """.""".r
  val Construct = opt(Def) ~ Keyword ~ Def ^^ {
    case p ~ `For` ~ s => {
      val arr = p.getOrElse("").split(",")
      val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "")
      ("pos&com", (t2._1, s.toString))
    }
    case p ~ `From` ~ s => {
      val arr = s split ","
      val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "")
      ("from", (t2._1, t2._2))
    }
    case p ~ `To` ~ s => {
      val arr = s split ","
      val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "")
      ("to", (t2._1, t2._2))
    }
  }
  val Statement = rep(Construct) ^^ (Map() ++ _) ^^ { m =>
    if (m.size == 3) {
      val from = new MonthYear(m.get("from").head._1, m.get("from").head._2.trim.toInt)
      val to = new MonthYear(m.get("to").head._1, m.get("to").head._2.trim.toInt)
      val pos = m.get("pos&com").head._1
      val com = m.get("pos&com").head._2
      new Some(CompanyWithMonthDateRange(com, pos, List(from, to)))
    } else None
  }

  val Statements = rep(Statement <~ End)

  def apply(in: String) = {
    parseAll(Statements, in) match {
      case Success(r, i) => println(r)
      case failure => failure
    }
  }
}
输出为: inStr1:

名单(部分)(公司名称为MonthDaterange)(私人有限公司。 ,经理,名单(2009年1月),2012年1月

指令2:

名单(部分)(公司名称为MonthDaterange)(私人有限公司。 ,经理,名单(每月(2009年1月),每月(2012年1月)), 一些(公司)与MonthDaterange(公司私人有限公司。 ,员工,名单(月(2010年2月),月(2012年6月)), 一些(公司)与MonthDaterange(公司私人有限公司。 ,人力资源,名单(2010年5月),2012年7月

object CompParser extends RegexParsers {
  val For = "[for]"
  val From = "[from]"
  val To = "[to]"
  val Keyword = For | From | To
  val Def = """(?m)(?<=^|\]).*?(?=\[|(\.\s*[\n\r]+))""".r
  val End = """.""".r
  val Construct = opt(Def) ~ Keyword ~ Def ^^ {
    case p ~ `For` ~ s => {
      val arr = p.getOrElse("").split(",")
      val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "")
      ("pos&com", (t2._1, s.toString))
    }
    case p ~ `From` ~ s => {
      val arr = s split ","
      val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "")
      ("from", (t2._1, t2._2))
    }
    case p ~ `To` ~ s => {
      val arr = s split ","
      val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "")
      ("to", (t2._1, t2._2))
    }
  }
  val Statement = rep(Construct) ^^ (Map() ++ _) ^^ { m =>
    if (m.size == 3) {
      val from = new MonthYear(m.get("from").head._1, m.get("from").head._2.trim.toInt)
      val to = new MonthYear(m.get("to").head._1, m.get("to").head._2.trim.toInt)
      val pos = m.get("pos&com").head._1
      val com = m.get("pos&com").head._2
      new Some(CompanyWithMonthDateRange(com, pos, List(from, to)))
    } else None
  }

  val Statements = rep(Statement <~ End)

  def apply(in: String) = {
    parseAll(Statements, in) match {
      case Success(r, i) => println(r)
      case failure => failure
    }
  }
}
object TestP extends App {
  val inStr1 = """ 
    Manager, Delhi [for] The Company Pvt Ltd. [from] Jan, 2009 [to] Jan, 2012. 
   """
  val inStr2 = """ 
    Manager, Delhi [for] The Company Pvt Ltd. [from] Jan, 2009 [to] Jan, 2012.
    Employee, Kate [for] The Company Pvt Ltd. [from] Feb, 2010 [to] Jun, 2012.  
    HR, Jane       [for] The Company Pvt Ltd. [from] May, 2010 [to] July, 2012. 
    """
  CompParser(inStr1)
  CompParser(inStr2)
}