Parsing 使用解析器组合器解析Scala语句
如何有效地解析(没有太多的代码混乱)下面这样的语句? 关键字/分隔符放置在[]内 新德里公司私人有限公司(自2009年1月至2012年1月)经理 使用解析组合符从文本中提取人名、公司名和日期范围。(预期输出显示在底部) 下面是为上述内容编写的代码-Parsing 使用解析器组合器解析Scala语句,parsing,scala,dsl,parser-combinators,Parsing,Scala,Dsl,Parser Combinators,如何有效地解析(没有太多的代码混乱)下面这样的语句? 关键字/分隔符放置在[]内 新德里公司私人有限公司(自2009年1月至2012年1月)经理 使用解析组合符从文本中提取人名、公司名和日期范围。(预期输出显示在底部) 下面是为上述内容编写的代码- case class CompanyWithMonthDateRange(company:String, position:String, dateRange:List[MonthYear]) case class MonthY
case class CompanyWithMonthDateRange(company:String, position:String, dateRange:List[MonthYear])
case class MonthYear(month:String, year:Int)
object CompanyParser1 extends RegexParsers {
override type Elem = Char
override def skipWhitespace = false
def keywords: Parser[String] = "for" | "in" | "with" |"at" | "from" | "pvt"|"ltd" | "company" | "co" | "limited" | "inc" | "corporation" | "jan" |\
"feb" | "mar" | "apr" | "may" | "jun" | "jul" | "aug" | "sep" | "nov" | "dec" | "to" | "till" | "until" | "upto"
val date = ("""\d\d\d\d""".r | """\d\d""".r)
val integer = ("""(0|[1-9]\d*)""".r) ^^ { _.toInt }
val comma = ("""\,""".r)
val quote = ("""[\'\"]+""".r)
val underscore = ("""\_""".r)
val dot = ("""\.""".r)
val space = ("""\s+""".r) ^^ {case _ => ""}
val colon = (""":""".r)
val ampersand = ("""(\&|and)""".r)
val hyphen = ("""\-""".r)
val brackets = ("""[\(\)]+""".r)
val newline = ("""[\n\r]""".r)
val months = ("""(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)""".r)
val toTillUntil = ("""(to|till|until|upto)""".r)
val asWord = ("""(as)""".r)
val fromWord = ("""from""".r)
val forWithAt = ("""(in|for|with|at)""".r)
val companyExt = ("""(pvt|ltd|company|co|limited|inc|corporation)""".r)
val alphabets = not(keywords)~"""[a-zA-Z]+""".r
val name = not(keywords)~"""[a-zA-Z][a-zA-Z\,\-\'\&\(\)]+\s+""".r
def possibleCompanyExts = companyExt <~ (dot *) ^^ {_.toString.trim}
def alphabetsExt = ((alphabets ~ ((quote | ampersand | hyphen | brackets | underscore | comma) *) <~ (space *))+) ^^ { case a => a.toString.trim}
def companyNameExt = (alphabetsExt <~ (space *) <~ (possibleCompanyExts+)) ^^ {_.toString
}
def companyName = alphabetsExt *
def entityName = (alphabetsExt+) ^^ {case l => l.map(s => s.trim).mkString(" ")}
def dateWithEndingChars = date <~ ((comma | quote | dot | newline) *) <~ (space *) ^^ {_.toInt}
def monthWithEndingChars = months <~ ((comma | quote | dot | newline) *) <~ (space *) ^^ { _.toString}
def monthWithDate = monthWithEndingChars ~ dateWithEndingChars ^^ { case a~b => MonthYear(a,b)}
def monthDateRange = monthWithDate ~ (space *) ~ toTillUntil ~ (space *) ~ monthWithDate ^^ { case a~s1~b~s2~c => List(a,c)}
def companyWithMonthDateRange = (companyNameExt ~ (space *) ~ monthDateRange) ^^ {
case a~b~c => CompanyWithMonthDateRange(company = a, dateRange = c, position = "")
}
def positionWithCompanyWithMonthDateRange = ((name+) ~ (space *) ~ forWithAt ~ (space *) ~ companyWithMonthDateRange) ^^ {
case a~s1~b~s2~c => c.copy(position = a.mkString(","))
}
def apply(input:String) = {
parseAll(positionWithCompanyWithMonthDateRange,input) match {
case Success(lup,_) => println(lup)
case x => println(x)
}
}
}
另外,如何删除上面文本中出现的不需要的“~”
谢谢,
Pawan我不想写这篇文章来解决你真正的问题,只是想把这句话解析成你提供的数据结构,我不确定它是否有用,只是作为参考 在您的
公司WithMonthDaterRange
中,我不知道将提取的名称放在何处,因此,我将省略它,添加它应该很简单
object CompParser extends RegexParsers {
val For = "[for]"
val From = "[from]"
val To = "[to]"
val Keyword = For | From | To
val Def = """(?m)(?<=^|\]).*?(?=\[|(\.\s*[\n\r]+))""".r
val End = """.""".r
val Construct = opt(Def) ~ Keyword ~ Def ^^ {
case p ~ `For` ~ s => {
val arr = p.getOrElse("").split(",")
val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "")
("pos&com", (t2._1, s.toString))
}
case p ~ `From` ~ s => {
val arr = s split ","
val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "")
("from", (t2._1, t2._2))
}
case p ~ `To` ~ s => {
val arr = s split ","
val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "")
("to", (t2._1, t2._2))
}
}
val Statement = rep(Construct) ^^ (Map() ++ _) ^^ { m =>
if (m.size == 3) {
val from = new MonthYear(m.get("from").head._1, m.get("from").head._2.trim.toInt)
val to = new MonthYear(m.get("to").head._1, m.get("to").head._2.trim.toInt)
val pos = m.get("pos&com").head._1
val com = m.get("pos&com").head._2
new Some(CompanyWithMonthDateRange(com, pos, List(from, to)))
} else None
}
val Statements = rep(Statement <~ End)
def apply(in: String) = {
parseAll(Statements, in) match {
case Success(r, i) => println(r)
case failure => failure
}
}
}
输出为:
inStr1:
名单(部分)(公司名称为MonthDaterange)(私人有限公司。
,经理,名单(2009年1月),2012年1月
指令2:
名单(部分)(公司名称为MonthDaterange)(私人有限公司。
,经理,名单(每月(2009年1月),每月(2012年1月)),
一些(公司)与MonthDaterange(公司私人有限公司。
,员工,名单(月(2010年2月),月(2012年6月)),
一些(公司)与MonthDaterange(公司私人有限公司。
,人力资源,名单(2010年5月),2012年7月
object CompParser extends RegexParsers {
val For = "[for]"
val From = "[from]"
val To = "[to]"
val Keyword = For | From | To
val Def = """(?m)(?<=^|\]).*?(?=\[|(\.\s*[\n\r]+))""".r
val End = """.""".r
val Construct = opt(Def) ~ Keyword ~ Def ^^ {
case p ~ `For` ~ s => {
val arr = p.getOrElse("").split(",")
val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "")
("pos&com", (t2._1, s.toString))
}
case p ~ `From` ~ s => {
val arr = s split ","
val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "")
("from", (t2._1, t2._2))
}
case p ~ `To` ~ s => {
val arr = s split ","
val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "")
("to", (t2._1, t2._2))
}
}
val Statement = rep(Construct) ^^ (Map() ++ _) ^^ { m =>
if (m.size == 3) {
val from = new MonthYear(m.get("from").head._1, m.get("from").head._2.trim.toInt)
val to = new MonthYear(m.get("to").head._1, m.get("to").head._2.trim.toInt)
val pos = m.get("pos&com").head._1
val com = m.get("pos&com").head._2
new Some(CompanyWithMonthDateRange(com, pos, List(from, to)))
} else None
}
val Statements = rep(Statement <~ End)
def apply(in: String) = {
parseAll(Statements, in) match {
case Success(r, i) => println(r)
case failure => failure
}
}
}
object TestP extends App {
val inStr1 = """
Manager, Delhi [for] The Company Pvt Ltd. [from] Jan, 2009 [to] Jan, 2012.
"""
val inStr2 = """
Manager, Delhi [for] The Company Pvt Ltd. [from] Jan, 2009 [to] Jan, 2012.
Employee, Kate [for] The Company Pvt Ltd. [from] Feb, 2010 [to] Jun, 2012.
HR, Jane [for] The Company Pvt Ltd. [from] May, 2010 [to] July, 2012.
"""
CompParser(inStr1)
CompParser(inStr2)
}