如何有效地解析(沒有太多代碼混亂)像下面的語句? 關鍵字/分隔符位於[]中。斯卡拉句子解析使用解析器組合器
經理,德里[供]本公司私人有限公司[從] 2009年1月[來]揚,2012年
的人姓名,公司名稱和日期範圍是從使用文本解析提取組合子。 (預期輸出在底部示出)
下面是上文
case class CompanyWithMonthDateRange(company:String, position:String, dateRange:List[MonthYear])
case class MonthYear(month:String, year:Int)
object CompanyParser1 extends RegexParsers {
override type Elem = Char
override def skipWhitespace = false
def keywords: Parser[String] = "for" | "in" | "with" |"at" | "from" | "pvt"|"ltd" | "company" | "co" | "limited" | "inc" | "corporation" | "jan" |\
"feb" | "mar" | "apr" | "may" | "jun" | "jul" | "aug" | "sep" | "nov" | "dec" | "to" | "till" | "until" | "upto"
val date = ("""\d\d\d\d""".r | """\d\d""".r)
val integer = ("""(0|[1-9]\d*)""".r) ^^ { _.toInt }
val comma = ("""\,""".r)
val quote = ("""[\'\"]+""".r)
val underscore = ("""\_""".r)
val dot = ("""\.""".r)
val space = ("""\s+""".r) ^^ {case _ => ""}
val colon = (""":""".r)
val ampersand = ("""(\&|and)""".r)
val hyphen = ("""\-""".r)
val brackets = ("""[\(\)]+""".r)
val newline = ("""[\n\r]""".r)
val months = ("""(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)""".r)
val toTillUntil = ("""(to|till|until|upto)""".r)
val asWord = ("""(as)""".r)
val fromWord = ("""from""".r)
val forWithAt = ("""(in|for|with|at)""".r)
val companyExt = ("""(pvt|ltd|company|co|limited|inc|corporation)""".r)
val alphabets = not(keywords)~"""[a-zA-Z]+""".r
val name = not(keywords)~"""[a-zA-Z][a-zA-Z\,\-\'\&\(\)]+\s+""".r
def possibleCompanyExts = companyExt <~ (dot *) ^^ {_.toString.trim}
def alphabetsExt = ((alphabets ~ ((quote | ampersand | hyphen | brackets | underscore | comma) *) <~ (space *))+) ^^ { case a => a.toString.trim}
def companyNameExt = (alphabetsExt <~ (space *) <~ (possibleCompanyExts+)) ^^ {_.toString
}
def companyName = alphabetsExt *
def entityName = (alphabetsExt+) ^^ {case l => l.map(s => s.trim).mkString(" ")}
def dateWithEndingChars = date <~ ((comma | quote | dot | newline) *) <~ (space *) ^^ {_.toInt}
def monthWithEndingChars = months <~ ((comma | quote | dot | newline) *) <~ (space *) ^^ { _.toString}
def monthWithDate = monthWithEndingChars ~ dateWithEndingChars ^^ { case a~b => MonthYear(a,b)}
def monthDateRange = monthWithDate ~ (space *) ~ toTillUntil ~ (space *) ~ monthWithDate ^^ { case a~s1~b~s2~c => List(a,c)}
def companyWithMonthDateRange = (companyNameExt ~ (space *) ~ monthDateRange) ^^ {
case a~b~c => CompanyWithMonthDateRange(company = a, dateRange = c, position = "")
}
def positionWithCompanyWithMonthDateRange = ((name+) ~ (space *) ~ forWithAt ~ (space *) ~ companyWithMonthDateRange) ^^ {
case a~s1~b~s2~c => c.copy(position = a.mkString(","))
}
def apply(input:String) = {
parseAll(positionWithCompanyWithMonthDateRange,input) match {
case Success(lup,_) => println(lup)
case x => println(x)
}
}
}
輸出寫的代碼應該像
CompanyWithMonthDateRange(List(((()~Company)~List()), ((()~fd)~List()), ((()~India)~List('))),(()~Manager,),(()~Delhi),List(MonthYear(mar,2010), MonthYear(jul,2012)))
此外,如何刪除不需要的「〜」出現在上面的文本中。
感謝, 爬完