Regex 采访:机器编码/正则表达式(比我的解决方案更好的选择)
以下是采访问题: 机器编码轮:(时间1小时) 表达式已给定,并且字符串Regex 采访:机器编码/正则表达式(比我的解决方案更好的选择),regex,algorithm,finite-automata,Regex,Algorithm,Finite Automata,以下是采访问题: 机器编码轮:(时间1小时) 表达式已给定,并且字符串testCase,需要评估testCase是否对表达式有效 表达式可能包含: 字母[a-z] 。(。表示[a-z]中的任何字符) '*'('*'具有与普通RegExp中相同的属性) '^'('^'表示字符串的开头) “$”(“$”表示字符串的结尾) 示例案例: Expression Test Case Valid ab ab true a*b aaaaaab
testCase
,需要评估testCase
是否对表达式有效
表达式可能包含:
- 字母
[a-z]
(。
表示。
中的任何字符)[a-z]
('*'
具有与普通RegExp中相同的属性)'*'
('^'
表示字符串的开头)'^'
(“$”
表示字符串的结尾)“$”
Expression Test Case Valid
ab ab true
a*b aaaaaab true
a*b*c* abc true
a*b*c aaabccc false
^abc*b abccccb true
^abc*b abbccccb false
^abcd$ abcd true
^abc*abc$ abcabc true
^abc.abc$ abczabc true
^ab..*abc$ abyxxxxabc true
我的做法:
ab
)、转换(a | b
)、(a*
)kleenstar。并为串联添加
+
。例如:
abc$ => .*+a+b+c
^ab..*abc$ => a+b+.+.*+a+b+c
(
paranethes>kleen\u star>串联>。
)
^
,在结束时忽略$
——其他任何地方都是无效的
然后我们构造一个二维网格,其中每一行表示表达式中的一个单元[1],每一列表示测试字符串中的一个字符
[1] :此处的“单位”是指单个字符,但*
应附加到前一个字符
因此,对于a*b*c
和aaabccc
,我们得到如下结果:
a a a b c c c
a*
b*
c
每个单元格都可以有一个表示有效性的布尔值
现在,对于每个单元格,如果以下任一项保持不变,则将其设置为有效:
- 左邻域中的值有效,行为
或x*
,列为*
(x
为任意字符x
) 这对应于与一个附加字符匹配的a-z
*
- 左上邻的值有效,行为
或x
,列为
(x
为任意字符x
) 这对应于单个字符匹配a-z
- 上邻中的值有效,行为
或x*
这对应于不匹配的*
*
V
表示有效)
由于右下角的单元格无效,因此返回invalid
运行时间:O(stringLength*expressionLength)
您应该注意到,我们主要是在探索网格的一小部分 这个解决方案可以通过使用使其成为递归解决方案来改进(并且只调用右下角单元格的递归解决方案) 这将使我们获得最佳情况下的性能
O(1)
,但最差情况下的性能仍然是O(stringLength*expressionLength)
我的解决方案假设表达式必须匹配整个字符串,这是从上面的示例无效的结果推断出来的(根据问题) 如果它可以匹配一个子字符串,我们可以稍微修改它,如果单元格位于顶行,则在以下情况下它是有效的:
- 该行为
或x*
*
- 行为
或x
,列为x
- 只要一小时,我们就可以使用简单的方法
将模式拆分为令牌:
a*b.c
=>{a*b.c}
如果模式不是以^
开头,则在开头添加*
,否则删除^
如果模式没有以$
结尾,则在结尾添加*
,否则删除$
然后我们使用递归:如果我们有循环模式,则使用3种方式(将模式索引增加1,将单词索引增加1,将两个索引增加1),如果不是循环模式,则使用一种方式(将两个索引增加1)
C中的示例代码#
使用系统;
使用System.Collections.Generic;
使用系统诊断;
使用System.Linq;
名称空间重新测试
{
班级计划
{
静态void Main(字符串[]参数)
{
Assert(IsMatch(“ab”,“ab”)==true);
Assert(IsMatch(“aaaaaa b”,“a*b”)==true);
Assert(IsMatch(“abc”,“a*b*c*”)==true);
Assert(IsMatch(“aaabccc”,“a*b*c”)==true);/*original false,但它应该是true*/
Assert(IsMatch(“abccccb”,“^abc*b”)==true);
Assert(IsMatch(“abbcccb”,“^abc*b”)==false);
Assert(IsMatch(“abcd”,“^abcd$”)==true);
Assert(IsMatch(“abcabc”,“^abc*abc$”)==true);
Assert(IsMatch(“abczabc”,“^abc.abc$”)==true);
Assert(IsMatch(“abyxxxabc”,“^ab..*abc$”==true);
}
静态布尔IsMatch(字符串输入、字符串模式)
{
列出标记=
a a a b c c c
a*
b*
c
a a a b c c c
a* V V V - - - -
b* - - - V - - -
c - - - - V - -
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
namespace ReTest
{
class Program
{
static void Main(string[] args)
{
Debug.Assert(IsMatch("ab", "ab") == true);
Debug.Assert(IsMatch("aaaaaab", "a*b") == true);
Debug.Assert(IsMatch("abc", "a*b*c*") == true);
Debug.Assert(IsMatch("aaabccc", "a*b*c") == true); /* original false, but it should be true */
Debug.Assert(IsMatch("abccccb", "^abc*b") == true);
Debug.Assert(IsMatch("abbccccb", "^abc*b") == false);
Debug.Assert(IsMatch("abcd", "^abcd$") == true);
Debug.Assert(IsMatch("abcabc", "^abc*abc$") == true);
Debug.Assert(IsMatch("abczabc", "^abc.abc$") == true);
Debug.Assert(IsMatch("abyxxxxabc", "^ab..*abc$") == true);
}
static bool IsMatch(string input, string pattern)
{
List<PatternToken> patternTokens = new List<PatternToken>();
for (int i = 0; i < pattern.Length; i++)
{
char token = pattern[i];
if (token == '^')
{
if (i == 0)
patternTokens.Add(new PatternToken { Token = token, Occurence = Occurence.Single });
else
throw new ArgumentException("input");
}
else if (char.IsLower(token) || token == '.')
{
if (i < pattern.Length - 1 && pattern[i + 1] == '*')
{
patternTokens.Add(new PatternToken { Token = token, Occurence = Occurence.Multiple });
i++;
}
else
patternTokens.Add(new PatternToken { Token = token, Occurence = Occurence.Single });
}
else if (token == '$')
{
if (i == pattern.Length - 1)
patternTokens.Add(new PatternToken { Token = token, Occurence = Occurence.Single });
else
throw new ArgumentException("input");
}
else
throw new ArgumentException("input");
}
PatternToken firstPatternToken = patternTokens.First();
if (firstPatternToken.Token == '^')
patternTokens.RemoveAt(0);
else
patternTokens.Insert(0, new PatternToken { Token = '.', Occurence = Occurence.Multiple });
PatternToken lastPatternToken = patternTokens.Last();
if (lastPatternToken.Token == '$')
patternTokens.RemoveAt(patternTokens.Count - 1);
else
patternTokens.Add(new PatternToken { Token = '.', Occurence = Occurence.Multiple });
return IsMatch(input, 0, patternTokens, 0);
}
static bool IsMatch(string input, int inputIndex, IList<PatternToken> pattern, int patternIndex)
{
if (inputIndex == input.Length)
{
if (patternIndex == pattern.Count || (patternIndex == pattern.Count - 1 && pattern[patternIndex].Occurence == Occurence.Multiple))
return true;
else
return false;
}
else if (inputIndex < input.Length && patternIndex < pattern.Count)
{
char c = input[inputIndex];
PatternToken patternToken = pattern[patternIndex];
if (patternToken.Token == '.' || patternToken.Token == c)
{
if (patternToken.Occurence == Occurence.Single)
return IsMatch(input, inputIndex + 1, pattern, patternIndex + 1);
else
return IsMatch(input, inputIndex, pattern, patternIndex + 1) ||
IsMatch(input, inputIndex + 1, pattern, patternIndex) ||
IsMatch(input, inputIndex + 1, pattern, patternIndex + 1);
}
else
return false;
}
else
return false;
}
class PatternToken
{
public char Token { get; set; }
public Occurence Occurence { get; set; }
public override string ToString()
{
if (Occurence == Occurence.Single)
return Token.ToString();
else
return Token.ToString() + "*";
}
}
enum Occurence
{
Single,
Multiple
}
}
}
/**
* @author Santhosh Kumar
*
*/
public class ExpressionProblemSolution {
public static void main(String[] args) {
System.out.println("---------- ExpressionProblemSolution - start ---------- \n");
ExpressionProblemSolution evs = new ExpressionProblemSolution();
evs.runMatchTests();
System.out.println("\n---------- ExpressionProblemSolution - end ---------- ");
}
// simple node structure to keep expression terms
class Node {
Character ch; // char [a-z]
Character sch; // special char (^, *, $, .)
Node next;
Node(Character ch1, Character sch1) {
ch = ch1;
sch = sch1;
}
Node add(Character ch1, Character sch1) {
this.next = new Node(ch1, sch1);
return this.next;
}
Node next() {
return this.next;
}
public String toString() {
return "[ch=" + ch + ", sch=" + sch + "]";
}
}
private boolean letters(char ch) {
return (ch >= 'a' && ch <= 'z');
}
private boolean specialChars(char ch) {
return (ch == '.' || ch == '^' || ch == '*' || ch == '$');
}
private void validate(String expression) {
// if expression has invalid chars throw runtime exception
if (expression == null) {
throw new RuntimeException(
"Expression can't be null, but it can be empty");
}
char[] expr = expression.toCharArray();
for (int i = 0; i < expr.length; i++) {
if (!letters(expr[i]) && !specialChars(expr[i])) {
throw new RuntimeException(
"Expression contains invalid char at position=" + i
+ ", invalid_char=" + expr[i]
+ " (allowed chars are 'a-z', *, . ^, * and $)");
}
}
}
// Parse the expression and split them into terms and add to list
// the list is FSM (Finite State Machine). The list is used during
// the process step to iterate through the machine states based
// on the input string
//
// expression = a*b*c has 3 terms -> [a*] [b*] [c]
// expression = ^ab.*c$ has 4 terms -> [^a] [b] [.*] [c$]
//
// Timing : O(n) n -> expression length
// Space : O(n) n -> expression length decides the no.of terms stored in the list
private Node preprocess(String expression) {
debug("preprocess - start [" + expression + "]");
validate(expression);
Node root = new Node(' ', ' '); // root node with empty values
Node current = root;
char[] expr = expression.toCharArray();
int i = 0, n = expr.length;
while (i < n) {
debug("i=" + i);
if (expr[i] == '^') { // it is prefix operator, so it always linked
// to the char after that
if (i + 1 < n) {
if (i == 0) { // ^ indicates start of the expression, so it
// must be first in the expr string
current = current.add(expr[i + 1], expr[i]);
i += 2;
continue;
} else {
throw new RuntimeException(
"Special char ^ should be present only at the first position of the expression (position="
+ i + ", char=" + expr[i] + ")");
}
} else {
throw new RuntimeException(
"Expression missing after ^ (position=" + i
+ ", char=" + expr[i] + ")");
}
} else if (letters(expr[i]) || expr[i] == '.') { // [a-z] or .
if (i + 1 < n) {
char nextCh = expr[i + 1];
if (nextCh == '$' && i + 1 != n - 1) { // if $, then it must
// be at the last
// position of the
// expression
throw new RuntimeException(
"Special char $ should be present only at the last position of the expression (position="
+ (i + 1)
+ ", char="
+ expr[i + 1]
+ ")");
}
if (nextCh == '$' || nextCh == '*') { // a* or b$
current = current.add(expr[i], nextCh);
i += 2;
continue;
} else {
current = current.add(expr[i], expr[i] == '.' ? expr[i]
: null);
i++;
continue;
}
} else { // a or b
current = current.add(expr[i], null);
i++;
continue;
}
} else {
throw new RuntimeException("Invalid char - (position=" + (i)
+ ", char=" + expr[i] + ")");
}
}
debug("preprocess - end");
return root;
}
// Traverse over the terms in the list and iterate and match the input string
// The terms list is the FSM (Finite State Machine); the end of list indicates
// end state. That is, input is valid and matching the expression
//
// Timing : O(n) for pre-processing + O(n) for processing = 2O(n) = ~O(n) where n -> expression length
// Timing : O(2n) ~ O(n)
// Space : O(n) where n -> expression length decides the no.of terms stored in the list
public boolean process(String expression, String testString) {
Node root = preprocess(expression);
print(root);
Node current = root.next();
if (root == null || current == null)
return false;
int i = 0;
int n = testString.length();
debug("input-string-length=" + n);
char[] test = testString.toCharArray();
// while (i < n && current != null) {
while (current != null) {
debug("process: i=" + i);
debug("process: ch=" + current.ch + ", sch=" + current.sch);
if (current.sch == null) { // no special char just [a-z] case
if (test[i] != current.ch) { // test char and current state char
// should match
return false;
} else {
i++;
current = current.next();
continue;
}
} else if (current.sch == '^') { // process start char
if (i == 0 && test[i] == current.ch) {
i++;
current = current.next();
continue;
} else {
return false;
}
} else if (current.sch == '$') { // process end char
if (i == n - 1 && test[i] == current.ch) {
i++;
current = current.next();
continue;
} else {
return false;
}
} else if (current.sch == '*') { // process repeat char
if (letters(current.ch)) { // like a* or b*
while (i < n && test[i] == current.ch)
i++; // move i till end of repeat char
current = current.next();
continue;
} else if (current.ch == '.') { // like .*
Node nextNode = current.next();
print(nextNode);
if (nextNode != null) {
Character nextChar = nextNode.ch;
Character nextSChar = nextNode.sch;
// a.*z = az or (you need to check the next state in the
// list)
if (test[i] == nextChar) { // test [i] == 'z'
i++;
current = current.next();
continue;
} else {
// a.*z = abz or
// a.*z = abbz
char tch = test[i]; // get 'b'
while (i + 1 < n && test[++i] == tch)
; // move i till end of repeat char
current = current.next();
continue;
}
}
} else { // like $* or ^*
debug("process: return false-1");
return false;
}
} else if (current.sch == '.') { // process any char
if (!letters(test[i])) {
return false;
}
i++;
current = current.next();
continue;
}
}
if (i == n && current == null) {
// string position is out of bound
// list is at end ie. exhausted both expression and input
// FSM reached the end state, hence the input is valid and matches the given expression
return true;
} else {
return false;
}
}
public void debug(Object str) {
boolean debug = false;
if (debug) {
System.out.println("[debug] " + str);
}
}
private void print(Node node) {
StringBuilder sb = new StringBuilder();
while (node != null) {
sb.append(node + " ");
node = node.next();
}
sb.append("\n");
debug(sb.toString());
}
public boolean match(String expr, String input) {
boolean result = process(expr, input);
System.out.printf("\n%-20s %-20s %-20s\n", expr, input, result);
return result;
}
public void runMatchTests() {
match("ab", "ab");
match("a*b", "aaaaaab");
match("a*b*c*", "abc");
match("a*b*c", "aaabccc");
match("^abc*b", "abccccb");
match("^abc*b", "abccccbb");
match("^abcd$", "abcd");
match("^abc*abc$", "abcabc");
match("^abc.abc$", "abczabc");
match("^ab..*abc$", "abyxxxxabc");
match("a*b*", ""); // handles empty input string
match("xyza*b*", "xyz");
}}
int regex_validate(char *reg, char *test) {
char *ptr = reg;
while (*test) {
switch(*ptr) {
case '.':
{
test++; ptr++; continue;
break;
}
case '*':
{
if (*(ptr-1) == *test) {
test++; continue;
}
else if (*(ptr-1) == '.' && (*test == *(test-1))) {
test++; continue;
}
else {
ptr++; continue;
}
break;
}
case '^':
{
ptr++;
while ( ptr && test && *ptr == *test) {
ptr++; test++;
}
if (!ptr && !test)
return 1;
if (ptr && test && (*ptr == '$' || *ptr == '*' || *ptr == '.')) {
continue;
}
else {
return 0;
}
break;
}
case '$':
{
if (*test)
return 0;
break;
}
default:
{
printf("default case.\n");
if (*ptr != *test) {
return 0;
}
test++; ptr++; continue;
}
break;
}
}
return 1;
}
int main () {
printf("regex=%d\n", regex_validate("ab", "ab"));
printf("regex=%d\n", regex_validate("a*b", "aaaaaab"));
printf("regex=%d\n", regex_validate("^abc.abc$", "abcdabc"));
printf("regex=%d\n", regex_validate("^abc*abc$", "abcabc"));
printf("regex=%d\n", regex_validate("^abc*b", "abccccb"));
printf("regex=%d\n", regex_validate("^abc*b", "abbccccb"));
return 0;
}