[c] 100行实现简单的正则表达式引擎 - 哆啦比猫的技术瞎扯 - Arch Linux · ドラえもん · 实时绘制
[c] 100行实现简单的正则表达式引擎
注:此方法效率不高,仅供学习,产品级代码请使用 regexp 或 pcre 库!
实现了 () | ? + * . 和字面量匹配这几个功能。转义没实现,不过应该是轻而易举的事。行首行尾匹配也没实现,因为默认就要匹配行首行尾的;如不希望匹配行首或行尾,需显式地加上 .*。没实现 non-greedy,不过设计时考虑到了,要加上这功能很容易,等有空了再说。
代码先贴这里,有时间了再来解释:
#include <stdio.h> #include <err.h> enum { RE_ANY = 0x0100, RE_SPLIT, RE_JUMP, RE_GROUP_START, RE_GROUP_END, RE_MATCH }; typedef struct RE_State { unsigned short c; // 0x0000-0x00FF: match literal; RE_* for specials char g, ng; // greedy/non-greedy jump } RE_State; typedef struct RE_Match { unsigned short s, e; } RE_Match; typedef struct RE_Regexp { RE_State state[256]; RE_Match match[256]; } RE_Regexp; RE_Regexp RE_compile(const char * regexp) { RE_Regexp re; RE_State * s_tail = re.state; void append(unsigned short c, char g, char ng) { *s_tail++ = (RE_State){ c, g, ng }; } void expr(RE_State * s, int group) { RE_State * s_head = s; void prepend(unsigned short c, char g, char ng) { RE_State * s = s_tail++; for (; s != s_head-1; s--) *(s+1) = *s; *s_head = (RE_State){ c, g, ng }; } for (; *regexp; regexp++) { switch (*regexp) { case '(': regexp++; s_head = s_tail; append(RE_GROUP_START, group, 0); expr(s_tail, group+1); if (*regexp != ')') err(1, "missing )."); append(RE_GROUP_END, group, 0); break; case ')': return; case '?': prepend(RE_SPLIT, 1, s_tail-s_head+1); break; case '+': append(RE_SPLIT, s_head-s_tail, 1); break; case '*': prepend(RE_SPLIT, 1, s_tail-s_head+2); append(RE_SPLIT, s_head-s_tail+1, 1); break; case '|': s_head = s; prepend(RE_SPLIT, 1, s_tail-s_head+2); s = s_tail; append(RE_JUMP, 0, 0); regexp++; expr(s_tail, group); s->g = s_tail-s; regexp--; break; case '.': s_head = s_tail; append(RE_ANY, 0, 0); break; default : s_head = s_tail; append(*regexp, 0, 0); } } } expr(re.state, 0); append(RE_MATCH, 0, 0); return re; } int RE_match(RE_Regexp * re, const char * str) { RE_Match * m = re->match; int match(RE_State * s, const char * p) { for (; ; p+=(*p!=0), s++) { //printf(">> @%d(%x)\t[%d]%c(%2.2X)\n", s-re->state, s->c, p-str, *p, *p); switch (s->c) { case RE_ANY: if (!*p) return 0; continue; case RE_SPLIT: if (match(s+s-> g, p)) return 1; if (match(s+s->ng, p)) return 1; return 0; case RE_JUMP: s += s->g-1; p--; break; case RE_MATCH: return !*p; case RE_GROUP_START: m[s->g].s = p-str; p--; break; case RE_GROUP_END : m[s->g].e = p-str; p--; break; default: if (*p != s->c) return 0; } } return 0; } return match(re->state, str); }
测试代码:
int main() { RE_Regexp re = RE_compile("a(((b|c)?d)+).*"); { char str[] = "ad"; if (RE_match(&re, str)) { str[re.match[0].e] = 0; printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]); } else printf("!\n"); } { char str[] = "abd"; if (RE_match(&re, str)) { str[re.match[0].e] = 0; printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]); } else printf("!\n"); } { char str[] = "acd"; if (RE_match(&re, str)) { str[re.match[0].e] = 0; printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]); } else printf("!\n"); } { char str[] = "abddcddd"; if (RE_match(&re, str)) { str[re.match[0].e] = 0; printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]); } else printf("!\n"); } { char str[] = "adddbdbcd"; if (RE_match(&re, str)) { str[re.match[0].e] = 0; printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]); } else printf("!\n"); } return 0; }
凡未特殊声明(转载/翻译),所有文章均为原创。
by Giumo Xavier Clanjor (哆啦比猫/兰威举), 2010-2019.
本作品采用知识共享署名·非商业性使用·相同方式共享 3.0 中国大陆许可协议进行许可。
文中凡未特殊声明且未声明为引用的代码均以 MIT 协议授权。
blog comments powered by Disqus