[c] 100行实现简单的正则表达式引擎 - 哆啦比猫的技术瞎扯 - Arch Linux · ドラえもん · 实时绘制
[c] 100行实现简单的正则表达式引擎
注:此方法效率不高,仅供学习,产品级代码请使用 regexp 或 pcre 库!
实现了 () | ? + * . 和字面量匹配这几个功能。转义没实现,不过应该是轻而易举的事。行首行尾匹配也没实现,因为默认就要匹配行首行尾的;如不希望匹配行首或行尾,需显式地加上 .*。没实现 non-greedy,不过设计时考虑到了,要加上这功能很容易,等有空了再说。
代码先贴这里,有时间了再来解释:
#include <stdio.h>
#include <err.h>
enum { RE_ANY = 0x0100, RE_SPLIT, RE_JUMP,
RE_GROUP_START, RE_GROUP_END, RE_MATCH };
typedef struct RE_State
{
unsigned short c; // 0x0000-0x00FF: match literal; RE_* for specials
char g, ng; // greedy/non-greedy jump
} RE_State;
typedef struct RE_Match { unsigned short s, e; } RE_Match;
typedef struct RE_Regexp
{
RE_State state[256];
RE_Match match[256];
} RE_Regexp;
RE_Regexp RE_compile(const char * regexp)
{
RE_Regexp re;
RE_State * s_tail = re.state;
void append(unsigned short c, char g, char ng)
{
*s_tail++ = (RE_State){ c, g, ng };
}
void expr(RE_State * s, int group)
{
RE_State * s_head = s;
void prepend(unsigned short c, char g, char ng)
{
RE_State * s = s_tail++;
for (; s != s_head-1; s--) *(s+1) = *s;
*s_head = (RE_State){ c, g, ng };
}
for (; *regexp; regexp++) {
switch (*regexp) {
case '(':
regexp++;
s_head = s_tail;
append(RE_GROUP_START, group, 0);
expr(s_tail, group+1);
if (*regexp != ')') err(1, "missing ).");
append(RE_GROUP_END, group, 0);
break;
case ')': return;
case '?': prepend(RE_SPLIT, 1, s_tail-s_head+1); break;
case '+': append(RE_SPLIT, s_head-s_tail, 1); break;
case '*': prepend(RE_SPLIT, 1, s_tail-s_head+2);
append(RE_SPLIT, s_head-s_tail+1, 1);
break;
case '|':
s_head = s;
prepend(RE_SPLIT, 1, s_tail-s_head+2);
s = s_tail;
append(RE_JUMP, 0, 0);
regexp++;
expr(s_tail, group);
s->g = s_tail-s;
regexp--;
break;
case '.': s_head = s_tail; append(RE_ANY, 0, 0); break;
default : s_head = s_tail; append(*regexp, 0, 0);
}
}
}
expr(re.state, 0);
append(RE_MATCH, 0, 0);
return re;
}
int RE_match(RE_Regexp * re, const char * str)
{
RE_Match * m = re->match;
int match(RE_State * s, const char * p)
{
for (; ; p+=(*p!=0), s++) {
//printf(">> @%d(%x)\t[%d]%c(%2.2X)\n", s-re->state, s->c, p-str, *p, *p);
switch (s->c) {
case RE_ANY: if (!*p) return 0; continue;
case RE_SPLIT:
if (match(s+s-> g, p)) return 1;
if (match(s+s->ng, p)) return 1;
return 0;
case RE_JUMP: s += s->g-1; p--; break;
case RE_MATCH: return !*p;
case RE_GROUP_START: m[s->g].s = p-str; p--; break;
case RE_GROUP_END : m[s->g].e = p-str; p--; break;
default: if (*p != s->c) return 0;
}
}
return 0;
}
return match(re->state, str);
}
测试代码:
int main()
{
RE_Regexp re = RE_compile("a(((b|c)?d)+).*");
{
char str[] = "ad";
if (RE_match(&re, str)) {
str[re.match[0].e] = 0;
printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]);
}
else printf("!\n");
}
{
char str[] = "abd";
if (RE_match(&re, str)) {
str[re.match[0].e] = 0;
printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]);
}
else printf("!\n");
}
{
char str[] = "acd";
if (RE_match(&re, str)) {
str[re.match[0].e] = 0;
printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]);
}
else printf("!\n");
}
{
char str[] = "abddcddd";
if (RE_match(&re, str)) {
str[re.match[0].e] = 0;
printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]);
}
else printf("!\n");
}
{
char str[] = "adddbdbcd";
if (RE_match(&re, str)) {
str[re.match[0].e] = 0;
printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]);
}
else printf("!\n");
}
return 0;
}
凡未特殊声明(转载/翻译),所有文章均为原创。
by Giumo Xavier Clanjor (哆啦比猫/兰威举), 2010-2019.
本作品采用知识共享署名·非商业性使用·相同方式共享 3.0 中国大陆许可协议进行许可。
文中凡未特殊声明且未声明为引用的代码均以 MIT 协议授权。
blog comments powered by Disqus