[c] 100行实现简单的正则表达式引擎 - 哆啦比猫的技术瞎扯 - Arch Linux · ドラえもん · 实时绘制

[c] 100行实现简单的正则表达式引擎

哆啦比猫 posted @ 2013年8月10日 22:00 in C/C++ with tags 正则表达式 c regexp , 5039 阅读

注：此方法效率不高，仅供学习，产品级代码请使用 regexp 或 pcre 库！

实现了 () | ? + * . 和字面量匹配这几个功能。转义没实现，不过应该是轻而易举的事。行首行尾匹配也没实现，因为默认就要匹配行首行尾的；如不希望匹配行首或行尾，需显式地加上 .*。没实现 non-greedy，不过设计时考虑到了，要加上这功能很容易，等有空了再说。

代码先贴这里，有时间了再来解释：

#include <stdio.h>
#include <err.h>

enum { RE_ANY = 0x0100, RE_SPLIT, RE_JUMP,
		RE_GROUP_START, RE_GROUP_END, RE_MATCH };
typedef struct RE_State
{
	unsigned short c;	// 0x0000-0x00FF: match literal; RE_* for specials
	char g, ng;			// greedy/non-greedy jump
} RE_State;
typedef struct RE_Match { unsigned short s, e; } RE_Match;
typedef struct RE_Regexp
{
	RE_State state[256];
	RE_Match match[256];
} RE_Regexp;

RE_Regexp RE_compile(const char * regexp)
{
	RE_Regexp re;
	RE_State * s_tail = re.state;
	void append(unsigned short c, char g, char ng)
	{
		*s_tail++ = (RE_State){ c, g, ng };
	}
	void expr(RE_State * s, int group)
	{
		RE_State * s_head = s;
		void prepend(unsigned short c, char g, char ng)
		{
			RE_State * s = s_tail++;
			for (; s != s_head-1; s--) *(s+1) = *s;
			*s_head = (RE_State){ c, g, ng };
		}
		for (; *regexp; regexp++) {
			switch (*regexp) {
				case '(':
					regexp++;
					s_head = s_tail;
					append(RE_GROUP_START, group, 0);
					expr(s_tail, group+1);
					if (*regexp != ')') err(1, "missing ).");
					append(RE_GROUP_END, group, 0);
					break;
				case ')': return;
				case '?': prepend(RE_SPLIT, 1, s_tail-s_head+1); break;
				case '+':  append(RE_SPLIT, s_head-s_tail, 1); break;
				case '*': prepend(RE_SPLIT, 1, s_tail-s_head+2);
						   append(RE_SPLIT, s_head-s_tail+1, 1);
						  break;
				case '|':
					s_head = s;
					prepend(RE_SPLIT, 1, s_tail-s_head+2);
					s = s_tail;
					append(RE_JUMP, 0, 0);
					regexp++;
					expr(s_tail, group);
					s->g = s_tail-s;
					regexp--;
					break;
				case '.': s_head = s_tail; append(RE_ANY, 0, 0); break;
				default : s_head = s_tail; append(*regexp, 0, 0);
			}
		}
	}
	expr(re.state, 0);
	append(RE_MATCH, 0, 0);
	return re;
}

int RE_match(RE_Regexp * re, const char * str)
{
	RE_Match * m = re->match;
	int match(RE_State * s, const char * p)
	{
		for (; ; p+=(*p!=0), s++) {
			//printf(">> @%d(%x)\t[%d]%c(%2.2X)\n", s-re->state, s->c, p-str, *p, *p);
			switch (s->c) {
				case RE_ANY: if (!*p) return 0; continue;
				case RE_SPLIT:
					if (match(s+s-> g, p)) return 1;
					if (match(s+s->ng, p)) return 1;
					return 0;
				case RE_JUMP: s += s->g-1; p--; break;
				case RE_MATCH: return !*p;
				case RE_GROUP_START: m[s->g].s = p-str; p--; break;
				case RE_GROUP_END  : m[s->g].e = p-str; p--; break;
				default: if (*p != s->c) return 0;
			}
		}
		return 0;
	}
	return match(re->state, str);
}

测试代码：

int main()
{
	RE_Regexp re = RE_compile("a(((b|c)?d)+).*");
	{
		char str[] = "ad";
		if (RE_match(&re, str)) {
			str[re.match[0].e] = 0;
			printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]);
		}
		else printf("!\n");
	}
	{
		char str[] = "abd";
		if (RE_match(&re, str)) {
			str[re.match[0].e] = 0;
			printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]);
		}
		else printf("!\n");
	}
	{
		char str[] = "acd";
		if (RE_match(&re, str)) {
			str[re.match[0].e] = 0;
			printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]);
		}
		else printf("!\n");
	}
	{
		char str[] = "abddcddd";
		if (RE_match(&re, str)) {
			str[re.match[0].e] = 0;
			printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]);
		}
		else printf("!\n");
	}
	{
		char str[] = "adddbdbcd";
		if (RE_match(&re, str)) {
			str[re.match[0].e] = 0;
			printf("M%4d%4d -> %s\n", re.match[0].s, re.match[0].e, &str[re.match[0].s]);
		}
		else printf("!\n");
	}
	return 0;
}

凡未特殊声明（转载/翻译），所有文章均为原创。
by Giumo Xavier Clanjor (哆啦比猫/兰威举), 2010-2019.
本作品采用知识共享署名·非商业性使用·相同方式共享 3.0 中国大陆许可协议进行许可。
文中凡未特殊声明且未声明为引用的代码均以 MIT 协议授权。

相关文章
全局相关文章