From 846cee5066814f448d05f70d5caccea92d4e3755 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sun, 13 Mar 2011 05:46:29 -0700 Subject: [PATCH] Split out parsing and expression trees from regexp.y Start of splitting regexp.y into logical components instead of the mess it is today. Split out the expr-tree and parsing components from regexp.y int expr-tree.x and parse.y and since regexp.y no longer does parsing rename it to hfa.cc Some code cleanups snuck their way into this patch and since I am to lazy to redo it, I have left them in. Signed-off-by: John Johansen Acked-By: Steve Beattie --- parser/libapparmor_re/Makefile | 14 +- parser/libapparmor_re/apparmor_re.h | 2 + parser/libapparmor_re/expr-tree.cc | 576 ++++++++ parser/libapparmor_re/expr-tree.h | 627 +++++++++ parser/libapparmor_re/{regexp.y => hfa.cc} | 1386 +------------------- parser/libapparmor_re/parse.h | 27 + parser/libapparmor_re/parse.y | 266 ++++ parser/libapparmor_re/regexp.h | 10 - 8 files changed, 1538 insertions(+), 1370 deletions(-) create mode 100644 parser/libapparmor_re/expr-tree.cc create mode 100644 parser/libapparmor_re/expr-tree.h rename parser/libapparmor_re/{regexp.y => hfa.cc} (60%) create mode 100644 parser/libapparmor_re/parse.h create mode 100644 parser/libapparmor_re/parse.y delete mode 100644 parser/libapparmor_re/regexp.h diff --git a/parser/libapparmor_re/Makefile b/parser/libapparmor_re/Makefile index 3409f9a5b..7006744de 100644 --- a/parser/libapparmor_re/Makefile +++ b/parser/libapparmor_re/Makefile @@ -12,14 +12,20 @@ BISON := bison all : ${TARGET} -libapparmor_re.a: regexp.o +libapparmor_re.a: parse.o expr-tree.o hfa.o ar ${ARFLAGS} $@ $^ -regexp.o : regexp.cc apparmor_re.h +expr-tree.o: expr-tree.cc expr-tree.h $(LINK.cc) $< -c -o $@ -regexp.cc : regexp.y flex-tables.h ../immunix.h +hfa.o: hfa.cc apparmor_re.h + $(LINK.cc) $< -c -o $@ + +parse.o : parse.cc apparmor_re.h expr-tree.h + $(LINK.cc) $< -c -o $@ + +parse.cc : parse.y flex-tables.h ../immunix.h ${BISON} -o $@ $< clean: - rm -f regexp.o regexp.cc regexp.so regexp.a regexp ${TARGET} + rm -f *.o parse.cc ${TARGET} diff --git a/parser/libapparmor_re/apparmor_re.h b/parser/libapparmor_re/apparmor_re.h index 2cbd0d6df..fed69be16 100644 --- a/parser/libapparmor_re/apparmor_re.h +++ b/parser/libapparmor_re/apparmor_re.h @@ -10,6 +10,8 @@ #ifndef APPARMOR_RE_H #define APPARMOR_RE_H +#include + typedef enum dfaflags { DFA_CONTROL_EQUIV = 1 << 0, DFA_CONTROL_TREE_NORMAL = 1 << 1, diff --git a/parser/libapparmor_re/expr-tree.cc b/parser/libapparmor_re/expr-tree.cc new file mode 100644 index 000000000..2d5ca7738 --- /dev/null +++ b/parser/libapparmor_re/expr-tree.cc @@ -0,0 +1,576 @@ +/* + * (C) 2006, 2007 Andreas Gruenbacher + * Copyright (c) 2003-2008 Novell, Inc. (All rights reserved) + * Copyright 2009-2010 Canonical Ltd. + * + * The libapparmor library is licensed under the terms of the GNU + * Lesser General Public License, version 2.1. Please see the file + * COPYING.LGPL. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + * + * + * Functions to create/manipulate an expression tree for regular expressions + * that have been parsed. + * + * The expression tree can be used directly after the parse creates it, or + * it can be factored so that the set of important nodes is smaller. + * Having a reduced set of important nodes generally results in a dfa that + * is closer to minimum (fewer redundant states are created). It also + * results in fewer important nodes in a the state set during subset + * construction resulting in less memory used to create a dfa. + * + * Generally it is worth doing expression tree simplification before dfa + * construction, if the regular expression tree contains any alternations. + * Even if the regular expression doesn't simplification should be fast + * enough that it can be used with minimal overhead. + */ + +#include +#include + +#include "expr-tree.h" +#include "apparmor_re.h" + + +/* Use a single static EpsNode as it carries no node specific information */ +EpsNode epsnode; + + +ostream& operator<<(ostream& os, uchar c) +{ + const char *search = "\a\033\f\n\r\t|*+[](). ", + *replace = "aefnrt|*+[](). ", *s; + + if ((s = strchr(search, c)) && *s != '\0') + os << '\\' << replace[s - search]; + else if (c < 32 || c >= 127) + os << '\\' << '0' << char('0' + (c >> 6)) + << char('0' + ((c >> 3) & 7)) << char('0' + (c & 7)); + else + os << (char)c; + return os; +} + +/** + * Text-dump a state (for debugging). + */ +ostream& operator<<(ostream& os, const NodeSet& state) +{ + os << '{'; + if (!state.empty()) { + NodeSet::iterator i = state.begin(); + for(;;) { + os << (*i)->label; + if (++i == state.end()) + break; + os << ','; + } + } + os << '}'; + return os; +} + +ostream& operator<<(ostream& os, Node& node) +{ + node.dump(os); + return os; +} + +/** + * hash_NodeSet - generate a hash for the Nodes in the set + */ +unsigned long hash_NodeSet(NodeSet *ns) +{ + unsigned long hash = 5381; + + for (NodeSet::iterator i = ns->begin(); i != ns->end(); i++) { + hash = ((hash << 5) + hash) + (unsigned long) *i; + } + + return hash; +} + + +/** + * label_nodes - label the node positions for pretty-printing debug output + * + * TODO: separate - node labels should be separate and optional, if not + * present pretty printing should use Node address + */ +void label_nodes(Node *root) +{ + int nodes = 1; + for (depth_first_traversal i(root); i; i++) + i->label = nodes++; +} + +/** + * Text-dump the syntax tree (for debugging). + */ +void Node::dump_syntax_tree(ostream& os) +{ + for (depth_first_traversal i(this); i; i++) { + os << i->label << '\t'; + if ((*i)->child[0] == 0) + os << **i << '\t' << (*i)->followpos << endl; + else { + if ((*i)->child[1] == 0) + os << (*i)->child[0]->label << **i; + else + os << (*i)->child[0]->label << **i + << (*i)->child[1]->label; + os << '\t' << (*i)->firstpos + << (*i)->lastpos << endl; + } + } + os << endl; +} + +/* + * Normalize the regex parse tree for factoring and cancelations. Normalization + * reorganizes internal (alt and cat) nodes into a fixed "normalized" form that + * simplifies factoring code, in that it produces a canonicalized form for + * the direction being normalized so that the factoring code does not have + * to consider as many cases. + * + * left normalization (dir == 0) uses these rules + * (E | a) -> (a | E) + * (a | b) | c -> a | (b | c) + * (ab)c -> a(bc) + * + * right normalization (dir == 1) uses the same rules but reversed + * (a | E) -> (E | a) + * a | (b | c) -> (a | b) | c + * a(bc) -> (ab)c + * + * Note: This is written iteratively for a given node (the top node stays + * fixed and the children are rotated) instead of recursively. + * For a given node under examination rotate over nodes from + * dir to !dir. Until no dir direction node meets the criterial. + * Then recurse to the children (which will have a different node type) + * to make sure they are normalized. + * Normalization of a child node is guarenteed to not affect the + * normalization of the parent. + * + * For cat nodes the depth first traverse order is guarenteed to be + * maintained. This is not necessary for altnodes. + * + * Eg. For left normalization + * + * |1 |1 + * / \ / \ + * |2 T -> a |2 + * / \ / \ + * |3 c b |3 + * / \ / \ + * a b c T + * + */ +static void rotate_node(Node *t, int dir) { + // (a | b) | c -> a | (b | c) + // (ab)c -> a(bc) + Node *left = t->child[dir]; + t->child[dir] = left->child[dir]; + left->child[dir] = left->child[!dir]; + left->child[!dir] = t->child[!dir]; + t->child[!dir] = left; +} + +void normalize_tree(Node *t, int dir) +{ + if (dynamic_cast(t)) + return; + + for (;;) { + if ((&epsnode == t->child[dir]) && + (&epsnode != t->child[!dir]) && + dynamic_cast(t)) { + // (E | a) -> (a | E) + // Ea -> aE + Node *c = t->child[dir]; + t->child[dir] = t->child[!dir]; + t->child[!dir] = c; + // Don't break here as 'a' may be a tree that + // can be pulled up. + } else if ((dynamic_cast(t) && + dynamic_cast(t->child[dir])) || + (dynamic_cast(t) && + dynamic_cast(t->child[dir]))) { + // (a | b) | c -> a | (b | c) + // (ab)c -> a(bc) + rotate_node(t, dir); + } else if (dynamic_cast(t) && + dynamic_cast(t->child[dir]) && + dynamic_cast(t->child[!dir])) { + // [a] | b -> b | [a] + Node *c = t->child[dir]; + t->child[dir] = t->child[!dir]; + t->child[!dir] = c; + } else { + break; + } + } + if (t->child[dir]) + normalize_tree(t->child[dir], dir); + if (t->child[!dir]) + normalize_tree(t->child[!dir], dir); +} + +//charset conversion is disabled for now, +//it hinders tree optimization in some cases, so it need to be either +//done post optimization, or have extra factoring rules added +#if 0 +static Node *merge_charset(Node *a, Node *b) +{ + if (dynamic_cast(a) && + dynamic_cast(b)) { + Chars chars; + chars.insert(dynamic_cast(a)->c); + chars.insert(dynamic_cast(b)->c); + CharSetNode *n = new CharSetNode(chars); + return n; + } else if (dynamic_cast(a) && + dynamic_cast(b)) { + Chars *chars = &dynamic_cast(b)->chars; + chars->insert(dynamic_cast(a)->c); + return b; + } else if (dynamic_cast(a) && + dynamic_cast(b)) { + Chars *from = &dynamic_cast(a)->chars; + Chars *to = &dynamic_cast(b)->chars; + for (Chars::iterator i = from->begin(); i != from->end(); i++) + to->insert(*i); + return b; + } + + //return ???; +} + +static Node *alt_to_charsets(Node *t, int dir) +{ +/* + Node *first = NULL; + Node *p = t; + Node *i = t; + for (;dynamic_cast(i);) { + if (dynamic_cast(i->child[dir]) || + dynamic_cast(i->child[dir])) { + if (!first) { + first = i; + p = i; + i = i->child[!dir]; + } else { + first->child[dir] = merge_charset(first->child[dir], + i->child[dir]); + p->child[!dir] = i->child[!dir]; + Node *tmp = i; + i = tmp->child[!dir]; + tmp->child[!dir] = NULL; + tmp->release(); + } + } else { + p = i; + i = i->child[!dir]; + } + } + // last altnode of chain check other dir as well + if (first && (dynamic_cast(i) || + dynamic_cast(i))) { + + } +*/ + +/* + if (dynamic_cast(t->child[dir]) || + dynamic_cast(t->child[dir])) + char_test = true; + (char_test && + (dynamic_cast(i->child[dir]) || + dynamic_cast(i->child[dir])))) { +*/ + return t; +} +#endif + +static Node *basic_alt_factor(Node *t, int dir) +{ + if (!dynamic_cast(t)) + return t; + + if (t->child[dir]->eq(t->child[!dir])) { + // (a | a) -> a + Node *tmp = t->child[dir]; + t->child[dir] = NULL; + t->release(); + return tmp; + } + + // (ab) | (ac) -> a(b|c) + if (dynamic_cast(t->child[dir]) && + dynamic_cast(t->child[!dir]) && + t->child[dir]->child[dir]->eq(t->child[!dir]->child[dir])) { + // (ab) | (ac) -> a(b|c) + Node *left = t->child[dir]; + Node *right = t->child[!dir]; + t->child[dir] = left->child[!dir]; + t->child[!dir] = right->child[!dir]; + right->child[!dir] = NULL; + right->release(); + left->child[!dir] = t; + return left; + } + + // a | (ab) -> a (E | b) -> a (b | E) + if (dynamic_cast(t->child[!dir]) && + t->child[dir]->eq(t->child[!dir]->child[dir])) { + Node *c = t->child[!dir]; + t->child[dir]->release(); + t->child[dir] = c->child[!dir]; + t->child[!dir] = &epsnode; + c->child[!dir] = t; + return c; + } + + // ab | (a) -> a (b | E) + if (dynamic_cast(t->child[dir]) && + t->child[dir]->child[dir]->eq(t->child[!dir])) { + Node *c = t->child[dir]; + t->child[!dir]->release(); + t->child[dir] = c->child[!dir]; + t->child[!dir] = &epsnode; + c->child[!dir] = t; + return c; + } + + return t; +} + +static Node *basic_simplify(Node *t, int dir) +{ + if (dynamic_cast(t) && + &epsnode == t->child[!dir]) { + // aE -> a + Node *tmp = t->child[dir]; + t->child[dir] = NULL; + t->release(); + return tmp; + } + + return basic_alt_factor(t, dir); +} + +/* + * assumes a normalized tree. reductions shown for left normalization + * aE -> a + * (a | a) -> a + ** factoring patterns + * a | (a | b) -> (a | b) + * a | (ab) -> a (E | b) -> a (b | E) + * (ab) | (ac) -> a(b|c) + * + * returns t - if no simplifications were made + * a new root node - if simplifications were made + */ +Node *simplify_tree_base(Node *t, int dir, bool &mod) +{ + if (dynamic_cast(t)) + return t; + + for (int i=0; i < 2; i++) { + if (t->child[i]) { + Node *c = simplify_tree_base(t->child[i], dir, mod); + if (c != t->child[i]) { + t->child[i] = c; + mod = true; + } + } + } + + // only iterate on loop if modification made + for (;; mod = true) { + + Node *tmp = basic_simplify(t, dir); + if (tmp != t) { + t = tmp; + continue; + } + + + /* all tests after this must meet 2 alt node condition */ + if (!dynamic_cast(t) || + !dynamic_cast(t->child[!dir])) + break; + + // a | (a | b) -> (a | b) + // a | (b | (c | a)) -> (b | (c | a)) + Node *p = t; + Node *i = t->child[!dir]; + for (;dynamic_cast(i); p = i, i = i->child[!dir]) { + if (t->child[dir]->eq(i->child[dir])) { + Node *tmp = t->child[!dir]; + t->child[!dir] = NULL; + t->release(); + t = tmp; + continue; + } + } + // last altnode of chain check other dir as well + if (t->child[dir]->eq(p->child[!dir])) { + Node *tmp = t->child[!dir]; + t->child[!dir] = NULL; + t->release(); + t = tmp; + continue; + } + + //exact match didn't work, try factoring front + //a | (ac | (ad | () -> (a (E | c)) | (...) + //ab | (ac | (...)) -> (a (b | c)) | (...) + //ab | (a | (...)) -> (a (b | E)) | (...) + Node *pp; + int count = 0; + Node *subject = t->child[dir]; + Node *a = subject; + if (dynamic_cast(subject)) + a = subject->child[dir]; + + for (pp = p = t, i = t->child[!dir]; + dynamic_cast(i); ) { + if ((dynamic_cast(i->child[dir]) && + a->eq(i->child[dir]->child[dir])) || + (a->eq(i->child[dir]))) { + // extract matching alt node + p->child[!dir] = i->child[!dir]; + i->child[!dir] = subject; + subject = basic_simplify(i, dir); + if (dynamic_cast(subject)) + a = subject->child[dir]; + else + a = subject; + + i = p->child[!dir]; + count++; + } else { + pp = p; p = i; i = i->child[!dir]; + } + } + + // last altnode in chain check other dir as well + if ((dynamic_cast(i) && + a->eq(i->child[dir])) || + (a->eq(i))) { + count++; + if (t == p) { + t->child[dir] = subject; + t = basic_simplify(t, dir); + } else { + t->child[dir] = p->child[dir]; + p->child[dir] = subject; + pp->child[!dir] = basic_simplify(p, dir); + } + } else { + t->child[dir] = i; + p->child[!dir] = subject; + } + + if (count == 0) + break; + } + return t; +} + +int debug_tree(Node *t) +{ + int nodes = 1; + + if (!dynamic_cast(t)) { + if (t->child[0]) + nodes += debug_tree(t->child[0]); + if (t->child[1]) + nodes += debug_tree(t->child[1]); + } + return nodes; +} + +static void count_tree_nodes(Node *t, struct node_counts *counts) +{ + if (dynamic_cast(t)) { + counts->alt++; + count_tree_nodes(t->child[0], counts); + count_tree_nodes(t->child[1], counts); + } else if (dynamic_cast(t)) { + counts->cat++; + count_tree_nodes(t->child[0], counts); + count_tree_nodes(t->child[1], counts); + } else if (dynamic_cast(t)) { + counts->plus++; + count_tree_nodes(t->child[0], counts); + } else if (dynamic_cast(t)) { + counts->star++; + count_tree_nodes(t->child[0], counts); + } else if (dynamic_cast(t)) { + counts->charnode++; + } else if (dynamic_cast(t)) { + counts->any++; + } else if (dynamic_cast(t)) { + counts->charset++; + } else if (dynamic_cast(t)) { + counts->notcharset++; + } +} + +#include "stdio.h" +#include "stdint.h" +#include "apparmor_re.h" + +Node *simplify_tree(Node *t, dfaflags_t flags) +{ + bool update; + + if (flags & DFA_DUMP_TREE_STATS) { + struct node_counts counts = { 0, 0, 0, 0, 0, 0, 0, 0 }; + count_tree_nodes(t, &counts); + fprintf(stderr, "expr tree: c %d, [] %d, [^] %d, | %d, + %d, * %d, . %d, cat %d\n", counts.charnode, counts.charset, counts.notcharset, counts.alt, counts.plus, counts.star, counts.any, counts.cat); + } + do { + update = false; + //default to right normalize first as this reduces the number + //of trailing nodes which might follow an internal * + //or **, which is where state explosion can happen + //eg. in one test this makes the difference between + // the dfa having about 7 thousands states, + // and it having about 1.25 million states + int dir = 1; + if (flags & DFA_CONTROL_TREE_LEFT) + dir = 0; + for (int count = 0; count < 2; count++) { + bool modified; + do { + modified = false; + if (flags & DFA_CONTROL_TREE_NORMAL) + normalize_tree(t, dir); + t = simplify_tree_base(t, dir, modified); + if (modified) + update = true; + } while (modified); + if (flags & DFA_CONTROL_TREE_LEFT) + dir++; + else + dir--; + } + } while(update); + if (flags & DFA_DUMP_TREE_STATS) { + struct node_counts counts = { 0, 0, 0, 0, 0, 0, 0, 0 }; + count_tree_nodes(t, &counts); + fprintf(stderr, "simplified expr tree: c %d, [] %d, [^] %d, | %d, + %d, * %d, . %d, cat %d\n", counts.charnode, counts.charset, counts.notcharset, counts.alt, counts.plus, counts.star, counts.any, counts.cat); + } + return t; +} + diff --git a/parser/libapparmor_re/expr-tree.h b/parser/libapparmor_re/expr-tree.h new file mode 100644 index 000000000..6a3ec3113 --- /dev/null +++ b/parser/libapparmor_re/expr-tree.h @@ -0,0 +1,627 @@ +/* + * (C) 2006, 2007 Andreas Gruenbacher + * Copyright (c) 2003-2008 Novell, Inc. (All rights reserved) + * Copyright 2009-2010 Canonical Ltd. + * + * The libapparmor library is licensed under the terms of the GNU + * Lesser General Public License, version 2.1. Please see the file + * COPYING.LGPL. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + * + * + * Functions to create/manipulate an expression tree for regular expressions + * that have been parsed. + * + * The expression tree can be used directly after the parse creates it, or + * it can be factored so that the set of important nodes is smaller. + * Having a reduced set of important nodes generally results in a dfa that + * is closer to minimum (fewer redundant states are created). It also + * results in fewer important nodes in a the state set during subset + * construction resulting in less memory used to create a dfa. + * + * Generally it is worth doing expression tree simplification before dfa + * construction, if the regular expression tree contains any alternations. + * Even if the regular expression doesn't simplification should be fast + * enough that it can be used with minimal overhead. + */ +#ifndef __LIBAA_RE_EXPR_H +#define __LIBAA_RE_EXPR_H + +#include +#include +#include +#include + +#include "apparmor_re.h" + +using namespace std; + +typedef unsigned char uchar; +typedef set Chars; + +ostream& operator<<(ostream& os, uchar c); + +/* Compute the union of two sets. */ +template +set operator+(const set& a, const set& b) +{ + set c(a); + c.insert(b.begin(), b.end()); + return c; +} + +/** + * When creating DFAs from regex trees, a DFA state is constructed from + * a set of important nodes in the syntax tree. This includes AcceptNodes, + * which indicate that when a match ends in a particular state, the + * regular expressions that the AcceptNode belongs to match. + */ +class Node; +class ImportantNode; +typedef set NodeSet; + +/** + * Text-dump a state (for debugging). + */ +ostream& operator<<(ostream& os, const NodeSet& state); + +/** + * Out-edges from a state to another: we store the follow-set of Nodes + * for each input character that is not a default match in + * cases (i.e., following a CharNode or CharSetNode), and default + * matches in otherwise as well as in all matching explicit cases + * (i.e., following an AnyCharNode or NotCharSetNode). This avoids + * enumerating all the explicit tranitions for default matches. + */ +typedef struct NodeCases { + typedef map::iterator iterator; + iterator begin() { return cases.begin(); } + iterator end() { return cases.end(); } + + NodeCases() : otherwise(0) { } + map cases; + NodeSet *otherwise; +} NodeCases; + + +ostream& operator<<(ostream& os, Node& node); + +/* An abstract node in the syntax tree. */ +class Node { +public: + Node() : + nullable(false) { child[0] = child[1] = 0; } + Node(Node *left) : + nullable(false) { child[0] = left; child[1] = 0; } + Node(Node *left, Node *right) : + nullable(false) { child[0] = left; child[1] = right; } + virtual ~Node() + { + if (child[0]) + child[0]->release(); + if (child[1]) + child[1]->release(); + } + + /** + * See the "Dragon Book" for an explanation of nullable, firstpos, + * lastpos, and followpos. + */ + virtual void compute_nullable() { } + virtual void compute_firstpos() = 0; + virtual void compute_lastpos() = 0; + virtual void compute_followpos() { } + virtual int eq(Node *other) = 0; + virtual ostream& dump(ostream& os) = 0; + void dump_syntax_tree(ostream& os); + + bool nullable; + NodeSet firstpos, lastpos, followpos; + /* child 0 is left, child 1 is right */ + Node *child[2]; + + unsigned int label; /* unique number for debug etc */ + /** + * We indirectly release Nodes through a virtual function because + * accept and Eps Nodes are shared, and must be treated specially. + * We could use full reference counting here but the indirect release + * is sufficient and has less overhead + */ + virtual void release(void) { delete this; } +}; + + +class InnerNode : public Node { +public: + InnerNode() : Node() { }; + InnerNode(Node *left) : Node(left) {}; + InnerNode(Node *left, Node *right) : Node(left, right) { }; +}; + +class OneChildNode : public InnerNode { +public: + OneChildNode(Node *left) : InnerNode(left) { }; +}; + +class TwoChildNode : public InnerNode { +public: + TwoChildNode(Node *left, Node *right) : InnerNode(left, right) { }; +}; + +class LeafNode : public Node { +public: + LeafNode() : Node() { }; +}; + +/* Match nothing (//). */ +class EpsNode : public LeafNode { +public: + EpsNode() : LeafNode() + { + nullable = true; + label = 0; + } + void release(void) + { + /* don't delete Eps nodes because there is a single static + * instance shared by all trees. Look for epsnode in the code + */ + } + + void compute_firstpos() { } + void compute_lastpos() { } + int eq(Node *other) + { + if (dynamic_cast(other)) + return 1; + return 0; + } + ostream& dump(ostream& os) + { + return os << "[]"; + } +}; + +/** + * Leaf nodes in the syntax tree are important to us: they describe the + * characters that the regular expression matches. We also consider + * AcceptNodes import: they indicate when a regular expression matches. + */ +class ImportantNode : public LeafNode { +public: + ImportantNode() : LeafNode() { } + void compute_firstpos() + { + firstpos.insert(this); + } + void compute_lastpos() { + lastpos.insert(this); + } + virtual void follow(NodeCases& cases) = 0; +}; + +/* common base class for all the different classes that contain + * character information. + */ +class CNode : public ImportantNode { +public: + CNode() : ImportantNode() { } +}; + +/* Match one specific character (/c/). */ +class CharNode : public CNode { +public: + CharNode(uchar c) : c(c) { } + void follow(NodeCases& cases) + { + NodeSet **x = &cases.cases[c]; + if (!*x) { + if (cases.otherwise) + *x = new NodeSet(*cases.otherwise); + else + *x = new NodeSet; + } + (*x)->insert(followpos.begin(), followpos.end()); + } + int eq(Node *other) + { + CharNode *o = dynamic_cast(other); + if (o) { + return c == o->c; + } + return 0; + } + ostream& dump(ostream& os) + { + return os << c; + } + + uchar c; +}; + +/* Match a set of characters (/[abc]/). */ +class CharSetNode : public CNode { +public: + CharSetNode(Chars& chars) : chars(chars) { } + void follow(NodeCases& cases) + { + for (Chars::iterator i = chars.begin(); i != chars.end(); i++) { + NodeSet **x = &cases.cases[*i]; + if (!*x) { + if (cases.otherwise) + *x = new NodeSet(*cases.otherwise); + else + *x = new NodeSet; + } + (*x)->insert(followpos.begin(), followpos.end()); + } + } + int eq(Node *other) + { + CharSetNode *o = dynamic_cast(other); + if (!o || chars.size() != o->chars.size()) + return 0; + + for (Chars::iterator i = chars.begin(), j = o->chars.begin(); + i != chars.end() && j != o->chars.end(); + i++, j++) { + if (*i != *j) + return 0; + } + return 1; + } + ostream& dump(ostream& os) + { + os << '['; + for (Chars::iterator i = chars.begin(); i != chars.end(); i++) + os << *i; + return os << ']'; + } + + Chars chars; +}; + +/* Match all except one character (/[^abc]/). */ +class NotCharSetNode : public CNode { +public: + NotCharSetNode(Chars& chars) : chars(chars) { } + void follow(NodeCases& cases) + { + if (!cases.otherwise) + cases.otherwise = new NodeSet; + for (Chars::iterator j = chars.begin(); j != chars.end(); j++) { + NodeSet **x = &cases.cases[*j]; + if (!*x) + *x = new NodeSet(*cases.otherwise); + } + /* Note: Add to the nonmatching characters after copying away + * the old otherwise state for the matching characters. + */ + cases.otherwise->insert(followpos.begin(), followpos.end()); + for (NodeCases::iterator i = cases.begin(); i != cases.end(); + i++) { + if (chars.find(i->first) == chars.end()) + i->second->insert(followpos.begin(), + followpos.end()); + } + } + int eq(Node *other) + { + NotCharSetNode *o = dynamic_cast(other); + if (!o || chars.size() != o->chars.size()) + return 0; + + for (Chars::iterator i = chars.begin(), j = o->chars.begin(); + i != chars.end() && j != o->chars.end(); + i++, j++) { + if (*i != *j) + return 0; + } + return 1; + } + ostream& dump(ostream& os) + { + os << "[^"; + for (Chars::iterator i = chars.begin(); i != chars.end(); i++) + os << *i; + return os << ']'; + } + + Chars chars; +}; + +/* Match any character (/./). */ +class AnyCharNode : public CNode { +public: + AnyCharNode() { } + void follow(NodeCases& cases) + { + if (!cases.otherwise) + cases.otherwise = new NodeSet; + cases.otherwise->insert(followpos.begin(), followpos.end()); + for (NodeCases::iterator i = cases.begin(); i != cases.end(); + i++) + i->second->insert(followpos.begin(), followpos.end()); + } + int eq(Node *other) + { + if (dynamic_cast(other)) + return 1; + return 0; + } + ostream& dump(ostream& os) { + return os << "."; + } +}; + +/** + * Indicate that a regular expression matches. An AcceptNode itself + * doesn't match anything, so it will never generate any transitions. + */ +class AcceptNode : public ImportantNode { +public: + AcceptNode() {} + void release(void) + { + /* don't delete AcceptNode via release as they are shared, and + * will be deleted when the table the are stored in is deleted + */ + } + + void follow(NodeCases& cases __attribute__((unused))) + { + /* Nothing to follow. */ + } + + /* requires accept nodes to be common by pointer */ + int eq(Node *other) + { + if (dynamic_cast(other)) + return (this == other); + return 0; + } +}; + +/* Match a node zero or more times. (This is a unary operator.) */ +class StarNode : public OneChildNode { +public: + StarNode(Node *left) : OneChildNode(left) + { + nullable = true; + } + void compute_firstpos() + { + firstpos = child[0]->firstpos; + } + void compute_lastpos() + { + lastpos = child[0]->lastpos; + } + void compute_followpos() + { + NodeSet from = child[0]->lastpos, to = child[0]->firstpos; + for(NodeSet::iterator i = from.begin(); i != from.end(); i++) { + (*i)->followpos.insert(to.begin(), to.end()); + } + } + int eq(Node *other) { + if (dynamic_cast(other)) + return child[0]->eq(other->child[0]); + return 0; + } + ostream& dump(ostream& os) + { + os << '('; + child[0]->dump(os); + return os << ")*"; + } +}; + +/* Match a node one or more times. (This is a unary operator.) */ +class PlusNode : public OneChildNode { +public: + PlusNode(Node *left) : OneChildNode(left) { } + void compute_nullable() + { + nullable = child[0]->nullable; + } + void compute_firstpos() + { + firstpos = child[0]->firstpos; + } + void compute_lastpos() + { + lastpos = child[0]->lastpos; + } + void compute_followpos() + { + NodeSet from = child[0]->lastpos, to = child[0]->firstpos; + for(NodeSet::iterator i = from.begin(); i != from.end(); i++) { + (*i)->followpos.insert(to.begin(), to.end()); + } + } + int eq(Node *other) + { + if (dynamic_cast(other)) + return child[0]->eq(other->child[0]); + return 0; + } + ostream& dump(ostream& os) + { + os << '('; + child[0]->dump(os); + return os << ")+"; + } +}; + +/* Match a pair of consecutive nodes. */ +class CatNode : public TwoChildNode { +public: + CatNode(Node *left, Node *right) : TwoChildNode(left, right) { } + void compute_nullable() + { + nullable = child[0]->nullable && child[1]->nullable; + } + void compute_firstpos() + { + if (child[0]->nullable) + firstpos = child[0]->firstpos + child[1]->firstpos; + else + firstpos = child[0]->firstpos; + } + void compute_lastpos() + { + if (child[1]->nullable) + lastpos = child[0]->lastpos + child[1]->lastpos; + else + lastpos = child[1]->lastpos; + } + void compute_followpos() + { + NodeSet from = child[0]->lastpos, to = child[1]->firstpos; + for(NodeSet::iterator i = from.begin(); i != from.end(); i++) { + (*i)->followpos.insert(to.begin(), to.end()); + } + } + int eq(Node *other) { + if (dynamic_cast(other)) { + if (!child[0]->eq(other->child[0])) + return 0; + return child[1]->eq(other->child[1]); + } + return 0; + } + ostream& dump(ostream& os) + { + child[0]->dump(os); + child[1]->dump(os); + return os; + } +}; + +/* Match one of two alternative nodes. */ +class AltNode : public TwoChildNode { +public: + AltNode(Node *left, Node *right) : TwoChildNode(left, right) { } + void compute_nullable() + { + nullable = child[0]->nullable || child[1]->nullable; + } + void compute_lastpos() + { + lastpos = child[0]->lastpos + child[1]->lastpos; + } + void compute_firstpos() + { + firstpos = child[0]->firstpos + child[1]->firstpos; + } + int eq(Node *other) { + if (dynamic_cast(other)) { + if (!child[0]->eq(other->child[0])) + return 0; + return child[1]->eq(other->child[1]); + } + return 0; + } + ostream& dump(ostream& os) + { + os << '('; + child[0]->dump(os); + os << '|'; + child[1]->dump(os); + os << ')'; + return os; + } +}; + + +/* Traverse the syntax tree depth-first in an iterator-like manner. */ +class depth_first_traversal { + stack pos; + void push_left(Node *node) + { + pos.push(node); + + while (dynamic_cast(node)) { + pos.push(node->child[0]); + node = node->child[0]; + } + } + +public: + depth_first_traversal(Node *node) + { + push_left(node); + } + Node *operator*() + { + return pos.top(); + } + Node* operator->() + { + return pos.top(); + } + operator bool() + { + return !pos.empty(); + } + void operator++(int) + { + Node *last = pos.top(); + pos.pop(); + + if (!pos.empty()) { + /* no need to dynamic cast, as we just popped a node so + * the top node must be an inner node */ + InnerNode *node = (InnerNode *)(pos.top()); + if (node->child[1] && node->child[1] != last) { + push_left(node->child[1]); + } + } + } +}; + +struct node_counts { + int charnode; + int charset; + int notcharset; + int alt; + int plus; + int star; + int any; + int cat; +}; + +extern EpsNode epsnode; + +int debug_tree(Node *t); +Node *simplify_tree(Node *t, dfaflags_t flags); +void label_nodes(Node *root); +unsigned long hash_NodeSet(NodeSet *ns); + + +/* Comparison operator for sets of . + * Compare set hashes, and if the sets have the same hash + * do compare pointer comparison on set of , the pointer comparison + * allows us to determine which Sets of we have seen already from + * new ones when constructing the DFA. + */ +struct deref_less_than { + bool operator()(pair const & lhs, + pair const & rhs) const + { + if (lhs.first == rhs.first) + return *(lhs.second) < *(rhs.second); + else + return lhs.first < rhs.first; + } +}; + +#endif /* __LIBAA_RE_EXPR */ diff --git a/parser/libapparmor_re/regexp.y b/parser/libapparmor_re/hfa.cc similarity index 60% rename from parser/libapparmor_re/regexp.y rename to parser/libapparmor_re/hfa.cc index aac11572f..c78bfce0d 100644 --- a/parser/libapparmor_re/regexp.y +++ b/parser/libapparmor_re/hfa.cc @@ -1,1110 +1,36 @@ /* - * regexp.y -- Regular Expression Matcher Generator * (C) 2006, 2007 Andreas Gruenbacher + * Copyright (c) 2003-2008 Novell, Inc. (All rights reserved) + * Copyright 2009-2010 Canonical Ltd. * - * Implementation based on the Lexical Analysis chapter of: + * The libapparmor library is licensed under the terms of the GNU + * Lesser General Public License, version 2.1. Please see the file + * COPYING.LGPL. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + * + * + * Base of implementation based on the Lexical Analysis chapter of: * Alfred V. Aho, Ravi Sethi, Jeffrey D. Ullman: * Compilers: Principles, Techniques, and Tools (The "Dragon Book"), * Addison-Wesley, 1986. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * See http://www.gnu.org for more details. */ -%{ - /* #define DEBUG_TREE */ +#include +#include +#include +#include +#include +#include +#include +#include - #include - #include - #include - #include - #include - #include - #include - #include - - using namespace std; - - typedef unsigned char uchar; - typedef set Chars; - - ostream& operator<<(ostream& os, uchar c); - - /* Compute the union of two sets. */ - template - set operator+(const set& a, const set& b) - { - set c(a); - c.insert(b.begin(), b.end()); - return c; - } - - /** - * When creating DFAs from regex trees, a DFA state is constructed from - * a set of important nodes in the syntax tree. This includes AcceptNodes, - * which indicate that when a match ends in a particular state, the - * regular expressions that the AcceptNode belongs to match. - */ - class ImportantNode; - typedef set NodeSet; - - /** - * Out-edges from a state to another: we store the follow-set of Nodes - * for each input character that is not a default match in - * cases (i.e., following a CharNode or CharSetNode), and default - * matches in otherwise as well as in all matching explicit cases - * (i.e., following an AnyCharNode or NotCharSetNode). This avoids - * enumerating all the explicit tranitions for default matches. - */ - typedef struct NodeCases { - typedef map::iterator iterator; - iterator begin() { return cases.begin(); } - iterator end() { return cases.end(); } - - NodeCases() : otherwise(0) { } - map cases; - NodeSet *otherwise; - } NodeCases; - - - /* An abstract node in the syntax tree. */ - class Node { - public: - Node() : - nullable(false) { child[0] = child[1] = 0; } - Node(Node *left) : - nullable(false) { child[0] = left; child[1] = 0; } - Node(Node *left, Node *right) : - nullable(false) { child[0] = left; child[1] = right; } - virtual ~Node() - { - if (child[0]) - child[0]->release(); - if (child[1]) - child[1]->release(); - } - - /** - * See the "Dragon Book" for an explanation of nullable, firstpos, - * lastpos, and followpos. - */ - virtual void compute_nullable() { } - virtual void compute_firstpos() = 0; - virtual void compute_lastpos() = 0; - virtual void compute_followpos() { } - virtual int eq(Node *other) = 0; - virtual ostream& dump(ostream& os) = 0; - - bool nullable; - NodeSet firstpos, lastpos, followpos; - /* child 0 is left, child 1 is right */ - Node *child[2]; - - unsigned int label; /* unique number for debug etc */ - /** - * We indirectly release Nodes through a virtual function because - * accept and Eps Nodes are shared, and must be treated specially. - * We could use full reference counting here but the indirect release - * is sufficient and has less overhead - */ - virtual void release(void) { - delete this; - } - }; - - class InnerNode : public Node { - public: - InnerNode() : Node() { }; - InnerNode(Node *left) : Node(left) {}; - InnerNode(Node *left, Node *right) : Node(left, right) { }; - }; - - class OneChildNode : public InnerNode { - public: - OneChildNode(Node *left) : InnerNode(left) { }; - }; - - class TwoChildNode : public InnerNode { - public: - TwoChildNode(Node *left, Node *right) : InnerNode(left, right) { }; - }; - - class LeafNode : public Node { - public: - LeafNode() : Node() { }; - - }; - - /* Match nothing (//). */ - class EpsNode : public LeafNode { - public: - EpsNode() : LeafNode() - { - nullable = true; - label = 0; - } - void release(void) - { - /* don't delete Eps nodes because there is a single static instance - * shared by all trees. Look for epsnode in the code - */ - } - - void compute_firstpos() - { - } - void compute_lastpos() - { - } - int eq(Node *other) { - if (dynamic_cast(other)) - return 1; - return 0; - } - ostream& dump(ostream& os) - { - return os << "[]"; - } - }; - - /** - * Leaf nodes in the syntax tree are important to us: they describe the - * characters that the regular expression matches. We also consider - * AcceptNodes import: they indicate when a regular expression matches. - */ - class ImportantNode : public LeafNode { - public: - ImportantNode() : LeafNode() { } - void compute_firstpos() - { - firstpos.insert(this); - } - void compute_lastpos() { - lastpos.insert(this); - } - virtual void follow(NodeCases& cases) = 0; - }; - - /* common base class for all the different classes that contain - * character information. - */ - class CNode : public ImportantNode { - public: - CNode() : ImportantNode() { } - - }; - - /* Match one specific character (/c/). */ - class CharNode : public CNode { - public: - CharNode(uchar c) : c(c) { } - void follow(NodeCases& cases) - { - NodeSet **x = &cases.cases[c]; - if (!*x) { - if (cases.otherwise) - *x = new NodeSet(*cases.otherwise); - else - *x = new NodeSet; - } - (*x)->insert(followpos.begin(), followpos.end()); - } - int eq(Node *other) { - CharNode *o = dynamic_cast(other); - if (o) { - return c == o->c; - } - return 0; - } - ostream& dump(ostream& os) - { - return os << c; - } - - uchar c; - }; - - /* Match a set of characters (/[abc]/). */ - class CharSetNode : public CNode { - public: - CharSetNode(Chars& chars) : chars(chars) { } - void follow(NodeCases& cases) - { - for (Chars::iterator i = chars.begin(); i != chars.end(); i++) { - NodeSet **x = &cases.cases[*i]; - if (!*x) { - if (cases.otherwise) - *x = new NodeSet(*cases.otherwise); - else - *x = new NodeSet; - } - (*x)->insert(followpos.begin(), followpos.end()); - } - } - int eq(Node *other) { - CharSetNode *o = dynamic_cast(other); - if (!o || chars.size() != o->chars.size()) - return 0; - - for (Chars::iterator i = chars.begin(), j = o->chars.begin(); - i != chars.end() && j != o->chars.end(); - i++, j++) { - if (*i != *j) - return 0; - } - return 1; - } - ostream& dump(ostream& os) - { - os << '['; - for (Chars::iterator i = chars.begin(); i != chars.end(); i++) - os << *i; - return os << ']'; - } - - Chars chars; - }; - - /* Match all except one character (/[^abc]/). */ - class NotCharSetNode : public CNode { - public: - NotCharSetNode(Chars& chars) : chars(chars) { } - void follow(NodeCases& cases) - { - if (!cases.otherwise) - cases.otherwise = new NodeSet; - for (Chars::iterator j = chars.begin(); j != chars.end(); j++) { - NodeSet **x = &cases.cases[*j]; - if (!*x) - *x = new NodeSet(*cases.otherwise); - } - /** - * Note: Add to the nonmatching characters after copying away the - * old otherwise state for the matching characters. - */ - cases.otherwise->insert(followpos.begin(), followpos.end()); - for (NodeCases::iterator i = cases.begin(); i != cases.end(); i++) { - if (chars.find(i->first) == chars.end()) - i->second->insert(followpos.begin(), followpos.end()); - } - } - int eq(Node *other) { - NotCharSetNode *o = dynamic_cast(other); - if (!o || chars.size() != o->chars.size()) - return 0; - - for (Chars::iterator i = chars.begin(), j = o->chars.begin(); - i != chars.end() && j != o->chars.end(); - i++, j++) { - if (*i != *j) - return 0; - } - return 1; - } - ostream& dump(ostream& os) - { - os << "[^"; - for (Chars::iterator i = chars.begin(); i != chars.end(); i++) - os << *i; - return os << ']'; - } - - Chars chars; - }; - - /* Match any character (/./). */ - class AnyCharNode : public CNode { - public: - AnyCharNode() { } - void follow(NodeCases& cases) - { - if (!cases.otherwise) - cases.otherwise = new NodeSet; - cases.otherwise->insert(followpos.begin(), followpos.end()); - for (NodeCases::iterator i = cases.begin(); i != cases.end(); i++) - i->second->insert(followpos.begin(), followpos.end()); - } - int eq(Node *other) { - if (dynamic_cast(other)) - return 1; - return 0; - } - ostream& dump(ostream& os) { - return os << "."; - } - }; - - /** - * Indicate that a regular expression matches. An AcceptNode itself - * doesn't match anything, so it will never generate any transitions. - */ - class AcceptNode : public ImportantNode { - public: - AcceptNode() {} - void release(void) - { - /* don't delete AcceptNode via release as they are shared, - * and will be deleted when the table the are stored in is deleted - */ - } - - void follow(NodeCases& cases __attribute__((unused))) - { - /* Nothing to follow. */ - } - /* requires accept nodes to be common by pointer */ - int eq(Node *other) { - if (dynamic_cast(other)) - return (this == other); - return 0; - } - }; - - /* Match a node zero or more times. (This is a unary operator.) */ - class StarNode : public OneChildNode { - public: - StarNode(Node *left) : - OneChildNode(left) - { - nullable = true; - } - void compute_firstpos() - { - firstpos = child[0]->firstpos; - } - void compute_lastpos() - { - lastpos = child[0]->lastpos; - } - void compute_followpos() - { - NodeSet from = child[0]->lastpos, to = child[0]->firstpos; - for(NodeSet::iterator i = from.begin(); i != from.end(); i++) { - (*i)->followpos.insert(to.begin(), to.end()); - } - } - int eq(Node *other) { - if (dynamic_cast(other)) - return child[0]->eq(other->child[0]); - return 0; - } - ostream& dump(ostream& os) - { - os << '('; - child[0]->dump(os); - return os << ")*"; - } - }; - - /* Match a node one or more times. (This is a unary operator.) */ - class PlusNode : public OneChildNode { - public: - PlusNode(Node *left) : - OneChildNode(left) { } - void compute_nullable() - { - nullable = child[0]->nullable; - } - void compute_firstpos() - { - firstpos = child[0]->firstpos; - } - void compute_lastpos() - { - lastpos = child[0]->lastpos; - } - void compute_followpos() - { - NodeSet from = child[0]->lastpos, to = child[0]->firstpos; - for(NodeSet::iterator i = from.begin(); i != from.end(); i++) { - (*i)->followpos.insert(to.begin(), to.end()); - } - } - int eq(Node *other) { - if (dynamic_cast(other)) - return child[0]->eq(other->child[0]); - return 0; - } - ostream& dump(ostream& os) - { - os << '('; - child[0]->dump(os); - return os << ")+"; - } - }; - - /* Match a pair of consecutive nodes. */ - class CatNode : public TwoChildNode { - public: - CatNode(Node *left, Node *right) : - TwoChildNode(left, right) { } - void compute_nullable() - { - nullable = child[0]->nullable && child[1]->nullable; - } - void compute_firstpos() - { - if (child[0]->nullable) - firstpos = child[0]->firstpos + child[1]->firstpos; - else - firstpos = child[0]->firstpos; - } - void compute_lastpos() - { - if (child[1]->nullable) - lastpos = child[0]->lastpos + child[1]->lastpos; - else - lastpos = child[1]->lastpos; - } - void compute_followpos() - { - NodeSet from = child[0]->lastpos, to = child[1]->firstpos; - for(NodeSet::iterator i = from.begin(); i != from.end(); i++) { - (*i)->followpos.insert(to.begin(), to.end()); - } - } - int eq(Node *other) { - if (dynamic_cast(other)) { - if (!child[0]->eq(other->child[0])) - return 0; - return child[1]->eq(other->child[1]); - } - return 0; - } - ostream& dump(ostream& os) - { - child[0]->dump(os); - child[1]->dump(os); - return os; - //return os << ' '; - } - }; - - /* Match one of two alternative nodes. */ - class AltNode : public TwoChildNode { - public: - AltNode(Node *left, Node *right) : - TwoChildNode(left, right) { } - void compute_nullable() - { - nullable = child[0]->nullable || child[1]->nullable; - } - void compute_lastpos() - { - lastpos = child[0]->lastpos + child[1]->lastpos; - } - void compute_firstpos() - { - firstpos = child[0]->firstpos + child[1]->firstpos; - } - int eq(Node *other) { - if (dynamic_cast(other)) { - if (!child[0]->eq(other->child[0])) - return 0; - return child[1]->eq(other->child[1]); - } - return 0; - } - ostream& dump(ostream& os) - { - os << '('; - child[0]->dump(os); - os << '|'; - child[1]->dump(os); - os << ')'; - return os; - } - }; - -/* Use a single static EpsNode as it carries no node specific information */ -static EpsNode epsnode; - -/* - * Normalize the regex parse tree for factoring and cancelations. Normalization - * reorganizes internal (alt and cat) nodes into a fixed "normalized" form that - * simplifies factoring code, in that it produces a canonicalized form for - * the direction being normalized so that the factoring code does not have - * to consider as many cases. - * - * left normalization (dir == 0) uses these rules - * (E | a) -> (a | E) - * (a | b) | c -> a | (b | c) - * (ab)c -> a(bc) - * - * right normalization (dir == 1) uses the same rules but reversed - * (a | E) -> (E | a) - * a | (b | c) -> (a | b) | c - * a(bc) -> (ab)c - * - * Note: This is written iteratively for a given node (the top node stays - * fixed and the children are rotated) instead of recursively. - * For a given node under examination rotate over nodes from - * dir to !dir. Until no dir direction node meets the criterial. - * Then recurse to the children (which will have a different node type) - * to make sure they are normalized. - * Normalization of a child node is guarenteed to not affect the - * normalization of the parent. - * - * For cat nodes the depth first traverse order is guarenteed to be - * maintained. This is not necessary for altnodes. - * - * Eg. For left normalization - * - * |1 |1 - * / \ / \ - * |2 T -> a |2 - * / \ / \ - * |3 c b |3 - * / \ / \ - * a b c T - * - */ -static void rotate_node(Node *t, int dir) { - // (a | b) | c -> a | (b | c) - // (ab)c -> a(bc) - Node *left = t->child[dir]; - t->child[dir] = left->child[dir]; - left->child[dir] = left->child[!dir]; - left->child[!dir] = t->child[!dir]; - t->child[!dir] = left; -} - -void normalize_tree(Node *t, int dir) -{ - if (dynamic_cast(t)) - return; - - for (;;) { - if ((&epsnode == t->child[dir]) && - (&epsnode != t->child[!dir]) && - dynamic_cast(t)) { - // (E | a) -> (a | E) - // Ea -> aE - Node *c = t->child[dir]; - t->child[dir] = t->child[!dir]; - t->child[!dir] = c; - // Don't break here as 'a' may be a tree that - // can be pulled up. - } else if ((dynamic_cast(t) && - dynamic_cast(t->child[dir])) || - (dynamic_cast(t) && - dynamic_cast(t->child[dir]))) { - // (a | b) | c -> a | (b | c) - // (ab)c -> a(bc) - rotate_node(t, dir); - } else if (dynamic_cast(t) && - dynamic_cast(t->child[dir]) && - dynamic_cast(t->child[!dir])) { - // [a] | b -> b | [a] - Node *c = t->child[dir]; - t->child[dir] = t->child[!dir]; - t->child[!dir] = c; - } else { - break; - } - } - if (t->child[dir]) - normalize_tree(t->child[dir], dir); - if (t->child[!dir]) - normalize_tree(t->child[!dir], dir); -} - -//charset conversion is disabled for now, -//it hinders tree optimization in some cases, so it need to be either -//done post optimization, or have extra factoring rules added -#if 0 -static Node *merge_charset(Node *a, Node *b) -{ - if (dynamic_cast(a) && - dynamic_cast(b)) { - Chars chars; - chars.insert(dynamic_cast(a)->c); - chars.insert(dynamic_cast(b)->c); - CharSetNode *n = new CharSetNode(chars); - return n; - } else if (dynamic_cast(a) && - dynamic_cast(b)) { - Chars *chars = &dynamic_cast(b)->chars; - chars->insert(dynamic_cast(a)->c); - return b; - } else if (dynamic_cast(a) && - dynamic_cast(b)) { - Chars *from = &dynamic_cast(a)->chars; - Chars *to = &dynamic_cast(b)->chars; - for (Chars::iterator i = from->begin(); i != from->end(); i++) - to->insert(*i); - return b; - } - - //return ???; -} - -static Node *alt_to_charsets(Node *t, int dir) -{ -/* - Node *first = NULL; - Node *p = t; - Node *i = t; - for (;dynamic_cast(i);) { - if (dynamic_cast(i->child[dir]) || - dynamic_cast(i->child[dir])) { - if (!first) { - first = i; - p = i; - i = i->child[!dir]; - } else { - first->child[dir] = merge_charset(first->child[dir], - i->child[dir]); - p->child[!dir] = i->child[!dir]; - Node *tmp = i; - i = tmp->child[!dir]; - tmp->child[!dir] = NULL; - tmp->release(); - } - } else { - p = i; - i = i->child[!dir]; - } - } - // last altnode of chain check other dir as well - if (first && (dynamic_cast(i) || - dynamic_cast(i))) { - - } -*/ - -/* - if (dynamic_cast(t->child[dir]) || - dynamic_cast(t->child[dir])) - char_test = true; - (char_test && - (dynamic_cast(i->child[dir]) || - dynamic_cast(i->child[dir])))) { -*/ - return t; -} -#endif - -static Node *basic_alt_factor(Node *t, int dir) -{ - if (!dynamic_cast(t)) - return t; - - if (t->child[dir]->eq(t->child[!dir])) { - // (a | a) -> a - Node *tmp = t->child[dir]; - t->child[dir] = NULL; - t->release(); - return tmp; - } - - // (ab) | (ac) -> a(b|c) - if (dynamic_cast(t->child[dir]) && - dynamic_cast(t->child[!dir]) && - t->child[dir]->child[dir]->eq(t->child[!dir]->child[dir])) { - // (ab) | (ac) -> a(b|c) - Node *left = t->child[dir]; - Node *right = t->child[!dir]; - t->child[dir] = left->child[!dir]; - t->child[!dir] = right->child[!dir]; - right->child[!dir] = NULL; - right->release(); - left->child[!dir] = t; - return left; - } - - // a | (ab) -> a (E | b) -> a (b | E) - if (dynamic_cast(t->child[!dir]) && - t->child[dir]->eq(t->child[!dir]->child[dir])) { - Node *c = t->child[!dir]; - t->child[dir]->release(); - t->child[dir] = c->child[!dir]; - t->child[!dir] = &epsnode; - c->child[!dir] = t; - return c; - } - - // ab | (a) -> a (b | E) - if (dynamic_cast(t->child[dir]) && - t->child[dir]->child[dir]->eq(t->child[!dir])) { - Node *c = t->child[dir]; - t->child[!dir]->release(); - t->child[dir] = c->child[!dir]; - t->child[!dir] = &epsnode; - c->child[!dir] = t; - return c; - } - - return t; -} - -static Node *basic_simplify(Node *t, int dir) -{ - if (dynamic_cast(t) && - &epsnode == t->child[!dir]) { - // aE -> a - Node *tmp = t->child[dir]; - t->child[dir] = NULL; - t->release(); - return tmp; - } - - return basic_alt_factor(t, dir); -} - -/* - * assumes a normalized tree. reductions shown for left normalization - * aE -> a - * (a | a) -> a - ** factoring patterns - * a | (a | b) -> (a | b) - * a | (ab) -> a (E | b) -> a (b | E) - * (ab) | (ac) -> a(b|c) - * - * returns t - if no simplifications were made - * a new root node - if simplifications were made - */ -Node *simplify_tree_base(Node *t, int dir, bool &mod) -{ - if (dynamic_cast(t)) - return t; - - for (int i=0; i < 2; i++) { - if (t->child[i]) { - Node *c = simplify_tree_base(t->child[i], dir, mod); - if (c != t->child[i]) { - t->child[i] = c; - mod = true; - } - } - } - - // only iterate on loop if modification made - for (;; mod = true) { - - Node *tmp = basic_simplify(t, dir); - if (tmp != t) { - t = tmp; - continue; - } - - - /* all tests after this must meet 2 alt node condition */ - if (!dynamic_cast(t) || - !dynamic_cast(t->child[!dir])) - break; - - // a | (a | b) -> (a | b) - // a | (b | (c | a)) -> (b | (c | a)) - Node *p = t; - Node *i = t->child[!dir]; - for (;dynamic_cast(i); p = i, i = i->child[!dir]) { - if (t->child[dir]->eq(i->child[dir])) { - Node *tmp = t->child[!dir]; - t->child[!dir] = NULL; - t->release(); - t = tmp; - continue; - } - } - // last altnode of chain check other dir as well - if (t->child[dir]->eq(p->child[!dir])) { - Node *tmp = t->child[!dir]; - t->child[!dir] = NULL; - t->release(); - t = tmp; - continue; - } - - //exact match didn't work, try factoring front - //a | (ac | (ad | () -> (a (E | c)) | (...) - //ab | (ac | (...)) -> (a (b | c)) | (...) - //ab | (a | (...)) -> (a (b | E)) | (...) - Node *pp; - int count = 0; - Node *subject = t->child[dir]; - Node *a = subject; - if (dynamic_cast(subject)) - a = subject->child[dir]; - - for (pp = p = t, i = t->child[!dir]; - dynamic_cast(i); ) { - if ((dynamic_cast(i->child[dir]) && - a->eq(i->child[dir]->child[dir])) || - (a->eq(i->child[dir]))) { - // extract matching alt node - p->child[!dir] = i->child[!dir]; - i->child[!dir] = subject; - subject = basic_simplify(i, dir); - if (dynamic_cast(subject)) - a = subject->child[dir]; - else - a = subject; - - i = p->child[!dir]; - count++; - } else { - pp = p; p = i; i = i->child[!dir]; - } - } - - // last altnode in chain check other dir as well - if ((dynamic_cast(i) && - a->eq(i->child[dir])) || - (a->eq(i))) { - count++; - if (t == p) { - t->child[dir] = subject; - t = basic_simplify(t, dir); - } else { - t->child[dir] = p->child[dir]; - p->child[dir] = subject; - pp->child[!dir] = basic_simplify(p, dir); - } - } else { - t->child[dir] = i; - p->child[!dir] = subject; - } - - if (count == 0) - break; - } - return t; -} - -int debug_tree(Node *t) -{ - int nodes = 1; - - if (!dynamic_cast(t)) { - if (t->child[0]) - nodes += debug_tree(t->child[0]); - if (t->child[1]) - nodes += debug_tree(t->child[1]); - } - return nodes; -} - -struct node_counts { - int charnode; - int charset; - int notcharset; - int alt; - int plus; - int star; - int any; - int cat; -}; - - -static void count_tree_nodes(Node *t, struct node_counts *counts) -{ - if (dynamic_cast(t)) { - counts->alt++; - count_tree_nodes(t->child[0], counts); - count_tree_nodes(t->child[1], counts); - } else if (dynamic_cast(t)) { - counts->cat++; - count_tree_nodes(t->child[0], counts); - count_tree_nodes(t->child[1], counts); - } else if (dynamic_cast(t)) { - counts->plus++; - count_tree_nodes(t->child[0], counts); - } else if (dynamic_cast(t)) { - counts->star++; - count_tree_nodes(t->child[0], counts); - } else if (dynamic_cast(t)) { - counts->charnode++; - } else if (dynamic_cast(t)) { - counts->any++; - } else if (dynamic_cast(t)) { - counts->charset++; - } else if (dynamic_cast(t)) { - counts->notcharset++; - } -} - -#include "stdio.h" -#include "stdint.h" -#include "apparmor_re.h" - -Node *simplify_tree(Node *t, dfaflags_t flags) -{ - bool update; - - if (flags & DFA_DUMP_TREE_STATS) { - struct node_counts counts = { 0, 0, 0, 0, 0, 0, 0, 0 }; - count_tree_nodes(t, &counts); - fprintf(stderr, "expr tree: c %d, [] %d, [^] %d, | %d, + %d, * %d, . %d, cat %d\n", counts.charnode, counts.charset, counts.notcharset, counts.alt, counts.plus, counts.star, counts.any, counts.cat); - } - do { - update = false; - //default to right normalize first as this reduces the number - //of trailing nodes which might follow an internal * - //or **, which is where state explosion can happen - //eg. in one test this makes the difference between - // the dfa having about 7 thousands states, - // and it having about 1.25 million states - int dir = 1; - if (flags & DFA_CONTROL_TREE_LEFT) - dir = 0; - for (int count = 0; count < 2; count++) { - bool modified; - do { - modified = false; - if (flags & DFA_CONTROL_TREE_NORMAL) - normalize_tree(t, dir); - t = simplify_tree_base(t, dir, modified); - if (modified) - update = true; - } while (modified); - if (flags & DFA_CONTROL_TREE_LEFT) - dir++; - else - dir--; - } - } while(update); - if (flags & DFA_DUMP_TREE_STATS) { - struct node_counts counts = { 0, 0, 0, 0, 0, 0, 0, 0 }; - count_tree_nodes(t, &counts); - fprintf(stderr, "simplified expr tree: c %d, [] %d, [^] %d, | %d, + %d, * %d, . %d, cat %d\n", counts.charnode, counts.charset, counts.notcharset, counts.alt, counts.plus, counts.star, counts.any, counts.cat); - } - return t; -} - - -%} - -%union { - char c; - Node *node; - Chars *cset; -} - -%{ - void regexp_error(Node **, const char *, const char *); -# define YYLEX_PARAM &text - int regexp_lex(YYSTYPE *, const char **); - - static inline Chars* - insert_char(Chars* cset, uchar a) - { - cset->insert(a); - return cset; - } - - static inline Chars* - insert_char_range(Chars* cset, uchar a, uchar b) - { - if (a > b) - swap(a, b); - for (uchar i = a; i <= b; i++) - cset->insert(i); - return cset; - } -%} - -%pure-parser -/* %error-verbose */ -%parse-param {Node **root} -%parse-param {const char *text} -%name-prefix = "regexp_" - -%token CHAR -%type regex_char cset_char1 cset_char cset_charN -%type charset cset_chars -%type regexp expr terms0 terms qterm term - -/** - * Note: destroy all nodes upon failure, but *not* the start symbol once - * parsing succeeds! - */ -%destructor { $$->release(); } expr terms0 terms qterm term - -%% - -/* FIXME: Does not parse "[--]", "[---]", "[^^-x]". I don't actually know - which precise grammer Perl regexps use, and rediscovering that - is proving to be painful. */ - -regexp : /* empty */ { *root = $$ = &epsnode; } - | expr { *root = $$ = $1; } - ; - -expr : terms - | expr '|' terms0 { $$ = new AltNode($1, $3); } - | '|' terms0 { $$ = new AltNode(&epsnode, $2); } - ; - -terms0 : /* empty */ { $$ = &epsnode; } - | terms - ; - -terms : qterm - | terms qterm { $$ = new CatNode($1, $2); } - ; - -qterm : term - | term '*' { $$ = new StarNode($1); } - | term '+' { $$ = new PlusNode($1); } - ; - -term : '.' { $$ = new AnyCharNode; } - | regex_char { $$ = new CharNode($1); } - | '[' charset ']' { $$ = new CharSetNode(*$2); - delete $2; } - | '[' '^' charset ']' - { $$ = new NotCharSetNode(*$3); - delete $3; } - | '[' '^' '^' cset_chars ']' - { $4->insert('^'); - $$ = new NotCharSetNode(*$4); - delete $4; } - | '(' regexp ')' { $$ = $2; } - ; - -regex_char : CHAR - | '^' { $$ = '^'; } - | '-' { $$ = '-'; } - | ']' { $$ = ']'; } - ; - -charset : cset_char1 cset_chars - { $$ = insert_char($2, $1); } - | cset_char1 '-' cset_charN cset_chars - { $$ = insert_char_range($4, $1, $3); } - ; - -cset_chars : /* nothing */ { $$ = new Chars; } - | cset_chars cset_charN - { $$ = insert_char($1, $2); } - | cset_chars cset_charN '-' cset_charN - { $$ = insert_char_range($1, $2, $4); } - ; - -cset_char1 : cset_char - | ']' { $$ = ']'; } - | '-' { $$ = '-'; } - ; - -cset_charN : cset_char - | '^' { $$ = '^'; } - ; - -cset_char : CHAR - | '[' { $$ = '['; } - | '*' { $$ = '*'; } - | '+' { $$ = '+'; } - | '.' { $$ = '.'; } - | '|' { $$ = '|'; } - | '(' { $$ = '('; } - | ')' { $$ = ')'; } - ; - -%% #include #include @@ -1114,264 +40,12 @@ cset_char : CHAR #include #include +#include "expr-tree.h" +#include "parse.h" #include "../immunix.h" -/* Traverse the syntax tree depth-first in an iterator-like manner. */ -class depth_first_traversal { - stack pos; - void push_left(Node *node) - { - pos.push(node); - while (dynamic_cast(node)) { - pos.push(node->child[0]); - node = node->child[0]; - } - } -public: - depth_first_traversal(Node *node) { - push_left(node); - } - Node *operator*() - { - return pos.top(); - } - Node* operator->() - { - return pos.top(); - } - operator bool() - { - return !pos.empty(); - } - void operator++(int) - { - Node *last = pos.top(); - pos.pop(); - - if (!pos.empty()) { - /* no need to dynamic cast, as we just popped a node so the top node - * must be an inner node */ - InnerNode *node = (InnerNode *)(pos.top()); - - if (node->child[1] && node->child[1] != last) { - push_left(node->child[1]); - } - } - } -}; - -ostream& operator<<(ostream& os, Node& node) -{ - node.dump(os); - return os; -} - -ostream& operator<<(ostream& os, uchar c) -{ - const char *search = "\a\033\f\n\r\t|*+[](). ", - *replace = "aefnrt|*+[](). ", *s; - - if ((s = strchr(search, c)) && *s != '\0') - os << '\\' << replace[s - search]; - else if (c < 32 || c >= 127) - os << '\\' << '0' << char('0' + (c >> 6)) - << char('0' + ((c >> 3) & 7)) << char('0' + (c & 7)); - else - os << (char)c; - return os; -} - -int -octdigit(char c) -{ - if (c >= '0' && c <= '7') - return c - '0'; - return -1; -} - -int -hexdigit(char c) -{ - if (c >= '0' && c <= '9') - return c - '0'; - else if (c >= 'A' && c <= 'F') - return 10 + c - 'A'; - else if (c >= 'a' && c <= 'f') - return 10 + c - 'A'; - else - return -1; -} - -int -regexp_lex(YYSTYPE *val, const char **pos) -{ - int c; - - val->c = **pos; - switch(*(*pos)++) { - case '\0': - (*pos)--; - return 0; - - case '*': case '+': case '.': case '|': case '^': case '-': - case '[': case ']': case '(' : case ')': - return *(*pos - 1); - - case '\\': - val->c = **pos; - switch(*(*pos)++) { - case '\0': - (*pos)--; - /* fall through */ - case '\\': - val->c = '\\'; - break; - - case '0': - val->c = 0; - if ((c = octdigit(**pos)) >= 0) { - val->c = c; - (*pos)++; - } - if ((c = octdigit(**pos)) >= 0) { - val->c = (val->c << 3) + c; - (*pos)++; - } - if ((c = octdigit(**pos)) >= 0) { - val->c = (val->c << 3) + c; - (*pos)++; - } - break; - - case 'x': - val->c = 0; - if ((c = hexdigit(**pos)) >= 0) { - val->c = c; - (*pos)++; - } - if ((c = hexdigit(**pos)) >= 0) { - val->c = (val->c << 4) + c; - (*pos)++; - } - break; - - case 'a': - val->c = '\a'; - break; - - case 'e': - val->c = 033 /* ESC */; - break; - - case 'f': - val->c = '\f'; - break; - - case 'n': - val->c = '\n'; - break; - - case 'r': - val->c = '\r'; - break; - - case 't': - val->c = '\t'; - break; - } - } - return CHAR; -} - -void -regexp_error(Node ** __attribute__((unused)), - const char *text __attribute__((unused)), - const char *error __attribute__((unused))) -{ - /* We don't want the library to print error messages. */ -} - -/** - * Assign a consecutive number to each node. This is only needed for - * pretty-printing the debug output. - * - * The epsnode is labeled 0. Start labeling at 1 - */ -void label_nodes(Node *root) -{ - int nodes = 1; - for (depth_first_traversal i(root); i; i++) - i->label = nodes++; -} - -/** - * Text-dump a state (for debugging). - */ -ostream& operator<<(ostream& os, const NodeSet& state) -{ - os << '{'; - if (!state.empty()) { - NodeSet::iterator i = state.begin(); - for(;;) { - os << (*i)->label; - if (++i == state.end()) - break; - os << ','; - } - } - os << '}'; - return os; -} - -/** - * Text-dump the syntax tree (for debugging). - */ -void dump_syntax_tree(ostream& os, Node *node) { - for (depth_first_traversal i(node); i; i++) { - os << i->label << '\t'; - if ((*i)->child[0] == 0) - os << **i << '\t' << (*i)->followpos << endl; - else { - if ((*i)->child[1] == 0) - os << (*i)->child[0]->label << **i; - else - os << (*i)->child[0]->label << **i - << (*i)->child[1]->label; - os << '\t' << (*i)->firstpos - << (*i)->lastpos << endl; - } - } - os << endl; -} - -/* Comparison operator for sets of . - * Compare set hashes, and if the sets have the same hash - * do compare pointer comparison on set of , the pointer comparison - * allows us to determine which Sets of we have seen already from - * new ones when constructing the DFA. - */ -struct deref_less_than { - bool operator()(pair const & lhs, pair const & rhs) const - { - if (lhs.first == rhs.first) - return *(lhs.second) < *(rhs.second); - else - return lhs.first < rhs.first; - } -}; - -unsigned long hash_NodeSet(const NodeSet *ns) -{ - unsigned long hash = 5381; - - for (NodeSet::iterator i = ns->begin(); i != ns->end(); i++) { - hash = ((hash << 5) + hash) + (unsigned long) *i; - } - - return hash; -} class State; /** @@ -2536,7 +1210,7 @@ SecondIterator second_iterator(Iter iter) */ #include "flex-tables.h" -#include "regexp.h" +#define YYTH_REGEX_MAGIC 0x1B5E783D static inline size_t pad64(size_t i) { @@ -2638,7 +1312,7 @@ void TransitionTable::flex_table(ostream& os, const char *name) /* Write the actual flex parser table. */ size_t hsize = pad64(sizeof(th) + sizeof(th_version) + strlen(name) + 1); - th.th_magic = htonl(YYTH_REGEXP_MAGIC); + th.th_magic = htonl(YYTH_REGEX_MAGIC); th.th_hsize = htonl(hsize); th.th_ssize = htonl(hsize + flex_table_size(accept.begin(), accept.end()) + @@ -2871,7 +1545,7 @@ extern "C" int aare_add_rule_vec(aare_ruleset_t *rules, int deny, assert(perms != 0); - if (regexp_parse(&tree, rulev[0])) + if (regex_parse(&tree, rulev[0])) return 0; for (int i = 1; i < count; i++) { Node *subtree = NULL; @@ -2879,7 +1553,7 @@ extern "C" int aare_add_rule_vec(aare_ruleset_t *rules, int deny, if (!node) return 0; tree = new CatNode(tree, node); - if (regexp_parse(&subtree, rulev[i])) + if (regex_parse(&subtree, rulev[i])) return 0; tree = new CatNode(tree, subtree); } diff --git a/parser/libapparmor_re/parse.h b/parser/libapparmor_re/parse.h new file mode 100644 index 000000000..42ad8435b --- /dev/null +++ b/parser/libapparmor_re/parse.h @@ -0,0 +1,27 @@ +/* + * (C) 2006, 2007 Andreas Gruenbacher + * Copyright (c) 2003-2008 Novell, Inc. (All rights reserved) + * Copyright 2009-2010 Canonical Ltd. + * + * The libapparmor library is licensed under the terms of the GNU + * Lesser General Public License, version 2.1. Please see the file + * COPYING.LGPL. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + * + * + * Parsing of regular expression into expression trees as implemented in + * expr-tree + */ +#ifndef __LIBAA_RE_PARSE_H +#define __LIBAA_RE_PARSE_H + +int regex_parse(Node **tree, const char *rule); + +#endif /* __LIBAA_RE_PARSE_H */ diff --git a/parser/libapparmor_re/parse.y b/parser/libapparmor_re/parse.y new file mode 100644 index 000000000..3f9ef30f2 --- /dev/null +++ b/parser/libapparmor_re/parse.y @@ -0,0 +1,266 @@ +/* + * (C) 2006, 2007 Andreas Gruenbacher + * Copyright (c) 2003-2008 Novell, Inc. (All rights reserved) + * Copyright 2009-2010 Canonical Ltd. + * + * The libapparmor library is licensed under the terms of the GNU + * Lesser General Public License, version 2.1. Please see the file + * COPYING.LGPL. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + * + * + * Parsing of regular expression into expression trees as implemented in + * expr-tree + */ + +%{ +/* #define DEBUG_TREE */ + #include "expr-tree.h" + +%} + +%union { + char c; + Node *node; + Chars *cset; +} + +%{ + void regex_error(Node **, const char *, const char *); +# define YYLEX_PARAM &text + int regex_lex(YYSTYPE *, const char **); + + static inline Chars* + insert_char(Chars* cset, uchar a) + { + cset->insert(a); + return cset; + } + + static inline Chars* + insert_char_range(Chars* cset, uchar a, uchar b) + { + if (a > b) + swap(a, b); + for (uchar i = a; i <= b; i++) + cset->insert(i); + return cset; + } +%} + +%pure-parser +/* %error-verbose */ +%parse-param {Node **root} +%parse-param {const char *text} +%name-prefix = "regex_" + +%token CHAR +%type regex_char cset_char1 cset_char cset_charN +%type charset cset_chars +%type regex expr terms0 terms qterm term + +/** + * Note: destroy all nodes upon failure, but *not* the start symbol once + * parsing succeeds! + */ +%destructor { $$->release(); } expr terms0 terms qterm term + +%% + +/* FIXME: Does not parse "[--]", "[---]", "[^^-x]". I don't actually know + which precise grammer Perl regexs use, and rediscovering that + is proving to be painful. */ + +regex : /* empty */ { *root = $$ = &epsnode; } + | expr { *root = $$ = $1; } + ; + +expr : terms + | expr '|' terms0 { $$ = new AltNode($1, $3); } + | '|' terms0 { $$ = new AltNode(&epsnode, $2); } + ; + +terms0 : /* empty */ { $$ = &epsnode; } + | terms + ; + +terms : qterm + | terms qterm { $$ = new CatNode($1, $2); } + ; + +qterm : term + | term '*' { $$ = new StarNode($1); } + | term '+' { $$ = new PlusNode($1); } + ; + +term : '.' { $$ = new AnyCharNode; } + | regex_char { $$ = new CharNode($1); } + | '[' charset ']' { $$ = new CharSetNode(*$2); + delete $2; } + | '[' '^' charset ']' + { $$ = new NotCharSetNode(*$3); + delete $3; } + | '[' '^' '^' cset_chars ']' + { $4->insert('^'); + $$ = new NotCharSetNode(*$4); + delete $4; } + | '(' regex ')' { $$ = $2; } + ; + +regex_char : CHAR + | '^' { $$ = '^'; } + | '-' { $$ = '-'; } + | ']' { $$ = ']'; } + ; + +charset : cset_char1 cset_chars + { $$ = insert_char($2, $1); } + | cset_char1 '-' cset_charN cset_chars + { $$ = insert_char_range($4, $1, $3); } + ; + +cset_chars : /* nothing */ { $$ = new Chars; } + | cset_chars cset_charN + { $$ = insert_char($1, $2); } + | cset_chars cset_charN '-' cset_charN + { $$ = insert_char_range($1, $2, $4); } + ; + +cset_char1 : cset_char + | ']' { $$ = ']'; } + | '-' { $$ = '-'; } + ; + +cset_charN : cset_char + | '^' { $$ = '^'; } + ; + +cset_char : CHAR + | '[' { $$ = '['; } + | '*' { $$ = '*'; } + | '+' { $$ = '+'; } + | '.' { $$ = '.'; } + | '|' { $$ = '|'; } + | '(' { $$ = '('; } + | ')' { $$ = ')'; } + ; + +%% + + +int +octdigit(char c) +{ + if (c >= '0' && c <= '7') + return c - '0'; + return -1; +} + +int +hexdigit(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'A' && c <= 'F') + return 10 + c - 'A'; + else if (c >= 'a' && c <= 'f') + return 10 + c - 'A'; + else + return -1; +} + +int +regex_lex(YYSTYPE *val, const char **pos) +{ + int c; + + val->c = **pos; + switch(*(*pos)++) { + case '\0': + (*pos)--; + return 0; + + case '*': case '+': case '.': case '|': case '^': case '-': + case '[': case ']': case '(' : case ')': + return *(*pos - 1); + + case '\\': + val->c = **pos; + switch(*(*pos)++) { + case '\0': + (*pos)--; + /* fall through */ + case '\\': + val->c = '\\'; + break; + + case '0': + val->c = 0; + if ((c = octdigit(**pos)) >= 0) { + val->c = c; + (*pos)++; + } + if ((c = octdigit(**pos)) >= 0) { + val->c = (val->c << 3) + c; + (*pos)++; + } + if ((c = octdigit(**pos)) >= 0) { + val->c = (val->c << 3) + c; + (*pos)++; + } + break; + + case 'x': + val->c = 0; + if ((c = hexdigit(**pos)) >= 0) { + val->c = c; + (*pos)++; + } + if ((c = hexdigit(**pos)) >= 0) { + val->c = (val->c << 4) + c; + (*pos)++; + } + break; + + case 'a': + val->c = '\a'; + break; + + case 'e': + val->c = 033 /* ESC */; + break; + + case 'f': + val->c = '\f'; + break; + + case 'n': + val->c = '\n'; + break; + + case 'r': + val->c = '\r'; + break; + + case 't': + val->c = '\t'; + break; + } + } + return CHAR; +} + +void +regex_error(Node ** __attribute__((unused)), + const char *text __attribute__((unused)), + const char *error __attribute__((unused))) +{ + /* We don't want the library to print error messages. */ +} diff --git a/parser/libapparmor_re/regexp.h b/parser/libapparmor_re/regexp.h deleted file mode 100644 index 728efbe92..000000000 --- a/parser/libapparmor_re/regexp.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef __REGEXP_H -#define __REGEXP_H - -/** - * Flex file format, but without state compression and with negative - * match results in the YYTD_ID_DEF table instead. - */ -#define YYTH_REGEXP_MAGIC 0x1B5E783D - -#endif /* __REGEXP_H */