/* * regexp.y -- Regular Expression Matcher Generator * (C) 2006, 2007 Andreas Gruenbacher * * Implementation based on the Lexical Analysis chapter of: * Alfred V. Aho, Ravi Sethi, Jeffrey D. Ullman: * Compilers: Principles, Techniques, and Tools (The "Dragon Book"), * Addison-Wesley, 1986. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * See http://www.gnu.org for more details. */ %{ /* #define DEBUG_TREE */ #include #include #include #include #include #include #include #include using namespace std; typedef unsigned char uchar; typedef set Chars; ostream& operator<<(ostream& os, uchar c); /* Compute the union of two sets. */ template set operator+(const set& a, const set& b) { set c(a); c.insert(b.begin(), b.end()); return c; } /** * When creating DFAs from regex trees, a DFA state is constructed from * a set of important nodes in the syntax tree. This includes AcceptNodes, * which indicate that when a match ends in a particular state, the * regular expressions that the AcceptNode belongs to match. */ class ImportantNode; typedef set NodeSet; /** * Out-edges from a state to another: we store the follow-set of Nodes * for each input character that is not a default match in * cases (i.e., following a CharNode or CharSetNode), and default * matches in otherwise as well as in all matching explicit cases * (i.e., following an AnyCharNode or NotCharSetNode). This avoids * enumerating all the explicit tranitions for default matches. */ typedef struct NodeCases { typedef map::iterator iterator; iterator begin() { return cases.begin(); } iterator end() { return cases.end(); } NodeCases() : otherwise(0) { } map cases; NodeSet *otherwise; } NodeCases; /* An abstract node in the syntax tree. */ class Node { public: Node() : nullable(false) { child[0] = child[1] = 0; } Node(Node *left) : nullable(false) { child[0] = left; child[1] = 0; } Node(Node *left, Node *right) : nullable(false) { child[0] = left; child[1] = right; } virtual ~Node() { if (child[0]) child[0]->release(); if (child[1]) child[1]->release(); } /** * See the "Dragon Book" for an explanation of nullable, firstpos, * lastpos, and followpos. */ virtual void compute_nullable() { } virtual void compute_firstpos() = 0; virtual void compute_lastpos() = 0; virtual void compute_followpos() { } virtual int eq(Node *other) = 0; virtual ostream& dump(ostream& os) = 0; bool nullable; NodeSet firstpos, lastpos, followpos; /* child 0 is left, child 1 is right */ Node *child[2]; unsigned int label; /* unique number for debug etc */ /** * We indirectly release Nodes through a virtual function because * accept and Eps Nodes are shared, and must be treated specially. * We could use full reference counting here but the indirect release * is sufficient and has less overhead */ virtual void release(void) { delete this; } }; class InnerNode : public Node { public: InnerNode() : Node() { }; InnerNode(Node *left) : Node(left) {}; InnerNode(Node *left, Node *right) : Node(left, right) { }; }; class OneChildNode : public InnerNode { public: OneChildNode(Node *left) : InnerNode(left) { }; }; class TwoChildNode : public InnerNode { public: TwoChildNode(Node *left, Node *right) : InnerNode(left, right) { }; }; class LeafNode : public Node { public: LeafNode() : Node() { }; }; /* Match nothing (//). */ class EpsNode : public LeafNode { public: EpsNode() : LeafNode() { nullable = true; label = 0; } void release(void) { /* don't delete Eps nodes because there is a single static instance * shared by all trees. Look for epsnode in the code */ } void compute_firstpos() { } void compute_lastpos() { } int eq(Node *other) { if (dynamic_cast(other)) return 1; return 0; } ostream& dump(ostream& os) { return os << "[]"; } }; /** * Leaf nodes in the syntax tree are important to us: they describe the * characters that the regular expression matches. We also consider * AcceptNodes import: they indicate when a regular expression matches. */ class ImportantNode : public LeafNode { public: ImportantNode() : LeafNode() { } void compute_firstpos() { firstpos.insert(this); } void compute_lastpos() { lastpos.insert(this); } virtual void follow(NodeCases& cases) = 0; }; /* common base class for all the different classes that contain * character information. */ class CNode : public ImportantNode { public: CNode() : ImportantNode() { } }; /* Match one specific character (/c/). */ class CharNode : public CNode { public: CharNode(uchar c) : c(c) { } void follow(NodeCases& cases) { NodeSet **x = &cases.cases[c]; if (!*x) { if (cases.otherwise) *x = new NodeSet(*cases.otherwise); else *x = new NodeSet; } (*x)->insert(followpos.begin(), followpos.end()); } int eq(Node *other) { CharNode *o = dynamic_cast(other); if (o) { return c == o->c; } return 0; } ostream& dump(ostream& os) { return os << c; } uchar c; }; /* Match a set of characters (/[abc]/). */ class CharSetNode : public CNode { public: CharSetNode(Chars& chars) : chars(chars) { } void follow(NodeCases& cases) { for (Chars::iterator i = chars.begin(); i != chars.end(); i++) { NodeSet **x = &cases.cases[*i]; if (!*x) { if (cases.otherwise) *x = new NodeSet(*cases.otherwise); else *x = new NodeSet; } (*x)->insert(followpos.begin(), followpos.end()); } } int eq(Node *other) { CharSetNode *o = dynamic_cast(other); if (!o || chars.size() != o->chars.size()) return 0; for (Chars::iterator i = chars.begin(), j = o->chars.begin(); i != chars.end() && j != o->chars.end(); i++, j++) { if (*i != *j) return 0; } return 1; } ostream& dump(ostream& os) { os << '['; for (Chars::iterator i = chars.begin(); i != chars.end(); i++) os << *i; return os << ']'; } Chars chars; }; /* Match all except one character (/[^abc]/). */ class NotCharSetNode : public CNode { public: NotCharSetNode(Chars& chars) : chars(chars) { } void follow(NodeCases& cases) { if (!cases.otherwise) cases.otherwise = new NodeSet; for (Chars::iterator j = chars.begin(); j != chars.end(); j++) { NodeSet **x = &cases.cases[*j]; if (!*x) *x = new NodeSet(*cases.otherwise); } /** * Note: Add to the nonmatching characters after copying away the * old otherwise state for the matching characters. */ cases.otherwise->insert(followpos.begin(), followpos.end()); for (NodeCases::iterator i = cases.begin(); i != cases.end(); i++) { if (chars.find(i->first) == chars.end()) i->second->insert(followpos.begin(), followpos.end()); } } int eq(Node *other) { NotCharSetNode *o = dynamic_cast(other); if (!o || chars.size() != o->chars.size()) return 0; for (Chars::iterator i = chars.begin(), j = o->chars.begin(); i != chars.end() && j != o->chars.end(); i++, j++) { if (*i != *j) return 0; } return 1; } ostream& dump(ostream& os) { os << "[^"; for (Chars::iterator i = chars.begin(); i != chars.end(); i++) os << *i; return os << ']'; } Chars chars; }; /* Match any character (/./). */ class AnyCharNode : public CNode { public: AnyCharNode() { } void follow(NodeCases& cases) { if (!cases.otherwise) cases.otherwise = new NodeSet; cases.otherwise->insert(followpos.begin(), followpos.end()); for (NodeCases::iterator i = cases.begin(); i != cases.end(); i++) i->second->insert(followpos.begin(), followpos.end()); } int eq(Node *other) { if (dynamic_cast(other)) return 1; return 0; } ostream& dump(ostream& os) { return os << "."; } }; /** * Indicate that a regular expression matches. An AcceptNode itself * doesn't match anything, so it will never generate any transitions. */ class AcceptNode : public ImportantNode { public: AcceptNode() {} void release(void) { /* don't delete AcceptNode via release as they are shared, * and will be deleted when the table the are stored in is deleted */ } void follow(NodeCases& cases) { /* Nothing to follow. */ } /* requires accept nodes to be common by pointer */ int eq(Node *other) { if (dynamic_cast(other)) return (this == other); return 0; } }; /* Match a node zero or more times. (This is a unary operator.) */ class StarNode : public OneChildNode { public: StarNode(Node *left) : OneChildNode(left) { nullable = true; } void compute_firstpos() { firstpos = child[0]->firstpos; } void compute_lastpos() { lastpos = child[0]->lastpos; } void compute_followpos() { NodeSet from = child[0]->lastpos, to = child[0]->firstpos; for(NodeSet::iterator i = from.begin(); i != from.end(); i++) { (*i)->followpos.insert(to.begin(), to.end()); } } int eq(Node *other) { if (dynamic_cast(other)) return child[0]->eq(other->child[0]); return 0; } ostream& dump(ostream& os) { os << '('; child[0]->dump(os); return os << ")*"; } }; /* Match a node one or more times. (This is a unary operator.) */ class PlusNode : public OneChildNode { public: PlusNode(Node *left) : OneChildNode(left) { } void compute_nullable() { nullable = child[0]->nullable; } void compute_firstpos() { firstpos = child[0]->firstpos; } void compute_lastpos() { lastpos = child[0]->lastpos; } void compute_followpos() { NodeSet from = child[0]->lastpos, to = child[0]->firstpos; for(NodeSet::iterator i = from.begin(); i != from.end(); i++) { (*i)->followpos.insert(to.begin(), to.end()); } } int eq(Node *other) { if (dynamic_cast(other)) return child[0]->eq(other->child[0]); return 0; } ostream& dump(ostream& os) { os << '('; child[0]->dump(os); return os << ")+"; } }; /* Match a pair of consecutive nodes. */ class CatNode : public TwoChildNode { public: CatNode(Node *left, Node *right) : TwoChildNode(left, right) { } void compute_nullable() { nullable = child[0]->nullable && child[1]->nullable; } void compute_firstpos() { if (child[0]->nullable) firstpos = child[0]->firstpos + child[1]->firstpos; else firstpos = child[0]->firstpos; } void compute_lastpos() { if (child[1]->nullable) lastpos = child[0]->lastpos + child[1]->lastpos; else lastpos = child[1]->lastpos; } void compute_followpos() { NodeSet from = child[0]->lastpos, to = child[1]->firstpos; for(NodeSet::iterator i = from.begin(); i != from.end(); i++) { (*i)->followpos.insert(to.begin(), to.end()); } } int eq(Node *other) { if (dynamic_cast(other)) { if (!child[0]->eq(other->child[0])) return 0; return child[1]->eq(other->child[1]); } return 0; } ostream& dump(ostream& os) { child[0]->dump(os); child[1]->dump(os); return os; //return os << ' '; } }; /* Match one of two alternative nodes. */ class AltNode : public TwoChildNode { public: AltNode(Node *left, Node *right) : TwoChildNode(left, right) { } void compute_nullable() { nullable = child[0]->nullable || child[1]->nullable; } void compute_lastpos() { lastpos = child[0]->lastpos + child[1]->lastpos; } void compute_firstpos() { firstpos = child[0]->firstpos + child[1]->firstpos; } int eq(Node *other) { if (dynamic_cast(other)) { if (!child[0]->eq(other->child[0])) return 0; return child[1]->eq(other->child[1]); } return 0; } ostream& dump(ostream& os) { os << '('; child[0]->dump(os); os << '|'; child[1]->dump(os); os << ')'; return os; } }; /* Use a single static EpsNode as it carries no node specific information */ static EpsNode epsnode; /* * Normalize the regex parse tree for factoring and cancelations. Normalization * reorganizes internal (alt and cat) nodes into a fixed "normalized" form that * simplifies factoring code, in that it produces a canonicalized form for * the direction being normalized so that the factoring code does not have * to consider as many cases. * * left normalization (dir == 0) uses these rules * (E | a) -> (a | E) * (a | b) | c -> a | (b | c) * (ab)c -> a(bc) * * right normalization (dir == 1) uses the same rules but reversed * (a | E) -> (E | a) * a | (b | c) -> (a | b) | c * a(bc) -> (ab)c * * Note: This is written iteratively for a given node (the top node stays * fixed and the children are rotated) instead of recursively. * For a given node under examination rotate over nodes from * dir to !dir. Until no dir direction node meets the criterial. * Then recurse to the children (which will have a different node type) * to make sure they are normalized. * Normalization of a child node is guarenteed to not affect the * normalization of the parent. * * For cat nodes the depth first traverse order is guarenteed to be * maintained. This is not necessary for altnodes. * * Eg. For left normalization * * |1 |1 * / \ / \ * |2 T -> a |2 * / \ / \ * |3 c b |3 * / \ / \ * a b c T * */ static void rotate_node(Node *t, int dir) { // (a | b) | c -> a | (b | c) // (ab)c -> a(bc) Node *left = t->child[dir]; t->child[dir] = left->child[dir]; left->child[dir] = left->child[!dir]; left->child[!dir] = t->child[!dir]; t->child[!dir] = left; } void normalize_tree(Node *t, int dir) { if (dynamic_cast(t)) return; for (;;) { if ((&epsnode == t->child[dir]) && (&epsnode != t->child[!dir]) && dynamic_cast(t)) { // (E | a) -> (a | E) // Ea -> aE Node *c = t->child[dir]; t->child[dir] = t->child[!dir]; t->child[!dir] = c; // Don't break here as 'a' may be a tree that // can be pulled up. } else if ((dynamic_cast(t) && dynamic_cast(t->child[dir])) || (dynamic_cast(t) && dynamic_cast(t->child[dir]))) { // (a | b) | c -> a | (b | c) // (ab)c -> a(bc) rotate_node(t, dir); } else if (dynamic_cast(t) && dynamic_cast(t->child[dir]) && dynamic_cast(t->child[!dir])) { // [a] | b -> b | [a] Node *c = t->child[dir]; t->child[dir] = t->child[!dir]; t->child[!dir] = c; } else { break; } } if (t->child[dir]) normalize_tree(t->child[dir], dir); if (t->child[!dir]) normalize_tree(t->child[!dir], dir); } //charset conversion is disabled for now, //it hinders tree optimization in some cases, so it need to be either //done post optimization, or have extra factoring rules added #if 0 static Node *merge_charset(Node *a, Node *b) { if (dynamic_cast(a) && dynamic_cast(b)) { Chars chars; chars.insert(dynamic_cast(a)->c); chars.insert(dynamic_cast(b)->c); CharSetNode *n = new CharSetNode(chars); return n; } else if (dynamic_cast(a) && dynamic_cast(b)) { Chars *chars = &dynamic_cast(b)->chars; chars->insert(dynamic_cast(a)->c); return b; } else if (dynamic_cast(a) && dynamic_cast(b)) { Chars *from = &dynamic_cast(a)->chars; Chars *to = &dynamic_cast(b)->chars; for (Chars::iterator i = from->begin(); i != from->end(); i++) to->insert(*i); return b; } //return ???; } static Node *alt_to_charsets(Node *t, int dir) { /* Node *first = NULL; Node *p = t; Node *i = t; for (;dynamic_cast(i);) { if (dynamic_cast(i->child[dir]) || dynamic_cast(i->child[dir])) { if (!first) { first = i; p = i; i = i->child[!dir]; } else { first->child[dir] = merge_charset(first->child[dir], i->child[dir]); p->child[!dir] = i->child[!dir]; Node *tmp = i; i = tmp->child[!dir]; tmp->child[!dir] = NULL; tmp->release(); } } else { p = i; i = i->child[!dir]; } } // last altnode of chain check other dir as well if (first && (dynamic_cast(i) || dynamic_cast(i))) { } */ /* if (dynamic_cast(t->child[dir]) || dynamic_cast(t->child[dir])) char_test = true; (char_test && (dynamic_cast(i->child[dir]) || dynamic_cast(i->child[dir])))) { */ return t; } #endif static Node *basic_alt_factor(Node *t, int dir) { if (!dynamic_cast(t)) return t; if (t->child[dir]->eq(t->child[!dir])) { // (a | a) -> a Node *tmp = t->child[dir]; t->child[dir] = NULL; t->release(); return tmp; } // (ab) | (ac) -> a(b|c) if (dynamic_cast(t->child[dir]) && dynamic_cast(t->child[!dir]) && t->child[dir]->child[dir]->eq(t->child[!dir]->child[dir])) { // (ab) | (ac) -> a(b|c) Node *left = t->child[dir]; Node *right = t->child[!dir]; t->child[dir] = left->child[!dir]; t->child[!dir] = right->child[!dir]; right->child[!dir] = NULL; right->release(); left->child[!dir] = t; return left; } // a | (ab) -> a (E | b) -> a (b | E) if (dynamic_cast(t->child[!dir]) && t->child[dir]->eq(t->child[!dir]->child[dir])) { Node *c = t->child[!dir]; t->child[dir]->release(); t->child[dir] = c->child[!dir]; t->child[!dir] = &epsnode; c->child[!dir] = t; return c; } // ab | (a) -> a (b | E) if (dynamic_cast(t->child[dir]) && t->child[dir]->child[dir]->eq(t->child[!dir])) { Node *c = t->child[dir]; t->child[!dir]->release(); t->child[dir] = c->child[!dir]; t->child[!dir] = &epsnode; c->child[!dir] = t; return c; } return t; } static Node *basic_simplify(Node *t, int dir) { if (dynamic_cast(t) && &epsnode == t->child[!dir]) { // aE -> a Node *tmp = t->child[dir]; t->child[dir] = NULL; t->release(); return tmp; } return basic_alt_factor(t, dir); } /* * assumes a normalized tree. reductions shown for left normalization * aE -> a * (a | a) -> a ** factoring patterns * a | (a | b) -> (a | b) * a | (ab) -> a (E | b) -> a (b | E) * (ab) | (ac) -> a(b|c) * * returns t - if no simplifications were made * a new root node - if simplifications were made */ Node *simplify_tree_base(Node *t, int dir, bool &mod) { if (dynamic_cast(t)) return t; for (int i=0; i < 2; i++) { if (t->child[i]) { Node *c = simplify_tree_base(t->child[i], dir, mod); if (c != t->child[i]) { t->child[i] = c; mod = true; } } } // only iterate on loop if modification made for (;; mod = true) { Node *tmp = basic_simplify(t, dir); if (tmp != t) { t = tmp; continue; } /* all tests after this must meet 2 alt node condition */ if (!dynamic_cast(t) || !dynamic_cast(t->child[!dir])) break; // a | (a | b) -> (a | b) // a | (b | (c | a)) -> (b | (c | a)) Node *p = t; Node *i = t->child[!dir]; for (;dynamic_cast(i); p = i, i = i->child[!dir]) { if (t->child[dir]->eq(i->child[dir])) { Node *tmp = t->child[!dir]; t->child[!dir] = NULL; t->release(); t = tmp; continue; } } // last altnode of chain check other dir as well if (t->child[dir]->eq(p->child[!dir])) { Node *tmp = t->child[!dir]; t->child[!dir] = NULL; t->release(); t = tmp; continue; } //exact match didn't work, try factoring front //a | (ac | (ad | () -> (a (E | c)) | (...) //ab | (ac | (...)) -> (a (b | c)) | (...) //ab | (a | (...)) -> (a (b | E)) | (...) Node *pp; int count = 0; Node *subject = t->child[dir]; Node *a = subject; if (dynamic_cast(subject)) a = subject->child[dir]; for (pp = p = t, i = t->child[!dir]; dynamic_cast(i); ) { if ((dynamic_cast(i->child[dir]) && a->eq(i->child[dir]->child[dir])) || (a->eq(i->child[dir]))) { // extract matching alt node p->child[!dir] = i->child[!dir]; i->child[!dir] = subject; subject = basic_simplify(i, dir); if (dynamic_cast(subject)) a = subject->child[dir]; else a = subject; i = p->child[!dir]; count++; } else { pp = p; p = i; i = i->child[!dir]; } } // last altnode in chain check other dir as well if ((dynamic_cast(i) && a->eq(i->child[dir])) || (a->eq(i))) { count++; if (t == p) { t->child[dir] = subject; t = basic_simplify(t, dir); } else { t->child[dir] = p->child[dir]; p->child[dir] = subject; pp->child[!dir] = basic_simplify(p, dir); } } else { t->child[dir] = i; p->child[!dir] = subject; } if (count == 0) break; } return t; } int debug_tree(Node *t) { int nodes = 1; if (!dynamic_cast(t)) { if (t->child[0]) nodes += debug_tree(t->child[0]); if (t->child[1]) nodes += debug_tree(t->child[1]); } return nodes; } struct node_counts { int charnode; int charset; int notcharset; int alt; int plus; int star; int any; int cat; }; static void count_tree_nodes(Node *t, struct node_counts *counts) { if (dynamic_cast(t)) { counts->alt++; count_tree_nodes(t->child[0], counts); count_tree_nodes(t->child[1], counts); } else if (dynamic_cast(t)) { counts->cat++; count_tree_nodes(t->child[0], counts); count_tree_nodes(t->child[1], counts); } else if (dynamic_cast(t)) { counts->plus++; count_tree_nodes(t->child[0], counts); } else if (dynamic_cast(t)) { counts->star++; count_tree_nodes(t->child[0], counts); } else if (dynamic_cast(t)) { counts->charnode++; } else if (dynamic_cast(t)) { counts->any++; } else if (dynamic_cast(t)) { counts->charset++; } else if (dynamic_cast(t)) { counts->notcharset++; } } #include "stdio.h" #include "stdint.h" #include "apparmor_re.h" Node *simplify_tree(Node *t, dfaflags_t flags) { bool update; if (flags & DFA_DUMP_TREE_STATS) { struct node_counts counts = { }; count_tree_nodes(t, &counts); fprintf(stderr, "expr tree: c %d, [] %d, [^] %d, | %d, + %d, * %d, . %d, cat %d\n", counts.charnode, counts.charset, counts.notcharset, counts.alt, counts.plus, counts.star, counts.any, counts.cat); } do { update = false; //default to right normalize first as this reduces the number //of trailing nodes which might follow an internal * //or **, which is where state explosion can happen //eg. in one test this makes the difference between // the dfa having about 7 thousands states, // and it having about 1.25 million states int dir = 1; if (flags & DFA_CONTROL_TREE_LEFT) dir = 0; for (int count = 0; count < 2; count++) { bool modified; do { modified = false; if (flags & DFA_CONTROL_TREE_NORMAL) normalize_tree(t, dir); t = simplify_tree_base(t, dir, modified); if (modified) update = true; } while (modified); if (flags & DFA_CONTROL_TREE_LEFT) dir++; else dir--; } } while(update); if (flags & DFA_DUMP_TREE_STATS) { struct node_counts counts = { }; count_tree_nodes(t, &counts); fprintf(stderr, "simplified expr tree: c %d, [] %d, [^] %d, | %d, + %d, * %d, . %d, cat %d\n", counts.charnode, counts.charset, counts.notcharset, counts.alt, counts.plus, counts.star, counts.any, counts.cat); } return t; } %} %union { char c; Node *node; Chars *cset; } %{ void regexp_error(Node **, const char *, const char *); # define YYLEX_PARAM &text int regexp_lex(YYSTYPE *, const char **); static inline Chars* insert_char(Chars* cset, uchar a) { cset->insert(a); return cset; } static inline Chars* insert_char_range(Chars* cset, uchar a, uchar b) { if (a > b) swap(a, b); for (uchar i = a; i <= b; i++) cset->insert(i); return cset; } %} %pure-parser /* %error-verbose */ %parse-param {Node **root} %parse-param {const char *text} %name-prefix = "regexp_" %token CHAR %type regex_char cset_char1 cset_char cset_charN %type charset cset_chars %type regexp expr terms0 terms qterm term /** * Note: destroy all nodes upon failure, but *not* the start symbol once * parsing succeeds! */ %destructor { $$->release(); } expr terms0 terms qterm term %% /* FIXME: Does not parse "[--]", "[---]", "[^^-x]". I don't actually know which precise grammer Perl regexps use, and rediscovering that is proving to be painful. */ regexp : /* empty */ { *root = $$ = &epsnode; } | expr { *root = $$ = $1; } ; expr : terms | expr '|' terms0 { $$ = new AltNode($1, $3); } | '|' terms0 { $$ = new AltNode(&epsnode, $2); } ; terms0 : /* empty */ { $$ = &epsnode; } | terms ; terms : qterm | terms qterm { $$ = new CatNode($1, $2); } ; qterm : term | term '*' { $$ = new StarNode($1); } | term '+' { $$ = new PlusNode($1); } ; term : '.' { $$ = new AnyCharNode; } | regex_char { $$ = new CharNode($1); } | '[' charset ']' { $$ = new CharSetNode(*$2); delete $2; } | '[' '^' charset ']' { $$ = new NotCharSetNode(*$3); delete $3; } | '[' '^' '^' cset_chars ']' { $4->insert('^'); $$ = new NotCharSetNode(*$4); delete $4; } | '(' regexp ')' { $$ = $2; } ; regex_char : CHAR | '^' { $$ = '^'; } | '-' { $$ = '-'; } | ']' { $$ = ']'; } ; charset : cset_char1 cset_chars { $$ = insert_char($2, $1); } | cset_char1 '-' cset_charN cset_chars { $$ = insert_char_range($4, $1, $3); } ; cset_chars : /* nothing */ { $$ = new Chars; } | cset_chars cset_charN { $$ = insert_char($1, $2); } | cset_chars cset_charN '-' cset_charN { $$ = insert_char_range($1, $2, $4); } ; cset_char1 : cset_char | ']' { $$ = ']'; } | '-' { $$ = '-'; } ; cset_charN : cset_char | '^' { $$ = '^'; } ; cset_char : CHAR | '[' { $$ = '['; } | '*' { $$ = '*'; } | '+' { $$ = '+'; } | '.' { $$ = '.'; } | '|' { $$ = '|'; } | '(' { $$ = '('; } | ')' { $$ = ')'; } ; %% #include #include #include #include #include #include #include "../immunix.h" /* Traverse the syntax tree depth-first in an iterator-like manner. */ class depth_first_traversal { stack pos; void push_left(Node *node) { pos.push(node); while (dynamic_cast(node)) { pos.push(node->child[0]); node = node->child[0]; } } public: depth_first_traversal(Node *node) { push_left(node); } Node *operator*() { return pos.top(); } Node* operator->() { return pos.top(); } operator bool() { return !pos.empty(); } void operator++(int) { Node *last = pos.top(); pos.pop(); if (!pos.empty()) { /* no need to dynamic cast, as we just popped a node so the top node * must be an inner node */ InnerNode *node = (InnerNode *)(pos.top()); if (node->child[1] && node->child[1] != last) { push_left(node->child[1]); } } } }; ostream& operator<<(ostream& os, Node& node) { node.dump(os); return os; } ostream& operator<<(ostream& os, uchar c) { const char *search = "\a\033\f\n\r\t|*+[](). ", *replace = "aefnrt|*+[](). ", *s; if ((s = strchr(search, c)) && *s != '\0') os << '\\' << replace[s - search]; else if (c < 32 || c >= 127) os << '\\' << '0' << char('0' + (c >> 6)) << char('0' + ((c >> 3) & 7)) << char('0' + (c & 7)); else os << (char)c; return os; } int octdigit(char c) { if (c >= '0' && c <= '7') return c - '0'; return -1; } int hexdigit(char c) { if (c >= '0' && c <= '9') return c - '0'; else if (c >= 'A' && c <= 'F') return 10 + c - 'A'; else if (c >= 'a' && c <= 'f') return 10 + c - 'A'; else return -1; } int regexp_lex(YYSTYPE *val, const char **pos) { int c; val->c = **pos; switch(*(*pos)++) { case '\0': (*pos)--; return 0; case '*': case '+': case '.': case '|': case '^': case '-': case '[': case ']': case '(' : case ')': return *(*pos - 1); case '\\': val->c = **pos; switch(*(*pos)++) { case '\0': (*pos)--; /* fall through */ case '\\': val->c = '\\'; break; case '0': val->c = 0; if ((c = octdigit(**pos)) >= 0) { val->c = c; (*pos)++; } if ((c = octdigit(**pos)) >= 0) { val->c = (val->c << 3) + c; (*pos)++; } if ((c = octdigit(**pos)) >= 0) { val->c = (val->c << 3) + c; (*pos)++; } break; case 'x': val->c = 0; if ((c = hexdigit(**pos)) >= 0) { val->c = c; (*pos)++; } if ((c = hexdigit(**pos)) >= 0) { val->c = (val->c << 4) + c; (*pos)++; } break; case 'a': val->c = '\a'; break; case 'e': val->c = 033 /* ESC */; break; case 'f': val->c = '\f'; break; case 'n': val->c = '\n'; break; case 'r': val->c = '\r'; break; case 't': val->c = '\t'; break; } } return CHAR; } void regexp_error(Node **, const char *text, const char *error) { /* We don't want the library to print error messages. */ } /** * Assign a consecutive number to each node. This is only needed for * pretty-printing the debug output. * * The epsnode is labeled 0. Start labeling at 1 */ void label_nodes(Node *root) { int nodes = 1; for (depth_first_traversal i(root); i; i++) i->label = nodes++; } /** * Text-dump a state (for debugging). */ ostream& operator<<(ostream& os, const NodeSet& state) { os << '{'; if (!state.empty()) { NodeSet::iterator i = state.begin(); for(;;) { os << (*i)->label; if (++i == state.end()) break; os << ','; } } os << '}'; return os; } /** * Text-dump the syntax tree (for debugging). */ void dump_syntax_tree(ostream& os, Node *node) { for (depth_first_traversal i(node); i; i++) { os << i->label << '\t'; if ((*i)->child[0] == 0) os << **i << '\t' << (*i)->followpos << endl; else { if ((*i)->child[1] == 0) os << (*i)->child[0]->label << **i; else os << (*i)->child[0]->label << **i << (*i)->child[1]->label; os << '\t' << (*i)->firstpos << (*i)->lastpos << endl; } } os << endl; } /* Comparison operator for sets of . * Compare set hashes, and if the sets have the same hash * do compare pointer comparison on set of , the pointer comparison * allows us to determine which Sets of we have seen already from * new ones when constructing the DFA. */ struct deref_less_than { bool operator()(pair const & lhs, pair const & rhs) const { if (lhs.first == rhs.first) return *(lhs.second) < *(rhs.second); else return lhs.first < rhs.first; } }; unsigned long hash_NodeSet(const NodeSet *ns) { unsigned long hash = 5381; for (NodeSet::iterator i = ns->begin(); i != ns->end(); i++) { hash = ((hash << 5) + hash) + (unsigned long) *i; } return hash; } class State; /** * State cases are identical to NodesCases except they map to State * * instead of NodeSet. * Out-edges from a state to another: we store the follow State * for each input character that is not a default match in cases and * default matches in otherwise as well as in all matching explicit cases * This avoids enumerating all the explicit tranitions for default matches. */ typedef struct Cases { typedef map::iterator iterator; iterator begin() { return cases.begin(); } iterator end() { return cases.end(); } Cases() : otherwise(0) { } map cases; State *otherwise; } Cases; /* * State - DFA individual state information * audit: the audit permission mask for the state * accept: the accept permissions for the state * cases: set of transitions from this state */ class State { public: State() : label (0), audit(0), accept(0), cases() { } int label; uint32_t audit, accept; Cases cases; }; ostream& operator<<(ostream& os, const State& state) { /* currently just dump the state ptr */ os << '{'; os << state.label; os << '}'; return os; } typedef list Partition; typedef map, State *, deref_less_than > NodeMap; /* Transitions in the DFA. */ class DFA { public: DFA(Node *root, dfaflags_t flags); virtual ~DFA(); void remove_unreachable(dfaflags_t flags); bool same_mappings(map &partition_map, State *s1, State *s2); size_t hash_trans(State *s); void minimize(dfaflags_t flags); void dump(ostream& os); void dump_dot_graph(ostream& os); map equivalence_classes(dfaflags_t flags); void apply_equivalence_classes(map& eq); Node *root; State *nonmatching, *start; Partition states; }; uint32_t accept_perms(NodeSet *state, uint32_t *audit_ctl, int *error); /* macro to help out with DFA creation, not done as inlined fn as nearly * every line uses a different map or variable that would have to be passed */ #define update_for_nodes(NODES, TARGET) \ do { \ pair index = make_pair(hash_NodeSet(NODES), NODES); \ map, State *, deref_less_than>::iterator x = nodemap.find(index); \ if (x == nodemap.end()) { \ /* set of nodes isn't known so create new state, and nodes to \ * state mapping \ */ \ TARGET = new State(); \ (TARGET)->label = nodemap.size(); \ states.push_back(TARGET); \ nodemap.insert(make_pair(index, TARGET)); \ work_queue.push_back(NODES); \ proto_sum += (NODES)->size(); \ if ((NODES)->size() > proto_max) \ proto_max = (NODES)->size(); \ } else { \ /* set of nodes already has a mapping so free this one */ \ match_count++; \ delete (NODES); \ TARGET = x->second; \ } \ } while (0) /** * Construct a DFA from a syntax tree. */ DFA::DFA(Node *root, dfaflags_t flags) : root(root) { int i, match_count; i = match_count = 0; if (flags & DFA_DUMP_PROGRESS) fprintf(stderr, "Creating dfa:\r"); for (depth_first_traversal i(root); i; i++) { (*i)->compute_nullable(); (*i)->compute_firstpos(); (*i)->compute_lastpos(); } if (flags & DFA_DUMP_PROGRESS) fprintf(stderr, "Creating dfa: followpos\r"); for (depth_first_traversal i(root); i; i++) { (*i)->compute_followpos(); } NodeMap nodemap; nonmatching = new State; states.push_back(nonmatching); NodeSet *emptynode = new NodeSet; nodemap.insert(make_pair(make_pair(hash_NodeSet(emptynode), emptynode), nonmatching)); /* there is no nodemapping for the nonmatching state */ unsigned int proto_max = 0; unsigned int proto_sum = 0; start = new State; start->label = 1; states.push_back(start); NodeSet *first = new NodeSet(root->firstpos); nodemap.insert(make_pair(make_pair(hash_NodeSet(first), first), start)); /* the work_queue contains the proto-states (set of nodes that is * the precurser of a state) that need to be computed * * TODO: currently the work_queue is treated in a breadth first * search manner. Test using the work_queue in a depth first * manner, this may help reduce the number of entries on the * work_queue at any given time, thus reducing peak memory use. */ list work_queue; work_queue.push_back(first); while (!work_queue.empty()) { if (i % 1000 == 0 && (flags & DFA_DUMP_PROGRESS)) fprintf(stderr, "\033[2KCreating dfa: queue %ld\tstates %ld\teliminated duplicates %d\r", work_queue.size(), states.size(), match_count); i++; int error; NodeSet *nodes = work_queue.front(); work_queue.pop_front(); State *from = nodemap[make_pair(hash_NodeSet(nodes), nodes)]; /* Compute permissions associated with the State. */ from->accept = accept_perms(nodes, &from->audit, &error); if (error) { /* TODO!!!!!!!!!!!!! * permission error checking here */ } /* Compute possible transitions for `nodes`. This is done by * iterating over all the nodes in nodes and combining the * transitions. * * The resultant transition set is a mapping of characters to * sets of nodes. */ NodeCases cases; for (NodeSet::iterator i = nodes->begin(); i != nodes->end(); i++) (*i)->follow(cases); /* Now for each set of nodes in the computed transitions, make * sure that there is a state that maps to it, and add the * matching case to the state. */ /* check the default transition first */ if (cases.otherwise) { State *target; update_for_nodes(cases.otherwise, target); from->cases.otherwise = target; } /* For each transition from *from, check if the set of nodes it * transitions to already has been mapped to a state */ for (NodeCases::iterator j = cases.begin(); j != cases.end(); j++) { State *target; update_for_nodes(j->second, target); /* Don't insert transition that the default transition * already covers */ if (target != from->cases.otherwise) from->cases.cases[j->first] = target; } } /* for (NodeSet *nodes ... */ /* cleanup Sets of nodes used computing the DFA as they are no longer * needed. */ for (depth_first_traversal i(root); i; i++) { (*i)->firstpos.clear(); (*i)->lastpos.clear(); (*i)->followpos.clear(); } for (NodeMap::iterator i = nodemap.begin(); i != nodemap.end(); i++) delete i->first.second; nodemap.clear(); if (flags & (DFA_DUMP_STATS)) fprintf(stderr, "\033[2KCreated dfa: states %ld,\teliminated duplicates %d,\tprotostate sets: longest %u, avg %u\n", states.size(), match_count, proto_max, (unsigned int) (proto_sum/states.size())); /* TODO Dump dfa with NODE mapping - or node to dfa mapping */ // ?????? } DFA::~DFA() { for (Partition::iterator i = states.begin(); i != states.end(); i++) delete *i; } class MatchFlag : public AcceptNode { public: MatchFlag(uint32_t flag, uint32_t audit) : flag(flag), audit(audit) {} ostream& dump(ostream& os) { return os << '<' << flag << '>'; } uint32_t flag; uint32_t audit; }; class ExactMatchFlag : public MatchFlag { public: ExactMatchFlag(uint32_t flag, uint32_t audit) : MatchFlag(flag, audit) {} }; class DenyMatchFlag : public MatchFlag { public: DenyMatchFlag(uint32_t flag, uint32_t quiet) : MatchFlag(flag, quiet) {} }; /* Remove dead or unreachable states */ void DFA::remove_unreachable(dfaflags_t flags) { set reachable; list work_queue; /* find the set of reachable states */ reachable.insert(nonmatching); work_queue.push_back(start); while (!work_queue.empty()) { State *from = work_queue.front(); work_queue.pop_front(); reachable.insert(from); if (from->cases.otherwise && (reachable.find(from->cases.otherwise) == reachable.end())) work_queue.push_back(from->cases.otherwise); for (Cases::iterator j = from->cases.begin(); j != from->cases.end(); j++) { if (reachable.find(j->second) == reachable.end()) work_queue.push_back(j->second); } } /* walk the set of states and remove any that aren't reachable */ if (reachable.size() < states.size()) { int count = 0; Partition::iterator i; Partition::iterator next; for (i = states.begin(); i != states.end(); i = next) { next = i; next++; if (reachable.find(*i) == reachable.end()) { if (flags & DFA_DUMP_UNREACHABLE) { cerr << "unreachable: "<< **i; if (*i == start) cerr << " <=="; if ((*i)->accept) { cerr << " (0x" << hex << (*i)->accept << " " << (*i)->audit << dec << ')'; } cerr << endl; } State *current = *i; states.erase(i); delete(current); count++; } } if (count && (flags & DFA_DUMP_STATS)) cerr << "DFA: states " << states.size() << " removed " << count << " unreachable states\n"; } } /* test if two states have the same transitions under partition_map */ bool DFA::same_mappings(map &partition_map, State *s1, State *s2) { if (s1->cases.otherwise && s1->cases.otherwise != nonmatching) { if (!s2->cases.otherwise || s2->cases.otherwise == nonmatching) return false; Partition *p1 = partition_map.find(s1->cases.otherwise)->second; Partition *p2 = partition_map.find(s2->cases.otherwise)->second; if (p1 != p2) return false; } else if (s2->cases.otherwise && s2->cases.otherwise != nonmatching) { return false; } if (s1->cases.cases.size() != s2->cases.cases.size()) return false; for (Cases::iterator j1 = s1->cases.begin(); j1 != s1->cases.end(); j1++){ Cases::iterator j2 = s2->cases.cases.find(j1->first); if (j2 == s2->cases.end()) return false; Partition *p1 = partition_map.find(j1->second)->second; Partition *p2 = partition_map.find(j2->second)->second; if (p1 != p2) return false; } return true; } /* Do simple djb2 hashing against a States transition cases * this provides a rough initial guess at state equivalence as if a state * has a different number of transitions or has transitions on different * cases they will never be equivalent. * Note: this only hashes based off of the alphabet (not destination) * as different destinations could end up being equiv */ size_t DFA::hash_trans(State *s) { unsigned long hash = 5381; for (Cases::iterator j = s->cases.begin(); j != s->cases.end(); j++){ hash = ((hash << 5) + hash) + j->first; State *k = j->second; hash = ((hash << 5) + hash) + k->cases.cases.size(); } if (s->cases.otherwise && s->cases.otherwise != nonmatching) { hash = ((hash << 5) + hash) + 5381; State *k = s->cases.otherwise; hash = ((hash << 5) + hash) + k->cases.cases.size(); } hash = (hash << 8) | s->cases.cases.size(); return hash; } /* minimize the number of dfa states */ void DFA::minimize(dfaflags_t flags) { map , Partition *> perm_map; list partitions; map partition_map; /* Set up the initial partitions * minimium of - 1 non accepting, and 1 accepting * if trans hashing is used the accepting and non-accepting partitions * can be further split based on the number and type of transitions * a state makes. * If permission hashing is enabled the accepting partitions can * be further divided by permissions. This can result in not * obtaining a truely minimized dfa but comes close, and can speedup * minimization. */ int accept_count = 0; int final_accept = 0; for (Partition::iterator i = states.begin(); i != states.end(); i++) { uint64_t perm_hash = 0; if (flags & DFA_CONTROL_MINIMIZE_HASH_PERMS) { /* make every unique perm create a new partition */ perm_hash = ((uint64_t)(*i)->audit)<<32 | (uint64_t)(*i)->accept; } else if ((*i)->audit || (*i)->accept) { /* combine all perms together into a single parition */ perm_hash = 1; } /* else not an accept state so 0 for perm_hash */ size_t trans_hash = 0; if (flags & DFA_CONTROL_MINIMIZE_HASH_TRANS) trans_hash = hash_trans(*i); pair group = make_pair(perm_hash, trans_hash); map , Partition *>::iterator p = perm_map.find(group); if (p == perm_map.end()) { Partition *part = new Partition(); part->push_back(*i); perm_map.insert(make_pair(group, part)); partitions.push_back(part); partition_map.insert(make_pair(*i, part)); if (perm_hash) accept_count++; } else { partition_map.insert(make_pair(*i, p->second)); p->second->push_back(*i); } if ((flags & DFA_DUMP_PROGRESS) && (partitions.size() % 1000 == 0)) cerr << "\033[2KMinimize dfa: partitions " << partitions.size() << "\tinit " << partitions.size() << " (accept " << accept_count << ")\r"; } /* perm_map is no longer needed so free the memory it is using. * Don't remove - doing it manually here helps reduce peak memory usage. */ perm_map.clear(); int init_count = partitions.size(); if (flags & DFA_DUMP_PROGRESS) cerr << "\033[2KMinimize dfa: partitions " << partitions.size() << "\tinit " << init_count << " (accept " << accept_count << ")\r"; /* Now do repartitioning until each partition contains the set of * states that are the same. This will happen when the partition * splitting stables. With a worse case of 1 state per partition * ie. already minimized. */ Partition *new_part; int new_part_count; do { new_part_count = 0; for (list ::iterator p = partitions.begin(); p != partitions.end(); p++) { new_part = NULL; State *rep = *((*p)->begin()); Partition::iterator next; for (Partition::iterator s = ++(*p)->begin(); s != (*p)->end(); ) { if (same_mappings(partition_map, rep, *s)) { ++s; continue; } if (!new_part) { new_part = new Partition; list ::iterator tmp = p; partitions.insert(++tmp, new_part); new_part_count++; } new_part->push_back(*s); s = (*p)->erase(s); } /* remapping partition_map for new_part entries * Do not do this above as it messes up same_mappings */ if (new_part) { for (Partition::iterator m = new_part->begin(); m != new_part->end(); m++) { partition_map.erase(*m); partition_map.insert(make_pair(*m, new_part)); } } if ((flags & DFA_DUMP_PROGRESS) && (partitions.size() % 100 == 0)) cerr << "\033[2KMinimize dfa: partitions " << partitions.size() << "\tinit " << init_count << " (accept " << accept_count << ")\r"; } } while(new_part_count); if (partitions.size() == states.size()) { if (flags & DFA_DUMP_STATS) cerr << "\033[2KDfa minimization no states removed: partitions " << partitions.size() << "\tinit " << init_count << " (accept " << accept_count << ")\n"; goto out; } /* Remap the dfa so it uses the representative states * Use the first state of a partition as the representative state * At this point all states with in a partion have transitions * to states within the same partitions, however this can slow * down compressed dfa compression as there are more states, */ for (list ::iterator p = partitions.begin(); p != partitions.end(); p++) { /* representative state for this partition */ State *rep = *((*p)->begin()); /* update representative state's transitions */ if (rep->cases.otherwise) { map ::iterator z = partition_map.find(rep->cases.otherwise); Partition *partition = partition_map.find(rep->cases.otherwise)->second; rep->cases.otherwise = *partition->begin(); } for (Cases::iterator c = rep->cases.begin(); c != rep->cases.end(); c++) { Partition *partition = partition_map.find(c->second)->second; c->second = *partition->begin(); } //if ((*p)->size() > 1) //cerr << rep->label << ": "; /* clear the state label for all non representative states, * and accumulate permissions */ for (Partition::iterator i = ++(*p)->begin(); i != (*p)->end(); i++) { //cerr << " " << (*i)->label; (*i)->label = -1; rep->accept |= (*i)->accept; rep->audit |= (*i)->audit; } if (rep->accept || rep->audit) final_accept++; //if ((*p)->size() > 1) //cerr << "\n"; } if (flags & DFA_DUMP_STATS) cerr << "\033[2KMinimized dfa: final partitions " << partitions.size() << " (accept " << final_accept << ")" << "\tinit " << init_count << " (accept " << accept_count << ")\n"; /* make sure nonmatching and start state are up to date with the * mappings */ { Partition *partition = partition_map.find(nonmatching)->second; if (*partition->begin() != nonmatching) { nonmatching = *partition->begin(); } partition = partition_map.find(start)->second; if (*partition->begin() != start) { start = *partition->begin(); } } /* Now that the states have been remapped, remove all states * that are not the representive states for their partition, they * will have a label == -1 */ for (Partition::iterator i = states.begin(); i != states.end(); ) { if ((*i)->label == -1) { State *s = *i; i = states.erase(i); delete(s); } else i++; } out: /* Cleanup */ while (!partitions.empty()) { Partition *p = partitions.front(); partitions.pop_front(); delete(p); } } /** * text-dump the DFA (for debugging). */ void DFA::dump(ostream& os) { for (Partition::iterator i = states.begin(); i != states.end(); i++) { if (*i == start || (*i)->accept) { os << **i; if (*i == start) os << " <=="; if ((*i)->accept) { os << " (0x" << hex << (*i)->accept << " " << (*i)->audit << dec << ')'; } os << endl; } } os << endl; for (Partition::iterator i = states.begin(); i != states.end(); i++) { if ((*i)->cases.otherwise) os << **i << " -> " << (*i)->cases.otherwise << endl; for (Cases::iterator j = (*i)->cases.begin(); j != (*i)->cases.end(); j++) { os << **i << " -> " << j->second << ": " << j->first << endl; } } os << endl; } /** * Create a dot (graphviz) graph from the DFA (for debugging). */ void DFA::dump_dot_graph(ostream& os) { os << "digraph \"dfa\" {" << endl; for (Partition::iterator i = states.begin(); i != states.end(); i++) { if (*i == nonmatching) continue; os << "\t\"" << **i << "\" [" << endl; if (*i == start) { os << "\t\tstyle=bold" << endl; } uint32_t perms = (*i)->accept; if (perms) { os << "\t\tlabel=\"" << **i << "\\n(" << perms << ")\"" << endl; } os << "\t]" << endl; } for (Partition::iterator i = states.begin(); i != states.end(); i++) { Cases& cases = (*i)->cases; Chars excluded; for (Cases::iterator j = cases.begin(); j != cases.end(); j++) { if (j->second == nonmatching) excluded.insert(j->first); else { os << "\t\"" << **i << "\" -> \""; os << j->second << "\" [" << endl; os << "\t\tlabel=\"" << j->first << "\"" << endl; os << "\t]" << endl; } } if (cases.otherwise && cases.otherwise != nonmatching) { os << "\t\"" << **i << "\" -> \"" << cases.otherwise << "\" [" << endl; if (!excluded.empty()) { os << "\t\tlabel=\"[^"; for (Chars::iterator i = excluded.begin(); i != excluded.end(); i++) { os << *i; } os << "]\"" << endl; } os << "\t]" << endl; } } os << '}' << endl; } /** * Compute character equivalence classes in the DFA to save space in the * transition table. */ map DFA::equivalence_classes(dfaflags_t flags) { map classes; uchar next_class = 1; for (Partition::iterator i = states.begin(); i != states.end(); i++) { Cases& cases = (*i)->cases; /* Group edges to the same next state together */ map node_sets; for (Cases::iterator j = cases.begin(); j != cases.end(); j++) node_sets[j->second].insert(j->first); for (map::iterator j = node_sets.begin(); j != node_sets.end(); j++) { /* Group edges to the same next state together by class */ map node_classes; bool class_used = false; for (Chars::iterator k = j->second.begin(); k != j->second.end(); k++) { pair::iterator, bool> x = classes.insert(make_pair(*k, next_class)); if (x.second) class_used = true; pair::iterator, bool> y = node_classes.insert(make_pair(x.first->second, Chars())); y.first->second.insert(*k); } if (class_used) { next_class++; class_used = false; } for (map::iterator k = node_classes.begin(); k != node_classes.end(); k++) { /** * If any other characters are in the same class, move * the characters in this class into their own new class */ map::iterator l; for (l = classes.begin(); l != classes.end(); l++) { if (l->second == k->first && k->second.find(l->first) == k->second.end()) { class_used = true; break; } } if (class_used) { for (Chars::iterator l = k->second.begin(); l != k->second.end(); l++) { classes[*l] = next_class; } next_class++; class_used = false; } } } } if (flags & DFA_DUMP_EQUIV_STATS) fprintf(stderr, "Equiv class reduces to %d classes\n", next_class - 1); return classes; } /** * Text-dump the equivalence classes (for debugging). */ void dump_equivalence_classes(ostream& os, map& eq) { map rev; for (map::iterator i = eq.begin(); i != eq.end(); i++) { Chars& chars = rev.insert(make_pair(i->second, Chars())).first->second; chars.insert(i->first); } os << "(eq):" << endl; for (map::iterator i = rev.begin(); i != rev.end(); i++) { os << (int)i->first << ':'; Chars& chars = i->second; for (Chars::iterator j = chars.begin(); j != chars.end(); j++) { os << ' ' << *j; } os << endl; } } /** * Replace characters with classes (which are also represented as * characters) in the DFA transition table. */ void DFA::apply_equivalence_classes(map& eq) { /** * Note: We only transform the transition table; the nodes continue to * contain the original characters. */ for (Partition::iterator i = states.begin(); i != states.end(); i++) { map tmp; tmp.swap((*i)->cases.cases); for (Cases::iterator j = tmp.begin(); j != tmp.end(); j++) (*i)->cases.cases.insert(make_pair(eq[j->first], j->second)); } } /** * Flip the children of all cat nodes. This causes strings to be matched * back-forth. */ void flip_tree(Node *node) { for (depth_first_traversal i(node); i; i++) { if (CatNode *cat = dynamic_cast(*i)) { swap(cat->child[0], cat->child[1]); } } } class TransitionTable { typedef vector > DefaultBase; typedef vector > NextCheck; public: TransitionTable(DFA& dfa, map& eq, dfaflags_t flags); void dump(ostream& os); void flex_table(ostream& os, const char *name); void init_free_list(vector > &free_list, size_t prev, size_t start); bool fits_in(vector > &free_list, size_t base, Cases& cases); void insert_state(vector > &free_list, State *state, DFA& dfa); private: vector accept; vector accept2; DefaultBase default_base; NextCheck next_check; map num; map& eq; uchar max_eq; size_t first_free; }; void TransitionTable::init_free_list(vector > &free_list, size_t prev, size_t start) { for (size_t i = start; i < free_list.size(); i++) { if (prev) free_list[prev].second = i; free_list[i].first = prev; prev = i; } free_list[free_list.size() -1].second = 0; } /** * new Construct the transition table. */ TransitionTable::TransitionTable(DFA& dfa, map& eq, dfaflags_t flags) : eq(eq) { if (flags & DFA_DUMP_TRANS_PROGRESS) fprintf(stderr, "Compressing trans table:\r"); if (eq.empty()) max_eq = 255; else { max_eq = 0; for(map::iterator i = eq.begin(); i != eq.end(); i++) { if (i->second > max_eq) max_eq = i->second; } } /* Do initial setup adding up all the transitions and sorting by * transition count. */ size_t optimal = 2; multimap order; vector > free_list; for (Partition::iterator i = dfa.states.begin(); i != dfa.states.end(); i++) { if (*i == dfa.start || *i == dfa.nonmatching) continue; optimal += (*i)->cases.cases.size(); if (flags & DFA_CONTROL_TRANS_HIGH) { size_t range = 0; if ((*i)->cases.cases.size()) range = (*i)->cases.cases.rbegin()->first - (*i)->cases.begin()->first; size_t ord = ((256 - (*i)->cases.cases.size()) << 8) | (256 - range); /* reverse sort by entry count, most entries first */ order.insert(make_pair(ord, *i)); } } /* Insert the dummy nonmatching transition by hand */ next_check.push_back(make_pair(dfa.nonmatching, dfa.nonmatching)); default_base.push_back(make_pair(dfa.nonmatching, 0)); num.insert(make_pair(dfa.nonmatching, num.size())); accept.resize(dfa.states.size()); accept2.resize(dfa.states.size()); next_check.resize(optimal); free_list.resize(optimal); accept[0] = 0; accept2[0] = 0; first_free = 1; init_free_list(free_list, 0, 1); insert_state(free_list, dfa.start, dfa); accept[1] = 0; accept2[1] = 0; num.insert(make_pair(dfa.start, num.size())); int count = 2; if (!(flags & DFA_CONTROL_TRANS_HIGH)) { for (Partition::iterator i = dfa.states.begin(); i != dfa.states.end(); i++) { if (*i != dfa.nonmatching && *i != dfa.start) { insert_state(free_list, *i, dfa); accept[num.size()] = (*i)->accept; accept2[num.size()] = (*i)->audit; num.insert(make_pair(*i, num.size())); } if (flags & (DFA_DUMP_TRANS_PROGRESS)) { count++; if (count % 100 == 0) fprintf(stderr, "\033[2KCompressing trans table: insert state: %d/%ld\r", count, dfa.states.size()); } } } else { for (multimap ::iterator i = order.begin(); i != order.end(); i++) { if (i->second != dfa.nonmatching && i->second != dfa.start) { insert_state(free_list, i->second, dfa); accept[num.size()] = i->second->accept; accept2[num.size()] = i->second->audit; num.insert(make_pair(i->second, num.size())); } if (flags & (DFA_DUMP_TRANS_PROGRESS)) { count++; if (count % 100 == 0) fprintf(stderr, "\033[2KCompressing trans table: insert state: %d/%ld\r", count, dfa.states.size()); } } } if (flags & (DFA_DUMP_TRANS_STATS | DFA_DUMP_TRANS_PROGRESS)) { ssize_t size = 4 * next_check.size() + 6 * dfa.states.size(); fprintf(stderr, "\033[2KCompressed trans table: states %ld, next/check %ld, optimal next/check %ld avg/state %.2f, compression %ld/%ld = %.2f %%\n", dfa.states.size(), next_check.size(), optimal, (float)next_check.size()/(float)dfa.states.size(), size, 512 * dfa.states.size(), 100.0 - ((float) size * 100.0 / (float)(512 * dfa.states.size()))); } } /** * Does fit into position of the transition table? */ bool TransitionTable::fits_in(vector > &free_list, size_t pos, Cases& cases) { size_t c, base = pos - cases.begin()->first; for (Cases::iterator i = cases.begin(); i != cases.end(); i++) { c = base + i->first; /* if it overflows the next_check array it fits in as we will * resize */ if (c >= next_check.size()) return true; if (next_check[c].second) return false; } return true; } /** * Insert of into the transition table. */ void TransitionTable::insert_state(vector > &free_list, State *from, DFA& dfa) { State *default_state = dfa.nonmatching; size_t base = 0; int resize; Cases& cases = from->cases; size_t c = cases.begin()->first; size_t prev = 0; size_t x = first_free; if (cases.otherwise) default_state = cases.otherwise; if (cases.cases.empty()) goto do_insert; repeat: resize = 0; /* get the first free entry that won't underflow */ while (x && (x < c)) { prev = x; x = free_list[x].second; } /* try inserting until we succeed. */ while (x && !fits_in(free_list, x, cases)) { prev = x; x = free_list[x].second; } if (!x) { resize = 256 - cases.begin()->first; x = free_list.size(); /* set prev to last free */ } else if (x + 255 - cases.begin()->first >= next_check.size()) { resize = (255 - cases.begin()->first - (next_check.size() - 1 - x)); for (size_t y = x; y; y = free_list[y].second) prev = y; } if (resize) { /* expand next_check and free_list */ size_t old_size = free_list.size(); next_check.resize(next_check.size() + resize); free_list.resize(free_list.size() + resize); init_free_list(free_list, prev, old_size); if (!first_free) first_free = old_size;; if (x == old_size) goto repeat; } base = x - c; for (Cases::iterator j = cases.begin(); j != cases.end(); j++) { next_check[base + j->first] = make_pair(j->second, from); size_t prev = free_list[base + j->first].first; size_t next = free_list[base + j->first].second; if (prev) free_list[prev].second = next; if (next) free_list[next].first = prev; if (base + j->first == first_free) first_free = next; } do_insert: default_base.push_back(make_pair(default_state, base)); } /** * Text-dump the transition table (for debugging). */ void TransitionTable::dump(ostream& os) { map st; for (map::iterator i = num.begin(); i != num.end(); i++) { st.insert(make_pair(i->second, i->first)); } os << "size=" << default_base.size() << " (accept, default, base): {state} -> {default state}" << endl; for (size_t i = 0; i < default_base.size(); i++) { os << i << ": "; os << "(" << accept[i] << ", " << num[default_base[i].first] << ", " << default_base[i].second << ")"; if (st[i]) os << " " << *st[i]; if (default_base[i].first) os << " -> " << *default_base[i].first; os << endl; } os << "size=" << next_check.size() << " (next, check): {check state} -> {next state} : offset from base" << endl; for (size_t i = 0; i < next_check.size(); i++) { if (!next_check[i].second) continue; os << i << ": "; if (next_check[i].second) { os << "(" << num[next_check[i].first] << ", " << num[next_check[i].second] << ")" << " " << *next_check[i].second << " -> " << *next_check[i].first << ": "; size_t offs = i - default_base[num[next_check[i].second]].second; if (eq.size()) os << offs; else os << (uchar)offs; } os << endl; } } #if 0 template class FirstIterator { public: FirstIterator(Iter pos) : pos(pos) { } typename Iter::value_type::first_type operator*() { return pos->first; } bool operator!=(FirstIterator& i) { return pos != i.pos; } void operator++() { ++pos; } ssize_t operator-(FirstIterator i) { return pos - i.pos; } private: Iter pos; }; template FirstIterator first_iterator(Iter iter) { return FirstIterator(iter); } template class SecondIterator { public: SecondIterator(Iter pos) : pos(pos) { } typename Iter::value_type::second_type operator*() { return pos->second; } bool operator!=(SecondIterator& i) { return pos != i.pos; } void operator++() { ++pos; } ssize_t operator-(SecondIterator i) { return pos - i.pos; } private: Iter pos; }; template SecondIterator second_iterator(Iter iter) { return SecondIterator(iter); } #endif /** * Create a flex-style binary dump of the DFA tables. The table format * was partly reverse engineered from the flex sources and from * examining the tables that flex creates with its --tables-file option. * (Only the -Cf and -Ce formats are currently supported.) */ #include "flex-tables.h" #include "regexp.h" static inline size_t pad64(size_t i) { return (i + (size_t)7) & ~(size_t)7; } string fill64(size_t i) { const char zeroes[8] = { }; string fill(zeroes, (i & 7) ? 8 - (i & 7) : 0); return fill; } template size_t flex_table_size(Iter pos, Iter end) { return pad64(sizeof(struct table_header) + sizeof(*pos) * (end - pos)); } template void write_flex_table(ostream& os, int id, Iter pos, Iter end) { struct table_header td = { }; size_t size = end - pos; td.td_id = htons(id); td.td_flags = htons(sizeof(*pos)); td.td_lolen = htonl(size); os.write((char *)&td, sizeof(td)); for (; pos != end; ++pos) { switch(sizeof(*pos)) { case 4: os.put((char)(*pos >> 24)); os.put((char)(*pos >> 16)); case 2: os.put((char)(*pos >> 8)); case 1: os.put((char)*pos); } } os << fill64(sizeof(td) + sizeof(*pos) * size); } void TransitionTable::flex_table(ostream& os, const char *name) { const char th_version[] = "notflex"; struct table_set_header th = { }; /** * Change the following two data types to adjust the maximum flex * table size. */ typedef uint16_t state_t; typedef uint32_t trans_t; if (default_base.size() >= (state_t)-1) { cerr << "Too many states (" << default_base.size() << ") for " "type state_t" << endl; exit(1); } if (next_check.size() >= (trans_t)-1) { cerr << "Too many transitions (" << next_check.size() << ") for " "type trans_t" << endl; exit(1); } /** * Create copies of the data structures so that we can dump the tables * using the generic write_flex_table() routine. */ vector equiv_vec; if (eq.size()) { equiv_vec.resize(256); for (map::iterator i = eq.begin(); i != eq.end(); i++) { equiv_vec[i->first] = i->second; } } vector default_vec; vector base_vec; for (DefaultBase::iterator i = default_base.begin(); i != default_base.end(); i++) { default_vec.push_back(num[i->first]); base_vec.push_back(i->second); } vector next_vec; vector check_vec; for (NextCheck::iterator i = next_check.begin(); i != next_check.end(); i++) { next_vec.push_back(num[i->first]); check_vec.push_back(num[i->second]); } /* Write the actual flex parser table. */ size_t hsize = pad64(sizeof(th) + sizeof(th_version) + strlen(name) + 1); th.th_magic = htonl(YYTH_REGEXP_MAGIC); th.th_hsize = htonl(hsize); th.th_ssize = htonl(hsize + flex_table_size(accept.begin(), accept.end()) + flex_table_size(accept2.begin(), accept2.end()) + (eq.size() ? flex_table_size(equiv_vec.begin(), equiv_vec.end()) : 0) + flex_table_size(base_vec.begin(), base_vec.end()) + flex_table_size(default_vec.begin(), default_vec.end()) + flex_table_size(next_vec.begin(), next_vec.end()) + flex_table_size(check_vec.begin(), check_vec.end())); os.write((char *)&th, sizeof(th)); os << th_version << (char)0 << name << (char)0; os << fill64(sizeof(th) + sizeof(th_version) + strlen(name) + 1); write_flex_table(os, YYTD_ID_ACCEPT, accept.begin(), accept.end()); write_flex_table(os, YYTD_ID_ACCEPT2, accept2.begin(), accept2.end()); if (eq.size()) write_flex_table(os, YYTD_ID_EC, equiv_vec.begin(), equiv_vec.end()); write_flex_table(os, YYTD_ID_BASE, base_vec.begin(), base_vec.end()); write_flex_table(os, YYTD_ID_DEF, default_vec.begin(), default_vec.end()); write_flex_table(os, YYTD_ID_NXT, next_vec.begin(), next_vec.end()); write_flex_table(os, YYTD_ID_CHK, check_vec.begin(), check_vec.end()); } #if 0 typedef set AcceptNodes; map dominance(DFA& dfa) { map is_dominated; for (States::iterator i = dfa.states.begin(); i != dfa.states.end(); i++) { AcceptNodes set1; for (State::iterator j = (*i)->begin(); j != (*i)->end(); j++) { if (AcceptNode *accept = dynamic_cast(*j)) set1.insert(accept); } for (AcceptNodes::iterator j = set1.begin(); j != set1.end(); j++) { pair::iterator, bool> x = is_dominated.insert(make_pair(*j, set1)); if (!x.second) { AcceptNodes &set2(x.first->second), set3; for (AcceptNodes::iterator l = set2.begin(); l != set2.end(); l++) { if (set1.find(*l) != set1.end()) set3.insert(*l); } set3.swap(set2); } } } return is_dominated; } #endif void dump_regexp_rec(ostream& os, Node *tree) { if (tree->child[0]) dump_regexp_rec(os, tree->child[0]); os << *tree; if (tree->child[1]) dump_regexp_rec(os, tree->child[1]); } void dump_regexp(ostream& os, Node *tree) { dump_regexp_rec(os, tree); os << endl; } #include #include struct aare_ruleset { int reverse; Node *root; }; extern "C" aare_ruleset_t *aare_new_ruleset(int reverse) { aare_ruleset_t *container = (aare_ruleset_t *) malloc(sizeof(aare_ruleset_t)); if (!container) return NULL; container->root = NULL; container->reverse = reverse; return container; } extern "C" void aare_delete_ruleset(aare_ruleset_t *rules) { if (rules) { if (rules->root) rules->root->release(); free(rules); } } static inline int diff_qualifiers(uint32_t perm1, uint32_t perm2) { return ((perm1 & AA_EXEC_TYPE) && (perm2 & AA_EXEC_TYPE) && (perm1 & AA_EXEC_TYPE) != (perm2 & AA_EXEC_TYPE)); } /** * Compute the permission flags that this state corresponds to. If we * have any exact matches, then they override the execute and safe * execute flags. */ uint32_t accept_perms(NodeSet *state, uint32_t *audit_ctl, int *error) { uint32_t perms = 0, exact_match_perms = 0, audit = 0, exact_audit = 0, quiet = 0, deny = 0; if (error) *error = 0; for (NodeSet::iterator i = state->begin(); i != state->end(); i++) { MatchFlag *match; if (!(match= dynamic_cast(*i))) continue; if (dynamic_cast(match)) { /* exact match only ever happens with x */ if (!is_merged_x_consistent(exact_match_perms, match->flag) && error) *error = 1;; exact_match_perms |= match->flag; exact_audit |= match->audit; } else if (dynamic_cast(match)) { deny |= match->flag; quiet |= match->audit; } else { if (!is_merged_x_consistent(perms, match->flag) && error) *error = 1; perms |= match->flag; audit |= match->audit; } } //if (audit || quiet) //fprintf(stderr, "perms: 0x%x, audit: 0x%x exact: 0x%x eaud: 0x%x deny: 0x%x quiet: 0x%x\n", perms, audit, exact_match_perms, exact_audit, deny, quiet); perms |= exact_match_perms & ~(AA_USER_EXEC_TYPE | AA_OTHER_EXEC_TYPE); if (exact_match_perms & AA_USER_EXEC_TYPE) { perms = (exact_match_perms & AA_USER_EXEC_TYPE) | (perms & ~AA_USER_EXEC_TYPE); audit = (exact_audit & AA_USER_EXEC_TYPE) | (audit & ~ AA_USER_EXEC_TYPE); } if (exact_match_perms & AA_OTHER_EXEC_TYPE) { perms = (exact_match_perms & AA_OTHER_EXEC_TYPE) | (perms & ~AA_OTHER_EXEC_TYPE); audit = (exact_audit & AA_OTHER_EXEC_TYPE) | (audit & ~AA_OTHER_EXEC_TYPE); } if (perms & AA_USER_EXEC & deny) perms &= ~AA_USER_EXEC_TYPE; if (perms & AA_OTHER_EXEC & deny) perms &= ~AA_OTHER_EXEC_TYPE; perms &= ~deny; if (audit_ctl) *audit_ctl = PACK_AUDIT_CTL(audit, quiet & deny); // if (perms & AA_ERROR_BIT) { // fprintf(stderr, "error bit 0x%x\n", perms); // exit(255); //} //if (perms & AA_EXEC_BITS) //fprintf(stderr, "accept perm: 0x%x\n", perms); /* if (perms & ~AA_VALID_PERMS) yyerror(_("Internal error accumulated invalid perm 0x%llx\n"), perms); */ //if (perms & AA_CHANGE_HAT) // fprintf(stderr, "change_hat 0x%x\n", perms); return perms; } extern "C" int aare_add_rule(aare_ruleset_t *rules, char *rule, int deny, uint32_t perms, uint32_t audit, dfaflags_t flags) { return aare_add_rule_vec(rules, deny, perms, audit, 1, &rule, flags); } #define FLAGS_WIDTH 2 #define MATCH_FLAGS_SIZE (sizeof(uint32_t) * 8 - 1) MatchFlag *match_flags[FLAGS_WIDTH][MATCH_FLAGS_SIZE]; DenyMatchFlag *deny_flags[FLAGS_WIDTH][MATCH_FLAGS_SIZE]; #define EXEC_MATCH_FLAGS_SIZE ((AA_EXEC_COUNT << 2) * 2) MatchFlag *exec_match_flags[FLAGS_WIDTH][EXEC_MATCH_FLAGS_SIZE]; /* mods + unsafe + ix *u::o*/ ExactMatchFlag *exact_match_flags[FLAGS_WIDTH][EXEC_MATCH_FLAGS_SIZE];/* mods + unsafe +ix *u::o*/ extern "C" void aare_reset_matchflags(void) { uint32_t i, j; #define RESET_FLAGS(group, size) { \ for (i = 0; i < FLAGS_WIDTH; i++) { \ for (j = 0; j < size; j++) { \ if ((group)[i][j]) delete (group)[i][j]; \ (group)[i][j] = NULL; \ } \ } \ } RESET_FLAGS(match_flags,MATCH_FLAGS_SIZE); RESET_FLAGS(deny_flags,MATCH_FLAGS_SIZE); RESET_FLAGS(exec_match_flags,EXEC_MATCH_FLAGS_SIZE); RESET_FLAGS(exact_match_flags,EXEC_MATCH_FLAGS_SIZE); #undef RESET_FLAGS } extern "C" int aare_add_rule_vec(aare_ruleset_t *rules, int deny, uint32_t perms, uint32_t audit, int count, char **rulev, dfaflags_t flags) { Node *tree = NULL, *accept; int exact_match; assert(perms != 0); if (regexp_parse(&tree, rulev[0])) return 0; for (int i = 1; i < count; i++) { Node *subtree = NULL; Node *node = new CharNode(0); if (!node) return 0; tree = new CatNode(tree, node); if (regexp_parse(&subtree, rulev[i])) return 0; tree = new CatNode(tree, subtree); } /* * Check if we have an expression with or without wildcards. This * determines how exec modifiers are merged in accept_perms() based * on how we split permission bitmasks here. */ exact_match = 1; for (depth_first_traversal i(tree); i; i++) { if (dynamic_cast(*i) || dynamic_cast(*i) || dynamic_cast(*i) || dynamic_cast(*i) || dynamic_cast(*i)) exact_match = 0; } if (rules->reverse) flip_tree(tree); /* 0x3f == 4 bits x mods + 1 bit unsafe mask + 1 bit ix, after shift */ #define EXTRACT_X_INDEX(perm, shift) (((perm) >> (shift + 8)) & 0x3f) //if (perms & ALL_AA_EXEC_TYPE && (!perms & AA_EXEC_BITS)) // fprintf(stderr, "adding X rule without MAY_EXEC: 0x%x %s\n", perms, rulev[0]); //if (perms & ALL_EXEC_TYPE) // fprintf(stderr, "adding X rule %s 0x%x\n", rulev[0], perms); //if (audit) //fprintf(stderr, "adding rule with audit bits set: 0x%x %s\n", audit, rulev[0]); //if (perms & AA_CHANGE_HAT) // fprintf(stderr, "adding change_hat rule %s\n", rulev[0]); /* the permissions set is assumed to be non-empty if any audit * bits are specified */ accept = NULL; for (unsigned int n = 0; perms && n < (sizeof(perms) * 8) ; n++) { uint32_t mask = 1 << n; if (perms & mask) { int ai = audit & mask ? 1 : 0; perms &= ~mask; Node *flag; if (mask & ALL_AA_EXEC_TYPE) /* these cases are covered by EXEC_BITS */ continue; if (deny) { if (deny_flags[ai][n]) { flag = deny_flags[ai][n]; } else { //fprintf(stderr, "Adding deny ai %d mask 0x%x audit 0x%x\n", ai, mask, audit & mask); deny_flags[ai][n] = new DenyMatchFlag(mask, audit&mask); flag = deny_flags[ai][n]; } } else if (mask & AA_EXEC_BITS) { uint32_t eperm = 0; uint32_t index = 0; if (mask & AA_USER_EXEC) { eperm = mask | (perms & AA_USER_EXEC_TYPE); index = EXTRACT_X_INDEX(eperm, AA_USER_SHIFT); } else { eperm = mask | (perms & AA_OTHER_EXEC_TYPE); index = EXTRACT_X_INDEX(eperm, AA_OTHER_SHIFT) + (AA_EXEC_COUNT << 2); } //fprintf(stderr, "index %d eperm 0x%x\n", index, eperm); if (exact_match) { if (exact_match_flags[ai][index]) { flag = exact_match_flags[ai][index]; } else { exact_match_flags[ai][index] = new ExactMatchFlag(eperm, audit&mask); flag = exact_match_flags[ai][index]; } } else { if (exec_match_flags[ai][index]) { flag = exec_match_flags[ai][index]; } else { exec_match_flags[ai][index] = new MatchFlag(eperm, audit&mask); flag = exec_match_flags[ai][index]; } } } else { if (match_flags[ai][n]) { flag = match_flags[ai][n]; } else { match_flags[ai][n] = new MatchFlag(mask, audit&mask); flag = match_flags[ai][n]; } } if (accept) accept = new AltNode(accept, flag); else accept = flag; } } if (flags & DFA_DUMP_RULE_EXPR) { cerr << "rule: "; cerr << rulev[0]; for (int i = 1; i < count; i++) { cerr << "\\x00"; cerr << rulev[i]; } cerr << " -> "; tree->dump(cerr); cerr << "\n\n"; } if (rules->root) rules->root = new AltNode(rules->root, new CatNode(tree, accept)); else rules->root = new CatNode(tree, accept); return 1; } /* create a dfa from the ruleset * returns: buffer contain dfa tables, @size set to the size of the tables * else NULL on failure */ extern "C" void *aare_create_dfa(aare_ruleset_t *rules, size_t *size, dfaflags_t flags) { char *buffer = NULL; label_nodes(rules->root); if (flags & DFA_DUMP_TREE) { cerr << "\nDFA: Expression Tree\n"; rules->root->dump(cerr); cerr << "\n\n"; } if (flags & DFA_CONTROL_TREE_SIMPLE) { rules->root = simplify_tree(rules->root, flags); if (flags & DFA_DUMP_SIMPLE_TREE) { cerr << "\nDFA: Simplified Expression Tree\n"; rules->root->dump(cerr); cerr << "\n\n"; } } DFA dfa(rules->root, flags); if (flags & DFA_CONTROL_MINIMIZE) dfa.minimize(flags); //if (flags & DFA_CONTROL_REMOVE_UNREACHABLE) // remove_unreachable(flags); if (flags & DFA_DUMP_STATES) dfa.dump(cerr); if (flags & DFA_DUMP_GRAPH) dfa.dump_dot_graph(cerr); map eq; if (flags & DFA_CONTROL_EQUIV) { eq = dfa.equivalence_classes(flags); dfa.apply_equivalence_classes(eq); if (flags & DFA_DUMP_EQUIV) { cerr << "\nDFA equivalence class\n"; dump_equivalence_classes(cerr, eq); } } else if (flags & DFA_DUMP_EQUIV) cerr << "\nDFA did not generate an equivalence class\n"; // TODO: perm verification needs to be moved into dfa creation // if (dfa.verify_perms()) { // *size = 0; // return NULL; // } stringstream stream; TransitionTable transition_table(dfa, eq, flags); if (flags & DFA_DUMP_TRANS_TABLE) transition_table.dump(cerr); transition_table.flex_table(stream, ""); stringbuf *buf = stream.rdbuf(); buf->pubseekpos(0); *size = buf->in_avail(); buffer = (char *)malloc(*size); if (!buffer) return NULL; buf->sgetn(buffer, *size); return buffer; }