Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members  

cmt_regexp.cxx

Go to the documentation of this file.
00001 
00002 #include "cmt_std.h"
00003 #include "cmt_regexp.h"
00004 #include "cmt_vector.h"
00005 
00006 //----------------------------------------------------------
00007 //
00008 //  Declarations
00009 //
00010 //----------------------------------------------------------
00011 
00012 //----------------------------------------------------------
00013 class cmt_node
00014 {
00015 public:
00016   static cmt_node& null ();
00017   static int node_count ();
00018   
00019   cmt_node ();
00020   virtual ~cmt_node ();
00021   
00022   virtual const cmt_regexp::iterator match (const cmt_string& text, 
00023                                             int pos) const;
00024   virtual bool is_char () const;
00025   virtual bool is_many_node () const;
00026   
00027 private:
00028   static int _node_count;
00029 };
00030 //----------------------------------------------------------
00031 
00032 //----------------------------------------------------------
00033 class cmt_char_node : public cmt_node
00034 {
00035 public:
00036   cmt_char_node (char c);
00037   
00038   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00039   
00040   bool is_char () const;
00041   operator char ();
00042   
00043 private:
00044   char _c;
00045 };
00046 //----------------------------------------------------------
00047 
00048 //----------------------------------------------------------
00049 class cmt_string_node : public cmt_node
00050 {
00051 public:
00052   cmt_string_node (const cmt_string& s);
00053   
00054   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00055   
00056 private:
00057   cmt_string _s;
00058 };
00059 //----------------------------------------------------------
00060 
00061 //----------------------------------------------------------
00062 class cmt_char_list_node : public cmt_node
00063 {
00064 public:
00065   cmt_char_list_node (cmt_string list);
00066   
00067   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00068   
00069 protected:
00070   cmt_string _list;
00071   cmt_string _choices;
00072 };
00073 //----------------------------------------------------------
00074 
00075 //----------------------------------------------------------
00076 class cmt_not_char_list_node : public cmt_char_list_node
00077 {
00078 public:
00079   cmt_not_char_list_node (cmt_string list);
00080   
00081   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00082 };
00083 //----------------------------------------------------------
00084 
00085 //----------------------------------------------------------
00086 class cmt_any_node : public cmt_node
00087 {
00088 public:
00089   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00090 };
00091 //----------------------------------------------------------
00092 
00093 //----------------------------------------------------------
00094 class cmt_zero_one : public cmt_node
00095 {
00096 public:
00097   cmt_zero_one (cmt_node* n);
00098   ~cmt_zero_one ();
00099   
00100   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00101   
00102 protected:
00103   cmt_node* _node;
00104 };
00105 //----------------------------------------------------------
00106 
00107 //----------------------------------------------------------
00108 class cmt_begin_node : public cmt_node
00109 {
00110 public:
00111   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00112 };
00113 //----------------------------------------------------------
00114 
00115 //----------------------------------------------------------
00116 class cmt_end_node : public cmt_node
00117 {
00118 public:
00119   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00120 };
00121 //----------------------------------------------------------
00122 
00123 //----------------------------------------------------------
00124 class cmt_node_set : public cmt_node
00125 {
00126 public:
00127   cmt_node_set ();
00128   cmt_node_set (cmt_node_set* father);
00129   ~cmt_node_set ();
00130   
00131   cmt_node_set* father ();
00132   void clear ();
00133   void push (cmt_node* n);
00134   cmt_node* pop ();
00135   cmt_node* top () const;
00136   int nodes () const;
00137   const cmt_node* nodeAt (int index) const;
00138   bool parentheses () const;
00139   void set_parentheses (bool value);
00140   virtual void reduce ();
00141   
00142 protected:
00143   cmt_node_set* _father;
00144   cmt_vector<cmt_node*> _nodes;
00145   bool _parentheses;
00146 };
00147 //----------------------------------------------------------
00148 
00149 //----------------------------------------------------------
00150 class cmt_and_node : public cmt_node_set
00151 {
00152 public:
00153   cmt_and_node ();
00154   cmt_and_node (cmt_node_set* father);
00155   
00156   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00157   void reduce ();
00158   void fill (cmt_and_node& other, int start_index);
00159 };
00160 //----------------------------------------------------------
00161 
00162 //----------------------------------------------------------
00163 class cmt_or_node : public cmt_node_set
00164 {
00165 public:
00166   cmt_or_node (cmt_node_set* father);
00167   
00168   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00169 };
00170 //----------------------------------------------------------
00171 
00172 //----------------------------------------------------------
00173 class cmt_many_node : public cmt_node
00174 {
00175 public:
00176   bool is_many_node () const;
00177   void install (cmt_and_node& other, int index);
00178   void reduce ();
00179   
00180 protected:
00181   cmt_many_node (cmt_node* n);
00182   virtual ~cmt_many_node ();
00183   cmt_node* _node;
00184   cmt_and_node _follower;
00185 };
00186 //----------------------------------------------------------
00187 
00188 //----------------------------------------------------------
00189 class cmt_zero_more : public cmt_many_node
00190 {
00191 public:
00192   cmt_zero_more (cmt_node* n);
00193   
00194   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00195 };
00196 //----------------------------------------------------------
00197 
00198 //----------------------------------------------------------
00199 class cmt_one_more : public cmt_many_node
00200 {
00201 public:
00202   cmt_one_more (cmt_node* n);
00203 
00204   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00205 };
00206 //----------------------------------------------------------
00207 
00208 //----------------------------------------------------------
00209 cmt_node& cmt_node::null ()
00210 {
00211   static cmt_node null_instance;
00212   
00213   return (null_instance);
00214 }
00215 //----------------------------------------------------------
00216 
00217 
00218 
00219 //----------------------------------------------------------
00220 //
00221 //  Implementation
00222 //
00223 //----------------------------------------------------------
00224 
00225 //----------------------------------------------------------
00226 cmt_node::cmt_node ()
00227 {
00228   _node_count++;
00229 }
00230 
00231 cmt_node::~cmt_node ()
00232 {
00233   _node_count--;
00234 }
00235 
00236 int cmt_node::node_count ()
00237 {
00238   return (_node_count);
00239 }
00240 
00241 const cmt_regexp::iterator cmt_node::match (const cmt_string& /*text*/, 
00242                                             int /*pos*/) const
00243 {
00244   return (cmt_regexp::iterator::null());
00245 }
00246 
00247 bool cmt_node::is_char () const
00248 {
00249   return (false);
00250 }
00251 
00252 bool cmt_node::is_many_node () const
00253 {
00254   return (false);
00255 }
00256 
00257 int cmt_node::_node_count = 0;
00258 //----------------------------------------------------------
00259 
00260 //----------------------------------------------------------
00261 cmt_char_node::cmt_char_node (char c)
00262 {
00263   _c = c;
00264 }
00265 
00266 const cmt_regexp::iterator cmt_char_node::match (const cmt_string& text, 
00267                                                  int pos) const
00268 {
00269   if ((pos < 0) || (pos > text.size ())) 
00270     {
00271       return (cmt_regexp::iterator::null ());
00272     }
00273 
00274   char c = text[pos];
00275 
00276   if (c == _c)
00277     {
00278       return (cmt_regexp::iterator (pos, 1));
00279     }
00280   
00281   return (cmt_regexp::iterator::null ());
00282 }
00283 
00284 bool cmt_char_node::is_char () const
00285 {
00286   return (true);
00287 }
00288 
00289 cmt_char_node::operator char ()
00290 {
00291   return (_c);
00292 }
00293 //----------------------------------------------------------
00294 
00295 //----------------------------------------------------------
00296 cmt_string_node::cmt_string_node (const cmt_string& s)
00297 {
00298   _s = s;
00299 }
00300 
00301 const cmt_regexp::iterator cmt_string_node::match (const cmt_string& text, 
00302                                                   int pos) const
00303 {
00304   if ((pos < 0) || (pos > text.size ())) 
00305     {
00306       return (cmt_regexp::iterator::null ());
00307     }
00308 
00309   int length = _s.size ();
00310   
00311   cmt_string s = text.substr (pos, length);
00312   
00313   if ((length == 0) || (s == _s))
00314     {
00315       return (cmt_regexp::iterator (pos, length));
00316     }
00317   
00318   return (cmt_regexp::iterator::null ());
00319 }
00320 //----------------------------------------------------------
00321 
00322 //----------------------------------------------------------
00323 cmt_char_list_node::cmt_char_list_node (cmt_string list)
00324 {
00325   _list = list;
00326   
00327   _choices = "";
00328   
00329   char c;
00330   int i;
00331   
00332   for (i = 0; i < list.size (); i++)
00333     {
00334       c = list[i];
00335       
00336       switch (c)
00337         {
00338           case '-':
00339             i++;
00340             {
00341               char c1 = _choices[_choices.size () - 1];
00342               char c2 = list[i];
00343               int j;
00344               int j0 = (c1 < c2) ? c1 : c2;
00345               int j1 = (c1 > c2) ? c1 : c2;
00346               for (j = j0; j <= j1; j++)
00347                 {
00348                   _choices += j;
00349                 }
00350             }
00351             break;
00352           case '\\':
00353             i++;
00354             c = list[i];
00355             switch (c)
00356               {
00357                 case '[':
00358                 case ']':
00359                 case '(':
00360                 case ')':
00361                 case '.':
00362                 case '*':
00363                 case '?':
00364                 case '^':
00365                 case '$':
00366                 case '\\':
00367                   break;
00368                 case 'r':
00369                   c = '\r';
00370                   break;
00371                 case 't':
00372                   c = '\t';
00373                   break;
00374                 case 'n':
00375                   c = '\n';
00376                   break;
00377                 default:
00378                   break;
00379               }
00380           default:
00381             _choices += c;
00382             break;
00383         }
00384     }
00385 }
00386 
00387 const cmt_regexp::iterator cmt_char_list_node::match (const cmt_string& text, 
00388                                                      int pos) const
00389 {
00390   if ((pos < 0) || (pos > text.size ())) 
00391     {
00392       return (cmt_regexp::iterator::null ());
00393     }
00394 
00395   char c = text[pos];
00396 
00397   int i;
00398   
00399   for (i = 0; i < _choices.size (); i++)
00400     {
00401       if (c == _choices[i]) return (cmt_regexp::iterator (pos, 1));
00402     }
00403   
00404   return (cmt_regexp::iterator::null ());
00405 }
00406 //----------------------------------------------------------
00407 
00408 //----------------------------------------------------------
00409 cmt_not_char_list_node::cmt_not_char_list_node (cmt_string list) : 
00410         cmt_char_list_node (list)
00411 {
00412 }
00413 
00414 const cmt_regexp::iterator cmt_not_char_list_node::match (const cmt_string& text, 
00415                                                       int pos) const
00416 {
00417   if ((pos < 0) || (pos > text.size ())) 
00418     {
00419       return (cmt_regexp::iterator::null ());
00420     }
00421 
00422   char c = text[pos];
00423 
00424   int i;
00425 
00426   for (i = 0; i < _choices.size (); i++)
00427     {
00428       if (c == _choices[i]) return (cmt_regexp::iterator::null ());
00429     }
00430   
00431   return (cmt_regexp::iterator (pos, 1));
00432 }
00433 //----------------------------------------------------------
00434 
00435 //----------------------------------------------------------
00436 const cmt_regexp::iterator cmt_any_node::match (const cmt_string& text, 
00437                                             int pos) const
00438 {
00439   if ((pos < 0) | (pos >= text.size ())) 
00440     {
00441       return (cmt_regexp::iterator::null ());
00442     }
00443   
00444   return (cmt_regexp::iterator (pos, 1));
00445 }
00446 //----------------------------------------------------------
00447 
00448 //----------------------------------------------------------
00449 cmt_zero_one::cmt_zero_one (cmt_node* n) : _node (n)
00450 {
00451 }
00452 
00453 cmt_zero_one::~cmt_zero_one ()
00454 {
00455   delete _node;
00456 }
00457 
00458 const cmt_regexp::iterator cmt_zero_one::match (const cmt_string& text, 
00459                                                int pos) const
00460 {
00461   if ((pos < 0) || (pos > text.size ())) 
00462     {
00463       return (cmt_regexp::iterator::null ());
00464     }
00465 
00466   int total = 0;
00467 
00468   if (pos < text.size ())
00469     {
00470       const cmt_regexp::iterator it = _node->match (text, pos);
00471       if (it != cmt_regexp::iterator::null ())
00472         {
00473           total += it._length;
00474           pos += it._length;
00475         }
00476     }
00477   
00478   return (cmt_regexp::iterator (pos, total));
00479 }
00480 //----------------------------------------------------------
00481 
00482 //----------------------------------------------------------
00483 cmt_many_node::cmt_many_node (cmt_node* n) : _node (n)
00484 {
00485 }
00486 
00487 bool cmt_many_node::is_many_node () const
00488 {
00489   return (true);
00490 }
00491 
00492 cmt_many_node::~cmt_many_node ()
00493 {
00494   delete _node;
00495 }
00496 
00497 void cmt_many_node::install (cmt_and_node& other, int start_index)
00498 {
00499   _follower.fill (other, start_index);
00500 }
00501 
00502 void cmt_many_node::reduce ()
00503 {
00504   _follower.reduce ();
00505 }
00506 //----------------------------------------------------------
00507 
00508 
00509 
00510 //----------------------------------------------------------
00511 cmt_zero_more::cmt_zero_more (cmt_node* n) : cmt_many_node (n)
00512 {
00513 }
00514 
00515 const cmt_regexp::iterator cmt_zero_more::match (const cmt_string& text, 
00516                                              int pos) const
00517 {
00518   if ((pos < 0) || (pos > text.size ())) 
00519     {
00520       return (cmt_regexp::iterator::null ());
00521     }
00522 
00523   int total = 0;
00524 
00525   //
00526   // we are at : x*y
00527   //
00528 
00529   int saved_pos = -1;
00530   int saved_total = -1;
00531 
00532   do
00533     {
00534       const cmt_regexp::iterator itx = _node->match (text, pos);
00535       const cmt_regexp::iterator ity = _follower.match (text, pos);
00536 
00537       if ((itx == cmt_regexp::iterator::null ()) &&
00538           (ity == cmt_regexp::iterator::null ())) 
00539         {
00540           //
00541           // There is neither x nor y. We move back to the last 
00542           // succesful match for y.
00543           //
00544           if (saved_pos >= 0)
00545             {
00546               //
00547               // We had once a y.
00548               //
00549               pos = saved_pos;
00550               total = saved_total;
00551             }
00552           else
00553             {
00554               //
00555               // We never had any y !
00556               //
00557               return (cmt_regexp::iterator::null ());
00558             }
00559 
00560           break;
00561         }
00562 
00563       if (itx == cmt_regexp::iterator::null ())
00564         {
00565           //
00566           // There is a y but no x anymore, fine, we can quit.
00567           //
00568           total += ity._length;
00569           pos += ity._length;
00570           break;
00571         }
00572 
00573       if (ity != cmt_regexp::iterator::null ())
00574         {
00575           //
00576           //  We have both x and y. We save the current pos and total,
00577           // and then skip this x.
00578           //
00579           saved_total = total + ity._length;
00580           saved_pos = pos + ity._length;
00581         }
00582       total += itx._length;
00583       pos += itx._length;
00584     } while (true);
00585   
00586   return (cmt_regexp::iterator (pos, total));
00587 }
00588 //----------------------------------------------------------
00589 
00590 //----------------------------------------------------------
00591 cmt_one_more::cmt_one_more (cmt_node* n) : cmt_many_node (n)
00592 {
00593 }
00594 
00595 const cmt_regexp::iterator cmt_one_more::match (const cmt_string& text, 
00596                                             int pos) const
00597 {
00598   if ((pos < 0) || (pos > text.size ())) 
00599     {
00600       return (cmt_regexp::iterator::null ());
00601     }
00602 
00603   int total = 0;
00604 
00605   //
00606   // we are at : x+y
00607   //
00608 
00609   int saved_pos = -1;
00610   int saved_total = -1;
00611   bool at_least_one = false;
00612 
00613   do
00614     {
00615       const cmt_regexp::iterator itx = _node->match (text, pos);
00616       const cmt_regexp::iterator ity = _follower.match (text, pos);
00617 
00618       if ((itx == cmt_regexp::iterator::null ()) &&
00619           (ity == cmt_regexp::iterator::null ())) 
00620         {
00621           //
00622           // There is neither x nor y. We move back to the last 
00623           // succesful match for y.
00624           //
00625           if (saved_pos >= 0)
00626             {
00627               //
00628               // We had once a y.
00629               //
00630               pos = saved_pos;
00631               total = saved_total;
00632             }
00633           else
00634             {
00635               //
00636               // We never had any y !
00637               //
00638               return (cmt_regexp::iterator::null ());
00639             }
00640 
00641           break;
00642         }
00643 
00644       if (itx == cmt_regexp::iterator::null ())
00645         {
00646           //
00647           // There is a y but no x anymore, fine, we can quit.
00648           //
00649           total += ity._length;
00650           pos += ity._length;
00651           break;
00652         }
00653 
00654       if (ity != cmt_regexp::iterator::null ())
00655         {
00656           //
00657           //  We have both x and y. We save the current pos and total,
00658           // and then skip this x.
00659           //
00660           saved_total = total + ity._length;
00661           saved_pos = pos + ity._length;
00662         }
00663 
00664       total += itx._length;
00665       pos += itx._length;
00666 
00667       at_least_one = true;
00668     } while (true);
00669   
00670   if (!at_least_one) return (cmt_regexp::iterator::null ());
00671   
00672   return (cmt_regexp::iterator (pos, total));
00673 }
00674 //----------------------------------------------------------
00675 
00676 
00677 
00678 //----------------------------------------------------------
00679 const cmt_regexp::iterator cmt_begin_node::match (const cmt_string& /*text*/, 
00680                                                  int pos) const
00681 {
00682   if (pos == 0) return (cmt_regexp::iterator (pos, 0));
00683   return (cmt_regexp::iterator::null ());
00684 }
00685 //----------------------------------------------------------
00686 
00687 //----------------------------------------------------------
00688 const cmt_regexp::iterator cmt_end_node::match (const cmt_string& text,
00689                                                 int pos) const
00690 {
00691   if (pos == text.size ()) return (cmt_regexp::iterator (pos, 0));
00692   return (cmt_regexp::iterator::null ());
00693 }
00694 //----------------------------------------------------------
00695 
00696 //----------------------------------------------------------
00697 cmt_node_set::cmt_node_set () : _father (0)
00698 {
00699   _parentheses = false;
00700 }
00701 
00702 cmt_node_set::cmt_node_set (cmt_node_set* father) : _father (father)
00703 {
00704   if (father != 0) father->push (this);
00705   _parentheses = false;
00706 }
00707 
00708 cmt_node_set::~cmt_node_set ()
00709 {
00710   clear ();
00711 }
00712 
00713 cmt_node_set* cmt_node_set::father ()
00714 {
00715   return (_father);
00716 }
00717 
00718 void cmt_node_set::clear ()
00719 {
00720   int i;
00721   
00722   for (i = 0; i < _nodes.size (); i++)
00723     {
00724       cmt_node* n = _nodes[i];
00725       delete n;
00726     }
00727   _nodes.clear ();
00728 }
00729 
00730 void cmt_node_set::push (cmt_node* n)
00731 {
00732   _nodes.push_back (n);
00733 }
00734 
00735 cmt_node* cmt_node_set::pop ()
00736 {
00737   if (_nodes.size () == 0) return (&cmt_node::null ());
00738   
00739   int index = _nodes.size () - 1;
00740   
00741   cmt_node* n = _nodes[index];
00742   _nodes.erase (index);
00743   
00744   return (n);
00745 }
00746 
00747 cmt_node* cmt_node_set::top () const
00748 {
00749   if (_nodes.size () == 0) return (&cmt_node::null ());
00750   
00751   int index = _nodes.size () - 1;
00752   
00753   cmt_node* n = _nodes[index];
00754   
00755   return (n);
00756 }
00757 
00758 int cmt_node_set::nodes () const
00759 {
00760   return (_nodes.size ());
00761 }
00762 
00763 const cmt_node* cmt_node_set::nodeAt (int index) const
00764 {
00765   return (_nodes[index]);
00766 }
00767 
00768 bool cmt_node_set::parentheses () const
00769 {
00770   return (_parentheses);
00771 }
00772 
00773 void cmt_node_set::set_parentheses (bool value)
00774 {
00775   _parentheses = value;
00776 }
00777 
00778 void cmt_node_set::reduce ()
00779 {
00780 }
00781 //----------------------------------------------------------
00782 
00783 //----------------------------------------------------------
00784 cmt_and_node::cmt_and_node () : cmt_node_set ()
00785 {
00786 }
00787 
00788 cmt_and_node::cmt_and_node (cmt_node_set* father) : cmt_node_set (father)
00789 {
00790 }
00791 
00792 const cmt_regexp::iterator cmt_and_node::match (const cmt_string& text, 
00793                                                 int pos) const
00794 {
00795   if ((pos < 0) || (pos > text.size ())) 
00796     {
00797       return (cmt_regexp::iterator::null ());
00798     }
00799 
00800   if (_nodes.size () == 0) return (cmt_regexp::iterator (pos, 0));
00801 
00802   int i;
00803   int total = 0;
00804   int p = pos;
00805   
00806   for (i = 0; i < _nodes.size (); i++)
00807     {
00808       cmt_node* n = _nodes[i];
00809       
00810       const cmt_regexp::iterator it = n->match (text, p);
00811       
00812       if (it == cmt_regexp::iterator::null ()) return (it);
00813       
00814       total += it._length;
00815       p += it._length;
00816     }
00817 
00818     // All nodes match
00819   
00820   return (cmt_regexp::iterator (pos, total));
00821 }
00822 
00823 void cmt_and_node::reduce ()
00824 {
00825   if (_nodes.size () < 2) return;
00826   
00827   char c = ' ';
00828   cmt_string s = "";
00829   cmt_vector<cmt_node*> new_nodes;
00830 
00831   //
00832   // We loop once too much in order to finish the possibly accumulated
00833   // string at the end.
00834   //
00835   for (int i = 0; i <= _nodes.size (); i++)
00836     {
00837       cmt_node* n = 0;
00838 
00839       if (i < _nodes.size ()) n = _nodes[i];
00840 
00841       if ((i >= _nodes.size ()) || (!n->is_char ()))
00842         {
00843           if (s.size () == 1)
00844             {
00845               //
00846               // Too bad there was only one char node to consider
00847               // let's put it back as a char node !
00848               //
00849               new_nodes.push_back (new cmt_char_node (c));
00850               s = "";
00851             }
00852           else if (s.size () > 1)
00853             {
00854               //
00855               // We have real reduction here sonce there was several
00856               // consecutive char nodes.
00857               //
00858               new_nodes.push_back (new cmt_string_node (s));
00859               s = "";
00860             }
00861 
00862           if (i >= _nodes.size ()) break;
00863         }
00864 
00865       if (n->is_char ())
00866         {
00867           //
00868           // We are now trying to compact those char nodes.
00869           //
00870           cmt_char_node& cn = *((cmt_char_node*) n);
00871           c = (char) cn;
00872           s += c;
00873           delete n;
00874           _nodes[i] = 0;
00875         }
00876       else if (n->is_many_node ())
00877         {
00878           cmt_many_node& mn = *((cmt_many_node*) n);
00879           mn.install (*this, i + 1);
00880           mn.reduce ();
00881           new_nodes.push_back (n);
00882           break;
00883         }
00884       else
00885         {
00886           new_nodes.push_back (n);
00887         }
00888     }
00889   
00890   _nodes = new_nodes;
00891 }
00892 
00893 void cmt_and_node::fill (cmt_and_node& other, int start_index)
00894 {
00895   if ((start_index < 0) || (start_index > other.nodes ())) return;
00896 
00897   for (int i = start_index; i < other.nodes (); i++)
00898     {
00899       cmt_node* n = other._nodes[i];
00900       push (n);
00901     }
00902 }
00903 //----------------------------------------------------------
00904 
00905 //----------------------------------------------------------
00906 cmt_or_node::cmt_or_node (cmt_node_set* father) : cmt_node_set (father)
00907 {
00908 }
00909 
00910 const cmt_regexp::iterator cmt_or_node::match (const cmt_string& text, 
00911                                                int pos) const
00912 {
00913   if ((pos < 0) || (pos >= text.size ())) 
00914     {
00915       return (cmt_regexp::iterator::null ());
00916     }
00917 
00918   if (_nodes.size () == 0) return (cmt_regexp::iterator (pos, 0));
00919   
00920   int i;
00921   
00922   for (i = 0; i < _nodes.size (); i++)
00923     {
00924       const cmt_node* n = _nodes[i];
00925       
00926       const cmt_regexp::iterator it = n->match (text, pos);
00927       
00928         //        at least one or-ed expression matches
00929       if (it != cmt_regexp::iterator::null ()) return (it);
00930     }
00931   
00932   return (cmt_regexp::iterator::null ());
00933 }
00934 //----------------------------------------------------------
00935 
00936 
00937 
00938 
00939 
00940 
00941 
00942 
00943 
00944 
00945 
00946 
00947 
00948 //----------------------------------------------------------
00949 cmt_regexp::cmt_regexp (const cmt_string& expression)
00950 {
00951   _root = 0;
00952 
00953     //
00954     // The root is the cmt_or_node which will be returned. It is
00955     // the top of the hierarchy.
00956     //
00957     //  top is the running cmt_and_node.
00958     //
00959   cmt_node_set* or_root = 0;
00960   cmt_node_set* top_and = 0;
00961   
00962     // abcd
00963     // ab|cd
00964     // a|b|cd
00965     // a|b*|cd
00966     // a|b*|cd?e
00967     //
00968     // exp     : and
00969     //         | exp '|' and
00970     //
00971     // and     : unary 
00972     //         | unary and
00973     //
00974     // unary   : primary '*'
00975     //         | primary '?'
00976     //
00977     // primary : '[' opt_begin opt_chars opt_end ']'
00978     //         | '^'
00979     //         | '$'
00980     //         | char
00981     //         | '(' exp ')'
00982     //
00983   
00984   {
00985       //
00986       // First we build an cmt_or_node (corresponding to the
00987       // first grammatical rule)
00988       //
00989       //  Then cmt_and_nodes are pushed into it.
00990       //  and standard nodes are pushed into the running (top_and) cmt_and_node
00991       //
00992     or_root = new cmt_or_node (0);
00993     top_and = new cmt_and_node (or_root);
00994   }
00995   
00996   int i;
00997   
00998   for (i = 0; i < expression.size (); i++)
00999     {
01000       char c = expression[i];
01001       switch (c)
01002         {
01003           case '[':
01004           {
01005               //
01006               // The case is 
01007               //
01008               //  exp   : '['     char ... ']'
01009               //  exp   : '[' '^' char ... ']'
01010               //
01011 
01012             if (i >= expression.size ()) 
01013               {
01014                   // syntax error : unbalanced '['
01015                 delete or_root;
01016                 return;
01017               }
01018             i++;
01019             
01020             int i0 = i;
01021             
01022             bool done = false;
01023             bool has_not = false;
01024             
01025             cmt_string choices = "";
01026             
01027             for (; i < expression.size (); i++)
01028               {
01029                 c = expression[i];
01030                 switch (c)
01031                   {
01032                     case ']':
01033                       done = true;
01034                       break;
01035                     case '^':
01036                       if (i == i0) has_not = true;
01037                       else choices += c;
01038                       break;
01039                     case '\\':
01040                       choices += c;
01041                       if (i >= expression.size ())
01042                         {
01043                             // syntax error : unbalanced '[' and unfinished
01044                             // escape sequence
01045                           delete or_root;
01046                           return;
01047                         }
01048                       i++;
01049                       c = expression[i];
01050                       choices += c;
01051                       break;
01052                     default:
01053                       choices += c;
01054                       break;
01055                   }
01056                 if (done) break;
01057               }
01058             
01059             if (!done)
01060               {
01061                   // syntax error : unbalanced '['
01062                 delete or_root;
01063                 return;
01064               }
01065             if (has_not)
01066               top_and->push (new cmt_not_char_list_node (choices));
01067             else        
01068               top_and->push (new cmt_char_list_node (choices));
01069           }
01070           break;
01071           case '*':
01072           {
01073               //
01074               //  exp : exp '*'
01075               //
01076             if (top_and->nodes () == 0)
01077               {
01078                   // Syntax error : '*' is not preceded by an expression
01079                 delete or_root;
01080                 return;
01081               }
01082             
01083             cmt_node* n = top_and->pop ();
01084             top_and->push (new cmt_zero_more (n));
01085           }
01086           break;
01087           case '+':
01088           {
01089               //
01090               //  exp : exp '+'
01091               //
01092             if (top_and->nodes () == 0)
01093               {
01094                   // Syntax error : '+' is not preceded by an expression
01095                 delete or_root;
01096                 return;
01097               }
01098             
01099             cmt_node* n = top_and->pop ();
01100             top_and->push (new cmt_one_more (n));
01101           }
01102           break;
01103           case '?':
01104           {
01105               //
01106               //  exp : exp '?'
01107               //
01108             if (top_and->nodes () == 0)
01109               {
01110                   // Syntax error : '?' is not preceded by an expression
01111                 delete or_root;
01112                 return;
01113               }
01114             
01115             cmt_node* n = top_and->pop ();
01116             top_and->push (new cmt_zero_one (n));
01117           }
01118           break;
01119           case '.':
01120               //
01121               //  exp : '.'
01122               //
01123             top_and->push (new cmt_any_node ());
01124             break;
01125           case '(':
01126           {
01127               //
01128               //  exp : '(' exp ')'
01129               //
01130             if (top_and->parentheses ())
01131               {
01132                   // This should never happen.
01133                 delete or_root;
01134                 return;
01135               }
01136             
01137             top_and->set_parentheses (true);
01138             
01139               //
01140               // A new complete expression is started.
01141               //  -> do as for top_and parsing.
01142               //
01143             
01144             top_and = new cmt_and_node (new cmt_or_node (top_and));
01145           }
01146           break;
01147           case ')':
01148           {
01149               //
01150               //  exp : '(' exp ')'
01151               //
01152             
01153               // top_and is the cmt_and_node into which new nodes are pushed.
01154             cmt_node_set* or_node = top_and->father ();
01155             if (or_node == 0) 
01156               {
01157                   // This should never happen : top_and should always be
01158                   // at least an cmt_and_node hanging at an cmt_or_node
01159                 delete or_root;
01160                 return;
01161               }
01162             
01163               //
01164               // The last cmt_and_node was empty, thus we had either '()' or '(...|)'
01165               //
01166             
01167             if (top_and->nodes () == 0) 
01168               {
01169                 delete (or_node->pop ());
01170               }
01171             else
01172               {
01173                 top_and->reduce ();
01174               }
01175             
01176             top_and = or_node->father ();
01177             
01178             if (top_and == 0)
01179               {
01180                   // Syntax error : too many ')'
01181                 delete or_root;
01182                 return;
01183               }
01184             
01185               //
01186               // top_and is now the previous running cmt_and_node where the '(' 
01187               // was originally met its top_and node contains the parenthesized 
01188               // sub expression  If this one is empty, (due to an empty '()' 
01189               // expression) then it may simply be discarded.
01190               //
01191             
01192             if (!top_and->parentheses ())
01193               {
01194                   // Syntax error : too many ')'
01195                 delete or_root;
01196                 return;
01197               }
01198             
01199             top_and->set_parentheses (false);
01200             
01201             cmt_node* unique = 0;
01202             if (or_node->nodes () == 1)
01203               {
01204                 cmt_node_set* and_node = (cmt_node_set*) or_node->top ();
01205                 if (and_node->nodes () == 1)
01206                   {
01207                     unique = and_node->pop ();
01208                     delete (or_node->pop ());
01209                   }
01210                 else if (and_node->nodes () == 0)
01211                   {
01212                     delete (or_node->pop ());
01213                   }
01214               }
01215             
01216             if (or_node->nodes () == 0) delete (top_and->pop ());
01217             if (unique != 0) top_and->push (unique);
01218           }
01219           
01220           break;
01221           case '|':
01222           {
01223               //
01224               //  exp : exp '|' exp
01225               //
01226 
01227             cmt_node_set* or_node = top_and->father ();
01228             
01229             top_and->reduce ();
01230             
01231               //
01232               // or is the father cmt_or_node, which only contains cmt_and_nodes
01233               //
01234             
01235             const cmt_node_set* and_node = (cmt_node_set*) or_node->top ();
01236             if (and_node->nodes () == 0)
01237               {
01238                   // the previous node was empty.
01239                   // we may discard it
01240                 or_node->pop ();
01241               }
01242             
01243             top_and = new cmt_and_node (or_node);
01244           }
01245           break;
01246           case '^':
01247               //
01248               //  exp : '^'
01249               //
01250             top_and->push (new cmt_begin_node ());
01251             break;
01252           case '$':
01253               //
01254               //  exp : '$'
01255               //
01256             top_and->push (new cmt_end_node ());
01257             break;
01258           case '\\':
01259             if (i >= expression.size ())
01260               {
01261                 delete or_root;
01262                 return;
01263               }
01264             i++;
01265             c = expression[i];
01266             switch (c)
01267               {
01268                 case '[':
01269                 case ']':
01270                 case '(':
01271                 case ')':
01272                 case '.':
01273                 case '*':
01274                 case '?':
01275                 case '^':
01276                 case '$':
01277                 case '\\':
01278                   break;
01279                 case 'r':
01280                   c = '\r';
01281                   break;
01282                 case 't':
01283                   c = '\t';
01284                   break;
01285                 case 'n':
01286                   c = '\n';
01287                   break;
01288                 default:
01289                   break;
01290               }
01291           default:
01292             top_and->push (new cmt_char_node (c));
01293             break;
01294         }
01295     }
01296   
01297   if (or_root != 0)
01298     {
01299       cmt_node_set* and_node = (cmt_node_set*) or_root->top ();
01300       
01301       if (or_root->nodes () == 1)
01302         {
01303             //
01304             // Check whether there is at least one non-empty
01305             // cmt_and_node
01306             //
01307           if (and_node->nodes () == 0)
01308             {
01309               delete or_root;
01310               return;
01311             }
01312         }
01313       
01314       if (and_node != 0)
01315         {
01316           and_node->reduce ();
01317           
01318           if (and_node->parentheses ())
01319             {
01320               delete or_root;
01321               return;
01322             }
01323         }
01324     }
01325   
01326   _root = or_root;
01327 }
01328 
01329 cmt_regexp::~cmt_regexp ()
01330 {
01331   if (_root != 0)
01332     {
01333       delete _root;
01334     }
01335 }
01336 
01337 bool cmt_regexp::is_valid () const
01338 {
01339   if (_root != 0) return (true);
01340   else return (false);
01341 }
01342 
01343 cmt_regexp::iterator cmt_regexp::begin (const cmt_string& text, int pos)
01344 {
01345   if (_root != 0)
01346     {
01347       int i;
01348       
01349       for (i = pos; i < text.size (); i++)
01350         {
01351           cmt_regexp::iterator it = _root->match (text, i);
01352           if (it != end ()) return (it);
01353         }
01354     }
01355   
01356   return (end ());
01357 }
01358 
01359 cmt_regexp::iterator cmt_regexp::end ()
01360 {
01361   return (cmt_regexp::iterator::null ());
01362 }
01363 
01364 cmt_regexp::iterator cmt_regexp::begin (const cmt_string& text, int pos) const
01365 {
01366   if (_root != 0)
01367     {
01368       int i;
01369       
01370       for (i = pos; i < text.size (); i++)
01371         {
01372           cmt_regexp::iterator it = _root->match (text, i);
01373           if (it != end ()) return (it);
01374         }
01375     }
01376   
01377   return (end ());
01378 }
01379 
01380 cmt_regexp::iterator cmt_regexp::end () const
01381 {
01382   return (cmt_regexp::iterator::null ());
01383 }
01384 
01385 bool cmt_regexp::match (const cmt_string& text) const
01386 {
01387   iterator it = begin (text);
01388   if (it == end ()) return (false);
01389   else return (true);
01390 }
01391 //----------------------------------------------------------
01392 
01393 //----------------------------------------------------------
01394 const cmt_regexp::iterator cmt_regexp::iterator::null ()
01395 {
01396   static const iterator null_instance (-1, -1);
01397   
01398   return (null_instance);
01399 }
01400 
01401 cmt_regexp::iterator::iterator ()
01402 {
01403   _pos = 0;
01404   _length = 0;
01405 }
01406 
01407 cmt_regexp::iterator::iterator (int pos, int length)
01408 {
01409   _pos = pos;
01410   _length = length;
01411 }
01412 
01413 cmt_regexp::iterator::iterator (const iterator& other)
01414 {
01415   _pos = other._pos;
01416   _length = other._length;
01417 }
01418 
01419 int cmt_regexp::iterator::operator != (const iterator& other) const
01420 {
01421   return ((this->_pos != other._pos) ||
01422           (this->_length != other._length));
01423 }
01424 
01425 int cmt_regexp::iterator::operator == (const iterator& other) const
01426 {
01427   return ((this->_pos == other._pos) &&
01428           (this->_length == other._length));
01429 }
01430 //----------------------------------------------------------
01431 

Generated at Thu Apr 11 16:49:42 2002 for CMT by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000