Main?Page | Class?Hierarchy | Class?List | File?List | Class?Members | File?Members

cmt_regexp.cxx

Go to the documentation of this file.
00001 //-----------------------------------------------------------
00002 // Copyright Christian Arnault LAL-Orsay CNRS
00003 // 
00004 // See the complete license in cmt_license.txt "http://www.cecill.info". 
00005 //-----------------------------------------------------------
00006 
00007 #include "cmt_std.h"
00008 #include "cmt_regexp.h"
00009 #include "cmt_vector.h"
00010 #include "cmt_system.h"
00011 
00012 //----------------------------------------------------------
00013 //
00014 //  Declarations
00015 //
00016 //----------------------------------------------------------
00017 
00018 static int tab_level = 0;
00019 static void tab ()
00020 {
00021   for (int i = 0; i < tab_level; i++)
00022     {
00023       cout << "  ";
00024     }
00025 }
00026 
00027 //----------------------------------------------------------
00028 class cmt_regexp_node
00029 {
00030 public:
00031   static cmt_regexp_node& null ();
00032   static int node_count ();
00033   
00034   cmt_regexp_node ();
00035   virtual ~cmt_regexp_node ();
00036   
00037   virtual const cmt_regexp::iterator match (const cmt_string& text, 
00038                                             int pos) const;
00039   virtual bool is_char () const;
00040   virtual bool is_many_node () const;
00041 
00042   virtual void dump () const;
00043   
00044 private:
00045   static int _node_count;
00046 };
00047 //----------------------------------------------------------
00048 
00049 //----------------------------------------------------------
00050 class cmt_char_node : public cmt_regexp_node
00051 {
00052 public:
00053   cmt_char_node (char c);
00054   
00055   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00056   
00057   bool is_char () const;
00058   operator char ();
00059 
00060   void dump () const;
00061   
00062 private:
00063   char _c;
00064 };
00065 //----------------------------------------------------------
00066 
00067 //----------------------------------------------------------
00068 class cmt_string_node : public cmt_regexp_node
00069 {
00070 public:
00071   cmt_string_node (const cmt_string& s);
00072   
00073   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00074 
00075   void dump () const;
00076     
00077 private:
00078   cmt_string _s;
00079 };
00080 //----------------------------------------------------------
00081 
00082 //----------------------------------------------------------
00083 class cmt_char_list_node : public cmt_regexp_node
00084 {
00085 public:
00086   cmt_char_list_node (cmt_string list);
00087   
00088   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00089 
00090   void dump () const;
00091   
00092 protected:
00093   cmt_string _list;
00094   cmt_string _choices;
00095 };
00096 //----------------------------------------------------------
00097 
00098 //----------------------------------------------------------
00099 class cmt_not_char_list_node : public cmt_char_list_node
00100 {
00101 public:
00102   cmt_not_char_list_node (cmt_string list);
00103   
00104   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00105 
00106   void dump () const;
00107 };
00108 //----------------------------------------------------------
00109 
00110 //----------------------------------------------------------
00111 class cmt_any_node : public cmt_regexp_node
00112 {
00113 public:
00114   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00115 
00116   void dump () const;
00117 };
00118 //----------------------------------------------------------
00119 
00120 //----------------------------------------------------------
00121 class cmt_zero_one : public cmt_regexp_node
00122 {
00123 public:
00124   cmt_zero_one (cmt_regexp_node* n);
00125   ~cmt_zero_one ();
00126   
00127   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00128 
00129   void dump () const;
00130   
00131 protected:
00132   cmt_regexp_node* _node;
00133 };
00134 //----------------------------------------------------------
00135 
00136 
00137 
00138 //----------------------------------------------------------
00139 class cmt_begin_node : public cmt_regexp_node
00140 {
00141 public:
00142   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00143 
00144   void dump () const;
00145 };
00146 //----------------------------------------------------------
00147 
00148 //----------------------------------------------------------
00149 class cmt_end_node : public cmt_regexp_node
00150 {
00151 public:
00152   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00153 
00154   void dump () const;
00155 };
00156 //----------------------------------------------------------
00157 
00158 //----------------------------------------------------------
00159 class cmt_regexp_node_set : public cmt_regexp_node
00160 {
00161 public:
00162   cmt_regexp_node_set ();
00163   cmt_regexp_node_set (cmt_regexp_node_set* father);
00164   ~cmt_regexp_node_set ();
00165   
00166   cmt_regexp_node_set* father ();
00167   void clear ();
00168   void push (cmt_regexp_node* n);
00169   cmt_regexp_node* pop ();
00170   cmt_regexp_node* top () const;
00171   int nodes () const;
00172   const cmt_regexp_node* nodeAt (int index) const;
00173   bool parentheses () const;
00174   void set_parentheses (bool value);
00175   virtual void reduce ();
00176 
00177   virtual void dump () const;
00178   void dump (const cmt_string& title) const;
00179   
00180 protected:
00181   cmt_regexp_node_set* _father;
00182   cmt_vector _nodes;
00183   bool _parentheses;
00184 };
00185 //----------------------------------------------------------
00186 
00187 //----------------------------------------------------------
00188 class cmt_and_node : public cmt_regexp_node_set
00189 {
00190 public:
00191   cmt_and_node ();
00192   cmt_and_node (cmt_regexp_node_set* father);
00193   
00194   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00195   void reduce ();
00196   void fill (cmt_and_node& other, int start_index);
00197 
00198   void dump () const;
00199 };
00200 //----------------------------------------------------------
00201 
00202 //----------------------------------------------------------
00203 class cmt_or_node : public cmt_regexp_node_set
00204 {
00205 public:
00206   cmt_or_node (cmt_regexp_node_set* father);
00207   
00208   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00209 
00210   void dump () const;
00211 };
00212 //----------------------------------------------------------
00213 
00214 //----------------------------------------------------------
00215 cmt_regexp_node& cmt_regexp_node::null ()
00216 {
00217   static cmt_regexp_node null_instance;
00218   
00219   return (null_instance);
00220 }
00221 //----------------------------------------------------------
00222 
00223 //----------------------------------------------------------
00224 class cmt_many_node : public cmt_regexp_node
00225 {
00226 public:
00227   bool is_many_node () const;
00228   void install (cmt_and_node& other, int index);
00229   void reduce ();
00230 
00231   void dump () const;
00232   
00233 protected:
00234   cmt_many_node (cmt_regexp_node* n);
00235   virtual ~cmt_many_node ();
00236   cmt_regexp_node* _node;
00237   cmt_and_node _follower;
00238 };
00239 //----------------------------------------------------------
00240 
00241 //----------------------------------------------------------
00242 class cmt_zero_more : public cmt_many_node
00243 {
00244 public:
00245   cmt_zero_more (cmt_regexp_node* n);
00246   
00247   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00248 
00249   void dump () const;
00250 };
00251 //----------------------------------------------------------
00252 
00253 //----------------------------------------------------------
00254 class cmt_one_more : public cmt_many_node
00255 {
00256 public:
00257   cmt_one_more (cmt_regexp_node* n);
00258 
00259   const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00260 
00261   void dump () const;
00262 };
00263 //----------------------------------------------------------
00264 
00265 
00266 //----------------------------------------------------------
00267 //
00268 //  Implementation
00269 //
00270 //----------------------------------------------------------
00271 
00272 //----------------------------------------------------------
00273 cmt_regexp_node::cmt_regexp_node ()
00274 {
00275   _node_count++;
00276 }
00277 
00278 cmt_regexp_node::~cmt_regexp_node ()
00279 {
00280   _node_count--;
00281 }
00282 
00283 int cmt_regexp_node::node_count ()
00284 {
00285   return (_node_count);
00286 }
00287 
00288 const cmt_regexp::iterator cmt_regexp_node::match (const cmt_string& /*text*/, 
00289                                             int /*pos*/) const
00290 {
00291   return (cmt_regexp::iterator::null());
00292 }
00293 
00294 bool cmt_regexp_node::is_char () const
00295 {
00296   return (false);
00297 }
00298 
00299 bool cmt_regexp_node::is_many_node () const
00300 {
00301   return (false);
00302 }
00303 
00304 void cmt_regexp_node::dump () const
00305 {
00306 }
00307 
00308 int cmt_regexp_node::_node_count = 0;
00309 //----------------------------------------------------------
00310 
00311 //----------------------------------------------------------
00312 cmt_char_node::cmt_char_node (char c)
00313 {
00314   _c = c;
00315 }
00316 
00317 const cmt_regexp::iterator cmt_char_node::match (const cmt_string& text, 
00318                                                  int pos) const
00319 {
00320   if ((pos < 0) || (pos > text.size ())) 
00321     {
00322       return (cmt_regexp::iterator::null ());
00323     }
00324 
00325   char c = text[pos];
00326 
00327   if (c == _c)
00328     {
00329       return (cmt_regexp::iterator (pos, 1));
00330     }
00331   
00332   return (cmt_regexp::iterator::null ());
00333 }
00334 
00335 bool cmt_char_node::is_char () const
00336 {
00337   return (true);
00338 }
00339 
00340 cmt_char_node::operator char ()
00341 {
00342   return (_c);
00343 }
00344 
00345 void cmt_char_node::dump () const
00346 {
00347   tab (); cout << "char>(" << this << ") c=" << _c << endl;
00348 }
00349 
00350 //----------------------------------------------------------
00351 
00352 //----------------------------------------------------------
00353 cmt_string_node::cmt_string_node (const cmt_string& s)
00354 {
00355   _s = s;
00356 }
00357 
00358 const cmt_regexp::iterator cmt_string_node::match (const cmt_string& text, 
00359                                                   int pos) const
00360 {
00361   if ((pos < 0) || (pos > text.size ())) 
00362     {
00363       return (cmt_regexp::iterator::null ());
00364     }
00365 
00366   int length = _s.size ();
00367   
00368   cmt_string s = text.substr (pos, length);
00369   
00370   if ((length == 0) || (s == _s))
00371     {
00372       return (cmt_regexp::iterator (pos, length));
00373     }
00374   
00375   return (cmt_regexp::iterator::null ());
00376 }
00377 
00378 void cmt_string_node::dump () const
00379 {
00380   tab (); cout << "string (" << this << ") s=[" << _s << "]" << endl;
00381 }
00382 
00383 //----------------------------------------------------------
00384 
00385 //----------------------------------------------------------
00386 cmt_char_list_node::cmt_char_list_node (cmt_string list)
00387 {
00388   _list = list;
00389   
00390   _choices = "";
00391   
00392   char c;
00393   int i;
00394   
00395   for (i = 0; i < list.size (); i++)
00396     {
00397       c = list[i];
00398       
00399       switch (c)
00400         {
00401           case '-':
00402             i++;
00403             {
00404               char c1 = _choices[_choices.size () - 1];
00405               char c2 = list[i];
00406               int j;
00407               int j0 = (c1 < c2) ? c1 : c2;
00408               int j1 = (c1 > c2) ? c1 : c2;
00409               for (j = j0; j <= j1; j++)
00410                 {
00411                   _choices += j;
00412                 }
00413             }
00414             break;
00415           case '\\':
00416             i++;
00417             c = list[i];
00418             switch (c)
00419               {
00420                 case '[':
00421                 case ']':
00422                 case '(':
00423                 case ')':
00424                 case '.':
00425                 case '*':
00426                 case '?':
00427                 case '^':
00428                 case '$':
00429                 case '\\':
00430                   c = '\\';
00431                   break;
00432                 case 'r':
00433                   c = '\r';
00434                   break;
00435                 case 't':
00436                   c = '\t';
00437                   break;
00438                 case 'n':
00439                   c = '\n';
00440                   break;
00441                 default:
00442                   break;
00443               }
00444           default:
00445             _choices += c;
00446             break;
00447         }
00448     }
00449 }
00450 
00451 const cmt_regexp::iterator cmt_char_list_node::match (const cmt_string& text, 
00452                                                      int pos) const
00453 {
00454   if ((pos < 0) || (pos > text.size ())) 
00455     {
00456       return (cmt_regexp::iterator::null ());
00457     }
00458 
00459   char c = text[pos];
00460 
00461   int i;
00462   
00463   for (i = 0; i < _choices.size (); i++)
00464     {
00465       if (c == _choices[i]) return (cmt_regexp::iterator (pos, 1));
00466     }
00467   
00468   return (cmt_regexp::iterator::null ());
00469 }
00470 
00471 void cmt_char_list_node::dump () const
00472 {
00473   tab (); cout << "char_list (" << this << ") list=[" << _list << "] choices=[" << _choices << "]" << endl;
00474 }
00475 
00476 //----------------------------------------------------------
00477 
00478 //----------------------------------------------------------
00479 cmt_not_char_list_node::cmt_not_char_list_node (cmt_string list) : 
00480         cmt_char_list_node (list)
00481 {
00482 }
00483 
00484 const cmt_regexp::iterator cmt_not_char_list_node::match (const cmt_string& text, 
00485                                                       int pos) const
00486 {
00487   if ((pos < 0) || (pos > text.size ())) 
00488     {
00489       return (cmt_regexp::iterator::null ());
00490     }
00491 
00492   char c = text[pos];
00493 
00494   int i;
00495 
00496   for (i = 0; i < _choices.size (); i++)
00497     {
00498       if (c == _choices[i]) return (cmt_regexp::iterator::null ());
00499     }
00500   
00501   return (cmt_regexp::iterator (pos, 1));
00502 }
00503 
00504 void cmt_not_char_list_node::dump () const
00505 {
00506   tab (); cout << "not_char_list (" << this << ") list=[" << _list << "] choices=[" << _choices << "]" << endl;
00507 }
00508 
00509 
00510 //----------------------------------------------------------
00511 
00512 //----------------------------------------------------------
00513 const cmt_regexp::iterator cmt_any_node::match (const cmt_string& text, 
00514                                             int pos) const
00515 {
00516   if ((pos < 0) | (pos >= text.size ())) 
00517     {
00518       return (cmt_regexp::iterator::null ());
00519     }
00520   
00521   return (cmt_regexp::iterator (pos, 1));
00522 }
00523 
00524 void cmt_any_node::dump () const
00525 {
00526   tab (); cout << "any (" << this << ") " << endl;
00527 }
00528 
00529 //----------------------------------------------------------
00530 
00531 //----------------------------------------------------------
00532 cmt_zero_one::cmt_zero_one (cmt_regexp_node* n) : _node (n)
00533 {
00534 }
00535 
00536 cmt_zero_one::~cmt_zero_one ()
00537 {
00538   delete _node;
00539 }
00540 
00541 const cmt_regexp::iterator cmt_zero_one::match (const cmt_string& text, 
00542                                                int pos) const
00543 {
00544   if ((pos < 0) || (pos > text.size ())) 
00545     {
00546       return (cmt_regexp::iterator::null ());
00547     }
00548 
00549   int total = 0;
00550 
00551   if (pos < text.size ())
00552     {
00553       const cmt_regexp::iterator it = _node->match (text, pos);
00554       if (it != cmt_regexp::iterator::null ())
00555         {
00556           total += it._length;
00557           pos += it._length;
00558         }
00559     }
00560   
00561   return (cmt_regexp::iterator (pos, total));
00562 }
00563 
00564 void cmt_zero_one::dump () const
00565 {
00566   tab (); cout << "zero_one (" << this << ") " << endl;
00567   if (_node != 0)
00568     {
00569       tab_level++;
00570       _node->dump ();
00571       tab_level--;
00572     }
00573 }
00574 
00575 //----------------------------------------------------------
00576 
00577 //----------------------------------------------------------
00578 cmt_many_node::cmt_many_node (cmt_regexp_node* n) : _node (n)
00579 {
00580 }
00581 
00582 bool cmt_many_node::is_many_node () const
00583 {
00584   return (true);
00585 }
00586 
00587 cmt_many_node::~cmt_many_node ()
00588 {
00589   delete _node;
00590 }
00591 
00592 void cmt_many_node::install (cmt_and_node& other, int start_index)
00593 {
00594   _follower.fill (other, start_index);
00595 }
00596 
00597 void cmt_many_node::reduce ()
00598 {
00599   _follower.reduce ();
00600 }
00601 
00602 void cmt_many_node::dump () const
00603 {
00604   tab (); cout << "many (" << this << ") " << endl;
00605   if (_node != 0) 
00606     {
00607       tab_level++;
00608       _node->dump ();
00609       tab_level--;
00610     }
00611   tab_level++;
00612   _follower.dump ();
00613   tab_level--;
00614 }
00615 
00616 //----------------------------------------------------------
00617 
00618 
00619 
00620 //----------------------------------------------------------
00621 cmt_zero_more::cmt_zero_more (cmt_regexp_node* n) : cmt_many_node (n)
00622 {
00623 }
00624 
00625 const cmt_regexp::iterator cmt_zero_more::match (const cmt_string& text, 
00626                                              int pos) const
00627 {
00628   if ((pos < 0) || (pos > text.size ())) 
00629     {
00630       return (cmt_regexp::iterator::null ());
00631     }
00632 
00633   int total = 0;
00634 
00635   //
00636   // we are at : x*y
00637   //
00638 
00639   int saved_pos = -1;
00640   int saved_total = -1;
00641 
00642   do
00643     {
00644       const cmt_regexp::iterator itx = _node->match (text, pos);
00645       const cmt_regexp::iterator ity = _follower.match (text, pos);
00646 
00647       if ((itx == cmt_regexp::iterator::null ()) &&
00648           (ity == cmt_regexp::iterator::null ())) 
00649         {
00650           //
00651           // There is neither x nor y. We move back to the last 
00652           // succesful match for y.
00653           //
00654           if (saved_pos >= 0)
00655             {
00656               //
00657               // We had once a y.
00658               //
00659               pos = saved_pos;
00660               total = saved_total;
00661             }
00662           else
00663             {
00664               //
00665               // We never had any y !
00666               //
00667               return (cmt_regexp::iterator::null ());
00668             }
00669 
00670           break;
00671         }
00672 
00673       if (itx == cmt_regexp::iterator::null ())
00674         {
00675           //
00676           // There is a y but no x anymore, fine, we can quit.
00677           //
00678           total += ity._length;
00679           pos += ity._length;
00680           break;
00681         }
00682 
00683       if (ity != cmt_regexp::iterator::null ())
00684         {
00685           //
00686           //  We have both x and y. We save the current pos and total,
00687           // and then skip this x.
00688           //
00689           saved_total = total + ity._length;
00690           saved_pos = pos + ity._length;
00691         }
00692       total += itx._length;
00693       pos += itx._length;
00694     } while (true);
00695   
00696   return (cmt_regexp::iterator (pos, total));
00697 }
00698 
00699 void cmt_zero_more::dump () const
00700 {
00701   tab (); cout << "zero_more (" << this << ") " << endl;
00702   if (_node != 0)
00703     {
00704       tab_level++;
00705       _node->dump ();
00706       tab_level--;
00707     }
00708   _follower.dump ();
00709 }
00710 
00711 //----------------------------------------------------------
00712 
00713 //----------------------------------------------------------
00714 cmt_one_more::cmt_one_more (cmt_regexp_node* n) : cmt_many_node (n)
00715 {
00716 }
00717 
00718 const cmt_regexp::iterator cmt_one_more::match (const cmt_string& text, 
00719                                             int pos) const
00720 {
00721   if ((pos < 0) || (pos > text.size ())) 
00722     {
00723       return (cmt_regexp::iterator::null ());
00724     }
00725 
00726   int total = 0;
00727 
00728   //
00729   // we are at : x+y
00730   //
00731 
00732   int saved_pos = -1;
00733   int saved_total = -1;
00734   bool at_least_one = false;
00735 
00736   do
00737     {
00738       const cmt_regexp::iterator itx = _node->match (text, pos);
00739       const cmt_regexp::iterator ity = _follower.match (text, pos);
00740 
00741       if ((itx == cmt_regexp::iterator::null ()) &&
00742           (ity == cmt_regexp::iterator::null ())) 
00743         {
00744           //
00745           // There is neither x nor y. We move back to the last 
00746           // succesful match for y.
00747           //
00748           if (saved_pos >= 0)
00749             {
00750               //
00751               // We had once a y.
00752               //
00753               pos = saved_pos;
00754               total = saved_total;
00755             }
00756           else
00757             {
00758               //
00759               // We never had any y !
00760               //
00761               return (cmt_regexp::iterator::null ());
00762             }
00763 
00764           break;
00765         }
00766 
00767       if (itx == cmt_regexp::iterator::null ())
00768         {
00769           //
00770           // There is a y but no x anymore, fine, we can quit.
00771           //
00772           total += ity._length;
00773           pos += ity._length;
00774           break;
00775         }
00776 
00777       if (ity != cmt_regexp::iterator::null ())
00778         {
00779           //
00780           //  We have both x and y. We save the current pos and total,
00781           // and then skip this x.
00782           //
00783           saved_total = total + ity._length;
00784           saved_pos = pos + ity._length;
00785         }
00786 
00787       total += itx._length;
00788       pos += itx._length;
00789 
00790       at_least_one = true;
00791     } while (true);
00792   
00793   if (!at_least_one) return (cmt_regexp::iterator::null ());
00794   
00795   return (cmt_regexp::iterator (pos, total));
00796 }
00797 
00798 void cmt_one_more::dump () const
00799 {
00800   tab (); cout << "one_more (" << this << ") " << endl;
00801   if (_node != 0)
00802     {
00803       tab_level++;
00804       _node->dump ();
00805       tab_level--;
00806     }
00807   tab_level++;
00808   _follower.dump ();
00809   tab_level--;
00810 }
00811 
00812 //----------------------------------------------------------
00813 
00814 
00815 
00816 //----------------------------------------------------------
00817 const cmt_regexp::iterator cmt_begin_node::match (const cmt_string& /*text*/, 
00818                                                  int pos) const
00819 {
00820   if (pos == 0) return (cmt_regexp::iterator (pos, 0));
00821   return (cmt_regexp::iterator::null ());
00822 }
00823 
00824 void cmt_begin_node::dump () const
00825 {
00826   tab (); cout << "begin (" << this << ") " << endl;
00827 }
00828 //----------------------------------------------------------
00829 
00830 //----------------------------------------------------------
00831 const cmt_regexp::iterator cmt_end_node::match (const cmt_string& text,
00832                                                 int pos) const
00833 {
00834   if (pos == text.size ()) return (cmt_regexp::iterator (pos, 0));
00835   return (cmt_regexp::iterator::null ());
00836 }
00837 
00838 void cmt_end_node::dump () const
00839 {
00840   tab (); cout << "end (" << this << ") " << endl;
00841 }
00842 //----------------------------------------------------------
00843 
00844 //----------------------------------------------------------
00845 cmt_regexp_node_set::cmt_regexp_node_set () : _father (0)
00846 {
00847   _parentheses = false;
00848 }
00849 
00850 cmt_regexp_node_set::cmt_regexp_node_set (cmt_regexp_node_set* father) : _father (father)
00851 {
00852   if (father != 0) father->push (this);
00853   _parentheses = false;
00854 }
00855 
00856 cmt_regexp_node_set::~cmt_regexp_node_set ()
00857 {
00858   clear ();
00859 }
00860 
00861 cmt_regexp_node_set* cmt_regexp_node_set::father ()
00862 {
00863   return (_father);
00864 }
00865 
00866 void cmt_regexp_node_set::clear ()
00867 {
00868   int i;
00869   
00870   for (i = 0; i < _nodes.size (); i++)
00871     {
00872       cmt_regexp_node* n = _nodes[i];
00873       delete n;
00874     }
00875   _nodes.clear ();
00876 }
00877 
00878 void cmt_regexp_node_set::push (cmt_regexp_node* n)
00879 {
00880   _nodes.push_back (n);
00881 }
00882 
00883 cmt_regexp_node* cmt_regexp_node_set::pop ()
00884 {
00885   if (_nodes.size () == 0) return (&cmt_regexp_node::null ());
00886   
00887   int index = _nodes.size () - 1;
00888   
00889   cmt_regexp_node* n = _nodes[index];
00890   _nodes.erase (index);
00891   
00892   return (n);
00893 }
00894 
00895 cmt_regexp_node* cmt_regexp_node_set::top () const
00896 {
00897   if (_nodes.size () == 0) return (&cmt_regexp_node::null ());
00898   
00899   int index = _nodes.size () - 1;
00900   
00901   cmt_regexp_node* n = _nodes[index];
00902   
00903   return (n);
00904 }
00905 
00906 int cmt_regexp_node_set::nodes () const
00907 {
00908   return (_nodes.size ());
00909 }
00910 
00911 const cmt_regexp_node* cmt_regexp_node_set::nodeAt (int index) const
00912 {
00913   return (_nodes[index]);
00914 }
00915 
00916 bool cmt_regexp_node_set::parentheses () const
00917 {
00918   return (_parentheses);
00919 }
00920 
00921 void cmt_regexp_node_set::set_parentheses (bool value)
00922 {
00923   _parentheses = value;
00924 }
00925 
00926 void cmt_regexp_node_set::reduce ()
00927 {
00928 }
00929 
00930 void cmt_regexp_node_set::dump () const
00931 {
00932   tab (); cout << "regexp_node_set (" << this << ") " << endl;
00933 }
00934 
00935 void cmt_regexp_node_set::dump (const cmt_string& title) const
00936 {
00937   tab (); cout << "Set (" << this << ") father=" << _father << " pars=" << _parentheses << endl;
00938   for (int i = 0; i < _nodes.size (); i++)
00939     {
00940       cmt_regexp_node* n = _nodes[i];
00941       if (n != 0)
00942         {
00943           if (i > 0)
00944             {
00945               tab (); cout << title << endl;
00946             }
00947           tab_level++;
00948           n->dump ();
00949           tab_level--;
00950         }
00951     }
00952   tab (); cout << "EndSet (" << this << ")" << endl;
00953 }
00954 //----------------------------------------------------------
00955 
00956 //----------------------------------------------------------
00957 cmt_and_node::cmt_and_node () : cmt_regexp_node_set ()
00958 {
00959 }
00960 
00961 cmt_and_node::cmt_and_node (cmt_regexp_node_set* father) : cmt_regexp_node_set (father)
00962 {
00963 }
00964 
00965 const cmt_regexp::iterator cmt_and_node::match (const cmt_string& text, 
00966                                                 int pos) const
00967 {
00968   if ((pos < 0) || (pos > text.size ())) 
00969     {
00970       return (cmt_regexp::iterator::null ());
00971     }
00972 
00973   if (_nodes.size () == 0) return (cmt_regexp::iterator (pos, 0));
00974 
00975   int i;
00976   int total = 0;
00977   int p = pos;
00978   
00979   bool dbg = CmtSystem::testenv ("CMTTESTREGEXP");
00980   if (dbg) {tab (); cout << "match and (" << this << ") pos=" << pos << endl;}
00981 
00982   for (i = 0; i < _nodes.size (); i++)
00983     {
00984       cmt_regexp_node* n = _nodes[i];
00985 
00986       if (dbg) tab_level++;
00987       const cmt_regexp::iterator it = n->match (text, p);
00988       if (dbg) tab_level--;
00989 
00990       if (dbg) {tab (); cout << "  -> it(" << n << ") p=" << it._pos << " l=" << it._length << endl;}
00991       
00992       if (it == cmt_regexp::iterator::null ()) return (it);
00993       
00994       total += it._length;
00995       p += it._length;
00996     }
00997 
00998     // All nodes match
00999   
01000   return (cmt_regexp::iterator (pos, total));
01001 }
01002 
01003 void cmt_and_node::reduce ()
01004 {
01005   if (_nodes.size () < 2) return;
01006   
01007   char c = ' ';
01008   cmt_string s = "";
01009   cmt_vector new_nodes;
01010 
01011   //
01012   // We loop once too much in order to finish the possibly accumulated
01013   // string at the end.
01014   //
01015   for (int i = 0; i <= _nodes.size (); i++)
01016     {
01017       cmt_regexp_node* n = 0;
01018 
01019       if (i < _nodes.size ()) n = _nodes[i];
01020 
01021       if ((i >= _nodes.size ()) || (!n->is_char ()))
01022         {
01023           if (s.size () == 1)
01024             {
01025               //
01026               // Too bad there was only one char node to consider
01027               // let's put it back as a char node !
01028               //
01029               new_nodes.push_back (new cmt_char_node (c));
01030               s = "";
01031             }
01032           else if (s.size () > 1)
01033             {
01034               //
01035               // We have real reduction here sonce there was several
01036               // consecutive char nodes.
01037               //
01038               new_nodes.push_back (new cmt_string_node (s));
01039               s = "";
01040             }
01041 
01042           if (i >= _nodes.size ()) break;
01043         }
01044 
01045       if (n->is_char ())
01046         {
01047           //
01048           // We are now trying to compact those char nodes.
01049           //
01050           cmt_char_node& cn = *((cmt_char_node*) n);
01051           c = (char) cn;
01052           s += c;
01053           delete n;
01054           _nodes[i] = 0;
01055         }
01056       else if (n->is_many_node ())
01057         {
01058           cmt_many_node& mn = *((cmt_many_node*) n);
01059           mn.install (*this, i + 1);
01060           mn.reduce ();
01061           new_nodes.push_back (n);
01062           break;
01063         }
01064       else
01065         {
01066           new_nodes.push_back (n);
01067         }
01068     }
01069   
01070   _nodes = new_nodes;
01071 }
01072 
01073 void cmt_and_node::fill (cmt_and_node& other, int start_index)
01074 {
01075   if ((start_index < 0) || (start_index > other.nodes ())) return;
01076 
01077   for (int i = start_index; i < other.nodes (); i++)
01078     {
01079       cmt_regexp_node* n = other._nodes[i];
01080       push (n);
01081     }
01082 }
01083 
01084 void cmt_and_node::dump () const
01085 {
01086   cmt_regexp_node_set::dump ("and");
01087 }
01088 
01089 //----------------------------------------------------------
01090 
01091 //----------------------------------------------------------
01092 cmt_or_node::cmt_or_node (cmt_regexp_node_set* father) : cmt_regexp_node_set (father)
01093 {
01094 }
01095 
01096 const cmt_regexp::iterator cmt_or_node::match (const cmt_string& text, 
01097                                                int pos) const
01098 {
01099   if ((pos < 0) || (pos >= text.size ())) 
01100     {
01101       return (cmt_regexp::iterator::null ());
01102     }
01103 
01104   if (_nodes.size () == 0) return (cmt_regexp::iterator (pos, 0));
01105   
01106   bool dbg = CmtSystem::testenv ("CMTTESTREGEXP");
01107   if (dbg) {tab (); cout << "match or (" << this << ") pos=" << pos << endl;}
01108 
01109   int i;
01110 
01111   int longest = 0;
01112 
01113   cmt_regexp::iterator result = cmt_regexp::iterator::null ();
01114   
01115   for (i = 0; i < _nodes.size (); i++)
01116     {
01117       const cmt_regexp_node* n = _nodes[i];
01118       
01119       if (dbg) tab_level++;
01120       const cmt_regexp::iterator it = n->match (text, pos);
01121       if (dbg) tab_level--;
01122 
01123       if (it._length > longest)
01124         {
01125           longest = it._length;
01126           result = it;
01127         }
01128 
01129         //        at least one or-ed expression matches
01130       // if (it != cmt_regexp::iterator::null ()) return (it);
01131     }
01132   
01133   return (result);
01134 }
01135 
01136 void cmt_or_node::dump () const
01137 {
01138   cmt_regexp_node_set::dump ("or");
01139 }
01140 
01141 //----------------------------------------------------------
01142 
01143 
01144 
01145 
01146 
01147 
01148 
01149 
01150 
01151 
01152 
01153 
01154 
01155 //----------------------------------------------------------
01156 cmt_regexp::cmt_regexp ()
01157 {
01158   _root = 0;
01159 }
01160 
01161 //----------------------------------------------------------
01162 cmt_regexp::cmt_regexp (const cmt_string& expression)
01163 {
01164   _root = 0;
01165   set (expression);
01166 }
01167 
01168 //----------------------------------------------------------
01169 void cmt_regexp::set (const cmt_string& expression)
01170 {
01171   if (_root != 0)
01172     {
01173       delete _root;
01174       _root = 0;
01175     }
01176 
01177     //
01178     // The root is the cmt_or_node which will be returned. It is
01179     // the top of the hierarchy.
01180     //
01181     //  top is the running cmt_and_node.
01182     //
01183   cmt_regexp_node_set* or_root = 0;
01184   cmt_regexp_node_set* top_and = 0;
01185   
01186     // abcd
01187     // ab|cd
01188     // a|b|cd
01189     // a|b*|cd
01190     // a|b*|cd?e
01191     //
01192     // exp     : and
01193     //         | exp '|' and
01194     //
01195     // and     : unary 
01196     //         | unary and
01197     //
01198     // unary   : primary '*'
01199     //         | primary '?'
01200     //
01201     // primary : '[' opt_begin opt_chars opt_end ']'
01202     //         | '^'
01203     //         | '$'
01204     //         | char
01205     //         | '(' exp ')'
01206     //
01207   
01208   {
01209       //
01210       // First we build an cmt_or_node (corresponding to the
01211       // first grammatical rule)
01212       //
01213       //  Then cmt_and_nodes are pushed into it.
01214       //  and standard nodes are pushed into the running (top_and) cmt_and_node
01215       //
01216     or_root = new cmt_or_node (0);
01217     top_and = new cmt_and_node (or_root);
01218   }
01219   
01220   int i;
01221   
01222   for (i = 0; i < expression.size (); i++)
01223     {
01224       char c = expression[i];
01225       switch (c)
01226         {
01227           case '[':
01228           {
01229               //
01230               // The case is 
01231               //
01232               //  exp   : '['     char ... ']'
01233               //  exp   : '[' '^' char ... ']'
01234               //
01235 
01236             if (i >= expression.size ()) 
01237               {
01238                   // syntax error : unbalanced '['
01239                 delete or_root;
01240                 return;
01241               }
01242             i++;
01243             
01244             int i0 = i;
01245             
01246             bool done = false;
01247             bool has_not = false;
01248             
01249             cmt_string choices = "";
01250             
01251             for (; i < expression.size (); i++)
01252               {
01253                 c = expression[i];
01254                 switch (c)
01255                   {
01256                     case ']':
01257                       done = true;
01258                       break;
01259                     case '^':
01260                       if (i == i0) has_not = true;
01261                       else choices += c;
01262                       break;
01263                     case '\\':
01264                       choices += c;
01265                       if (i >= expression.size ())
01266                         {
01267                             // syntax error : unbalanced '[' and unfinished
01268                             // escape sequence
01269                           delete or_root;
01270                           return;
01271                         }
01272                       i++;
01273                       c = expression[i];
01274                       choices += c;
01275                       break;
01276                     default:
01277                       choices += c;
01278                       break;
01279                   }
01280                 if (done) break;
01281               }
01282             
01283             if (!done)
01284               {
01285                   // syntax error : unbalanced '['
01286                 delete or_root;
01287                 return;
01288               }
01289             if (has_not)
01290               top_and->push (new cmt_not_char_list_node (choices));
01291             else        
01292               top_and->push (new cmt_char_list_node (choices));
01293           }
01294           break;
01295           case '*':
01296           {
01297               //
01298               //  exp : exp '*'
01299               //
01300             if (top_and->nodes () == 0)
01301               {
01302                   // Syntax error : '*' is not preceded by an expression
01303                 delete or_root;
01304                 return;
01305               }
01306             
01307             cmt_regexp_node* n = top_and->pop ();
01308             top_and->push (new cmt_zero_more (n));
01309           }
01310           break;
01311           case '+':
01312           {
01313               //
01314               //  exp : exp '+'
01315               //
01316             if (top_and->nodes () == 0)
01317               {
01318                   // Syntax error : '+' is not preceded by an expression
01319                 delete or_root;
01320                 return;
01321               }
01322             
01323             cmt_regexp_node* n = top_and->pop ();
01324             top_and->push (new cmt_one_more (n));
01325           }
01326           break;
01327           case '?':
01328           {
01329               //
01330               //  exp : exp '?'
01331               //
01332             if (top_and->nodes () == 0)
01333               {
01334                   // Syntax error : '?' is not preceded by an expression
01335                 delete or_root;
01336                 return;
01337               }
01338             
01339             cmt_regexp_node* n = top_and->pop ();
01340             top_and->push (new cmt_zero_one (n));
01341           }
01342           break;
01343           case '.':
01344               //
01345               //  exp : '.'
01346               //
01347             top_and->push (new cmt_any_node ());
01348             break;
01349           case '(':
01350           {
01351               //
01352               //  exp : '(' exp ')'
01353               //
01354             if (top_and->parentheses ())
01355               {
01356                   // This should never happen.
01357                 delete or_root;
01358                 return;
01359               }
01360             
01361             top_and->set_parentheses (true);
01362             
01363               //
01364               // A new complete expression is started.
01365               //  -> do as for top_and parsing.
01366               //
01367             
01368             top_and = new cmt_and_node (new cmt_or_node (top_and));
01369           }
01370           break;
01371           case ')':
01372           {
01373               //
01374               //  exp : '(' exp ')'
01375               //
01376             
01377               // top_and is the cmt_and_node into which new nodes are pushed.
01378             cmt_regexp_node_set* or_node = top_and->father ();
01379             if (or_node == 0) 
01380               {
01381                   // This should never happen : top_and should always be
01382                   // at least an cmt_and_node hanging at an cmt_or_node
01383                 delete or_root;
01384                 return;
01385               }
01386             
01387               //
01388               // The last cmt_and_node was empty, thus we had either '()' or '(...|)'
01389               //
01390             
01391             if (top_and->nodes () == 0) 
01392               {
01393                 delete (or_node->pop ());
01394               }
01395             else
01396               {
01397                 top_and->reduce ();
01398               }
01399             
01400             top_and = or_node->father ();
01401             
01402             if (top_and == 0)
01403               {
01404                   // Syntax error : too many ')'
01405                 delete or_root;
01406                 return;
01407               }
01408             
01409               //
01410               // top_and is now the previous running cmt_and_node where the '(' 
01411               // was originally met its top_and node contains the parenthesized 
01412               // sub expression  If this one is empty, (due to an empty '()' 
01413               // expression) then it may simply be discarded.
01414               //
01415             
01416             if (!top_and->parentheses ())
01417               {
01418                   // Syntax error : too many ')'
01419                 delete or_root;
01420                 return;
01421               }
01422             
01423             top_and->set_parentheses (false);
01424             
01425             cmt_regexp_node* unique = 0;
01426             if (or_node->nodes () == 1)
01427               {
01428                 cmt_regexp_node_set* and_node = (cmt_regexp_node_set*) or_node->top ();
01429                 if (and_node->nodes () == 1)
01430                   {
01431                     unique = and_node->pop ();
01432                     delete (or_node->pop ());
01433                   }
01434                 else if (and_node->nodes () == 0)
01435                   {
01436                     delete (or_node->pop ());
01437                   }
01438               }
01439             
01440             if (or_node->nodes () == 0) delete (top_and->pop ());
01441             if (unique != 0) top_and->push (unique);
01442           }
01443           
01444           break;
01445           case '|':
01446           {
01447               //
01448               //  exp : exp '|' exp
01449               //
01450 
01451             cmt_regexp_node_set* or_node = top_and->father ();
01452             
01453             top_and->reduce ();
01454             
01455               //
01456               // or is the father cmt_or_node, which only contains cmt_and_nodes
01457               //
01458             
01459             const cmt_regexp_node_set* and_node = (cmt_regexp_node_set*) or_node->top ();
01460             if (and_node->nodes () == 0)
01461               {
01462                   // the previous node was empty.
01463                   // we may discard it
01464                 or_node->pop ();
01465               }
01466             
01467             top_and = new cmt_and_node (or_node);
01468           }
01469           break;
01470           case '^':
01471               //
01472               //  exp : '^'
01473               //
01474             top_and->push (new cmt_begin_node ());
01475             break;
01476           case '$':
01477               //
01478               //  exp : '$'
01479               //
01480             top_and->push (new cmt_end_node ());
01481             break;
01482           case '\\':
01483             if (i >= expression.size ())
01484               {
01485                 delete or_root;
01486                 return;
01487               }
01488             i++;
01489             c = expression[i];
01490             switch (c)
01491               {
01492                 case '[':
01493                 case ']':
01494                 case '(':
01495                 case ')':
01496                 case '.':
01497                 case '*':
01498                 case '?':
01499                 case '^':
01500                 case '$':
01501                 case '\\':
01502                   break;
01503                 case 'r':
01504                   c = '\r';
01505                   break;
01506                 case 't':
01507                   c = '\t';
01508                   break;
01509                 case 'n':
01510                   c = '\n';
01511                   break;
01512                 default:
01513                   break;
01514               }
01515           default:
01516             top_and->push (new cmt_char_node (c));
01517             break;
01518         }
01519     }
01520   
01521   if (or_root != 0)
01522     {
01523       cmt_regexp_node_set* and_node = (cmt_regexp_node_set*) or_root->top ();
01524       
01525       if (or_root->nodes () == 1)
01526         {
01527             //
01528             // Check whether there is at least one non-empty
01529             // cmt_and_node
01530             //
01531           if (and_node->nodes () == 0)
01532             {
01533               delete or_root;
01534               return;
01535             }
01536         }
01537       
01538       if (and_node != 0)
01539         {
01540           and_node->reduce ();
01541           
01542           if (and_node->parentheses ())
01543             {
01544               delete or_root;
01545               return;
01546             }
01547         }
01548     }
01549   
01550   _root = or_root;
01551 
01552   bool dbg = CmtSystem::testenv ("CMTTESTREGEXP");
01553 
01554   if (dbg)
01555     {
01556       if (_root != 0)
01557         {
01558           _root->dump ();
01559         }
01560     }
01561 }
01562 
01563 cmt_regexp::~cmt_regexp ()
01564 {
01565   if (_root != 0)
01566     {
01567       delete _root;
01568     }
01569 }
01570 
01571 bool cmt_regexp::is_valid () const
01572 {
01573   if (_root != 0) return (true);
01574   else return (false);
01575 }
01576 
01577 cmt_regexp::iterator cmt_regexp::begin (const cmt_string& text, int pos)
01578 {
01579   if (_root != 0)
01580     {
01581       int i;
01582       
01583       for (i = pos; i < text.size (); i++)
01584         {
01585           cmt_regexp::iterator it = _root->match (text, i);
01586           if (it != end ()) return (it);
01587         }
01588     }
01589   
01590   return (end ());
01591 }
01592 
01593 cmt_regexp::iterator cmt_regexp::end ()
01594 {
01595   return (cmt_regexp::iterator::null ());
01596 }
01597 
01598 cmt_regexp::iterator cmt_regexp::begin (const cmt_string& text, int pos) const
01599 {
01600   if (_root != 0)
01601     {
01602       int i;
01603       
01604       for (i = pos; i < text.size (); i++)
01605         {
01606           cmt_regexp::iterator it = _root->match (text, i);
01607           if (it != end ()) return (it);
01608         }
01609     }
01610   
01611   return (end ());
01612 }
01613 
01614 cmt_regexp::iterator cmt_regexp::end () const
01615 {
01616   return (cmt_regexp::iterator::null ());
01617 }
01618 
01619 bool cmt_regexp::match (const cmt_string& text) const
01620 {
01621   iterator it = begin (text);
01622   if (it == end ()) return (false);
01623   else return (true);
01624 }
01625 //----------------------------------------------------------
01626 
01627 //----------------------------------------------------------
01628 const cmt_regexp::iterator cmt_regexp::iterator::null ()
01629 {
01630   static const iterator null_instance (-1, -1);
01631   
01632   return (null_instance);
01633 }
01634 
01635 cmt_regexp::iterator::iterator ()
01636 {
01637   _pos = 0;
01638   _length = 0;
01639 }
01640 
01641 cmt_regexp::iterator::iterator (int pos, int length)
01642 {
01643   _pos = pos;
01644   _length = length;
01645 }
01646 
01647 cmt_regexp::iterator::iterator (const iterator& other)
01648 {
01649   _pos = other._pos;
01650   _length = other._length;
01651 }
01652 
01653 int cmt_regexp::iterator::operator != (const iterator& other) const
01654 {
01655   return ((this->_pos != other._pos) ||
01656           (this->_length != other._length));
01657 }
01658 
01659 int cmt_regexp::iterator::operator == (const iterator& other) const
01660 {
01661   return ((this->_pos == other._pos) &&
01662           (this->_length == other._length));
01663 }
01664 
01665 int cmt_regexp::iterator::operator < (const iterator& other) const
01666 {
01667   if (_pos == -1) return (0);
01668   if (other._pos == -1) return (0);
01669 
01670   return (_pos < other._pos);
01671 }
01672 
01673 cmt_string cmt_regexp::iterator::operator () (const cmt_string& text) const
01674 {
01675   if (_pos == -1) return ("");
01676   if (_length <= 0) return ("");
01677 
01678   return (text.substr (_pos, _length));
01679 }
01680 
01681 //----------------------------------------------------------
01682 

Generated on Mon May 2 10:25:05 2005 for CMT by 1.3.5