00001
00002
00003
00004
00005
00006
00007 #include "cmt_std.h"
00008 #include "cmt_regexp.h"
00009 #include "cmt_vector.h"
00010 #include "cmt_system.h"
00011
00012
00013
00014
00015
00016
00017
00018 static int tab_level = 0;
00019 static void tab ()
00020 {
00021 for (int i = 0; i < tab_level; i++)
00022 {
00023 cout << " ";
00024 }
00025 }
00026
00027
00028 class cmt_regexp_node
00029 {
00030 public:
00031 static cmt_regexp_node& null ();
00032 static int node_count ();
00033
00034 cmt_regexp_node ();
00035 virtual ~cmt_regexp_node ();
00036
00037 virtual const cmt_regexp::iterator match (const cmt_string& text,
00038 int pos) const;
00039 virtual bool is_char () const;
00040 virtual bool is_many_node () const;
00041
00042 virtual void dump () const;
00043
00044 private:
00045 static int _node_count;
00046 };
00047
00048
00049
00050 class cmt_char_node : public cmt_regexp_node
00051 {
00052 public:
00053 cmt_char_node (char c);
00054
00055 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00056
00057 bool is_char () const;
00058 operator char ();
00059
00060 void dump () const;
00061
00062 private:
00063 char _c;
00064 };
00065
00066
00067
00068 class cmt_string_node : public cmt_regexp_node
00069 {
00070 public:
00071 cmt_string_node (const cmt_string& s);
00072
00073 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00074
00075 void dump () const;
00076
00077 private:
00078 cmt_string _s;
00079 };
00080
00081
00082
00083 class cmt_char_list_node : public cmt_regexp_node
00084 {
00085 public:
00086 cmt_char_list_node (cmt_string list);
00087
00088 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00089
00090 void dump () const;
00091
00092 protected:
00093 cmt_string _list;
00094 cmt_string _choices;
00095 };
00096
00097
00098
00099 class cmt_not_char_list_node : public cmt_char_list_node
00100 {
00101 public:
00102 cmt_not_char_list_node (cmt_string list);
00103
00104 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00105
00106 void dump () const;
00107 };
00108
00109
00110
00111 class cmt_any_node : public cmt_regexp_node
00112 {
00113 public:
00114 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00115
00116 void dump () const;
00117 };
00118
00119
00120
00121 class cmt_zero_one : public cmt_regexp_node
00122 {
00123 public:
00124 cmt_zero_one (cmt_regexp_node* n);
00125 ~cmt_zero_one ();
00126
00127 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00128
00129 void dump () const;
00130
00131 protected:
00132 cmt_regexp_node* _node;
00133 };
00134
00135
00136
00137
00138
00139 class cmt_begin_node : public cmt_regexp_node
00140 {
00141 public:
00142 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00143
00144 void dump () const;
00145 };
00146
00147
00148
00149 class cmt_end_node : public cmt_regexp_node
00150 {
00151 public:
00152 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00153
00154 void dump () const;
00155 };
00156
00157
00158
00159 class cmt_regexp_node_set : public cmt_regexp_node
00160 {
00161 public:
00162 cmt_regexp_node_set ();
00163 cmt_regexp_node_set (cmt_regexp_node_set* father);
00164 ~cmt_regexp_node_set ();
00165
00166 cmt_regexp_node_set* father ();
00167 void clear ();
00168 void push (cmt_regexp_node* n);
00169 cmt_regexp_node* pop ();
00170 cmt_regexp_node* top () const;
00171 int nodes () const;
00172 const cmt_regexp_node* nodeAt (int index) const;
00173 bool parentheses () const;
00174 void set_parentheses (bool value);
00175 virtual void reduce ();
00176
00177 virtual void dump () const;
00178 void dump (const cmt_string& title) const;
00179
00180 protected:
00181 cmt_regexp_node_set* _father;
00182 cmt_vector _nodes;
00183 bool _parentheses;
00184 };
00185
00186
00187
00188 class cmt_and_node : public cmt_regexp_node_set
00189 {
00190 public:
00191 cmt_and_node ();
00192 cmt_and_node (cmt_regexp_node_set* father);
00193
00194 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00195 void reduce ();
00196 void fill (cmt_and_node& other, int start_index);
00197
00198 void dump () const;
00199 };
00200
00201
00202
00203 class cmt_or_node : public cmt_regexp_node_set
00204 {
00205 public:
00206 cmt_or_node (cmt_regexp_node_set* father);
00207
00208 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00209
00210 void dump () const;
00211 };
00212
00213
00214
00215 cmt_regexp_node& cmt_regexp_node::null ()
00216 {
00217 static cmt_regexp_node null_instance;
00218
00219 return (null_instance);
00220 }
00221
00222
00223
00224 class cmt_many_node : public cmt_regexp_node
00225 {
00226 public:
00227 bool is_many_node () const;
00228 void install (cmt_and_node& other, int index);
00229 void reduce ();
00230
00231 void dump () const;
00232
00233 protected:
00234 cmt_many_node (cmt_regexp_node* n);
00235 virtual ~cmt_many_node ();
00236 cmt_regexp_node* _node;
00237 cmt_and_node _follower;
00238 };
00239
00240
00241
00242 class cmt_zero_more : public cmt_many_node
00243 {
00244 public:
00245 cmt_zero_more (cmt_regexp_node* n);
00246
00247 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00248
00249 void dump () const;
00250 };
00251
00252
00253
00254 class cmt_one_more : public cmt_many_node
00255 {
00256 public:
00257 cmt_one_more (cmt_regexp_node* n);
00258
00259 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
00260
00261 void dump () const;
00262 };
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273 cmt_regexp_node::cmt_regexp_node ()
00274 {
00275 _node_count++;
00276 }
00277
00278 cmt_regexp_node::~cmt_regexp_node ()
00279 {
00280 _node_count--;
00281 }
00282
00283 int cmt_regexp_node::node_count ()
00284 {
00285 return (_node_count);
00286 }
00287
00288 const cmt_regexp::iterator cmt_regexp_node::match (const cmt_string& ,
00289 int ) const
00290 {
00291 return (cmt_regexp::iterator::null());
00292 }
00293
00294 bool cmt_regexp_node::is_char () const
00295 {
00296 return (false);
00297 }
00298
00299 bool cmt_regexp_node::is_many_node () const
00300 {
00301 return (false);
00302 }
00303
00304 void cmt_regexp_node::dump () const
00305 {
00306 }
00307
00308 int cmt_regexp_node::_node_count = 0;
00309
00310
00311
00312 cmt_char_node::cmt_char_node (char c)
00313 {
00314 _c = c;
00315 }
00316
00317 const cmt_regexp::iterator cmt_char_node::match (const cmt_string& text,
00318 int pos) const
00319 {
00320 if ((pos < 0) || (pos > text.size ()))
00321 {
00322 return (cmt_regexp::iterator::null ());
00323 }
00324
00325 char c = text[pos];
00326
00327 if (c == _c)
00328 {
00329 return (cmt_regexp::iterator (pos, 1));
00330 }
00331
00332 return (cmt_regexp::iterator::null ());
00333 }
00334
00335 bool cmt_char_node::is_char () const
00336 {
00337 return (true);
00338 }
00339
00340 cmt_char_node::operator char ()
00341 {
00342 return (_c);
00343 }
00344
00345 void cmt_char_node::dump () const
00346 {
00347 tab (); cout << "char>(" << this << ") c=" << _c << endl;
00348 }
00349
00350
00351
00352
00353 cmt_string_node::cmt_string_node (const cmt_string& s)
00354 {
00355 _s = s;
00356 }
00357
00358 const cmt_regexp::iterator cmt_string_node::match (const cmt_string& text,
00359 int pos) const
00360 {
00361 if ((pos < 0) || (pos > text.size ()))
00362 {
00363 return (cmt_regexp::iterator::null ());
00364 }
00365
00366 int length = _s.size ();
00367
00368 cmt_string s = text.substr (pos, length);
00369
00370 if ((length == 0) || (s == _s))
00371 {
00372 return (cmt_regexp::iterator (pos, length));
00373 }
00374
00375 return (cmt_regexp::iterator::null ());
00376 }
00377
00378 void cmt_string_node::dump () const
00379 {
00380 tab (); cout << "string (" << this << ") s=[" << _s << "]" << endl;
00381 }
00382
00383
00384
00385
00386 cmt_char_list_node::cmt_char_list_node (cmt_string list)
00387 {
00388 _list = list;
00389
00390 _choices = "";
00391
00392 char c;
00393 int i;
00394
00395 for (i = 0; i < list.size (); i++)
00396 {
00397 c = list[i];
00398
00399 switch (c)
00400 {
00401 case '-':
00402 i++;
00403 {
00404 char c1 = _choices[_choices.size () - 1];
00405 char c2 = list[i];
00406 int j;
00407 int j0 = (c1 < c2) ? c1 : c2;
00408 int j1 = (c1 > c2) ? c1 : c2;
00409 for (j = j0; j <= j1; j++)
00410 {
00411 _choices += j;
00412 }
00413 }
00414 break;
00415 case '\\':
00416 i++;
00417 c = list[i];
00418 switch (c)
00419 {
00420 case '[':
00421 case ']':
00422 case '(':
00423 case ')':
00424 case '.':
00425 case '*':
00426 case '?':
00427 case '^':
00428 case '$':
00429 case '\\':
00430 c = '\\';
00431 break;
00432 case 'r':
00433 c = '\r';
00434 break;
00435 case 't':
00436 c = '\t';
00437 break;
00438 case 'n':
00439 c = '\n';
00440 break;
00441 default:
00442 break;
00443 }
00444 default:
00445 _choices += c;
00446 break;
00447 }
00448 }
00449 }
00450
00451 const cmt_regexp::iterator cmt_char_list_node::match (const cmt_string& text,
00452 int pos) const
00453 {
00454 if ((pos < 0) || (pos > text.size ()))
00455 {
00456 return (cmt_regexp::iterator::null ());
00457 }
00458
00459 char c = text[pos];
00460
00461 int i;
00462
00463 for (i = 0; i < _choices.size (); i++)
00464 {
00465 if (c == _choices[i]) return (cmt_regexp::iterator (pos, 1));
00466 }
00467
00468 return (cmt_regexp::iterator::null ());
00469 }
00470
00471 void cmt_char_list_node::dump () const
00472 {
00473 tab (); cout << "char_list (" << this << ") list=[" << _list << "] choices=[" << _choices << "]" << endl;
00474 }
00475
00476
00477
00478
00479 cmt_not_char_list_node::cmt_not_char_list_node (cmt_string list) :
00480 cmt_char_list_node (list)
00481 {
00482 }
00483
00484 const cmt_regexp::iterator cmt_not_char_list_node::match (const cmt_string& text,
00485 int pos) const
00486 {
00487 if ((pos < 0) || (pos > text.size ()))
00488 {
00489 return (cmt_regexp::iterator::null ());
00490 }
00491
00492 char c = text[pos];
00493
00494 int i;
00495
00496 for (i = 0; i < _choices.size (); i++)
00497 {
00498 if (c == _choices[i]) return (cmt_regexp::iterator::null ());
00499 }
00500
00501 return (cmt_regexp::iterator (pos, 1));
00502 }
00503
00504 void cmt_not_char_list_node::dump () const
00505 {
00506 tab (); cout << "not_char_list (" << this << ") list=[" << _list << "] choices=[" << _choices << "]" << endl;
00507 }
00508
00509
00510
00511
00512
00513 const cmt_regexp::iterator cmt_any_node::match (const cmt_string& text,
00514 int pos) const
00515 {
00516 if ((pos < 0) | (pos >= text.size ()))
00517 {
00518 return (cmt_regexp::iterator::null ());
00519 }
00520
00521 return (cmt_regexp::iterator (pos, 1));
00522 }
00523
00524 void cmt_any_node::dump () const
00525 {
00526 tab (); cout << "any (" << this << ") " << endl;
00527 }
00528
00529
00530
00531
00532 cmt_zero_one::cmt_zero_one (cmt_regexp_node* n) : _node (n)
00533 {
00534 }
00535
00536 cmt_zero_one::~cmt_zero_one ()
00537 {
00538 delete _node;
00539 }
00540
00541 const cmt_regexp::iterator cmt_zero_one::match (const cmt_string& text,
00542 int pos) const
00543 {
00544 if ((pos < 0) || (pos > text.size ()))
00545 {
00546 return (cmt_regexp::iterator::null ());
00547 }
00548
00549 int total = 0;
00550
00551 if (pos < text.size ())
00552 {
00553 const cmt_regexp::iterator it = _node->match (text, pos);
00554 if (it != cmt_regexp::iterator::null ())
00555 {
00556 total += it._length;
00557 pos += it._length;
00558 }
00559 }
00560
00561 return (cmt_regexp::iterator (pos, total));
00562 }
00563
00564 void cmt_zero_one::dump () const
00565 {
00566 tab (); cout << "zero_one (" << this << ") " << endl;
00567 if (_node != 0)
00568 {
00569 tab_level++;
00570 _node->dump ();
00571 tab_level--;
00572 }
00573 }
00574
00575
00576
00577
00578 cmt_many_node::cmt_many_node (cmt_regexp_node* n) : _node (n)
00579 {
00580 }
00581
00582 bool cmt_many_node::is_many_node () const
00583 {
00584 return (true);
00585 }
00586
00587 cmt_many_node::~cmt_many_node ()
00588 {
00589 delete _node;
00590 }
00591
00592 void cmt_many_node::install (cmt_and_node& other, int start_index)
00593 {
00594 _follower.fill (other, start_index);
00595 }
00596
00597 void cmt_many_node::reduce ()
00598 {
00599 _follower.reduce ();
00600 }
00601
00602 void cmt_many_node::dump () const
00603 {
00604 tab (); cout << "many (" << this << ") " << endl;
00605 if (_node != 0)
00606 {
00607 tab_level++;
00608 _node->dump ();
00609 tab_level--;
00610 }
00611 tab_level++;
00612 _follower.dump ();
00613 tab_level--;
00614 }
00615
00616
00617
00618
00619
00620
00621 cmt_zero_more::cmt_zero_more (cmt_regexp_node* n) : cmt_many_node (n)
00622 {
00623 }
00624
00625 const cmt_regexp::iterator cmt_zero_more::match (const cmt_string& text,
00626 int pos) const
00627 {
00628 if ((pos < 0) || (pos > text.size ()))
00629 {
00630 return (cmt_regexp::iterator::null ());
00631 }
00632
00633 int total = 0;
00634
00635
00636
00637
00638
00639 int saved_pos = -1;
00640 int saved_total = -1;
00641
00642 do
00643 {
00644 const cmt_regexp::iterator itx = _node->match (text, pos);
00645 const cmt_regexp::iterator ity = _follower.match (text, pos);
00646
00647 if ((itx == cmt_regexp::iterator::null ()) &&
00648 (ity == cmt_regexp::iterator::null ()))
00649 {
00650
00651
00652
00653
00654 if (saved_pos >= 0)
00655 {
00656
00657
00658
00659 pos = saved_pos;
00660 total = saved_total;
00661 }
00662 else
00663 {
00664
00665
00666
00667 return (cmt_regexp::iterator::null ());
00668 }
00669
00670 break;
00671 }
00672
00673 if (itx == cmt_regexp::iterator::null ())
00674 {
00675
00676
00677
00678 total += ity._length;
00679 pos += ity._length;
00680 break;
00681 }
00682
00683 if (ity != cmt_regexp::iterator::null ())
00684 {
00685
00686
00687
00688
00689 saved_total = total + ity._length;
00690 saved_pos = pos + ity._length;
00691 }
00692 total += itx._length;
00693 pos += itx._length;
00694 } while (true);
00695
00696 return (cmt_regexp::iterator (pos, total));
00697 }
00698
00699 void cmt_zero_more::dump () const
00700 {
00701 tab (); cout << "zero_more (" << this << ") " << endl;
00702 if (_node != 0)
00703 {
00704 tab_level++;
00705 _node->dump ();
00706 tab_level--;
00707 }
00708 _follower.dump ();
00709 }
00710
00711
00712
00713
00714 cmt_one_more::cmt_one_more (cmt_regexp_node* n) : cmt_many_node (n)
00715 {
00716 }
00717
00718 const cmt_regexp::iterator cmt_one_more::match (const cmt_string& text,
00719 int pos) const
00720 {
00721 if ((pos < 0) || (pos > text.size ()))
00722 {
00723 return (cmt_regexp::iterator::null ());
00724 }
00725
00726 int total = 0;
00727
00728
00729
00730
00731
00732 int saved_pos = -1;
00733 int saved_total = -1;
00734 bool at_least_one = false;
00735
00736 do
00737 {
00738 const cmt_regexp::iterator itx = _node->match (text, pos);
00739 const cmt_regexp::iterator ity = _follower.match (text, pos);
00740
00741 if ((itx == cmt_regexp::iterator::null ()) &&
00742 (ity == cmt_regexp::iterator::null ()))
00743 {
00744
00745
00746
00747
00748 if (saved_pos >= 0)
00749 {
00750
00751
00752
00753 pos = saved_pos;
00754 total = saved_total;
00755 }
00756 else
00757 {
00758
00759
00760
00761 return (cmt_regexp::iterator::null ());
00762 }
00763
00764 break;
00765 }
00766
00767 if (itx == cmt_regexp::iterator::null ())
00768 {
00769
00770
00771
00772 total += ity._length;
00773 pos += ity._length;
00774 break;
00775 }
00776
00777 if (ity != cmt_regexp::iterator::null ())
00778 {
00779
00780
00781
00782
00783 saved_total = total + ity._length;
00784 saved_pos = pos + ity._length;
00785 }
00786
00787 total += itx._length;
00788 pos += itx._length;
00789
00790 at_least_one = true;
00791 } while (true);
00792
00793 if (!at_least_one) return (cmt_regexp::iterator::null ());
00794
00795 return (cmt_regexp::iterator (pos, total));
00796 }
00797
00798 void cmt_one_more::dump () const
00799 {
00800 tab (); cout << "one_more (" << this << ") " << endl;
00801 if (_node != 0)
00802 {
00803 tab_level++;
00804 _node->dump ();
00805 tab_level--;
00806 }
00807 tab_level++;
00808 _follower.dump ();
00809 tab_level--;
00810 }
00811
00812
00813
00814
00815
00816
00817 const cmt_regexp::iterator cmt_begin_node::match (const cmt_string& ,
00818 int pos) const
00819 {
00820 if (pos == 0) return (cmt_regexp::iterator (pos, 0));
00821 return (cmt_regexp::iterator::null ());
00822 }
00823
00824 void cmt_begin_node::dump () const
00825 {
00826 tab (); cout << "begin (" << this << ") " << endl;
00827 }
00828
00829
00830
00831 const cmt_regexp::iterator cmt_end_node::match (const cmt_string& text,
00832 int pos) const
00833 {
00834 if (pos == text.size ()) return (cmt_regexp::iterator (pos, 0));
00835 return (cmt_regexp::iterator::null ());
00836 }
00837
00838 void cmt_end_node::dump () const
00839 {
00840 tab (); cout << "end (" << this << ") " << endl;
00841 }
00842
00843
00844
00845 cmt_regexp_node_set::cmt_regexp_node_set () : _father (0)
00846 {
00847 _parentheses = false;
00848 }
00849
00850 cmt_regexp_node_set::cmt_regexp_node_set (cmt_regexp_node_set* father) : _father (father)
00851 {
00852 if (father != 0) father->push (this);
00853 _parentheses = false;
00854 }
00855
00856 cmt_regexp_node_set::~cmt_regexp_node_set ()
00857 {
00858 clear ();
00859 }
00860
00861 cmt_regexp_node_set* cmt_regexp_node_set::father ()
00862 {
00863 return (_father);
00864 }
00865
00866 void cmt_regexp_node_set::clear ()
00867 {
00868 int i;
00869
00870 for (i = 0; i < _nodes.size (); i++)
00871 {
00872 cmt_regexp_node* n = _nodes[i];
00873 delete n;
00874 }
00875 _nodes.clear ();
00876 }
00877
00878 void cmt_regexp_node_set::push (cmt_regexp_node* n)
00879 {
00880 _nodes.push_back (n);
00881 }
00882
00883 cmt_regexp_node* cmt_regexp_node_set::pop ()
00884 {
00885 if (_nodes.size () == 0) return (&cmt_regexp_node::null ());
00886
00887 int index = _nodes.size () - 1;
00888
00889 cmt_regexp_node* n = _nodes[index];
00890 _nodes.erase (index);
00891
00892 return (n);
00893 }
00894
00895 cmt_regexp_node* cmt_regexp_node_set::top () const
00896 {
00897 if (_nodes.size () == 0) return (&cmt_regexp_node::null ());
00898
00899 int index = _nodes.size () - 1;
00900
00901 cmt_regexp_node* n = _nodes[index];
00902
00903 return (n);
00904 }
00905
00906 int cmt_regexp_node_set::nodes () const
00907 {
00908 return (_nodes.size ());
00909 }
00910
00911 const cmt_regexp_node* cmt_regexp_node_set::nodeAt (int index) const
00912 {
00913 return (_nodes[index]);
00914 }
00915
00916 bool cmt_regexp_node_set::parentheses () const
00917 {
00918 return (_parentheses);
00919 }
00920
00921 void cmt_regexp_node_set::set_parentheses (bool value)
00922 {
00923 _parentheses = value;
00924 }
00925
00926 void cmt_regexp_node_set::reduce ()
00927 {
00928 }
00929
00930 void cmt_regexp_node_set::dump () const
00931 {
00932 tab (); cout << "regexp_node_set (" << this << ") " << endl;
00933 }
00934
00935 void cmt_regexp_node_set::dump (const cmt_string& title) const
00936 {
00937 tab (); cout << "Set (" << this << ") father=" << _father << " pars=" << _parentheses << endl;
00938 for (int i = 0; i < _nodes.size (); i++)
00939 {
00940 cmt_regexp_node* n = _nodes[i];
00941 if (n != 0)
00942 {
00943 if (i > 0)
00944 {
00945 tab (); cout << title << endl;
00946 }
00947 tab_level++;
00948 n->dump ();
00949 tab_level--;
00950 }
00951 }
00952 tab (); cout << "EndSet (" << this << ")" << endl;
00953 }
00954
00955
00956
00957 cmt_and_node::cmt_and_node () : cmt_regexp_node_set ()
00958 {
00959 }
00960
00961 cmt_and_node::cmt_and_node (cmt_regexp_node_set* father) : cmt_regexp_node_set (father)
00962 {
00963 }
00964
00965 const cmt_regexp::iterator cmt_and_node::match (const cmt_string& text,
00966 int pos) const
00967 {
00968 if ((pos < 0) || (pos > text.size ()))
00969 {
00970 return (cmt_regexp::iterator::null ());
00971 }
00972
00973 if (_nodes.size () == 0) return (cmt_regexp::iterator (pos, 0));
00974
00975 int i;
00976 int total = 0;
00977 int p = pos;
00978
00979 bool dbg = CmtSystem::testenv ("CMTTESTREGEXP");
00980 if (dbg) {tab (); cout << "match and (" << this << ") pos=" << pos << endl;}
00981
00982 for (i = 0; i < _nodes.size (); i++)
00983 {
00984 cmt_regexp_node* n = _nodes[i];
00985
00986 if (dbg) tab_level++;
00987 const cmt_regexp::iterator it = n->match (text, p);
00988 if (dbg) tab_level--;
00989
00990 if (dbg) {tab (); cout << " -> it(" << n << ") p=" << it._pos << " l=" << it._length << endl;}
00991
00992 if (it == cmt_regexp::iterator::null ()) return (it);
00993
00994 total += it._length;
00995 p += it._length;
00996 }
00997
00998
00999
01000 return (cmt_regexp::iterator (pos, total));
01001 }
01002
01003 void cmt_and_node::reduce ()
01004 {
01005 if (_nodes.size () < 2) return;
01006
01007 char c = ' ';
01008 cmt_string s = "";
01009 cmt_vector new_nodes;
01010
01011
01012
01013
01014
01015 for (int i = 0; i <= _nodes.size (); i++)
01016 {
01017 cmt_regexp_node* n = 0;
01018
01019 if (i < _nodes.size ()) n = _nodes[i];
01020
01021 if ((i >= _nodes.size ()) || (!n->is_char ()))
01022 {
01023 if (s.size () == 1)
01024 {
01025
01026
01027
01028
01029 new_nodes.push_back (new cmt_char_node (c));
01030 s = "";
01031 }
01032 else if (s.size () > 1)
01033 {
01034
01035
01036
01037
01038 new_nodes.push_back (new cmt_string_node (s));
01039 s = "";
01040 }
01041
01042 if (i >= _nodes.size ()) break;
01043 }
01044
01045 if (n->is_char ())
01046 {
01047
01048
01049
01050 cmt_char_node& cn = *((cmt_char_node*) n);
01051 c = (char) cn;
01052 s += c;
01053 delete n;
01054 _nodes[i] = 0;
01055 }
01056 else if (n->is_many_node ())
01057 {
01058 cmt_many_node& mn = *((cmt_many_node*) n);
01059 mn.install (*this, i + 1);
01060 mn.reduce ();
01061 new_nodes.push_back (n);
01062 break;
01063 }
01064 else
01065 {
01066 new_nodes.push_back (n);
01067 }
01068 }
01069
01070 _nodes = new_nodes;
01071 }
01072
01073 void cmt_and_node::fill (cmt_and_node& other, int start_index)
01074 {
01075 if ((start_index < 0) || (start_index > other.nodes ())) return;
01076
01077 for (int i = start_index; i < other.nodes (); i++)
01078 {
01079 cmt_regexp_node* n = other._nodes[i];
01080 push (n);
01081 }
01082 }
01083
01084 void cmt_and_node::dump () const
01085 {
01086 cmt_regexp_node_set::dump ("and");
01087 }
01088
01089
01090
01091
01092 cmt_or_node::cmt_or_node (cmt_regexp_node_set* father) : cmt_regexp_node_set (father)
01093 {
01094 }
01095
01096 const cmt_regexp::iterator cmt_or_node::match (const cmt_string& text,
01097 int pos) const
01098 {
01099 if ((pos < 0) || (pos >= text.size ()))
01100 {
01101 return (cmt_regexp::iterator::null ());
01102 }
01103
01104 if (_nodes.size () == 0) return (cmt_regexp::iterator (pos, 0));
01105
01106 bool dbg = CmtSystem::testenv ("CMTTESTREGEXP");
01107 if (dbg) {tab (); cout << "match or (" << this << ") pos=" << pos << endl;}
01108
01109 int i;
01110
01111 int longest = 0;
01112
01113 cmt_regexp::iterator result = cmt_regexp::iterator::null ();
01114
01115 for (i = 0; i < _nodes.size (); i++)
01116 {
01117 const cmt_regexp_node* n = _nodes[i];
01118
01119 if (dbg) tab_level++;
01120 const cmt_regexp::iterator it = n->match (text, pos);
01121 if (dbg) tab_level--;
01122
01123 if (it._length > longest)
01124 {
01125 longest = it._length;
01126 result = it;
01127 }
01128
01129
01130
01131 }
01132
01133 return (result);
01134 }
01135
01136 void cmt_or_node::dump () const
01137 {
01138 cmt_regexp_node_set::dump ("or");
01139 }
01140
01141
01142
01143
01144
01145
01146
01147
01148
01149
01150
01151
01152
01153
01154
01155
01156 cmt_regexp::cmt_regexp ()
01157 {
01158 _root = 0;
01159 }
01160
01161
01162 cmt_regexp::cmt_regexp (const cmt_string& expression)
01163 {
01164 _root = 0;
01165 set (expression);
01166 }
01167
01168
01169 void cmt_regexp::set (const cmt_string& expression)
01170 {
01171 if (_root != 0)
01172 {
01173 delete _root;
01174 _root = 0;
01175 }
01176
01177
01178
01179
01180
01181
01182
01183 cmt_regexp_node_set* or_root = 0;
01184 cmt_regexp_node_set* top_and = 0;
01185
01186
01187
01188
01189
01190
01191
01192
01193
01194
01195
01196
01197
01198
01199
01200
01201
01202
01203
01204
01205
01206
01207
01208 {
01209
01210
01211
01212
01213
01214
01215
01216 or_root = new cmt_or_node (0);
01217 top_and = new cmt_and_node (or_root);
01218 }
01219
01220 int i;
01221
01222 for (i = 0; i < expression.size (); i++)
01223 {
01224 char c = expression[i];
01225 switch (c)
01226 {
01227 case '[':
01228 {
01229
01230
01231
01232
01233
01234
01235
01236 if (i >= expression.size ())
01237 {
01238
01239 delete or_root;
01240 return;
01241 }
01242 i++;
01243
01244 int i0 = i;
01245
01246 bool done = false;
01247 bool has_not = false;
01248
01249 cmt_string choices = "";
01250
01251 for (; i < expression.size (); i++)
01252 {
01253 c = expression[i];
01254 switch (c)
01255 {
01256 case ']':
01257 done = true;
01258 break;
01259 case '^':
01260 if (i == i0) has_not = true;
01261 else choices += c;
01262 break;
01263 case '\\':
01264 choices += c;
01265 if (i >= expression.size ())
01266 {
01267
01268
01269 delete or_root;
01270 return;
01271 }
01272 i++;
01273 c = expression[i];
01274 choices += c;
01275 break;
01276 default:
01277 choices += c;
01278 break;
01279 }
01280 if (done) break;
01281 }
01282
01283 if (!done)
01284 {
01285
01286 delete or_root;
01287 return;
01288 }
01289 if (has_not)
01290 top_and->push (new cmt_not_char_list_node (choices));
01291 else
01292 top_and->push (new cmt_char_list_node (choices));
01293 }
01294 break;
01295 case '*':
01296 {
01297
01298
01299
01300 if (top_and->nodes () == 0)
01301 {
01302
01303 delete or_root;
01304 return;
01305 }
01306
01307 cmt_regexp_node* n = top_and->pop ();
01308 top_and->push (new cmt_zero_more (n));
01309 }
01310 break;
01311 case '+':
01312 {
01313
01314
01315
01316 if (top_and->nodes () == 0)
01317 {
01318
01319 delete or_root;
01320 return;
01321 }
01322
01323 cmt_regexp_node* n = top_and->pop ();
01324 top_and->push (new cmt_one_more (n));
01325 }
01326 break;
01327 case '?':
01328 {
01329
01330
01331
01332 if (top_and->nodes () == 0)
01333 {
01334
01335 delete or_root;
01336 return;
01337 }
01338
01339 cmt_regexp_node* n = top_and->pop ();
01340 top_and->push (new cmt_zero_one (n));
01341 }
01342 break;
01343 case '.':
01344
01345
01346
01347 top_and->push (new cmt_any_node ());
01348 break;
01349 case '(':
01350 {
01351
01352
01353
01354 if (top_and->parentheses ())
01355 {
01356
01357 delete or_root;
01358 return;
01359 }
01360
01361 top_and->set_parentheses (true);
01362
01363
01364
01365
01366
01367
01368 top_and = new cmt_and_node (new cmt_or_node (top_and));
01369 }
01370 break;
01371 case ')':
01372 {
01373
01374
01375
01376
01377
01378 cmt_regexp_node_set* or_node = top_and->father ();
01379 if (or_node == 0)
01380 {
01381
01382
01383 delete or_root;
01384 return;
01385 }
01386
01387
01388
01389
01390
01391 if (top_and->nodes () == 0)
01392 {
01393 delete (or_node->pop ());
01394 }
01395 else
01396 {
01397 top_and->reduce ();
01398 }
01399
01400 top_and = or_node->father ();
01401
01402 if (top_and == 0)
01403 {
01404
01405 delete or_root;
01406 return;
01407 }
01408
01409
01410
01411
01412
01413
01414
01415
01416 if (!top_and->parentheses ())
01417 {
01418
01419 delete or_root;
01420 return;
01421 }
01422
01423 top_and->set_parentheses (false);
01424
01425 cmt_regexp_node* unique = 0;
01426 if (or_node->nodes () == 1)
01427 {
01428 cmt_regexp_node_set* and_node = (cmt_regexp_node_set*) or_node->top ();
01429 if (and_node->nodes () == 1)
01430 {
01431 unique = and_node->pop ();
01432 delete (or_node->pop ());
01433 }
01434 else if (and_node->nodes () == 0)
01435 {
01436 delete (or_node->pop ());
01437 }
01438 }
01439
01440 if (or_node->nodes () == 0) delete (top_and->pop ());
01441 if (unique != 0) top_and->push (unique);
01442 }
01443
01444 break;
01445 case '|':
01446 {
01447
01448
01449
01450
01451 cmt_regexp_node_set* or_node = top_and->father ();
01452
01453 top_and->reduce ();
01454
01455
01456
01457
01458
01459 const cmt_regexp_node_set* and_node = (cmt_regexp_node_set*) or_node->top ();
01460 if (and_node->nodes () == 0)
01461 {
01462
01463
01464 or_node->pop ();
01465 }
01466
01467 top_and = new cmt_and_node (or_node);
01468 }
01469 break;
01470 case '^':
01471
01472
01473
01474 top_and->push (new cmt_begin_node ());
01475 break;
01476 case '$':
01477
01478
01479
01480 top_and->push (new cmt_end_node ());
01481 break;
01482 case '\\':
01483 if (i >= expression.size ())
01484 {
01485 delete or_root;
01486 return;
01487 }
01488 i++;
01489 c = expression[i];
01490 switch (c)
01491 {
01492 case '[':
01493 case ']':
01494 case '(':
01495 case ')':
01496 case '.':
01497 case '*':
01498 case '?':
01499 case '^':
01500 case '$':
01501 case '\\':
01502 break;
01503 case 'r':
01504 c = '\r';
01505 break;
01506 case 't':
01507 c = '\t';
01508 break;
01509 case 'n':
01510 c = '\n';
01511 break;
01512 default:
01513 break;
01514 }
01515 default:
01516 top_and->push (new cmt_char_node (c));
01517 break;
01518 }
01519 }
01520
01521 if (or_root != 0)
01522 {
01523 cmt_regexp_node_set* and_node = (cmt_regexp_node_set*) or_root->top ();
01524
01525 if (or_root->nodes () == 1)
01526 {
01527
01528
01529
01530
01531 if (and_node->nodes () == 0)
01532 {
01533 delete or_root;
01534 return;
01535 }
01536 }
01537
01538 if (and_node != 0)
01539 {
01540 and_node->reduce ();
01541
01542 if (and_node->parentheses ())
01543 {
01544 delete or_root;
01545 return;
01546 }
01547 }
01548 }
01549
01550 _root = or_root;
01551
01552 bool dbg = CmtSystem::testenv ("CMTTESTREGEXP");
01553
01554 if (dbg)
01555 {
01556 if (_root != 0)
01557 {
01558 _root->dump ();
01559 }
01560 }
01561 }
01562
01563 cmt_regexp::~cmt_regexp ()
01564 {
01565 if (_root != 0)
01566 {
01567 delete _root;
01568 }
01569 }
01570
01571 bool cmt_regexp::is_valid () const
01572 {
01573 if (_root != 0) return (true);
01574 else return (false);
01575 }
01576
01577 cmt_regexp::iterator cmt_regexp::begin (const cmt_string& text, int pos)
01578 {
01579 if (_root != 0)
01580 {
01581 int i;
01582
01583 for (i = pos; i < text.size (); i++)
01584 {
01585 cmt_regexp::iterator it = _root->match (text, i);
01586 if (it != end ()) return (it);
01587 }
01588 }
01589
01590 return (end ());
01591 }
01592
01593 cmt_regexp::iterator cmt_regexp::end ()
01594 {
01595 return (cmt_regexp::iterator::null ());
01596 }
01597
01598 cmt_regexp::iterator cmt_regexp::begin (const cmt_string& text, int pos) const
01599 {
01600 if (_root != 0)
01601 {
01602 int i;
01603
01604 for (i = pos; i < text.size (); i++)
01605 {
01606 cmt_regexp::iterator it = _root->match (text, i);
01607 if (it != end ()) return (it);
01608 }
01609 }
01610
01611 return (end ());
01612 }
01613
01614 cmt_regexp::iterator cmt_regexp::end () const
01615 {
01616 return (cmt_regexp::iterator::null ());
01617 }
01618
01619 bool cmt_regexp::match (const cmt_string& text) const
01620 {
01621 iterator it = begin (text);
01622 if (it == end ()) return (false);
01623 else return (true);
01624 }
01625
01626
01627
01628 const cmt_regexp::iterator cmt_regexp::iterator::null ()
01629 {
01630 static const iterator null_instance (-1, -1);
01631
01632 return (null_instance);
01633 }
01634
01635 cmt_regexp::iterator::iterator ()
01636 {
01637 _pos = 0;
01638 _length = 0;
01639 }
01640
01641 cmt_regexp::iterator::iterator (int pos, int length)
01642 {
01643 _pos = pos;
01644 _length = length;
01645 }
01646
01647 cmt_regexp::iterator::iterator (const iterator& other)
01648 {
01649 _pos = other._pos;
01650 _length = other._length;
01651 }
01652
01653 int cmt_regexp::iterator::operator != (const iterator& other) const
01654 {
01655 return ((this->_pos != other._pos) ||
01656 (this->_length != other._length));
01657 }
01658
01659 int cmt_regexp::iterator::operator == (const iterator& other) const
01660 {
01661 return ((this->_pos == other._pos) &&
01662 (this->_length == other._length));
01663 }
01664
01665 int cmt_regexp::iterator::operator < (const iterator& other) const
01666 {
01667 if (_pos == -1) return (0);
01668 if (other._pos == -1) return (0);
01669
01670 return (_pos < other._pos);
01671 }
01672
01673 cmt_string cmt_regexp::iterator::operator () (const cmt_string& text) const
01674 {
01675 if (_pos == -1) return ("");
01676 if (_length <= 0) return ("");
01677
01678 return (text.substr (_pos, _length));
01679 }
01680
01681
01682