tokenizer.cpp

Detailed Description

  1
  2//-----------------------------------------------------------------------------
  3// Copyright (c) 2012 GarageGames, LLC
  4//
  5// Permission is hereby granted, free of charge, to any person obtaining a copy
  6// of this software and associated documentation files (the "Software"), to
  7// deal in the Software without restriction, including without limitation the
  8// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  9// sell copies of the Software, and to permit persons to whom the Software is
 10// furnished to do so, subject to the following conditions:
 11//
 12// The above copyright notice and this permission notice shall be included in
 13// all copies or substantial portions of the Software.
 14//
 15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 21// IN THE SOFTWARE.
 22//-----------------------------------------------------------------------------
 23
 24#include "core/tokenizer.h"
 25#include "platform/platform.h"
 26#include "core/stream/fileStream.h"
 27#include "core/strings/stringFunctions.h"
 28#include "core/util/safeDelete.h"
 29
 30Tokenizer::Tokenizer()
 31{
 32   dMemset(mFileName, 0, sizeof(mFileName));
 33
 34   mpBuffer    = NULL;
 35   mBufferSize = 0;
 36
 37   mStartPos   = 0;
 38   mCurrPos    = 0;
 39
 40   mTokenIsQuoted = false;
 41
 42   dMemset(mCurrTokenBuffer, 0, sizeof(mCurrTokenBuffer));
 43   mTokenIsCurrent = false;
 44
 45   mSingleTokens = NULL;
 46
 47   VECTOR_SET_ASSOCIATION(mLinePositions);
 48}
 49
 50Tokenizer::~Tokenizer()
 51{
 52   clear();
 53}
 54
 55bool Tokenizer::openFile(const char* pFileName)
 56{
 57   AssertFatal(mFileName[0] == '\0', "Reuse of Tokenizers not allowed!");
 58
 59   FileStream* pStream = new FileStream;
 60   if (pStream->open(pFileName, Torque::FS::File::Read) == false)
 61   {
 62      delete pStream;
 63      return false;
 64   }
 65   dStrcpy(mFileName, pFileName, 1024);
 66
 67   mBufferSize = pStream->getStreamSize();
 68   mpBuffer    = new char[mBufferSize];
 69   pStream->read(mBufferSize, mpBuffer);
 70   pStream->close();
 71   delete pStream;
 72
 73   reset();
 74
 75   buildLinePositions();
 76
 77   return true;
 78}
 79
 80bool Tokenizer::openFile(Stream* pStream)
 81{
 82   mBufferSize = pStream->getStreamSize();
 83   mpBuffer    = new char[mBufferSize];
 84   pStream->read(mBufferSize, mpBuffer);
 85
 86   reset();
 87
 88   buildLinePositions();
 89
 90   return true;
 91}
 92
 93void Tokenizer::setBuffer(const char* buffer, U32 bufferSize)
 94{
 95   if (mpBuffer)
 96   {
 97      SAFE_DELETE_ARRAY(mpBuffer);
 98      mBufferSize = 0;
 99   }
100
101   mBufferSize = bufferSize;
102   mpBuffer    = new char[mBufferSize + 1];
103   dStrcpy(mpBuffer, buffer, mBufferSize + 1);
104
105   reset();
106
107   buildLinePositions();
108}
109
110void Tokenizer::setSingleTokens(const char* singleTokens)
111{
112   if (mSingleTokens)
113      SAFE_DELETE(mSingleTokens);
114
115   if (singleTokens)
116      mSingleTokens = dStrdup(singleTokens);
117}
118
119bool Tokenizer::reset()
120{
121   mStartPos   = 0;
122   mCurrPos    = 0;
123
124   mTokenIsQuoted = false;
125
126   dMemset(mCurrTokenBuffer, 0, sizeof(mCurrTokenBuffer));
127   mTokenIsCurrent = false;
128
129   return true;
130}
131
132bool Tokenizer::clear()
133{
134   // Delete our buffer
135   if (mpBuffer)
136      SAFE_DELETE_ARRAY(mpBuffer);
137
138   // Reset the buffer size
139   mBufferSize = 0;
140
141   // Reset our active data
142   reset();
143
144   // Clear our line positions
145   mLinePositions.clear();
146
147   // Reset our file name
148   dMemset(mFileName, 0, 1024);
149
150   // Wipe the single tokens
151   setSingleTokens(NULL);
152
153   return true;
154}
155
156bool Tokenizer::setCurrentPos(U32 pos)
157{
158   mCurrPos    = pos;
159   mTokenIsCurrent = false;
160
161   return advanceToken(true);
162}
163
164void Tokenizer::buildLinePositions()
165{
166   if (mBufferSize == 0)
167      return;
168
169   // We can safely assume that the first line is at position 0
170   mLinePositions.push_back(0);
171
172   U32 currPos = 0;
173   while (currPos + 1 < mBufferSize)
174   {
175      // Windows line ending
176      if (mpBuffer[currPos] == '\r' && mpBuffer[currPos + 1] == '\n')
177      {
178         currPos += 2;
179
180         mLinePositions.push_back(currPos);
181      }
182      // Not sure if this ever happens but just in case
183      else if (mpBuffer[currPos] == '\n' && mpBuffer[currPos + 1] == '\r')
184      {
185         currPos += 2;
186
187         mLinePositions.push_back(currPos);
188      }
189      // Unix line endings should only have a single line break character
190      else if (mpBuffer[currPos] == '\n' || mpBuffer[currPos] == '\r')
191      {
192         currPos++;
193
194         mLinePositions.push_back(currPos);
195      }
196      else
197         currPos++;
198   }
199}
200
201U32 Tokenizer::getLinePosition(const U32 pos, U32 lowIndex, S32 highIndex)
202{
203   // If we have one or less lines then
204   // the result is easy
205   if (mLinePositions.size() <= 1)
206      return 0;
207
208   // Now that we know we have at least one position
209   // we can do a quick test against the last line
210   if (pos >= mLinePositions.last())
211      return mLinePositions.size() - 1;
212
213   // If this is the beginning of the search
214   // set a good starting point (the middle)
215   if (highIndex < 0)
216      highIndex = mLinePositions.size() - 1;
217
218   // Just in case bad values got handed in
219   if (lowIndex > highIndex)
220      lowIndex = highIndex;
221
222   // Compute our test index (middle)
223   U32 testIndex = (lowIndex + highIndex) / 2;
224
225   // Make sure that our test indices are valid
226   if (testIndex >= mLinePositions.size() ||
227       testIndex + 1 >= mLinePositions.size())
228      return mLinePositions.size() - 1;
229
230   // See if we are already at the right line
231   if (pos >= mLinePositions[testIndex] && pos < mLinePositions[testIndex + 1])
232      return testIndex;
233
234   if (pos < mLinePositions[testIndex])
235      highIndex = testIndex;
236   else
237      lowIndex = testIndex;
238
239   return getLinePosition(pos, lowIndex, highIndex);
240}
241
242U32 Tokenizer::getCurrentLine()
243{
244   // Binary search for the line number whose
245   // position is equal to or lower than the
246   // current position
247   return getLinePosition(mStartPos);
248}
249
250U32 Tokenizer::getTokenLineOffset()
251{
252   U32 lineNumber = getCurrentLine();
253
254   if (lineNumber >= mLinePositions.size())
255      return 0;
256
257   U32 linePosition = mLinePositions[lineNumber];
258
259   if (linePosition >= mStartPos)
260      return 0;
261
262   return mStartPos - linePosition;
263}
264
265bool Tokenizer::advanceToken(const bool crossLine, const bool assertAvail)
266{
267   if (mTokenIsCurrent == true)
268   {
269      AssertFatal(mCurrTokenBuffer[0] != '\0', "No token, but marked as current?");
270      mTokenIsCurrent = false;
271      return true;
272   }
273
274   U32 currPosition = 0;
275   mCurrTokenBuffer[0] = '\0';
276
277   mTokenIsQuoted = false;
278
279   // Store the beginning of the previous advance
280   // and the beginning of the current advance
281   mStartPos = mCurrPos;
282
283   while (mCurrPos < mBufferSize)
284   {
285      char c = mpBuffer[mCurrPos];
286
287      bool cont = true;
288
289      if (mSingleTokens && dStrchr(mSingleTokens, c))
290      {
291         if (currPosition == 0)
292         {
293            mCurrTokenBuffer[currPosition++] = c;
294            mCurrPos++;
295            cont = false;
296            break;
297         }
298         else
299         {
300            // End of token
301            cont = false;
302         }
303      }
304      else
305      {
306         switch (c)
307         {
308           case ' ':
309           case '\t':
310            if (currPosition == 0)
311            {
312               // Token hasn't started yet...
313               mCurrPos++;
314            }
315            else
316            {
317               // End of token
318               mCurrPos++;
319               cont = false;
320            }
321            break;
322
323           case '\r':
324           case '\n':
325            if (crossLine == true)
326            {
327               // Windows line ending
328               if (mpBuffer[mCurrPos] == '\r' && mpBuffer[mCurrPos + 1] == '\n')
329                  mCurrPos += 2;
330               // Not sure if this ever happens but just in case
331               else if (mpBuffer[mCurrPos] == '\n' && mpBuffer[mCurrPos + 1] == '\r')
332                  mCurrPos += 2;
333               // Unix line endings should only have a single line break character
334               else
335                  mCurrPos++;
336            }
337            else
338            {
339               cont = false;
340               break;
341            }
342            break;
343
344           default:
345            if (c == '\"' || c == '\'')
346            {
347               // Quoted token
348               U32 startLine = getCurrentLine();
349               mCurrPos++;
350
351               // Store the beginning of the token
352               mStartPos = mCurrPos;
353
354               while (mpBuffer[mCurrPos] != c)
355               {
356                  AssertISV(mCurrPos < mBufferSize,
357                            avar("End of file before quote closed.  Quote started: (%s: %d)",
358                                 getFileName(), startLine));
359                  AssertISV((mpBuffer[mCurrPos] != '\n' && mpBuffer[mCurrPos] != '\r'),
360                            avar("End of line reached before end of quote.  Quote started: (%s: %d)",
361                                 getFileName(), startLine));
362
363                  mCurrTokenBuffer[currPosition++] = mpBuffer[mCurrPos++];
364               }
365
366               mTokenIsQuoted = true;
367
368               mCurrPos++;
369               cont = false;
370            }
371            else if (c == '/' && mpBuffer[mCurrPos+1] == '/')
372            {
373               // Line quote...
374               if (currPosition == 0)
375               {
376                  // continue to end of line, then let crossLine determine on the next pass
377                  while (mCurrPos < mBufferSize && (mpBuffer[mCurrPos] != '\n' && mpBuffer[mCurrPos] != '\r'))
378                     mCurrPos++;
379               }
380               else
381               {
382                  // This is the end of the token.  Continue to EOL
383                  while (mCurrPos < mBufferSize && (mpBuffer[mCurrPos] != '\n' && mpBuffer[mCurrPos] != '\r'))
384                     mCurrPos++;
385                  cont = false;
386               }
387            }
388            else if (c == '/' && mpBuffer[mCurrPos+1] == '*')
389            {
390               // Block quote...
391               if (currPosition == 0)
392               {
393                  // continue to end of block, then let crossLine determine on the next pass
394                  while (mCurrPos < mBufferSize - 1 && (mpBuffer[mCurrPos] != '*' || mpBuffer[mCurrPos + 1] != '/'))
395                     mCurrPos++;
396
397                  if (mCurrPos < mBufferSize - 1)
398                     mCurrPos += 2;
399               }
400               else
401               {
402                  // This is the end of the token.  Continue to EOL
403                  while (mCurrPos < mBufferSize - 1 && (mpBuffer[mCurrPos] != '*' || mpBuffer[mCurrPos + 1] != '/'))
404                     mCurrPos++;
405
406                  if (mCurrPos < mBufferSize - 1)
407                     mCurrPos += 2;
408
409                  cont = false;
410               }
411            }
412            else
413            {
414               // If this is the first non-token character then store the
415               // beginning of the token
416               if (currPosition == 0)
417                  mStartPos = mCurrPos;
418
419               mCurrTokenBuffer[currPosition++] = c;
420               mCurrPos++;
421            }
422            break;
423         }
424      }
425
426      if (cont == false)
427         break;
428   }
429
430   mCurrTokenBuffer[currPosition] = '\0';
431
432   if (assertAvail == true)
433      AssertISV(currPosition != 0, avar("Error parsing: %s at or around line: %d", getFileName(), getCurrentLine()));
434
435   if (mCurrPos == mBufferSize)
436      return false;
437
438   return true;
439}
440
441bool Tokenizer::regressToken(const bool crossLine)
442{
443   if (mTokenIsCurrent == true)
444   {
445      AssertFatal(mCurrTokenBuffer[0] != '\0', "No token, but marked as current?");
446      mTokenIsCurrent = false;
447      return true;
448   }
449
450   U32 currPosition = 0;
451   mCurrTokenBuffer[0] = '\0';
452
453   mTokenIsQuoted = false;
454
455   // Store the beginning of the previous advance
456   // and the beginning of the current advance
457   mCurrPos = mStartPos;
458
459   // Back up to the first character of the previous token
460   mStartPos--;
461
462   while (mStartPos > 0)
463   {
464      char c = mpBuffer[mStartPos];
465
466      bool cont = true;
467
468      if (mSingleTokens && dStrchr(mSingleTokens, c))
469      {
470         if (currPosition == 0)
471         {
472            mCurrTokenBuffer[currPosition++] = c;
473            mStartPos--;
474            cont = false;
475            break;
476         }
477         else
478         {
479            // End of token
480            cont = false;
481         }
482      }
483      else
484      {
485         switch (c)
486         {
487           case ' ':
488           case '\t':
489            if (currPosition == 0)
490            {
491               // Token hasn't started yet...
492               mStartPos--;
493            }
494            else
495            {
496               // End of token
497               mStartPos--;
498               cont = false;
499            }
500            break;
501
502           case '\r':
503           case '\n':
504            if (crossLine == true && currPosition == 0)
505            {
506               // Windows line ending
507               if (mStartPos > 0 && mpBuffer[mStartPos] == '\r' && mpBuffer[mStartPos - 1] == '\n')
508                  mStartPos -= 2;
509               // Not sure if this ever happens but just in case
510               else if (mStartPos > 0 && mpBuffer[mStartPos] == '\n' && mpBuffer[mStartPos - 1] == '\r')
511                  mStartPos -= 2;
512               // Unix line endings should only have a single line break character
513               else
514                  mStartPos--;
515            }
516            else
517            {
518               cont = false;
519               break;
520            }
521            break;
522
523           default:
524            if (c == '\"' || c == '\'')
525            {
526               // Quoted token
527               U32 endLine = getCurrentLine();
528               mStartPos--;
529
530               while (mpBuffer[mStartPos] != c)
531               {
532                  AssertISV(mStartPos < 0,
533                            avar("Beginning of file reached before finding begin quote.  Quote ended: (%s: %d)",
534                                 getFileName(), endLine));
535
536                  mCurrTokenBuffer[currPosition++] = mpBuffer[mStartPos--];
537               }
538
539               mTokenIsQuoted = true;
540
541               mStartPos--;
542               cont = false;
543            }
544            else if (c == '/' && mStartPos > 0 && mpBuffer[mStartPos - 1] == '/')
545            {
546               // Line quote...
547               // Clear out anything saved already
548               currPosition = 0;
549
550               mStartPos -= 2;
551            }
552            else
553            {
554               mCurrTokenBuffer[currPosition++] = c;
555               mStartPos--;
556            }
557            break;
558         }
559      }
560
561      if (cont == false)
562         break;
563   }
564
565   mCurrTokenBuffer[currPosition] = '\0';
566
567   // Reveres the token
568   for (U32 i = 0; i < currPosition / 2; i++)
569   {
570      char c = mCurrTokenBuffer[i];
571      mCurrTokenBuffer[i] = mCurrTokenBuffer[currPosition - i - 1];
572      mCurrTokenBuffer[currPosition - i - 1] = c;
573   }
574
575   mStartPos++;
576
577   if (mStartPos == mCurrPos)
578      return false;
579
580   return true;
581}
582
583bool Tokenizer::tokenAvailable()
584{
585   // Note: this implies that when advanceToken(false) fails, it must cap the
586   //        token buffer.
587   //
588   return mCurrTokenBuffer[0] != '\0';
589}
590
591const char* Tokenizer::getToken() const
592{
593   return mCurrTokenBuffer;
594}
595
596const char* Tokenizer::getNextToken()
597{
598   advanceToken(true);
599
600   return getToken();
601}
602
603bool Tokenizer::tokenICmp(const char* pCmp) const
604{
605   return dStricmp(mCurrTokenBuffer, pCmp) == 0;
606}
607
608bool Tokenizer::findToken(U32 start, const char* pCmp)
609{
610   // Move to the start
611   setCurrentPos(start);
612
613   // In case the first token is what we are looking for
614   if (tokenICmp(pCmp))
615      return true;
616
617   // Loop through the file and see if the token exists
618   while (advanceToken(true))
619   {
620      if (tokenICmp(pCmp))
621         return true;
622   }
623
624   return false;
625}
626
627bool Tokenizer::findToken(const char* pCmp)
628{
629   return findToken(0, pCmp);
630}
631
632bool Tokenizer::endOfFile()
633{
634   if (mCurrPos < mBufferSize)
635      return false;
636   else
637      return true;
638}
639