tokenizer.cpp
Engine/source/core/tokenizer.cpp
Detailed Description
1 2//----------------------------------------------------------------------------- 3// Copyright (c) 2012 GarageGames, LLC 4// 5// Permission is hereby granted, free of charge, to any person obtaining a copy 6// of this software and associated documentation files (the "Software"), to 7// deal in the Software without restriction, including without limitation the 8// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9// sell copies of the Software, and to permit persons to whom the Software is 10// furnished to do so, subject to the following conditions: 11// 12// The above copyright notice and this permission notice shall be included in 13// all copies or substantial portions of the Software. 14// 15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21// IN THE SOFTWARE. 22//----------------------------------------------------------------------------- 23 24#include "core/tokenizer.h" 25#include "platform/platform.h" 26#include "core/stream/fileStream.h" 27#include "core/strings/stringFunctions.h" 28#include "core/util/safeDelete.h" 29 30Tokenizer::Tokenizer() 31{ 32 dMemset(mFileName, 0, sizeof(mFileName)); 33 34 mpBuffer = NULL; 35 mBufferSize = 0; 36 37 mStartPos = 0; 38 mCurrPos = 0; 39 40 mTokenIsQuoted = false; 41 42 dMemset(mCurrTokenBuffer, 0, sizeof(mCurrTokenBuffer)); 43 mTokenIsCurrent = false; 44 45 mSingleTokens = NULL; 46 47 VECTOR_SET_ASSOCIATION(mLinePositions); 48} 49 50Tokenizer::~Tokenizer() 51{ 52 clear(); 53} 54 55bool Tokenizer::openFile(const char* pFileName) 56{ 57 AssertFatal(mFileName[0] == '\0', "Reuse of Tokenizers not allowed!"); 58 59 FileStream* pStream = new FileStream; 60 if (pStream->open(pFileName, Torque::FS::File::Read) == false) 61 { 62 delete pStream; 63 return false; 64 } 65 dStrcpy(mFileName, pFileName, 1024); 66 67 mBufferSize = pStream->getStreamSize(); 68 mpBuffer = new char[mBufferSize]; 69 pStream->read(mBufferSize, mpBuffer); 70 pStream->close(); 71 delete pStream; 72 73 reset(); 74 75 buildLinePositions(); 76 77 return true; 78} 79 80bool Tokenizer::openFile(Stream* pStream) 81{ 82 mBufferSize = pStream->getStreamSize(); 83 mpBuffer = new char[mBufferSize]; 84 pStream->read(mBufferSize, mpBuffer); 85 86 reset(); 87 88 buildLinePositions(); 89 90 return true; 91} 92 93void Tokenizer::setBuffer(const char* buffer, U32 bufferSize) 94{ 95 if (mpBuffer) 96 { 97 SAFE_DELETE_ARRAY(mpBuffer); 98 mBufferSize = 0; 99 } 100 101 mBufferSize = bufferSize; 102 mpBuffer = new char[mBufferSize + 1]; 103 dStrcpy(mpBuffer, buffer, mBufferSize + 1); 104 105 reset(); 106 107 buildLinePositions(); 108} 109 110void Tokenizer::setSingleTokens(const char* singleTokens) 111{ 112 if (mSingleTokens) 113 SAFE_DELETE(mSingleTokens); 114 115 if (singleTokens) 116 mSingleTokens = dStrdup(singleTokens); 117} 118 119bool Tokenizer::reset() 120{ 121 mStartPos = 0; 122 mCurrPos = 0; 123 124 mTokenIsQuoted = false; 125 126 dMemset(mCurrTokenBuffer, 0, sizeof(mCurrTokenBuffer)); 127 mTokenIsCurrent = false; 128 129 return true; 130} 131 132bool Tokenizer::clear() 133{ 134 // Delete our buffer 135 if (mpBuffer) 136 SAFE_DELETE_ARRAY(mpBuffer); 137 138 // Reset the buffer size 139 mBufferSize = 0; 140 141 // Reset our active data 142 reset(); 143 144 // Clear our line positions 145 mLinePositions.clear(); 146 147 // Reset our file name 148 dMemset(mFileName, 0, 1024); 149 150 // Wipe the single tokens 151 setSingleTokens(NULL); 152 153 return true; 154} 155 156bool Tokenizer::setCurrentPos(U32 pos) 157{ 158 mCurrPos = pos; 159 mTokenIsCurrent = false; 160 161 return advanceToken(true); 162} 163 164void Tokenizer::buildLinePositions() 165{ 166 if (mBufferSize == 0) 167 return; 168 169 // We can safely assume that the first line is at position 0 170 mLinePositions.push_back(0); 171 172 U32 currPos = 0; 173 while (currPos + 1 < mBufferSize) 174 { 175 // Windows line ending 176 if (mpBuffer[currPos] == '\r' && mpBuffer[currPos + 1] == '\n') 177 { 178 currPos += 2; 179 180 mLinePositions.push_back(currPos); 181 } 182 // Not sure if this ever happens but just in case 183 else if (mpBuffer[currPos] == '\n' && mpBuffer[currPos + 1] == '\r') 184 { 185 currPos += 2; 186 187 mLinePositions.push_back(currPos); 188 } 189 // Unix line endings should only have a single line break character 190 else if (mpBuffer[currPos] == '\n' || mpBuffer[currPos] == '\r') 191 { 192 currPos++; 193 194 mLinePositions.push_back(currPos); 195 } 196 else 197 currPos++; 198 } 199} 200 201U32 Tokenizer::getLinePosition(const U32 pos, U32 lowIndex, S32 highIndex) 202{ 203 // If we have one or less lines then 204 // the result is easy 205 if (mLinePositions.size() <= 1) 206 return 0; 207 208 // Now that we know we have at least one position 209 // we can do a quick test against the last line 210 if (pos >= mLinePositions.last()) 211 return mLinePositions.size() - 1; 212 213 // If this is the beginning of the search 214 // set a good starting point (the middle) 215 if (highIndex < 0) 216 highIndex = mLinePositions.size() - 1; 217 218 // Just in case bad values got handed in 219 if (lowIndex > highIndex) 220 lowIndex = highIndex; 221 222 // Compute our test index (middle) 223 U32 testIndex = (lowIndex + highIndex) / 2; 224 225 // Make sure that our test indices are valid 226 if (testIndex >= mLinePositions.size() || 227 testIndex + 1 >= mLinePositions.size()) 228 return mLinePositions.size() - 1; 229 230 // See if we are already at the right line 231 if (pos >= mLinePositions[testIndex] && pos < mLinePositions[testIndex + 1]) 232 return testIndex; 233 234 if (pos < mLinePositions[testIndex]) 235 highIndex = testIndex; 236 else 237 lowIndex = testIndex; 238 239 return getLinePosition(pos, lowIndex, highIndex); 240} 241 242U32 Tokenizer::getCurrentLine() 243{ 244 // Binary search for the line number whose 245 // position is equal to or lower than the 246 // current position 247 return getLinePosition(mStartPos); 248} 249 250U32 Tokenizer::getTokenLineOffset() 251{ 252 U32 lineNumber = getCurrentLine(); 253 254 if (lineNumber >= mLinePositions.size()) 255 return 0; 256 257 U32 linePosition = mLinePositions[lineNumber]; 258 259 if (linePosition >= mStartPos) 260 return 0; 261 262 return mStartPos - linePosition; 263} 264 265bool Tokenizer::advanceToken(const bool crossLine, const bool assertAvail) 266{ 267 if (mTokenIsCurrent == true) 268 { 269 AssertFatal(mCurrTokenBuffer[0] != '\0', "No token, but marked as current?"); 270 mTokenIsCurrent = false; 271 return true; 272 } 273 274 U32 currPosition = 0; 275 mCurrTokenBuffer[0] = '\0'; 276 277 mTokenIsQuoted = false; 278 279 // Store the beginning of the previous advance 280 // and the beginning of the current advance 281 mStartPos = mCurrPos; 282 283 while (mCurrPos < mBufferSize) 284 { 285 char c = mpBuffer[mCurrPos]; 286 287 bool cont = true; 288 289 if (mSingleTokens && dStrchr(mSingleTokens, c)) 290 { 291 if (currPosition == 0) 292 { 293 mCurrTokenBuffer[currPosition++] = c; 294 mCurrPos++; 295 cont = false; 296 break; 297 } 298 else 299 { 300 // End of token 301 cont = false; 302 } 303 } 304 else 305 { 306 switch (c) 307 { 308 case ' ': 309 case '\t': 310 if (currPosition == 0) 311 { 312 // Token hasn't started yet... 313 mCurrPos++; 314 } 315 else 316 { 317 // End of token 318 mCurrPos++; 319 cont = false; 320 } 321 break; 322 323 case '\r': 324 case '\n': 325 if (crossLine == true) 326 { 327 // Windows line ending 328 if (mpBuffer[mCurrPos] == '\r' && mpBuffer[mCurrPos + 1] == '\n') 329 mCurrPos += 2; 330 // Not sure if this ever happens but just in case 331 else if (mpBuffer[mCurrPos] == '\n' && mpBuffer[mCurrPos + 1] == '\r') 332 mCurrPos += 2; 333 // Unix line endings should only have a single line break character 334 else 335 mCurrPos++; 336 } 337 else 338 { 339 cont = false; 340 break; 341 } 342 break; 343 344 default: 345 if (c == '\"' || c == '\'') 346 { 347 // Quoted token 348 U32 startLine = getCurrentLine(); 349 mCurrPos++; 350 351 // Store the beginning of the token 352 mStartPos = mCurrPos; 353 354 while (mpBuffer[mCurrPos] != c) 355 { 356 AssertISV(mCurrPos < mBufferSize, 357 avar("End of file before quote closed. Quote started: (%s: %d)", 358 getFileName(), startLine)); 359 AssertISV((mpBuffer[mCurrPos] != '\n' && mpBuffer[mCurrPos] != '\r'), 360 avar("End of line reached before end of quote. Quote started: (%s: %d)", 361 getFileName(), startLine)); 362 363 mCurrTokenBuffer[currPosition++] = mpBuffer[mCurrPos++]; 364 } 365 366 mTokenIsQuoted = true; 367 368 mCurrPos++; 369 cont = false; 370 } 371 else if (c == '/' && mpBuffer[mCurrPos+1] == '/') 372 { 373 // Line quote... 374 if (currPosition == 0) 375 { 376 // continue to end of line, then let crossLine determine on the next pass 377 while (mCurrPos < mBufferSize && (mpBuffer[mCurrPos] != '\n' && mpBuffer[mCurrPos] != '\r')) 378 mCurrPos++; 379 } 380 else 381 { 382 // This is the end of the token. Continue to EOL 383 while (mCurrPos < mBufferSize && (mpBuffer[mCurrPos] != '\n' && mpBuffer[mCurrPos] != '\r')) 384 mCurrPos++; 385 cont = false; 386 } 387 } 388 else if (c == '/' && mpBuffer[mCurrPos+1] == '*') 389 { 390 // Block quote... 391 if (currPosition == 0) 392 { 393 // continue to end of block, then let crossLine determine on the next pass 394 while (mCurrPos < mBufferSize - 1 && (mpBuffer[mCurrPos] != '*' || mpBuffer[mCurrPos + 1] != '/')) 395 mCurrPos++; 396 397 if (mCurrPos < mBufferSize - 1) 398 mCurrPos += 2; 399 } 400 else 401 { 402 // This is the end of the token. Continue to EOL 403 while (mCurrPos < mBufferSize - 1 && (mpBuffer[mCurrPos] != '*' || mpBuffer[mCurrPos + 1] != '/')) 404 mCurrPos++; 405 406 if (mCurrPos < mBufferSize - 1) 407 mCurrPos += 2; 408 409 cont = false; 410 } 411 } 412 else 413 { 414 // If this is the first non-token character then store the 415 // beginning of the token 416 if (currPosition == 0) 417 mStartPos = mCurrPos; 418 419 mCurrTokenBuffer[currPosition++] = c; 420 mCurrPos++; 421 } 422 break; 423 } 424 } 425 426 if (cont == false) 427 break; 428 } 429 430 mCurrTokenBuffer[currPosition] = '\0'; 431 432 if (assertAvail == true) 433 AssertISV(currPosition != 0, avar("Error parsing: %s at or around line: %d", getFileName(), getCurrentLine())); 434 435 if (mCurrPos == mBufferSize) 436 return false; 437 438 return true; 439} 440 441bool Tokenizer::regressToken(const bool crossLine) 442{ 443 if (mTokenIsCurrent == true) 444 { 445 AssertFatal(mCurrTokenBuffer[0] != '\0', "No token, but marked as current?"); 446 mTokenIsCurrent = false; 447 return true; 448 } 449 450 U32 currPosition = 0; 451 mCurrTokenBuffer[0] = '\0'; 452 453 mTokenIsQuoted = false; 454 455 // Store the beginning of the previous advance 456 // and the beginning of the current advance 457 mCurrPos = mStartPos; 458 459 // Back up to the first character of the previous token 460 mStartPos--; 461 462 while (mStartPos > 0) 463 { 464 char c = mpBuffer[mStartPos]; 465 466 bool cont = true; 467 468 if (mSingleTokens && dStrchr(mSingleTokens, c)) 469 { 470 if (currPosition == 0) 471 { 472 mCurrTokenBuffer[currPosition++] = c; 473 mStartPos--; 474 cont = false; 475 break; 476 } 477 else 478 { 479 // End of token 480 cont = false; 481 } 482 } 483 else 484 { 485 switch (c) 486 { 487 case ' ': 488 case '\t': 489 if (currPosition == 0) 490 { 491 // Token hasn't started yet... 492 mStartPos--; 493 } 494 else 495 { 496 // End of token 497 mStartPos--; 498 cont = false; 499 } 500 break; 501 502 case '\r': 503 case '\n': 504 if (crossLine == true && currPosition == 0) 505 { 506 // Windows line ending 507 if (mStartPos > 0 && mpBuffer[mStartPos] == '\r' && mpBuffer[mStartPos - 1] == '\n') 508 mStartPos -= 2; 509 // Not sure if this ever happens but just in case 510 else if (mStartPos > 0 && mpBuffer[mStartPos] == '\n' && mpBuffer[mStartPos - 1] == '\r') 511 mStartPos -= 2; 512 // Unix line endings should only have a single line break character 513 else 514 mStartPos--; 515 } 516 else 517 { 518 cont = false; 519 break; 520 } 521 break; 522 523 default: 524 if (c == '\"' || c == '\'') 525 { 526 // Quoted token 527 U32 endLine = getCurrentLine(); 528 mStartPos--; 529 530 while (mpBuffer[mStartPos] != c) 531 { 532 AssertISV(mStartPos < 0, 533 avar("Beginning of file reached before finding begin quote. Quote ended: (%s: %d)", 534 getFileName(), endLine)); 535 536 mCurrTokenBuffer[currPosition++] = mpBuffer[mStartPos--]; 537 } 538 539 mTokenIsQuoted = true; 540 541 mStartPos--; 542 cont = false; 543 } 544 else if (c == '/' && mStartPos > 0 && mpBuffer[mStartPos - 1] == '/') 545 { 546 // Line quote... 547 // Clear out anything saved already 548 currPosition = 0; 549 550 mStartPos -= 2; 551 } 552 else 553 { 554 mCurrTokenBuffer[currPosition++] = c; 555 mStartPos--; 556 } 557 break; 558 } 559 } 560 561 if (cont == false) 562 break; 563 } 564 565 mCurrTokenBuffer[currPosition] = '\0'; 566 567 // Reveres the token 568 for (U32 i = 0; i < currPosition / 2; i++) 569 { 570 char c = mCurrTokenBuffer[i]; 571 mCurrTokenBuffer[i] = mCurrTokenBuffer[currPosition - i - 1]; 572 mCurrTokenBuffer[currPosition - i - 1] = c; 573 } 574 575 mStartPos++; 576 577 if (mStartPos == mCurrPos) 578 return false; 579 580 return true; 581} 582 583bool Tokenizer::tokenAvailable() 584{ 585 // Note: this implies that when advanceToken(false) fails, it must cap the 586 // token buffer. 587 // 588 return mCurrTokenBuffer[0] != '\0'; 589} 590 591const char* Tokenizer::getToken() const 592{ 593 return mCurrTokenBuffer; 594} 595 596const char* Tokenizer::getNextToken() 597{ 598 advanceToken(true); 599 600 return getToken(); 601} 602 603bool Tokenizer::tokenICmp(const char* pCmp) const 604{ 605 return dStricmp(mCurrTokenBuffer, pCmp) == 0; 606} 607 608bool Tokenizer::findToken(U32 start, const char* pCmp) 609{ 610 // Move to the start 611 setCurrentPos(start); 612 613 // In case the first token is what we are looking for 614 if (tokenICmp(pCmp)) 615 return true; 616 617 // Loop through the file and see if the token exists 618 while (advanceToken(true)) 619 { 620 if (tokenICmp(pCmp)) 621 return true; 622 } 623 624 return false; 625} 626 627bool Tokenizer::findToken(const char* pCmp) 628{ 629 return findToken(0, pCmp); 630} 631 632bool Tokenizer::endOfFile() 633{ 634 if (mCurrPos < mBufferSize) 635 return false; 636 else 637 return true; 638} 639