VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/script/scriptlex.cpp@ 108014

Last change on this file since 108014 was 108014, checked in by vboxsync, 4 weeks ago

Runtime/common/script/scriptlex.cpp: Fix C string literal scanning and add some helper APIs to create tokens for errors and identifiers, bugref:10733

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 37.7 KB
Line 
1/* $Id: scriptlex.cpp 108014 2025-02-01 19:20:09Z vboxsync $ */
2/** @file
3 * IPRT - RTScript* lexer API.
4 */
5
6/*
7 * Copyright (C) 2022-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * The contents of this file may alternatively be used under the terms
26 * of the Common Development and Distribution License Version 1.0
27 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28 * in the VirtualBox distribution, in which case the provisions of the
29 * CDDL are applicable instead of those of the GPL.
30 *
31 * You may elect to license modified versions of this file under the
32 * terms and conditions of either the GPL or the CDDL or both.
33 *
34 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35 */
36
37
38/*********************************************************************************************************************************
39* Header Files *
40*********************************************************************************************************************************/
41#define LOG_GROUP RTLOGGROUP_DEFAULT /// @todo
42#include <iprt/script.h>
43
44#include <iprt/assert.h>
45#include <iprt/ctype.h>
46#include <iprt/err.h>
47#include <iprt/file.h>
48#include <iprt/log.h>
49#include <iprt/mem.h>
50#include <iprt/string.h>
51
52
53/*********************************************************************************************************************************
54* Structures and Typedefs *
55*********************************************************************************************************************************/
56
57/**
58 * Internal lexer state.
59 */
60typedef struct RTSCRIPTLEXINT
61{
62 /** Magic. */
63 uint32_t u32Magic;
64 /** Source position. */
65 RTSCRIPTPOS Pos;
66 /** Current and next token buffer. */
67 RTSCRIPTLEXTOKEN aToks[2];
68 /** Pointer to the current token. */
69 PRTSCRIPTLEXTOKEN pTokCur;
70 /** Pointer to the next token. */
71 PRTSCRIPTLEXTOKEN pTokNext;
72 /** The lexer config. */
73 PCRTSCRIPTLEXCFG pCfg;
74 /** The input reader. */
75 PFNRTSCRIPTLEXRDR pfnReader;
76 /** The destructor callback. */
77 PFNRTSCRIPTLEXDTOR pfnDtor;
78 /** Opaque user data for the reader. */
79 void *pvUser;
80 /** Identifier string cache. */
81 RTSTRCACHE hStrCacheId;
82 /** String literal string cache. */
83 RTSTRCACHE hStrCacheStringLit;
84 /** Status code from the reader. */
85 int rcRdr;
86 /** Internal error info. */
87 RTERRINFOSTATIC ErrInfo;
88 /** Lexer flags. */
89 uint32_t fFlags;
90 /** Maximum numebr of bytes allocated for temporary storage for literal strings. */
91 size_t cchStrLitMax;
92 /** Pointer to the string buffer for holding the literal string. */
93 char *pszStrLit;
94 /** Pointer to the current input character. */
95 const char *pchCur;
96 /** Offset to start reading the next chunk from. */
97 size_t offBufRead;
98 /** Size of the input buffer. */
99 size_t cchBuf;
100 /** The cached part of the input, variable in size. */
101 char achBuf[1];
102} RTSCRIPTLEXINT;
103/** Pointer to the internal lexer state. */
104typedef RTSCRIPTLEXINT *PRTSCRIPTLEXINT;
105
106
107/** Free the identifier string cache literal on destruction. */
108#define RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE RT_BIT_32(0)
109/** Free the string literal string cache literal on destruction. */
110#define RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE RT_BIT_32(1)
111/** End of stream reached. */
112#define RTSCRIPT_LEX_INT_F_EOS RT_BIT_32(2)
113
114
115/*********************************************************************************************************************************
116* Global Variables *
117*********************************************************************************************************************************/
118
119/** Default set of white spaces. */
120static const char *g_szWsDef = " \t";
121/** Default set of newlines. */
122static const char *g_aszNlDef[] =
123{
124 "\n",
125 "\r\n",
126 NULL
127};
128/** Default set of characters allowed for identifiers. */
129static const char *g_aszIdeCharSetDef = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
130
131
132/*********************************************************************************************************************************
133* Internal Functions *
134*********************************************************************************************************************************/
135
136
137/**
138 * Locates the given character in the string, consuming it if found.
139 *
140 * @returns Flag whether the character was found in the string.
141 * @param pThis The lexer state.
142 * @param ch The character to check for.
143 * @param psz The string to check.
144 */
145DECLINLINE(bool) rtScriptLexLocateChInStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz)
146{
147 while ( *psz != '\0'
148 && *psz != ch)
149 psz++;
150
151 if (*psz != '\0')
152 RTScriptLexConsumeCh(pThis);
153
154 return *psz != '\0';
155}
156
157
158/**
159 * Matches the input against the given string starting with the given character, consuming it
160 * if found.
161 *
162 * @returns Flag whether there was a match.
163 * @param pThis The lexer state.
164 * @param ch The character to check start matching.
165 * @param psz The string to match against.
166 * @param pszExclude When the string matched but the input continues
167 * with one of the characters in this string there will
168 * be no match.
169 */
170DECLINLINE(bool) rtScriptLexMatchStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz,
171 const char *pszExclude)
172{
173 bool fMatch = false;
174 if (*psz == ch)
175 {
176 unsigned offPeek = 1;
177
178 psz++;
179 while ( *psz != '\0'
180 && *psz == RTScriptLexPeekCh(pThis, offPeek))
181 {
182 offPeek++;
183 psz++;
184 }
185
186 if (*psz == '\0')
187 {
188 if (pszExclude)
189 {
190 ch = RTScriptLexPeekCh(pThis, offPeek);
191 fMatch = strchr(pszExclude, ch) == NULL;
192 }
193 else
194 fMatch = true;
195 }
196
197 if (fMatch)
198 {
199 /* Match, consume everything. */
200 while (offPeek-- > 0)
201 RTScriptLexConsumeCh(pThis);
202 }
203 }
204
205 return fMatch;
206}
207
208
209/**
210 * Tries to locate a string with the given starting character (+ peeking ahead) in the
211 * given string array (exact match) and consumes the entire substring.
212 *
213 * @returns Flag whether there was a match.
214 * @param pThis The lexer state.
215 * @param ch The character to check for.
216 * @param papsz Pointer to the string array to check for.
217 * @param pidx Where to store the index of the matching substring if found,
218 * optional.
219 */
220DECLINLINE(bool) rtScriptLexLocateSubStrInStrArrayMatchConsume(PRTSCRIPTLEXINT pThis, char ch,
221 const char **papsz, unsigned *pidx)
222{
223 unsigned int idx = 0;
224
225 while ( papsz[idx] != NULL
226 && !rtScriptLexMatchStrConsume(pThis, ch, papsz[idx], NULL))
227 idx++;
228
229 if ( papsz[idx] != NULL
230 && pidx)
231 *pidx = idx;
232
233 return papsz[idx] != NULL;
234}
235
236
237/**
238 * Tries to get an exact match starting with the given character, consuming it when found.
239 *
240 * @returns Flag whether there was a match.
241 * @param pThis The lexer state.
242 * @param ch The character to check for.
243 * @param ppMatch Where to store the exact match on success.
244 */
245DECLINLINE(bool) rtScriptLexLocateExactMatchConsume(PRTSCRIPTLEXINT pThis, char ch, PCRTSCRIPTLEXTOKMATCH *ppMatch)
246{
247 PCRTSCRIPTLEXTOKMATCH pTokMatch = pThis->pCfg->paTokMatches;
248
249 if (pTokMatch)
250 {
251 while ( pTokMatch->pszMatch != NULL
252 && !rtScriptLexMatchStrConsume(pThis, ch, pTokMatch->pszMatch,
253 pTokMatch->fMaybeIdentifier
254 ? g_aszIdeCharSetDef
255 : NULL))
256 pTokMatch++;
257
258 if (pTokMatch->pszMatch != NULL)
259 {
260 *ppMatch = pTokMatch;
261 return true;
262 }
263 }
264
265 return false;
266}
267
268
269DECLINLINE(bool) rtScriptLexIsNewlineConsume(PRTSCRIPTLEXINT pThis, char ch)
270{
271 const char **papszNl = pThis->pCfg->pszWhitespace ? pThis->pCfg->papszNewline : g_aszNlDef;
272
273 bool fMatched = rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszNl, NULL);
274 if (fMatched)
275 {
276 pThis->Pos.iLine++;
277 pThis->Pos.iCh = 1;
278 }
279
280 return fMatched;
281}
282
283
284/**
285 * Checks whether the character is the beginning of a multi line comment, skipping the whole
286 * comment if necessary.
287 *
288 * @returns Flag whether a multi line comment was detected and consumed.
289 * @param hScriptLex The lexer state.
290 * @param ch The character to check for.
291 */
292DECLINLINE(bool) rtScriptLexIsMultiLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
293{
294 const char **papszCommentMultiStart = pThis->pCfg->papszCommentMultiStart;
295 unsigned idxComment = 0;
296
297 if ( papszCommentMultiStart
298 && rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentMultiStart,
299 &idxComment))
300 {
301 /* Look for the matching closing lexeme in the input consuming everything along the way. */
302 const char *pszClosing = pThis->pCfg->papszCommentMultiEnd[idxComment];
303
304 for (;;)
305 {
306 char chTmp = RTScriptLexGetCh(pThis);
307
308 /* Check for new lines explicetly to advance the position information. */
309 if (rtScriptLexIsNewlineConsume(pThis, chTmp))
310 continue;
311
312 /** @todo Not quite correct when there is an end of stream before the closing lexeme.
313 * But doesn't hurt at the moment. */
314 if ( chTmp == '\0'
315 || rtScriptLexMatchStrConsume(pThis, chTmp, pszClosing, NULL))
316 break;
317
318 RTScriptLexConsumeCh(pThis);
319 }
320
321 return true;
322 }
323
324 return false;
325}
326
327
328/**
329 * Checks whether the character is the beginning of a single line comment, skipping the whole
330 * comment if necessary.
331 *
332 * @returns Flag whether a single line comment was detected and consumed.
333 * @param hScriptLex The lexer state.
334 * @param ch The character to check for.
335 */
336DECLINLINE(bool) rtScriptLexIsSingleLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
337{
338 const char **papszCommentSingleStart = pThis->pCfg->papszCommentSingleStart;
339
340 if ( papszCommentSingleStart
341 && rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentSingleStart,
342 NULL))
343 {
344 for (;;)
345 {
346 char chTmp = RTScriptLexGetCh(pThis);
347
348 if ( chTmp == '\0'
349 || rtScriptLexIsNewlineConsume(pThis, chTmp))
350 break;
351
352 RTScriptLexConsumeCh(pThis);
353 }
354
355 return true;
356 }
357
358 return false;
359}
360
361
362/**
363 * Fills the input buffer with source data.
364 *
365 * @returns IPRT status code.
366 * @param pThis The lexer state.
367 */
368static int rtScriptLexFillBuffer(PRTSCRIPTLEXINT pThis)
369{
370 int rc = VINF_SUCCESS;
371 size_t cchToRead = pThis->cchBuf;
372 char *pchRead = &pThis->achBuf[0];
373
374 AssertReturn(!(pThis->fFlags & RTSCRIPT_LEX_INT_F_EOS), VERR_INVALID_STATE);
375
376 /* If there is input left to process move it to the front and fill the remainder. */
377 if (pThis->pchCur != NULL)
378 {
379 cchToRead = pThis->pchCur - &pThis->achBuf[0];
380 /* Move the rest to the front. */
381 memmove(&pThis->achBuf[0], pThis->pchCur, pThis->cchBuf - cchToRead);
382 pchRead = (char *)pThis->pchCur + 1;
383 memset(pchRead, 0, cchToRead);
384 }
385
386 if (cchToRead)
387 {
388 pThis->pchCur = &pThis->achBuf[0];
389
390 size_t cchRead = 0;
391 rc = pThis->pfnReader(pThis, pThis->offBufRead, pchRead, cchToRead, &cchRead, pThis->pvUser);
392 if (RT_SUCCESS(rc))
393 {
394 pThis->offBufRead += cchRead;
395 if (rc == VINF_EOF)
396 pThis->fFlags |= RTSCRIPT_LEX_INT_F_EOS;
397 rc = VINF_SUCCESS;
398 }
399 else
400 pThis->rcRdr = rc;
401 }
402 else
403 rc = VERR_BUFFER_OVERFLOW; /** @todo */
404
405 return rc;
406}
407
408
409/**
410 * Produce an end of stream token.
411 *
412 * @returns nothing.
413 * @param pThis The lexer state.
414 * @param pTok The token to fill.
415 */
416static void rtScriptLexProduceTokEos(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
417{
418 pTok->enmType = RTSCRIPTLEXTOKTYPE_EOS;
419 pTok->PosStart = pThis->Pos;
420 pTok->PosEnd = pThis->Pos;
421}
422
423
424RTDECL(int) RTScriptLexProduceTokError(RTSCRIPTLEX hScriptLex, PRTSCRIPTLEXTOKEN pTok,
425 int rc, const char *pszMsg, ...)
426{
427 PRTSCRIPTLEXINT pThis = hScriptLex;
428
429 va_list va;
430 va_start(va, pszMsg);
431
432 pTok->enmType = RTSCRIPTLEXTOKTYPE_ERROR;
433 pTok->PosEnd = pThis->Pos;
434 pTok->Type.Error.pErr = &pThis->ErrInfo.Core;
435
436 RTErrInfoInitStatic(&pThis->ErrInfo);
437 RTErrInfoSetV(&pThis->ErrInfo.Core, rc, pszMsg, va);
438 va_end(va);
439
440 return rc;
441}
442
443
444RTDECL(int) RTScriptLexProduceTokIde(RTSCRIPTLEX hScriptLex, PRTSCRIPTLEXTOKEN pTok, const char *pszIde, size_t cchIde)
445{
446 PRTSCRIPTLEXINT pThis = hScriptLex;
447
448 /* Insert into string cache. */
449 pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
450 pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, pszIde, cchIde);
451 if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
452 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
453
454 pTok->PosEnd = pThis->Pos;
455 return VINF_SUCCESS;
456}
457
458
459/**
460 * Create the token from the exact match.
461 *
462 * @returns nothing.
463 * @param pThis The lexer state.
464 * @param pTok The token to fill.
465 * @param pMatch The matched string.
466 */
467static void rtScriptLexProduceTokFromExactMatch(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
468 PCRTSCRIPTLEXTOKMATCH pMatch)
469{
470 pTok->enmType = pMatch->enmTokType;
471 pTok->PosEnd = pThis->Pos;
472
473 switch (pTok->enmType)
474 {
475 case RTSCRIPTLEXTOKTYPE_OPERATOR:
476 pTok->Type.Operator.pOp = pMatch;
477 break;
478 case RTSCRIPTLEXTOKTYPE_KEYWORD:
479 pTok->Type.Keyword.pKeyword = pMatch;
480 break;
481 case RTSCRIPTLEXTOKTYPE_PUNCTUATOR:
482 pTok->Type.Punctuator.pPunctuator = pMatch;
483 break;
484 default:
485 RTScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
486 "Lexer: The match contains an invalid token type: %d\n",
487 pTok->enmType);
488 }
489}
490
491
492/**
493 * Goes through the rules trying to find a matching one.
494 *
495 * @returns Flag whether a matching rule was found.
496 * @param pThis The lexer state.
497 * @param ch The character to check.
498 * @param pTok The token to fill.
499 */
500static bool rtScriptLexProduceTokFromRules(PRTSCRIPTLEXINT pThis, char ch, PRTSCRIPTLEXTOKEN pTok)
501{
502 PCRTSCRIPTLEXRULE pRule = pThis->pCfg->paRules;
503
504 if (pRule)
505 {
506 while (pRule->pfnProd != NULL)
507 {
508 if ( ch >= pRule->chStart
509 && ch <= pRule->chEnd)
510 {
511 if (pRule->fFlags & RTSCRIPT_LEX_RULE_CONSUME)
512 RTScriptLexConsumeCh(pThis);
513 int rc = pRule->pfnProd(pThis, ch, pTok, pRule->pvUser);
514 AssertRC(rc);
515 return true;
516 }
517
518 pRule++;
519 }
520 }
521
522 return false;
523}
524
525
526/**
527 * Fills in the given token from the scanned input at the current location.
528 *
529 * @returns IPRT status code.
530 * @param pThis The lexer state.
531 * @param pTok The token to fill.
532 */
533static int rtScriptLexProduceToken(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
534{
535 RTScriptLexSkipWhitespace(pThis);
536
537 pTok->PosStart = pThis->Pos;
538
539 char ch = RTScriptLexGetCh(pThis);
540 PCRTSCRIPTLEXTOKMATCH pMatch = NULL;
541 if (ch == '\0')
542 rtScriptLexProduceTokEos(pThis, pTok);
543 else if (rtScriptLexLocateExactMatchConsume(pThis, ch, &pMatch))
544 rtScriptLexProduceTokFromExactMatch(pThis, pTok, pMatch);
545 else if (!rtScriptLexProduceTokFromRules(pThis, ch, pTok))
546 {
547 if (pThis->pCfg->pfnProdDef)
548 pThis->rcRdr = pThis->pCfg->pfnProdDef(pThis, ch, pTok, pThis->pCfg->pvProdDefUser);
549 else
550 RTScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
551 "Lexer: Invalid character found in input: %c\n",
552 ch);
553 }
554
555 return pThis->rcRdr;
556}
557
558
559/**
560 * Populates the lexer for the initial use.
561 *
562 * @returns IPRT status code.
563 * @param pThis The lexer state.
564 */
565static int rtScriptLexPopulate(PRTSCRIPTLEXINT pThis)
566{
567 int rc = rtScriptLexFillBuffer(pThis);
568 if (RT_SUCCESS(rc))
569 {
570 rc = rtScriptLexProduceToken(pThis, pThis->pTokCur);
571 if (RT_SUCCESS(rc))
572 rc = rtScriptLexProduceToken(pThis, pThis->pTokNext);
573 }
574
575 return rc;
576}
577
578
579
580RTDECL(int) RTScriptLexCreateFromReader(PRTSCRIPTLEX phScriptLex, PFNRTSCRIPTLEXRDR pfnReader,
581 PFNRTSCRIPTLEXDTOR pfnDtor, void *pvUser,
582 size_t cchBuf, PRTSTRCACHE phStrCacheId, PRTSTRCACHE phStrCacheStringLit,
583 PCRTSCRIPTLEXCFG pCfg)
584{
585 AssertPtrReturn(phScriptLex, VERR_INVALID_POINTER);
586 AssertPtrReturn(pfnReader, VERR_INVALID_POINTER);
587 AssertPtrReturn(pCfg, VERR_INVALID_POINTER);
588
589 if (!cchBuf)
590 cchBuf = _16K;
591 int rc = VINF_SUCCESS;
592 PRTSCRIPTLEXINT pThis = (PRTSCRIPTLEXINT)RTMemAllocZ(RT_UOFFSETOF_DYN(RTSCRIPTLEXINT, achBuf[cchBuf]));
593 if (RT_LIKELY(pThis))
594 {
595 pThis->u32Magic = 0xfefecafe; /** @todo */
596 pThis->Pos.iLine = 1;
597 pThis->Pos.iCh = 1;
598 pThis->pTokCur = &pThis->aToks[0];
599 pThis->pTokNext = &pThis->aToks[1];
600 pThis->pCfg = pCfg;
601 pThis->pfnReader = pfnReader;
602 pThis->pfnDtor = pfnDtor;
603 pThis->pvUser = pvUser;
604 pThis->fFlags = 0;
605 pThis->cchStrLitMax = 0;
606 pThis->pszStrLit = NULL;
607 pThis->cchBuf = cchBuf;
608 pThis->offBufRead = 0;
609 pThis->pchCur = NULL;
610 pThis->hStrCacheId = NULL;
611 pThis->hStrCacheStringLit = NULL;
612
613 rc = RTStrCacheCreate(&pThis->hStrCacheId, "LEX-Ide");
614 if (RT_SUCCESS(rc))
615 {
616 rc = RTStrCacheCreate(&pThis->hStrCacheStringLit, "LEX-StrLit");
617 if (RT_SUCCESS(rc))
618 {
619 rc = rtScriptLexPopulate(pThis);
620 if (RT_SUCCESS(rc))
621 {
622 *phScriptLex = pThis;
623
624 if (phStrCacheId)
625 *phStrCacheId = pThis->hStrCacheId;
626 else
627 pThis->fFlags |= RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE;
628
629 if (phStrCacheStringLit)
630 *phStrCacheStringLit = pThis->hStrCacheStringLit;
631 else
632 pThis->fFlags |= RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE;
633
634 return VINF_SUCCESS;
635 }
636
637 RTStrCacheDestroy(pThis->hStrCacheStringLit);
638 }
639
640 RTStrCacheDestroy(pThis->hStrCacheId);
641 }
642
643 RTMemFree(pThis);
644 }
645 else
646 rc = VERR_NO_MEMORY;
647
648 return rc;
649}
650
651
652/**
653 * @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a string.}
654 */
655static DECLCALLBACK(int) rtScriptLexReaderStr(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
656 size_t cchBuf, size_t *pcchRead, void *pvUser)
657{
658 RT_NOREF(hScriptLex);
659
660 const char *psz = (const char *)pvUser;
661 size_t cch = strlen(psz);
662 size_t cchCopy = RT_MIN(cchBuf, cch - offBuf);
663 int rc = VINF_SUCCESS;
664
665 *pcchRead = cchCopy;
666
667 if (cchCopy)
668 memcpy(pchCur, &psz[offBuf], cchCopy * sizeof(char));
669 else
670 rc = VINF_EOF;
671
672 return rc;
673}
674
675
676RTDECL(int) RTScriptLexCreateFromString(PRTSCRIPTLEX phScriptLex, const char *pszSrc, PRTSTRCACHE phStrCacheId,
677 PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
678{
679 return RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderStr, NULL, (void *)pszSrc, 0,
680 phStrCacheId, phStrCacheStringLit, pCfg);
681}
682
683
684/**
685 * @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a file.}
686 */
687static DECLCALLBACK(int) rtScriptLexReaderFile(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
688 size_t cchBuf, size_t *pcchRead, void *pvUser)
689{
690 RT_NOREF(hScriptLex);
691
692 RTFILE hFile = (RTFILE)pvUser;
693 return RTFileReadAt(hFile, offBuf, pchCur, cchBuf, pcchRead);
694}
695
696
697/**
698 * @callback_method_impl{FNRTSCRIPTLEXDTOR, Destructor for the file variant.}
699 */
700static DECLCALLBACK(void) rtScriptLexDtorFile(RTSCRIPTLEX hScriptLex, void *pvUser)
701{
702 RT_NOREF(hScriptLex);
703
704 RTFILE hFile = (RTFILE)pvUser;
705 RTFileClose(hFile);
706}
707
708
709RTDECL(int) RTScriptLexCreateFromFile(PRTSCRIPTLEX phScriptLex, const char *pszFilename, PRTSTRCACHE phStrCacheId,
710 PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
711{
712 RTFILE hFile;
713 int rc = RTFileOpen(&hFile, pszFilename, RTFILE_O_READ | RTFILE_O_DENY_WRITE | RTFILE_O_OPEN);
714 if (RT_SUCCESS(rc))
715 {
716 rc = RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderFile, rtScriptLexDtorFile, (void *)hFile, 0,
717 phStrCacheId, phStrCacheStringLit, pCfg);
718 if (RT_FAILURE(rc))
719 RTFileClose(hFile);
720 }
721
722 return rc;
723}
724
725
726RTDECL(void) RTScriptLexDestroy(RTSCRIPTLEX hScriptLex)
727{
728 PRTSCRIPTLEXINT pThis = hScriptLex;
729 AssertPtrReturnVoid(pThis);
730
731 if (pThis->pfnDtor)
732 pThis->pfnDtor(pThis, pThis->pvUser);
733
734 if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE)
735 RTStrCacheDestroy(pThis->hStrCacheId);
736 if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE)
737 RTStrCacheDestroy(pThis->hStrCacheStringLit);
738
739 if (pThis->pszStrLit)
740 RTStrFree(pThis->pszStrLit);
741
742 RTMemFree(pThis);
743}
744
745
746RTDECL(int) RTScriptLexQueryToken(RTSCRIPTLEX hScriptLex, PCRTSCRIPTLEXTOKEN *ppToken)
747{
748 PRTSCRIPTLEXINT pThis = hScriptLex;
749 AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
750 AssertPtrReturn(ppToken, VERR_INVALID_POINTER);
751
752 if (RT_SUCCESS(pThis->rcRdr))
753 *ppToken = pThis->pTokCur;
754
755 return pThis->rcRdr;
756}
757
758
759RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexGetTokenType(RTSCRIPTLEX hScriptLex)
760{
761 PRTSCRIPTLEXINT pThis = hScriptLex;
762 AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
763
764 if (RT_SUCCESS(pThis->rcRdr))
765 return pThis->pTokCur->enmType;
766
767 return RTSCRIPTLEXTOKTYPE_INVALID;
768}
769
770
771RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexPeekNextTokenType(RTSCRIPTLEX hScriptLex)
772{
773 PRTSCRIPTLEXINT pThis = hScriptLex;
774 AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
775
776 if (RT_SUCCESS(pThis->rcRdr))
777 return pThis->pTokNext->enmType;
778
779 return RTSCRIPTLEXTOKTYPE_INVALID;
780}
781
782
783RTDECL(PCRTSCRIPTLEXTOKEN) RTScriptLexConsumeToken(RTSCRIPTLEX hScriptLex)
784{
785 PRTSCRIPTLEXINT pThis = hScriptLex;
786 AssertPtrReturn(pThis, NULL);
787
788 /*
789 * Stop token production as soon as the current token indicates the
790 * end of the stream or an error
791 */
792 if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
793 && pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
794 {
795 PRTSCRIPTLEXTOKEN pTokTmp = pThis->pTokCur;
796
797 /* Switch next token to current token and read in the next token. */
798 pThis->pTokCur = pThis->pTokNext;
799 pThis->pTokNext = pTokTmp;
800 if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
801 && pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
802 rtScriptLexProduceToken(pThis, pThis->pTokNext);
803 else
804 pThis->pTokNext = pThis->pTokCur;
805 }
806
807 return pThis->pTokCur;
808}
809
810
811RTDECL(char) RTScriptLexConsumeCh(RTSCRIPTLEX hScriptLex)
812{
813 return RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_DEFAULT);
814}
815
816
817RTDECL(char) RTScriptLexConsumeChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
818{
819 PRTSCRIPTLEXINT pThis = hScriptLex;
820 AssertPtrReturn(pThis, '\0');
821
822 pThis->pchCur++;
823 pThis->Pos.iCh++;
824 if (pThis->pchCur == &pThis->achBuf[pThis->cchBuf])
825 rtScriptLexFillBuffer(pThis);
826
827 return RTScriptLexGetChEx(pThis, fFlags);
828}
829
830
831RTDECL(char) RTScriptLexPeekCh(RTSCRIPTLEX hScriptLex, unsigned idx)
832{
833 return RTScriptLexPeekChEx(hScriptLex, idx, RTSCRIPT_LEX_CONV_F_DEFAULT);
834}
835
836
837RTDECL(char) RTScriptLexPeekChEx(RTSCRIPTLEX hScriptLex, unsigned idx, uint32_t fFlags)
838{
839 PRTSCRIPTLEXINT pThis = hScriptLex;
840 AssertPtrReturn(pThis, '\0');
841
842 /* Just return the character if it is in the current buffer. */
843 char ch = '\0';
844 if (RT_LIKELY(pThis->pchCur + idx < &pThis->achBuf[pThis->cchBuf]))
845 ch = pThis->pchCur[idx];
846 else
847 {
848 /* Slow path, read data into temporary buffer to read character from and dismiss. */
849 /** @todo */
850 AssertReleaseFailed();
851 }
852
853 if ( (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE)
854 && !(fFlags & RTSCRIPT_LEX_CONV_F_NOTHING))
855 ch = RT_C_TO_LOWER(ch);
856
857 return ch;
858}
859
860
861RTDECL(char) RTScriptLexGetCh(RTSCRIPTLEX hScriptLex)
862{
863 return RTScriptLexPeekCh(hScriptLex, 0);
864}
865
866
867RTDECL(char) RTScriptLexGetChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
868{
869 return RTScriptLexPeekChEx(hScriptLex, 0, fFlags);
870}
871
872
873RTDECL(void) RTScriptLexSkipWhitespace(RTSCRIPTLEX hScriptLex)
874{
875 PRTSCRIPTLEXINT pThis = hScriptLex;
876 AssertPtrReturnVoid(pThis);
877
878 for (;;)
879 {
880 char ch = RTScriptLexGetCh(hScriptLex);
881
882 if (ch == '\0')
883 break;
884
885 /* Check for whitespace. */
886 const char *pszWs = pThis->pCfg->pszWhitespace ? pThis->pCfg->pszWhitespace : g_szWsDef;
887
888 if ( rtScriptLexLocateChInStrConsume(pThis, ch, pszWs)
889 || rtScriptLexIsNewlineConsume(pThis, ch)
890 || rtScriptLexIsMultiLineCommentConsume(pThis, ch)
891 || rtScriptLexIsSingleLineCommentConsume(pThis, ch))
892 continue;
893
894 /* All white space skipped, next is some real content. */
895 break;
896 }
897}
898
899
900RTDECL(int) RTScriptLexScanNumber(RTSCRIPTLEX hScriptLex, uint8_t uBase, bool fAllowReal,
901 PRTSCRIPTLEXTOKEN pTok)
902{
903 RT_NOREF(uBase, fAllowReal, pTok);
904 PRTSCRIPTLEXINT pThis = hScriptLex;
905 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
906 AssertReturn(!fAllowReal, VERR_NOT_IMPLEMENTED);
907 AssertReturn(!uBase, VERR_NOT_IMPLEMENTED);
908
909 /** @todo r=aeichner Quick and dirty to have something working for the disassembler testcase.
910 * Among others it misses overflow handling. */
911 uBase = 10;
912 char ch = RTScriptLexGetCh(hScriptLex);
913 pTok->Type.Number.enmType = ch == '-'
914 ? RTSCRIPTLEXTOKNUMTYPE_INTEGER
915 : RTSCRIPTLEXTOKNUMTYPE_NATURAL;
916 if (ch == '-' || ch == '+')
917 ch = RTScriptLexConsumeCh(hScriptLex);
918
919 if (ch == '0')
920 {
921 /* Some hex prefix? */
922 char chNext = RTScriptLexPeekCh(hScriptLex, 1);
923 if (chNext == 'x')
924 {
925 uBase = 16;
926 RTScriptLexConsumeCh(hScriptLex);
927 }
928 else if (chNext >= '0' && chNext <= '9') /* Octal stuff. */
929 AssertFailedReturn(VERR_NOT_IMPLEMENTED);
930
931 ch = RTScriptLexConsumeCh(hScriptLex);
932 }
933
934 uint64_t u64 = 0;
935 for (;;)
936 {
937 if ( (ch < '0' || ch > '9')
938 && (ch < 'a' || ch > 'f' || uBase == 10))
939 {
940 if (pTok->Type.Number.enmType == RTSCRIPTLEXTOKNUMTYPE_INTEGER)
941 pTok->Type.Number.Type.i64 = -(int64_t)u64;
942 else
943 pTok->Type.Number.Type.u64 = u64;
944 pTok->enmType = RTSCRIPTLEXTOKTYPE_NUMBER;
945 pTok->PosEnd = pThis->Pos;
946 return VINF_SUCCESS;
947 }
948
949 if (ch >= '0' && ch <= '9')
950 u64 = (u64 * uBase) + (ch - '0');
951 else if (ch >= 'a' && ch <= 'f')
952 {
953 Assert(uBase == 16);
954 u64 = (u64 << 4) + 10 + (ch - 'a');
955 }
956
957 ch = RTScriptLexConsumeCh(hScriptLex);
958 }
959}
960
961
962RTDECL(int) RTScriptLexScanIdentifier(RTSCRIPTLEX hScriptLex, char ch,
963 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
964{
965 PRTSCRIPTLEXINT pThis = hScriptLex;
966 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
967
968 const char *pszCharSet = pvUser ? (const char *)pvUser : g_aszIdeCharSetDef;
969 char aszIde[513]; RT_ZERO(aszIde);
970 unsigned idx = 0;
971 aszIde[idx++] = ch;
972
973 ch = RTScriptLexGetCh(hScriptLex);
974 while ( idx < sizeof(aszIde) - 1
975 && rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
976 {
977 aszIde[idx++] = ch;
978 ch = RTScriptLexGetCh(hScriptLex);
979 }
980
981 if ( idx == sizeof(aszIde) - 1
982 && rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
983 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_BUFFER_OVERFLOW, "Lexer: Identifier exceeds the allowed length");
984
985 /* Insert into string cache. */
986 pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
987 pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, &aszIde[0], idx);
988 if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
989 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
990
991 pTok->PosEnd = pThis->Pos;
992 return VINF_SUCCESS;
993}
994
995
996/**
997 * Adds the given character to the string literal add the given position, assuring the string
998 * is always zero terminated.
999 *
1000 * @returns IPRT status code.
1001 * @param pThis The lexer state.
1002 * @param ch The character to add.
1003 * @param idx At which position to add the character in the string.
1004 */
1005static int rtScriptLexScanStringLiteralChAdd(PRTSCRIPTLEXINT pThis, char ch, uint32_t idx)
1006{
1007 int rc = VINF_SUCCESS;
1008
1009 if ( !pThis->cchStrLitMax
1010 || idx >= pThis->cchStrLitMax - 1)
1011 {
1012 /* Increase memory. */
1013 size_t cchMaxNew = pThis->cchStrLitMax + 64;
1014 char *pszNew = NULL;
1015 rc = RTStrRealloc(&pszNew, cchMaxNew * sizeof(char));
1016 if (RT_SUCCESS(rc))
1017 {
1018 pThis->pszStrLit = pszNew;
1019 pThis->cchStrLitMax = cchMaxNew;
1020 }
1021 }
1022
1023 if (RT_SUCCESS(rc))
1024 {
1025 pThis->pszStrLit[idx] = ch;
1026 pThis->pszStrLit[idx + 1] = '\0';
1027 }
1028
1029 return rc;
1030}
1031
1032
1033RTDECL(int) RTScriptLexScanStringLiteralC(RTSCRIPTLEX hScriptLex, char ch,
1034 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1035{
1036 RT_NOREF(ch, pvUser);
1037 PRTSCRIPTLEXINT pThis = hScriptLex;
1038 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1039
1040 uint32_t idxChCur = 0;
1041 int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1042 if (RT_FAILURE(rc))
1043 return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1044
1045 ch = RTScriptLexGetCh(hScriptLex);
1046 for (;;)
1047 {
1048 if (ch == '\0')
1049 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1050 else if (ch == '\"')
1051 {
1052 RTScriptLexConsumeCh(hScriptLex);
1053
1054 /* End of string, add it to the string literal cache and build the token. */
1055 pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1056 pTok->Type.StringLit.cchString = idxChCur;
1057 pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1058 if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1059 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1060 else
1061 break;
1062 }
1063 else if (ch == '\\')
1064 {
1065 /* Start of escape sequence. */
1066 RTScriptLexConsumeCh(hScriptLex);
1067 ch = RTScriptLexGetCh(hScriptLex);
1068 switch (ch)
1069 {
1070 case 'a': /* Alert (Bell) */
1071 ch = 0x07;
1072 break;
1073 case 'b': /* Backspace */
1074 ch = 0x08;
1075 break;
1076 case 'e': /* Escape character */
1077 ch = 0x1b;
1078 break;
1079 case 'f': /* Formfeed */
1080 ch = 0x0c;
1081 break;
1082 case 'n': /* Newline (line freed) */
1083 ch = 0x0a;
1084 break;
1085 case 'r': /* Carriage return */
1086 ch = 0x0d;
1087 break;
1088 case 't': /* Horizontal tab */
1089 ch = 0x09;
1090 break;
1091 case 'v': /* Vertical tab */
1092 ch = 0x0b;
1093 break;
1094 case '\\':
1095 case '\'':
1096 case '\"':
1097 case '\?':
1098 /* Can be added as is. */
1099 break;
1100 case 'x': /* Hexdecimal byte. */
1101 case '0': /* Octal */
1102 case '1':
1103 case '2':
1104 case '3':
1105 case '4':
1106 case '5':
1107 case '6':
1108 case '7':
1109 case '8':
1110 case '9':
1111 case 'u': /* Unicode point below 10000 */
1112 case 'U': /* Unicode point */
1113 default:
1114 /* Not supported for now. */
1115 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NOT_SUPPORTED, "Lexer: Invalid/unsupported escape sequence");
1116 }
1117 }
1118
1119 rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1120 if (RT_SUCCESS(rc))
1121 idxChCur++;
1122 else
1123 return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1124
1125 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1126 }
1127
1128 pTok->PosEnd = pThis->Pos;
1129 return VINF_SUCCESS;
1130}
1131
1132
1133RTDECL(int) RTScriptLexScanStringLiteralPascal(RTSCRIPTLEX hScriptLex, char ch,
1134 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1135{
1136 RT_NOREF(ch, pvUser);
1137 PRTSCRIPTLEXINT pThis = hScriptLex;
1138 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1139
1140 uint32_t idxChCur = 0;
1141 int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1142 if (RT_FAILURE(rc))
1143 return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1144
1145 ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1146 for (;;)
1147 {
1148 if (ch == '\0')
1149 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1150 else if (ch == '\'')
1151 {
1152 /*
1153 * Check whether there is a second ' coming afterwards used for
1154 * escaping ' characters.
1155 */
1156 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1157 if (ch != '\'')
1158 {
1159 /* End of string, add it to the string literal cache and build the token. */
1160 pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1161 pTok->Type.StringLit.cchString = idxChCur;
1162 pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1163 if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1164 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1165 else
1166 break;
1167 }
1168 /* else: Fall through and add the character to the string literal..*/
1169 }
1170
1171 rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1172 if (RT_SUCCESS(rc))
1173 idxChCur++;
1174 else
1175 return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1176 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1177 }
1178
1179 pTok->PosEnd = pThis->Pos;
1180 return VINF_SUCCESS;
1181}
1182
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette