VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/script/scriptlex.cpp@ 108194

Last change on this file since 108194 was 108194, checked in by vboxsync, 3 months ago

Runtime/comon/script/scriptlex.cpp: Fix re-allocating memory for a string literal, bugref:10733

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 38.6 KB
Line 
1/* $Id: scriptlex.cpp 108194 2025-02-13 14:35:47Z vboxsync $ */
2/** @file
3 * IPRT - RTScript* lexer API.
4 */
5
6/*
7 * Copyright (C) 2022-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * The contents of this file may alternatively be used under the terms
26 * of the Common Development and Distribution License Version 1.0
27 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28 * in the VirtualBox distribution, in which case the provisions of the
29 * CDDL are applicable instead of those of the GPL.
30 *
31 * You may elect to license modified versions of this file under the
32 * terms and conditions of either the GPL or the CDDL or both.
33 *
34 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35 */
36
37
38/*********************************************************************************************************************************
39* Header Files *
40*********************************************************************************************************************************/
41#define LOG_GROUP RTLOGGROUP_DEFAULT /// @todo
42#include <iprt/script.h>
43
44#include <iprt/assert.h>
45#include <iprt/ctype.h>
46#include <iprt/err.h>
47#include <iprt/file.h>
48#include <iprt/log.h>
49#include <iprt/mem.h>
50#include <iprt/string.h>
51
52
53/*********************************************************************************************************************************
54* Structures and Typedefs *
55*********************************************************************************************************************************/
56
57/**
58 * Internal lexer state.
59 */
60typedef struct RTSCRIPTLEXINT
61{
62 /** Magic. */
63 uint32_t u32Magic;
64 /** Source position. */
65 RTSCRIPTPOS Pos;
66 /** Current and next token buffer. */
67 RTSCRIPTLEXTOKEN aToks[2];
68 /** Pointer to the current token. */
69 PRTSCRIPTLEXTOKEN pTokCur;
70 /** Pointer to the next token. */
71 PRTSCRIPTLEXTOKEN pTokNext;
72 /** The lexer config. */
73 PCRTSCRIPTLEXCFG pCfg;
74 /** The input reader. */
75 PFNRTSCRIPTLEXRDR pfnReader;
76 /** The destructor callback. */
77 PFNRTSCRIPTLEXDTOR pfnDtor;
78 /** Opaque user data for the reader. */
79 void *pvUser;
80 /** Identifier string cache. */
81 RTSTRCACHE hStrCacheId;
82 /** String literal string cache. */
83 RTSTRCACHE hStrCacheStringLit;
84 /** Status code from the reader. */
85 int rcRdr;
86 /** Internal error info. */
87 RTERRINFOSTATIC ErrInfo;
88 /** Lexer flags. */
89 uint32_t fFlags;
90 /** Maximum numebr of bytes allocated for temporary storage for literal strings. */
91 size_t cchStrLitMax;
92 /** Pointer to the string buffer for holding the literal string. */
93 char *pszStrLit;
94 /** Pointer to the current input character. */
95 const char *pchCur;
96 /** Offset to start reading the next chunk from. */
97 size_t offBufRead;
98 /** Size of the input buffer. */
99 size_t cchBuf;
100 /** The cached part of the input, variable in size. */
101 char achBuf[1];
102} RTSCRIPTLEXINT;
103/** Pointer to the internal lexer state. */
104typedef RTSCRIPTLEXINT *PRTSCRIPTLEXINT;
105
106
107/** Free the identifier string cache literal on destruction. */
108#define RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE RT_BIT_32(0)
109/** Free the string literal string cache literal on destruction. */
110#define RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE RT_BIT_32(1)
111/** End of stream reached. */
112#define RTSCRIPT_LEX_INT_F_EOS RT_BIT_32(2)
113
114
115/*********************************************************************************************************************************
116* Global Variables *
117*********************************************************************************************************************************/
118
119/** Default set of white spaces. */
120static const char *g_szWsDef = " \t";
121/** Default set of newlines. */
122static const char *g_aszNlDef[] =
123{
124 "\n",
125 "\r\n",
126 NULL
127};
128/** Default set of characters allowed for identifiers. */
129static const char *g_aszIdeCharSetDef = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
130
131
132/*********************************************************************************************************************************
133* Internal Functions *
134*********************************************************************************************************************************/
135
136
137/**
138 * Locates the given character in the string, consuming it if found.
139 *
140 * @returns Flag whether the character was found in the string.
141 * @param pThis The lexer state.
142 * @param ch The character to check for.
143 * @param psz The string to check.
144 */
145DECLINLINE(bool) rtScriptLexLocateChInStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz)
146{
147 while ( *psz != '\0'
148 && *psz != ch)
149 psz++;
150
151 if (*psz != '\0')
152 RTScriptLexConsumeCh(pThis);
153
154 return *psz != '\0';
155}
156
157
158/**
159 * Matches the input against the given string starting with the given character, consuming it
160 * if found.
161 *
162 * @returns Flag whether there was a match.
163 * @param pThis The lexer state.
164 * @param ch The character to check start matching.
165 * @param psz The string to match against.
166 * @param pszExclude When the string matched but the input continues
167 * with one of the characters in this string there will
168 * be no match.
169 */
170DECLINLINE(bool) rtScriptLexMatchStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz,
171 const char *pszExclude)
172{
173 bool fMatch = false;
174 if (*psz == ch)
175 {
176 unsigned offPeek = 1;
177
178 psz++;
179 while ( *psz != '\0'
180 && *psz == RTScriptLexPeekCh(pThis, offPeek))
181 {
182 offPeek++;
183 psz++;
184 }
185
186 if (*psz == '\0')
187 {
188 if (pszExclude)
189 {
190 ch = RTScriptLexPeekCh(pThis, offPeek);
191 fMatch = strchr(pszExclude, ch) == NULL;
192 }
193 else
194 fMatch = true;
195 }
196
197 if (fMatch)
198 {
199 /* Match, consume everything. */
200 while (offPeek-- > 0)
201 RTScriptLexConsumeCh(pThis);
202 }
203 }
204
205 return fMatch;
206}
207
208
209/**
210 * Tries to locate a string with the given starting character (+ peeking ahead) in the
211 * given string array (exact match) and consumes the entire substring.
212 *
213 * @returns Flag whether there was a match.
214 * @param pThis The lexer state.
215 * @param ch The character to check for.
216 * @param papsz Pointer to the string array to check for.
217 * @param pidx Where to store the index of the matching substring if found,
218 * optional.
219 */
220DECLINLINE(bool) rtScriptLexLocateSubStrInStrArrayMatchConsume(PRTSCRIPTLEXINT pThis, char ch,
221 const char **papsz, unsigned *pidx)
222{
223 unsigned int idx = 0;
224
225 while ( papsz[idx] != NULL
226 && !rtScriptLexMatchStrConsume(pThis, ch, papsz[idx], NULL))
227 idx++;
228
229 if ( papsz[idx] != NULL
230 && pidx)
231 *pidx = idx;
232
233 return papsz[idx] != NULL;
234}
235
236
237/**
238 * Tries to get an exact match starting with the given character, consuming it when found.
239 *
240 * @returns Flag whether there was a match.
241 * @param pThis The lexer state.
242 * @param ch The character to check for.
243 * @param ppMatch Where to store the exact match on success.
244 */
245DECLINLINE(bool) rtScriptLexLocateExactMatchConsume(PRTSCRIPTLEXINT pThis, char ch, PCRTSCRIPTLEXTOKMATCH *ppMatch)
246{
247 PCRTSCRIPTLEXTOKMATCH pTokMatch = pThis->pCfg->paTokMatches;
248
249 if (pTokMatch)
250 {
251 while ( pTokMatch->pszMatch != NULL
252 && !rtScriptLexMatchStrConsume(pThis, ch, pTokMatch->pszMatch,
253 pTokMatch->fMaybeIdentifier
254 ? g_aszIdeCharSetDef
255 : NULL))
256 pTokMatch++;
257
258 if (pTokMatch->pszMatch != NULL)
259 {
260 *ppMatch = pTokMatch;
261 return true;
262 }
263 }
264
265 return false;
266}
267
268
269DECLINLINE(bool) rtScriptLexIsNewlineConsume(PRTSCRIPTLEXINT pThis, char ch)
270{
271 const char **papszNl = pThis->pCfg->pszWhitespace ? pThis->pCfg->papszNewline : g_aszNlDef;
272
273 bool fMatched = rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszNl, NULL);
274 if (fMatched)
275 {
276 pThis->Pos.iLine++;
277 pThis->Pos.iCh = 1;
278 }
279
280 return fMatched;
281}
282
283
284/**
285 * Checks whether the character is the beginning of a multi line comment, skipping the whole
286 * comment if necessary.
287 *
288 * @returns Flag whether a multi line comment was detected and consumed.
289 * @param hScriptLex The lexer state.
290 * @param ch The character to check for.
291 */
292DECLINLINE(bool) rtScriptLexIsMultiLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
293{
294 const char **papszCommentMultiStart = pThis->pCfg->papszCommentMultiStart;
295 unsigned idxComment = 0;
296
297 if ( papszCommentMultiStart
298 && rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentMultiStart,
299 &idxComment))
300 {
301 /* Look for the matching closing lexeme in the input consuming everything along the way. */
302 const char *pszClosing = pThis->pCfg->papszCommentMultiEnd[idxComment];
303
304 for (;;)
305 {
306 char chTmp = RTScriptLexGetCh(pThis);
307
308 /* Check for new lines explicetly to advance the position information. */
309 if (rtScriptLexIsNewlineConsume(pThis, chTmp))
310 continue;
311
312 /** @todo Not quite correct when there is an end of stream before the closing lexeme.
313 * But doesn't hurt at the moment. */
314 if ( chTmp == '\0'
315 || rtScriptLexMatchStrConsume(pThis, chTmp, pszClosing, NULL))
316 break;
317
318 RTScriptLexConsumeCh(pThis);
319 }
320
321 return true;
322 }
323
324 return false;
325}
326
327
328/**
329 * Checks whether the character is the beginning of a single line comment, skipping the whole
330 * comment if necessary.
331 *
332 * @returns Flag whether a single line comment was detected and consumed.
333 * @param hScriptLex The lexer state.
334 * @param ch The character to check for.
335 */
336DECLINLINE(bool) rtScriptLexIsSingleLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
337{
338 const char **papszCommentSingleStart = pThis->pCfg->papszCommentSingleStart;
339
340 if ( papszCommentSingleStart
341 && rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentSingleStart,
342 NULL))
343 {
344 for (;;)
345 {
346 char chTmp = RTScriptLexGetCh(pThis);
347
348 if ( chTmp == '\0'
349 || rtScriptLexIsNewlineConsume(pThis, chTmp))
350 break;
351
352 RTScriptLexConsumeCh(pThis);
353 }
354
355 return true;
356 }
357
358 return false;
359}
360
361
362/**
363 * Fills the input buffer with source data.
364 *
365 * @returns IPRT status code.
366 * @param pThis The lexer state.
367 */
368static int rtScriptLexFillBuffer(PRTSCRIPTLEXINT pThis)
369{
370 int rc = VINF_SUCCESS;
371 size_t cchToRead = pThis->cchBuf;
372 char *pchRead = &pThis->achBuf[0];
373
374 AssertReturn(!(pThis->fFlags & RTSCRIPT_LEX_INT_F_EOS), VERR_INVALID_STATE);
375
376 /* If there is input left to process move it to the front and fill the remainder. */
377 if ( pThis->pchCur != NULL
378 && pThis->pchCur != &pThis->achBuf[pThis->cchBuf])
379 {
380 cchToRead = pThis->pchCur - &pThis->achBuf[0];
381 /* Move the rest to the front. */
382 memmove(&pThis->achBuf[0], pThis->pchCur, pThis->cchBuf - cchToRead);
383 pchRead = (char *)pThis->pchCur + 1;
384 }
385
386 if (cchToRead)
387 {
388 pThis->pchCur = &pThis->achBuf[0];
389
390 size_t cchRead = 0;
391 rc = pThis->pfnReader(pThis, pThis->offBufRead, pchRead, cchToRead, &cchRead, pThis->pvUser);
392 if (RT_SUCCESS(rc))
393 {
394 pThis->offBufRead += cchRead;
395 if (rc == VINF_EOF)
396 pThis->fFlags |= RTSCRIPT_LEX_INT_F_EOS;
397 if (cchRead < cchToRead)
398 memset(pchRead + cchRead, 0, cchToRead - cchRead);
399 rc = VINF_SUCCESS;
400 }
401 else
402 pThis->rcRdr = rc;
403 }
404 else
405 rc = VERR_BUFFER_OVERFLOW; /** @todo */
406
407 return rc;
408}
409
410
411/**
412 * Produce an end of stream token.
413 *
414 * @returns nothing.
415 * @param pThis The lexer state.
416 * @param pTok The token to fill.
417 */
418static void rtScriptLexProduceTokEos(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
419{
420 pTok->enmType = RTSCRIPTLEXTOKTYPE_EOS;
421 pTok->PosStart = pThis->Pos;
422 pTok->PosEnd = pThis->Pos;
423}
424
425
426RTDECL(int) RTScriptLexProduceTokError(RTSCRIPTLEX hScriptLex, PRTSCRIPTLEXTOKEN pTok,
427 int rc, const char *pszMsg, ...)
428{
429 PRTSCRIPTLEXINT pThis = hScriptLex;
430
431 va_list va;
432 va_start(va, pszMsg);
433
434 pTok->enmType = RTSCRIPTLEXTOKTYPE_ERROR;
435 pTok->PosEnd = pThis->Pos;
436 pTok->Type.Error.pErr = &pThis->ErrInfo.Core;
437
438 RTErrInfoInitStatic(&pThis->ErrInfo);
439 RTErrInfoSetV(&pThis->ErrInfo.Core, rc, pszMsg, va);
440 va_end(va);
441
442 return rc;
443}
444
445
446RTDECL(int) RTScriptLexProduceTokIde(RTSCRIPTLEX hScriptLex, PRTSCRIPTLEXTOKEN pTok, const char *pszIde, size_t cchIde)
447{
448 PRTSCRIPTLEXINT pThis = hScriptLex;
449
450 /* Insert into string cache. */
451 pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
452 pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, pszIde, cchIde);
453 if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
454 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
455
456 pTok->PosEnd = pThis->Pos;
457 return VINF_SUCCESS;
458}
459
460
461/**
462 * Create the token from the exact match.
463 *
464 * @returns nothing.
465 * @param pThis The lexer state.
466 * @param pTok The token to fill.
467 * @param pMatch The matched string.
468 */
469static void rtScriptLexProduceTokFromExactMatch(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
470 PCRTSCRIPTLEXTOKMATCH pMatch)
471{
472 pTok->enmType = pMatch->enmTokType;
473 pTok->PosEnd = pThis->Pos;
474
475 switch (pTok->enmType)
476 {
477 case RTSCRIPTLEXTOKTYPE_OPERATOR:
478 pTok->Type.Operator.pOp = pMatch;
479 break;
480 case RTSCRIPTLEXTOKTYPE_KEYWORD:
481 pTok->Type.Keyword.pKeyword = pMatch;
482 break;
483 case RTSCRIPTLEXTOKTYPE_PUNCTUATOR:
484 pTok->Type.Punctuator.pPunctuator = pMatch;
485 break;
486 default:
487 RTScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
488 "Lexer: The match contains an invalid token type: %d\n",
489 pTok->enmType);
490 }
491}
492
493
494/**
495 * Goes through the rules trying to find a matching one.
496 *
497 * @returns Flag whether a matching rule was found.
498 * @param pThis The lexer state.
499 * @param ch The character to check.
500 * @param pTok The token to fill.
501 */
502static bool rtScriptLexProduceTokFromRules(PRTSCRIPTLEXINT pThis, char ch, PRTSCRIPTLEXTOKEN pTok)
503{
504 PCRTSCRIPTLEXRULE pRule = pThis->pCfg->paRules;
505
506 if (pRule)
507 {
508 while (pRule->pfnProd != NULL)
509 {
510 if ( ch >= pRule->chStart
511 && ch <= pRule->chEnd)
512 {
513 if (pRule->fFlags & RTSCRIPT_LEX_RULE_CONSUME)
514 RTScriptLexConsumeCh(pThis);
515 int rc = pRule->pfnProd(pThis, ch, pTok, pRule->pvUser);
516 AssertRC(rc);
517 return true;
518 }
519
520 pRule++;
521 }
522 }
523
524 return false;
525}
526
527
528/**
529 * Fills in the given token from the scanned input at the current location.
530 *
531 * @returns IPRT status code.
532 * @param pThis The lexer state.
533 * @param pTok The token to fill.
534 */
535static int rtScriptLexProduceToken(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
536{
537 RTScriptLexSkipWhitespace(pThis);
538
539 pTok->PosStart = pThis->Pos;
540
541 char ch = RTScriptLexGetCh(pThis);
542 PCRTSCRIPTLEXTOKMATCH pMatch = NULL;
543 if (ch == '\0')
544 rtScriptLexProduceTokEos(pThis, pTok);
545 else if (rtScriptLexLocateExactMatchConsume(pThis, ch, &pMatch))
546 rtScriptLexProduceTokFromExactMatch(pThis, pTok, pMatch);
547 else if (!rtScriptLexProduceTokFromRules(pThis, ch, pTok))
548 {
549 if (pThis->pCfg->pfnProdDef)
550 pThis->rcRdr = pThis->pCfg->pfnProdDef(pThis, ch, pTok, pThis->pCfg->pvProdDefUser);
551 else
552 RTScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
553 "Lexer: Invalid character found in input: %c\n",
554 ch);
555 }
556
557 return pThis->rcRdr;
558}
559
560
561/**
562 * Populates the lexer for the initial use.
563 *
564 * @returns IPRT status code.
565 * @param pThis The lexer state.
566 */
567static int rtScriptLexPopulate(PRTSCRIPTLEXINT pThis)
568{
569 int rc = rtScriptLexFillBuffer(pThis);
570 if (RT_SUCCESS(rc))
571 {
572 rc = rtScriptLexProduceToken(pThis, pThis->pTokCur);
573 if (RT_SUCCESS(rc))
574 rc = rtScriptLexProduceToken(pThis, pThis->pTokNext);
575 }
576
577 return rc;
578}
579
580
581
582RTDECL(int) RTScriptLexCreateFromReader(PRTSCRIPTLEX phScriptLex, PFNRTSCRIPTLEXRDR pfnReader,
583 PFNRTSCRIPTLEXDTOR pfnDtor, void *pvUser,
584 size_t cchBuf, PRTSTRCACHE phStrCacheId, PRTSTRCACHE phStrCacheStringLit,
585 PCRTSCRIPTLEXCFG pCfg)
586{
587 AssertPtrReturn(phScriptLex, VERR_INVALID_POINTER);
588 AssertPtrReturn(pfnReader, VERR_INVALID_POINTER);
589 AssertPtrReturn(pCfg, VERR_INVALID_POINTER);
590
591 /* Case insensitivity with internal lower or upper case conversion is mutually exclusive. */
592 AssertReturn( (pCfg->fFlags & (RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_LOWER | RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_UPPER))
593 != (RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_LOWER | RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_UPPER), VERR_INVALID_PARAMETER);
594
595 if (!cchBuf)
596 cchBuf = _16K;
597 int rc = VINF_SUCCESS;
598 PRTSCRIPTLEXINT pThis = (PRTSCRIPTLEXINT)RTMemAllocZ(RT_UOFFSETOF_DYN(RTSCRIPTLEXINT, achBuf[cchBuf]));
599 if (RT_LIKELY(pThis))
600 {
601 pThis->u32Magic = 0xfefecafe; /** @todo */
602 pThis->Pos.iLine = 1;
603 pThis->Pos.iCh = 1;
604 pThis->pTokCur = &pThis->aToks[0];
605 pThis->pTokNext = &pThis->aToks[1];
606 pThis->pCfg = pCfg;
607 pThis->pfnReader = pfnReader;
608 pThis->pfnDtor = pfnDtor;
609 pThis->pvUser = pvUser;
610 pThis->fFlags = 0;
611 pThis->cchStrLitMax = 0;
612 pThis->pszStrLit = NULL;
613 pThis->cchBuf = cchBuf;
614 pThis->offBufRead = 0;
615 pThis->pchCur = NULL;
616 pThis->hStrCacheId = NULL;
617 pThis->hStrCacheStringLit = NULL;
618
619 rc = RTStrCacheCreate(&pThis->hStrCacheId, "LEX-Ide");
620 if (RT_SUCCESS(rc))
621 {
622 rc = RTStrCacheCreate(&pThis->hStrCacheStringLit, "LEX-StrLit");
623 if (RT_SUCCESS(rc))
624 {
625 rc = rtScriptLexPopulate(pThis);
626 if (RT_SUCCESS(rc))
627 {
628 *phScriptLex = pThis;
629
630 if (phStrCacheId)
631 *phStrCacheId = pThis->hStrCacheId;
632 else
633 pThis->fFlags |= RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE;
634
635 if (phStrCacheStringLit)
636 *phStrCacheStringLit = pThis->hStrCacheStringLit;
637 else
638 pThis->fFlags |= RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE;
639
640 return VINF_SUCCESS;
641 }
642
643 RTStrCacheDestroy(pThis->hStrCacheStringLit);
644 }
645
646 RTStrCacheDestroy(pThis->hStrCacheId);
647 }
648
649 RTMemFree(pThis);
650 }
651 else
652 rc = VERR_NO_MEMORY;
653
654 return rc;
655}
656
657
658/**
659 * @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a string.}
660 */
661static DECLCALLBACK(int) rtScriptLexReaderStr(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
662 size_t cchBuf, size_t *pcchRead, void *pvUser)
663{
664 RT_NOREF(hScriptLex);
665
666 const char *psz = (const char *)pvUser;
667 size_t cch = strlen(psz);
668 size_t cchCopy = RT_MIN(cchBuf, cch - offBuf);
669 int rc = VINF_SUCCESS;
670
671 *pcchRead = cchCopy;
672
673 if (cchCopy)
674 memcpy(pchCur, &psz[offBuf], cchCopy * sizeof(char));
675 else
676 rc = VINF_EOF;
677
678 return rc;
679}
680
681
682RTDECL(int) RTScriptLexCreateFromString(PRTSCRIPTLEX phScriptLex, const char *pszSrc, PRTSTRCACHE phStrCacheId,
683 PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
684{
685 return RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderStr, NULL, (void *)pszSrc, 0,
686 phStrCacheId, phStrCacheStringLit, pCfg);
687}
688
689
690/**
691 * @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a file.}
692 */
693static DECLCALLBACK(int) rtScriptLexReaderFile(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
694 size_t cchBuf, size_t *pcchRead, void *pvUser)
695{
696 RT_NOREF(hScriptLex);
697
698 RTFILE hFile = (RTFILE)pvUser;
699 return RTFileReadAt(hFile, offBuf, pchCur, cchBuf, pcchRead);
700}
701
702
703/**
704 * @callback_method_impl{FNRTSCRIPTLEXDTOR, Destructor for the file variant.}
705 */
706static DECLCALLBACK(void) rtScriptLexDtorFile(RTSCRIPTLEX hScriptLex, void *pvUser)
707{
708 RT_NOREF(hScriptLex);
709
710 RTFILE hFile = (RTFILE)pvUser;
711 RTFileClose(hFile);
712}
713
714
715RTDECL(int) RTScriptLexCreateFromFile(PRTSCRIPTLEX phScriptLex, const char *pszFilename, PRTSTRCACHE phStrCacheId,
716 PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
717{
718 RTFILE hFile;
719 int rc = RTFileOpen(&hFile, pszFilename, RTFILE_O_READ | RTFILE_O_DENY_WRITE | RTFILE_O_OPEN);
720 if (RT_SUCCESS(rc))
721 {
722 rc = RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderFile, rtScriptLexDtorFile, (void *)hFile, 0,
723 phStrCacheId, phStrCacheStringLit, pCfg);
724 if (RT_FAILURE(rc))
725 RTFileClose(hFile);
726 }
727
728 return rc;
729}
730
731
732RTDECL(void) RTScriptLexDestroy(RTSCRIPTLEX hScriptLex)
733{
734 PRTSCRIPTLEXINT pThis = hScriptLex;
735 AssertPtrReturnVoid(pThis);
736
737 if (pThis->pfnDtor)
738 pThis->pfnDtor(pThis, pThis->pvUser);
739
740 if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE)
741 RTStrCacheDestroy(pThis->hStrCacheId);
742 if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE)
743 RTStrCacheDestroy(pThis->hStrCacheStringLit);
744
745 if (pThis->pszStrLit)
746 RTStrFree(pThis->pszStrLit);
747
748 RTMemFree(pThis);
749}
750
751
752RTDECL(int) RTScriptLexQueryToken(RTSCRIPTLEX hScriptLex, PCRTSCRIPTLEXTOKEN *ppToken)
753{
754 PRTSCRIPTLEXINT pThis = hScriptLex;
755 AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
756 AssertPtrReturn(ppToken, VERR_INVALID_POINTER);
757
758 if (RT_SUCCESS(pThis->rcRdr))
759 *ppToken = pThis->pTokCur;
760
761 return pThis->rcRdr;
762}
763
764
765RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexGetTokenType(RTSCRIPTLEX hScriptLex)
766{
767 PRTSCRIPTLEXINT pThis = hScriptLex;
768 AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
769
770 if (RT_SUCCESS(pThis->rcRdr))
771 return pThis->pTokCur->enmType;
772
773 return RTSCRIPTLEXTOKTYPE_INVALID;
774}
775
776
777RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexPeekNextTokenType(RTSCRIPTLEX hScriptLex)
778{
779 PRTSCRIPTLEXINT pThis = hScriptLex;
780 AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
781
782 if (RT_SUCCESS(pThis->rcRdr))
783 return pThis->pTokNext->enmType;
784
785 return RTSCRIPTLEXTOKTYPE_INVALID;
786}
787
788
789RTDECL(PCRTSCRIPTLEXTOKEN) RTScriptLexConsumeToken(RTSCRIPTLEX hScriptLex)
790{
791 PRTSCRIPTLEXINT pThis = hScriptLex;
792 AssertPtrReturn(pThis, NULL);
793
794 /*
795 * Stop token production as soon as the current token indicates the
796 * end of the stream or an error
797 */
798 if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
799 && pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
800 {
801 PRTSCRIPTLEXTOKEN pTokTmp = pThis->pTokCur;
802
803 /* Switch next token to current token and read in the next token. */
804 pThis->pTokCur = pThis->pTokNext;
805 pThis->pTokNext = pTokTmp;
806 if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
807 && pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
808 rtScriptLexProduceToken(pThis, pThis->pTokNext);
809 else
810 pThis->pTokNext = pThis->pTokCur;
811 }
812
813 return pThis->pTokCur;
814}
815
816
817RTDECL(char) RTScriptLexConsumeCh(RTSCRIPTLEX hScriptLex)
818{
819 return RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_DEFAULT);
820}
821
822
823RTDECL(char) RTScriptLexConsumeChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
824{
825 PRTSCRIPTLEXINT pThis = hScriptLex;
826 AssertPtrReturn(pThis, '\0');
827
828 pThis->pchCur++;
829 pThis->Pos.iCh++;
830 if (pThis->pchCur == &pThis->achBuf[pThis->cchBuf])
831 rtScriptLexFillBuffer(pThis);
832
833 return RTScriptLexGetChEx(pThis, fFlags);
834}
835
836
837RTDECL(char) RTScriptLexPeekCh(RTSCRIPTLEX hScriptLex, unsigned idx)
838{
839 return RTScriptLexPeekChEx(hScriptLex, idx, RTSCRIPT_LEX_CONV_F_DEFAULT);
840}
841
842
843RTDECL(char) RTScriptLexPeekChEx(RTSCRIPTLEX hScriptLex, unsigned idx, uint32_t fFlags)
844{
845 PRTSCRIPTLEXINT pThis = hScriptLex;
846 AssertPtrReturn(pThis, '\0');
847
848 /* Just return the character if it is in the current buffer. */
849 char ch = '\0';
850 if (RT_LIKELY(pThis->pchCur + idx < &pThis->achBuf[pThis->cchBuf]))
851 ch = pThis->pchCur[idx];
852 else
853 {
854 /* Slow path, read data into temporary buffer to read character from and dismiss. */
855 /** @todo */
856 AssertReleaseFailed();
857 }
858
859 if (!(fFlags & RTSCRIPT_LEX_CONV_F_NOTHING))
860 {
861 if (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_LOWER)
862 ch = RT_C_TO_LOWER(ch);
863 else if (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_UPPER)
864 ch = RT_C_TO_UPPER(ch);
865 }
866
867 return ch;
868}
869
870
871RTDECL(char) RTScriptLexGetCh(RTSCRIPTLEX hScriptLex)
872{
873 return RTScriptLexPeekCh(hScriptLex, 0);
874}
875
876
877RTDECL(char) RTScriptLexGetChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
878{
879 return RTScriptLexPeekChEx(hScriptLex, 0, fFlags);
880}
881
882
883RTDECL(void) RTScriptLexSkipWhitespace(RTSCRIPTLEX hScriptLex)
884{
885 PRTSCRIPTLEXINT pThis = hScriptLex;
886 AssertPtrReturnVoid(pThis);
887
888 for (;;)
889 {
890 char ch = RTScriptLexGetCh(hScriptLex);
891
892 if (ch == '\0')
893 break;
894
895 /* Check for whitespace. */
896 const char *pszWs = pThis->pCfg->pszWhitespace ? pThis->pCfg->pszWhitespace : g_szWsDef;
897
898 if ( rtScriptLexLocateChInStrConsume(pThis, ch, pszWs)
899 || rtScriptLexIsNewlineConsume(pThis, ch)
900 || rtScriptLexIsMultiLineCommentConsume(pThis, ch)
901 || rtScriptLexIsSingleLineCommentConsume(pThis, ch))
902 continue;
903
904 /* All white space skipped, next is some real content. */
905 break;
906 }
907}
908
909
910RTDECL(int) RTScriptLexScanNumber(RTSCRIPTLEX hScriptLex, uint8_t uBase, bool fAllowReal,
911 PRTSCRIPTLEXTOKEN pTok)
912{
913 RT_NOREF(uBase, fAllowReal, pTok);
914 PRTSCRIPTLEXINT pThis = hScriptLex;
915 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
916 AssertReturn(!fAllowReal, VERR_NOT_IMPLEMENTED);
917 AssertReturn(!uBase, VERR_NOT_IMPLEMENTED);
918
919 /** @todo r=aeichner Quick and dirty to have something working for the disassembler testcase.
920 * Among others it misses overflow handling. */
921 uBase = 10;
922 char ch = RTScriptLexGetCh(hScriptLex);
923 pTok->Type.Number.enmType = ch == '-'
924 ? RTSCRIPTLEXTOKNUMTYPE_INTEGER
925 : RTSCRIPTLEXTOKNUMTYPE_NATURAL;
926 if (ch == '-' || ch == '+')
927 ch = RTScriptLexConsumeCh(hScriptLex);
928
929 if (ch == '0')
930 {
931 /* Some hex prefix? */
932 char chNext = RTScriptLexPeekCh(hScriptLex, 1);
933 if (chNext == 'x' || chNext == 'X')
934 {
935 uBase = 16;
936 RTScriptLexConsumeCh(hScriptLex);
937 }
938 else if (chNext >= '0' && chNext <= '9') /* Octal stuff. */
939 AssertFailedReturn(VERR_NOT_IMPLEMENTED);
940
941 ch = RTScriptLexConsumeCh(hScriptLex);
942 }
943
944 uint64_t u64 = 0;
945 for (;;)
946 {
947 if ( (ch < '0' || ch > '9')
948 && ( ( !(ch >= 'a' && ch <= 'f')
949 && !(ch >= 'A' && ch <= 'F'))
950 || uBase == 10))
951 {
952 if (pTok->Type.Number.enmType == RTSCRIPTLEXTOKNUMTYPE_INTEGER)
953 pTok->Type.Number.Type.i64 = -(int64_t)u64;
954 else
955 pTok->Type.Number.Type.u64 = u64;
956 pTok->enmType = RTSCRIPTLEXTOKTYPE_NUMBER;
957 pTok->PosEnd = pThis->Pos;
958 return VINF_SUCCESS;
959 }
960
961 if (ch >= '0' && ch <= '9')
962 u64 = (u64 * uBase) + (ch - '0');
963 else if (ch >= 'a' && ch <= 'f')
964 {
965 Assert(uBase == 16);
966 u64 = (u64 << 4) + 10 + (ch - 'a');
967 }
968 else if (ch >= 'A' && ch <= 'F')
969 {
970 Assert(uBase == 16);
971 u64 = (u64 << 4) + 10 + (ch - 'A');
972 }
973
974 ch = RTScriptLexConsumeCh(hScriptLex);
975 }
976}
977
978
979RTDECL(int) RTScriptLexScanIdentifier(RTSCRIPTLEX hScriptLex, char ch,
980 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
981{
982 PRTSCRIPTLEXINT pThis = hScriptLex;
983 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
984
985 const char *pszCharSet = pvUser ? (const char *)pvUser : g_aszIdeCharSetDef;
986 char aszIde[513]; RT_ZERO(aszIde);
987 unsigned idx = 0;
988 aszIde[idx++] = ch;
989
990 ch = RTScriptLexGetCh(hScriptLex);
991 while ( idx < sizeof(aszIde) - 1
992 && rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
993 {
994 aszIde[idx++] = ch;
995 ch = RTScriptLexGetCh(hScriptLex);
996 }
997
998 if ( idx == sizeof(aszIde) - 1
999 && rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
1000 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_BUFFER_OVERFLOW, "Lexer: Identifier exceeds the allowed length");
1001
1002 /* Insert into string cache. */
1003 pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
1004 pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, &aszIde[0], idx);
1005 if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
1006 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
1007
1008 pTok->PosEnd = pThis->Pos;
1009 return VINF_SUCCESS;
1010}
1011
1012
1013/**
1014 * Adds the given character to the string literal add the given position, assuring the string
1015 * is always zero terminated.
1016 *
1017 * @returns IPRT status code.
1018 * @param pThis The lexer state.
1019 * @param ch The character to add.
1020 * @param idx At which position to add the character in the string.
1021 */
1022static int rtScriptLexScanStringLiteralChAdd(PRTSCRIPTLEXINT pThis, char ch, uint32_t idx)
1023{
1024 int rc = VINF_SUCCESS;
1025
1026 if ( !pThis->cchStrLitMax
1027 || idx >= pThis->cchStrLitMax - 1)
1028 {
1029 /* Increase memory. */
1030 size_t cchMaxNew = pThis->cchStrLitMax + 64;
1031 char *pszNew = pThis->pszStrLit;
1032 rc = RTStrRealloc(&pszNew, cchMaxNew * sizeof(char));
1033 if (RT_SUCCESS(rc))
1034 {
1035 pThis->pszStrLit = pszNew;
1036 pThis->cchStrLitMax = cchMaxNew;
1037 }
1038 }
1039
1040 if (RT_SUCCESS(rc))
1041 {
1042 pThis->pszStrLit[idx] = ch;
1043 pThis->pszStrLit[idx + 1] = '\0';
1044 }
1045
1046 return rc;
1047}
1048
1049
1050RTDECL(int) RTScriptLexScanStringLiteralC(RTSCRIPTLEX hScriptLex, char ch,
1051 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1052{
1053 RT_NOREF(ch, pvUser);
1054 PRTSCRIPTLEXINT pThis = hScriptLex;
1055 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1056
1057 uint32_t idxChCur = 0;
1058 int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1059 if (RT_FAILURE(rc))
1060 return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1061
1062 ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1063 for (;;)
1064 {
1065 if (ch == '\0')
1066 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1067 else if (ch == '\"')
1068 {
1069 RTScriptLexConsumeCh(hScriptLex);
1070
1071 /* End of string, add it to the string literal cache and build the token. */
1072 pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1073 pTok->Type.StringLit.cchString = idxChCur;
1074 pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1075 if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1076 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1077 else
1078 break;
1079 }
1080 else if (ch == '\\')
1081 {
1082 /* Start of escape sequence. */
1083 RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1084 ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1085 switch (ch)
1086 {
1087 case 'a': /* Alert (Bell) */
1088 ch = 0x07;
1089 break;
1090 case 'b': /* Backspace */
1091 ch = 0x08;
1092 break;
1093 case 'e': /* Escape character */
1094 ch = 0x1b;
1095 break;
1096 case 'f': /* Formfeed */
1097 ch = 0x0c;
1098 break;
1099 case 'n': /* Newline (line freed) */
1100 ch = 0x0a;
1101 break;
1102 case 'r': /* Carriage return */
1103 ch = 0x0d;
1104 break;
1105 case 't': /* Horizontal tab */
1106 ch = 0x09;
1107 break;
1108 case 'v': /* Vertical tab */
1109 ch = 0x0b;
1110 break;
1111 case '\\':
1112 case '\'':
1113 case '\"':
1114 case '\?':
1115 /* Can be added as is. */
1116 break;
1117 case 'x': /* Hexdecimal byte. */
1118 case '0': /* Octal */
1119 case '1':
1120 case '2':
1121 case '3':
1122 case '4':
1123 case '5':
1124 case '6':
1125 case '7':
1126 case '8':
1127 case '9':
1128 case 'u': /* Unicode point below 10000 */
1129 case 'U': /* Unicode point */
1130 default:
1131 /* Not supported for now. */
1132 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NOT_SUPPORTED, "Lexer: Invalid/unsupported escape sequence");
1133 }
1134 }
1135
1136 rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1137 if (RT_SUCCESS(rc))
1138 idxChCur++;
1139 else
1140 return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1141
1142 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1143 }
1144
1145 pTok->PosEnd = pThis->Pos;
1146 return VINF_SUCCESS;
1147}
1148
1149
1150RTDECL(int) RTScriptLexScanStringLiteralPascal(RTSCRIPTLEX hScriptLex, char ch,
1151 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1152{
1153 RT_NOREF(ch, pvUser);
1154 PRTSCRIPTLEXINT pThis = hScriptLex;
1155 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1156
1157 uint32_t idxChCur = 0;
1158 int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1159 if (RT_FAILURE(rc))
1160 return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1161
1162 ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1163 for (;;)
1164 {
1165 if (ch == '\0')
1166 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1167 else if (ch == '\'')
1168 {
1169 /*
1170 * Check whether there is a second ' coming afterwards used for
1171 * escaping ' characters.
1172 */
1173 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1174 if (ch != '\'')
1175 {
1176 /* End of string, add it to the string literal cache and build the token. */
1177 pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1178 pTok->Type.StringLit.cchString = idxChCur;
1179 pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1180 if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1181 return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1182 else
1183 break;
1184 }
1185 /* else: Fall through and add the character to the string literal..*/
1186 }
1187
1188 rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1189 if (RT_SUCCESS(rc))
1190 idxChCur++;
1191 else
1192 return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1193 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1194 }
1195
1196 pTok->PosEnd = pThis->Pos;
1197 return VINF_SUCCESS;
1198}
1199
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette