VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/script/scriptlex.cpp@ 107455

Last change on this file since 107455 was 106061, checked in by vboxsync, 5 months ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 37.4 KB
Line 
1/* $Id: scriptlex.cpp 106061 2024-09-16 14:03:52Z vboxsync $ */
2/** @file
3 * IPRT - RTScript* lexer API.
4 */
5
6/*
7 * Copyright (C) 2022-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * The contents of this file may alternatively be used under the terms
26 * of the Common Development and Distribution License Version 1.0
27 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28 * in the VirtualBox distribution, in which case the provisions of the
29 * CDDL are applicable instead of those of the GPL.
30 *
31 * You may elect to license modified versions of this file under the
32 * terms and conditions of either the GPL or the CDDL or both.
33 *
34 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35 */
36
37
38/*********************************************************************************************************************************
39* Header Files *
40*********************************************************************************************************************************/
41#define LOG_GROUP RTLOGGROUP_DEFAULT /// @todo
42#include <iprt/script.h>
43
44#include <iprt/assert.h>
45#include <iprt/ctype.h>
46#include <iprt/err.h>
47#include <iprt/file.h>
48#include <iprt/log.h>
49#include <iprt/mem.h>
50#include <iprt/string.h>
51
52
53/*********************************************************************************************************************************
54* Structures and Typedefs *
55*********************************************************************************************************************************/
56
57/**
58 * Internal lexer state.
59 */
60typedef struct RTSCRIPTLEXINT
61{
62 /** Magic. */
63 uint32_t u32Magic;
64 /** Source position. */
65 RTSCRIPTPOS Pos;
66 /** Current and next token buffer. */
67 RTSCRIPTLEXTOKEN aToks[2];
68 /** Pointer to the current token. */
69 PRTSCRIPTLEXTOKEN pTokCur;
70 /** Pointer to the next token. */
71 PRTSCRIPTLEXTOKEN pTokNext;
72 /** The lexer config. */
73 PCRTSCRIPTLEXCFG pCfg;
74 /** The input reader. */
75 PFNRTSCRIPTLEXRDR pfnReader;
76 /** The destructor callback. */
77 PFNRTSCRIPTLEXDTOR pfnDtor;
78 /** Opaque user data for the reader. */
79 void *pvUser;
80 /** Identifier string cache. */
81 RTSTRCACHE hStrCacheId;
82 /** String literal string cache. */
83 RTSTRCACHE hStrCacheStringLit;
84 /** Status code from the reader. */
85 int rcRdr;
86 /** Internal error info. */
87 RTERRINFOSTATIC ErrInfo;
88 /** Lexer flags. */
89 uint32_t fFlags;
90 /** Maximum numebr of bytes allocated for temporary storage for literal strings. */
91 size_t cchStrLitMax;
92 /** Pointer to the string buffer for holding the literal string. */
93 char *pszStrLit;
94 /** Pointer to the current input character. */
95 const char *pchCur;
96 /** Offset to start reading the next chunk from. */
97 size_t offBufRead;
98 /** Size of the input buffer. */
99 size_t cchBuf;
100 /** The cached part of the input, variable in size. */
101 char achBuf[1];
102} RTSCRIPTLEXINT;
103/** Pointer to the internal lexer state. */
104typedef RTSCRIPTLEXINT *PRTSCRIPTLEXINT;
105
106
107/** Free the identifier string cache literal on destruction. */
108#define RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE RT_BIT_32(0)
109/** Free the string literal string cache literal on destruction. */
110#define RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE RT_BIT_32(1)
111/** End of stream reached. */
112#define RTSCRIPT_LEX_INT_F_EOS RT_BIT_32(2)
113
114
115/*********************************************************************************************************************************
116* Global Variables *
117*********************************************************************************************************************************/
118
119/** Default set of white spaces. */
120static const char *g_szWsDef = " \t";
121/** Default set of newlines. */
122static const char *g_aszNlDef[] =
123{
124 "\n",
125 "\r\n",
126 NULL
127};
128/** Default set of characters allowed for identifiers. */
129static const char *g_aszIdeCharSetDef = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
130
131
132/*********************************************************************************************************************************
133* Internal Functions *
134*********************************************************************************************************************************/
135
136
137/**
138 * Locates the given character in the string, consuming it if found.
139 *
140 * @returns Flag whether the character was found in the string.
141 * @param pThis The lexer state.
142 * @param ch The character to check for.
143 * @param psz The string to check.
144 */
145DECLINLINE(bool) rtScriptLexLocateChInStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz)
146{
147 while ( *psz != '\0'
148 && *psz != ch)
149 psz++;
150
151 if (*psz != '\0')
152 RTScriptLexConsumeCh(pThis);
153
154 return *psz != '\0';
155}
156
157
158/**
159 * Matches the input against the given string starting with the given character, consuming it
160 * if found.
161 *
162 * @returns Flag whether there was a match.
163 * @param pThis The lexer state.
164 * @param ch The character to check start matching.
165 * @param psz The string to match against.
166 * @param pszExclude When the string matched but the input continues
167 * with one of the characters in this string there will
168 * be no match.
169 */
170DECLINLINE(bool) rtScriptLexMatchStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz,
171 const char *pszExclude)
172{
173 bool fMatch = false;
174 if (*psz == ch)
175 {
176 unsigned offPeek = 1;
177
178 psz++;
179 while ( *psz != '\0'
180 && *psz == RTScriptLexPeekCh(pThis, offPeek))
181 {
182 offPeek++;
183 psz++;
184 }
185
186 if (*psz == '\0')
187 {
188 if (pszExclude)
189 {
190 ch = RTScriptLexPeekCh(pThis, offPeek);
191 fMatch = strchr(pszExclude, ch) == NULL;
192 }
193 else
194 fMatch = true;
195 }
196
197 if (fMatch)
198 {
199 /* Match, consume everything. */
200 while (offPeek-- > 0)
201 RTScriptLexConsumeCh(pThis);
202 }
203 }
204
205 return fMatch;
206}
207
208
209/**
210 * Tries to locate a string with the given starting character (+ peeking ahead) in the
211 * given string array (exact match) and consumes the entire substring.
212 *
213 * @returns Flag whether there was a match.
214 * @param pThis The lexer state.
215 * @param ch The character to check for.
216 * @param papsz Pointer to the string array to check for.
217 * @param pidx Where to store the index of the matching substring if found,
218 * optional.
219 */
220DECLINLINE(bool) rtScriptLexLocateSubStrInStrArrayMatchConsume(PRTSCRIPTLEXINT pThis, char ch,
221 const char **papsz, unsigned *pidx)
222{
223 unsigned int idx = 0;
224
225 while ( papsz[idx] != NULL
226 && !rtScriptLexMatchStrConsume(pThis, ch, papsz[idx], NULL))
227 idx++;
228
229 if ( papsz[idx] != NULL
230 && pidx)
231 *pidx = idx;
232
233 return papsz[idx] != NULL;
234}
235
236
237/**
238 * Tries to get an exact match starting with the given character, consuming it when found.
239 *
240 * @returns Flag whether there was a match.
241 * @param pThis The lexer state.
242 * @param ch The character to check for.
243 * @param ppMatch Where to store the exact match on success.
244 */
245DECLINLINE(bool) rtScriptLexLocateExactMatchConsume(PRTSCRIPTLEXINT pThis, char ch, PCRTSCRIPTLEXTOKMATCH *ppMatch)
246{
247 PCRTSCRIPTLEXTOKMATCH pTokMatch = pThis->pCfg->paTokMatches;
248
249 if (pTokMatch)
250 {
251 while ( pTokMatch->pszMatch != NULL
252 && !rtScriptLexMatchStrConsume(pThis, ch, pTokMatch->pszMatch,
253 pTokMatch->fMaybeIdentifier
254 ? g_aszIdeCharSetDef
255 : NULL))
256 pTokMatch++;
257
258 if (pTokMatch->pszMatch != NULL)
259 {
260 *ppMatch = pTokMatch;
261 return true;
262 }
263 }
264
265 return false;
266}
267
268
269DECLINLINE(bool) rtScriptLexIsNewlineConsume(PRTSCRIPTLEXINT pThis, char ch)
270{
271 const char **papszNl = pThis->pCfg->pszWhitespace ? pThis->pCfg->papszNewline : g_aszNlDef;
272
273 bool fMatched = rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszNl, NULL);
274 if (fMatched)
275 {
276 pThis->Pos.iLine++;
277 pThis->Pos.iCh = 1;
278 }
279
280 return fMatched;
281}
282
283
284/**
285 * Checks whether the character is the beginning of a multi line comment, skipping the whole
286 * comment if necessary.
287 *
288 * @returns Flag whether a multi line comment was detected and consumed.
289 * @param hScriptLex The lexer state.
290 * @param ch The character to check for.
291 */
292DECLINLINE(bool) rtScriptLexIsMultiLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
293{
294 const char **papszCommentMultiStart = pThis->pCfg->papszCommentMultiStart;
295 unsigned idxComment = 0;
296
297 if ( papszCommentMultiStart
298 && rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentMultiStart,
299 &idxComment))
300 {
301 /* Look for the matching closing lexeme in the input consuming everything along the way. */
302 const char *pszClosing = pThis->pCfg->papszCommentMultiEnd[idxComment];
303
304 for (;;)
305 {
306 char chTmp = RTScriptLexGetCh(pThis);
307
308 /* Check for new lines explicetly to advance the position information. */
309 if (rtScriptLexIsNewlineConsume(pThis, chTmp))
310 continue;
311
312 /** @todo Not quite correct when there is an end of stream before the closing lexeme.
313 * But doesn't hurt at the moment. */
314 if ( chTmp == '\0'
315 || rtScriptLexMatchStrConsume(pThis, chTmp, pszClosing, NULL))
316 break;
317
318 RTScriptLexConsumeCh(pThis);
319 }
320
321 return true;
322 }
323
324 return false;
325}
326
327
328/**
329 * Checks whether the character is the beginning of a single line comment, skipping the whole
330 * comment if necessary.
331 *
332 * @returns Flag whether a single line comment was detected and consumed.
333 * @param hScriptLex The lexer state.
334 * @param ch The character to check for.
335 */
336DECLINLINE(bool) rtScriptLexIsSingleLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
337{
338 const char **papszCommentSingleStart = pThis->pCfg->papszCommentSingleStart;
339
340 if ( papszCommentSingleStart
341 && rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentSingleStart,
342 NULL))
343 {
344 for (;;)
345 {
346 char chTmp = RTScriptLexGetCh(pThis);
347
348 if ( chTmp == '\0'
349 || rtScriptLexIsNewlineConsume(pThis, chTmp))
350 break;
351
352 RTScriptLexConsumeCh(pThis);
353 }
354
355 return true;
356 }
357
358 return false;
359}
360
361
362/**
363 * Fills the input buffer with source data.
364 *
365 * @returns IPRT status code.
366 * @param pThis The lexer state.
367 */
368static int rtScriptLexFillBuffer(PRTSCRIPTLEXINT pThis)
369{
370 int rc = VINF_SUCCESS;
371 size_t cchToRead = pThis->cchBuf;
372 char *pchRead = &pThis->achBuf[0];
373
374 AssertReturn(!(pThis->fFlags & RTSCRIPT_LEX_INT_F_EOS), VERR_INVALID_STATE);
375
376 /* If there is input left to process move it to the front and fill the remainder. */
377 if (pThis->pchCur != NULL)
378 {
379 cchToRead = pThis->pchCur - &pThis->achBuf[0];
380 /* Move the rest to the front. */
381 memmove(&pThis->achBuf[0], pThis->pchCur, pThis->cchBuf - cchToRead);
382 pchRead = (char *)pThis->pchCur + 1;
383 memset(pchRead, 0, cchToRead);
384 }
385
386 if (cchToRead)
387 {
388 pThis->pchCur = &pThis->achBuf[0];
389
390 size_t cchRead = 0;
391 rc = pThis->pfnReader(pThis, pThis->offBufRead, pchRead, cchToRead, &cchRead, pThis->pvUser);
392 if (RT_SUCCESS(rc))
393 {
394 pThis->offBufRead += cchRead;
395 if (rc == VINF_EOF)
396 pThis->fFlags |= RTSCRIPT_LEX_INT_F_EOS;
397 rc = VINF_SUCCESS;
398 }
399 else
400 pThis->rcRdr = rc;
401 }
402 else
403 rc = VERR_BUFFER_OVERFLOW; /** @todo */
404
405 return rc;
406}
407
408
409/**
410 * Produce an end of stream token.
411 *
412 * @returns nothing.
413 * @param pThis The lexer state.
414 * @param pTok The token to fill.
415 */
416static void rtScriptLexProduceTokEos(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
417{
418 pTok->enmType = RTSCRIPTLEXTOKTYPE_EOS;
419 pTok->PosStart = pThis->Pos;
420 pTok->PosEnd = pThis->Pos;
421}
422
423
424/**
425 * Produce an error token with the given error message.
426 *
427 * @returns IPRT status code.
428 * @param pThis The lexer state.
429 * @param pTok The token to fill.
430 * @param rc The status code to use in the message.
431 * @param pszMsg The format string for the error message.
432 * @param ... Arguments to the format string.
433 */
434static int rtScriptLexProduceTokError(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
435 int rc, const char *pszMsg, ...)
436{
437 va_list va;
438 va_start(va, pszMsg);
439
440 pTok->enmType = RTSCRIPTLEXTOKTYPE_ERROR;
441 pTok->PosEnd = pThis->Pos;
442 pTok->Type.Error.pErr = &pThis->ErrInfo.Core;
443
444 RTErrInfoInitStatic(&pThis->ErrInfo);
445 RTErrInfoSetV(&pThis->ErrInfo.Core, rc, pszMsg, va);
446 va_end(va);
447
448 return rc;
449}
450
451
452/**
453 * Create the token from the exact match.
454 *
455 * @returns nothing.
456 * @param pThis The lexer state.
457 * @param pTok The token to fill.
458 * @param pMatch The matched string.
459 */
460static void rtScriptLexProduceTokFromExactMatch(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
461 PCRTSCRIPTLEXTOKMATCH pMatch)
462{
463 pTok->enmType = pMatch->enmTokType;
464 pTok->PosEnd = pThis->Pos;
465
466 switch (pTok->enmType)
467 {
468 case RTSCRIPTLEXTOKTYPE_OPERATOR:
469 pTok->Type.Operator.pOp = pMatch;
470 break;
471 case RTSCRIPTLEXTOKTYPE_KEYWORD:
472 pTok->Type.Keyword.pKeyword = pMatch;
473 break;
474 case RTSCRIPTLEXTOKTYPE_PUNCTUATOR:
475 pTok->Type.Punctuator.pPunctuator = pMatch;
476 break;
477 default:
478 rtScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
479 "Lexer: The match contains an invalid token type: %d\n",
480 pTok->enmType);
481 }
482}
483
484
485/**
486 * Goes through the rules trying to find a matching one.
487 *
488 * @returns Flag whether a matching rule was found.
489 * @param pThis The lexer state.
490 * @param ch The character to check.
491 * @param pTok The token to fill.
492 */
493static bool rtScriptLexProduceTokFromRules(PRTSCRIPTLEXINT pThis, char ch, PRTSCRIPTLEXTOKEN pTok)
494{
495 PCRTSCRIPTLEXRULE pRule = pThis->pCfg->paRules;
496
497 if (pRule)
498 {
499 while (pRule->pfnProd != NULL)
500 {
501 if ( ch >= pRule->chStart
502 && ch <= pRule->chEnd)
503 {
504 if (pRule->fFlags & RTSCRIPT_LEX_RULE_CONSUME)
505 RTScriptLexConsumeCh(pThis);
506 int rc = pRule->pfnProd(pThis, ch, pTok, pRule->pvUser);
507 AssertRC(rc);
508 return true;
509 }
510
511 pRule++;
512 }
513 }
514
515 return false;
516}
517
518
519/**
520 * Fills in the given token from the scanned input at the current location.
521 *
522 * @returns IPRT status code.
523 * @param pThis The lexer state.
524 * @param pTok The token to fill.
525 */
526static int rtScriptLexProduceToken(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
527{
528 RTScriptLexSkipWhitespace(pThis);
529
530 pTok->PosStart = pThis->Pos;
531
532 char ch = RTScriptLexGetCh(pThis);
533 PCRTSCRIPTLEXTOKMATCH pMatch = NULL;
534 if (ch == '\0')
535 rtScriptLexProduceTokEos(pThis, pTok);
536 else if (rtScriptLexLocateExactMatchConsume(pThis, ch, &pMatch))
537 rtScriptLexProduceTokFromExactMatch(pThis, pTok, pMatch);
538 else if (!rtScriptLexProduceTokFromRules(pThis, ch, pTok))
539 {
540 if (pThis->pCfg->pfnProdDef)
541 pThis->rcRdr = pThis->pCfg->pfnProdDef(pThis, ch, pTok, pThis->pCfg->pvProdDefUser);
542 else
543 rtScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
544 "Lexer: Invalid character found in input: %c\n",
545 ch);
546 }
547
548 return pThis->rcRdr;
549}
550
551
552/**
553 * Populates the lexer for the initial use.
554 *
555 * @returns IPRT status code.
556 * @param pThis The lexer state.
557 */
558static int rtScriptLexPopulate(PRTSCRIPTLEXINT pThis)
559{
560 int rc = rtScriptLexFillBuffer(pThis);
561 if (RT_SUCCESS(rc))
562 {
563 rc = rtScriptLexProduceToken(pThis, pThis->pTokCur);
564 if (RT_SUCCESS(rc))
565 rc = rtScriptLexProduceToken(pThis, pThis->pTokNext);
566 }
567
568 return rc;
569}
570
571
572
573RTDECL(int) RTScriptLexCreateFromReader(PRTSCRIPTLEX phScriptLex, PFNRTSCRIPTLEXRDR pfnReader,
574 PFNRTSCRIPTLEXDTOR pfnDtor, void *pvUser,
575 size_t cchBuf, PRTSTRCACHE phStrCacheId, PRTSTRCACHE phStrCacheStringLit,
576 PCRTSCRIPTLEXCFG pCfg)
577{
578 AssertPtrReturn(phScriptLex, VERR_INVALID_POINTER);
579 AssertPtrReturn(pfnReader, VERR_INVALID_POINTER);
580 AssertPtrReturn(pCfg, VERR_INVALID_POINTER);
581
582 if (!cchBuf)
583 cchBuf = _16K;
584 int rc = VINF_SUCCESS;
585 PRTSCRIPTLEXINT pThis = (PRTSCRIPTLEXINT)RTMemAllocZ(RT_UOFFSETOF_DYN(RTSCRIPTLEXINT, achBuf[cchBuf]));
586 if (RT_LIKELY(pThis))
587 {
588 pThis->u32Magic = 0xfefecafe; /** @todo */
589 pThis->Pos.iLine = 1;
590 pThis->Pos.iCh = 1;
591 pThis->pTokCur = &pThis->aToks[0];
592 pThis->pTokNext = &pThis->aToks[1];
593 pThis->pCfg = pCfg;
594 pThis->pfnReader = pfnReader;
595 pThis->pfnDtor = pfnDtor;
596 pThis->pvUser = pvUser;
597 pThis->fFlags = 0;
598 pThis->cchStrLitMax = 0;
599 pThis->pszStrLit = NULL;
600 pThis->cchBuf = cchBuf;
601 pThis->offBufRead = 0;
602 pThis->pchCur = NULL;
603 pThis->hStrCacheId = NULL;
604 pThis->hStrCacheStringLit = NULL;
605
606 rc = RTStrCacheCreate(&pThis->hStrCacheId, "LEX-Ide");
607 if (RT_SUCCESS(rc))
608 {
609 rc = RTStrCacheCreate(&pThis->hStrCacheStringLit, "LEX-StrLit");
610 if (RT_SUCCESS(rc))
611 {
612 rc = rtScriptLexPopulate(pThis);
613 if (RT_SUCCESS(rc))
614 {
615 *phScriptLex = pThis;
616
617 if (phStrCacheId)
618 *phStrCacheId = pThis->hStrCacheId;
619 else
620 pThis->fFlags |= RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE;
621
622 if (phStrCacheStringLit)
623 *phStrCacheStringLit = pThis->hStrCacheStringLit;
624 else
625 pThis->fFlags |= RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE;
626
627 return VINF_SUCCESS;
628 }
629
630 RTStrCacheDestroy(pThis->hStrCacheStringLit);
631 }
632
633 RTStrCacheDestroy(pThis->hStrCacheId);
634 }
635
636 RTMemFree(pThis);
637 }
638 else
639 rc = VERR_NO_MEMORY;
640
641 return rc;
642}
643
644
645/**
646 * @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a string.}
647 */
648static DECLCALLBACK(int) rtScriptLexReaderStr(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
649 size_t cchBuf, size_t *pcchRead, void *pvUser)
650{
651 RT_NOREF(hScriptLex);
652
653 const char *psz = (const char *)pvUser;
654 size_t cch = strlen(psz);
655 size_t cchCopy = RT_MIN(cchBuf, cch - offBuf);
656 int rc = VINF_SUCCESS;
657
658 *pcchRead = cchCopy;
659
660 if (cchCopy)
661 memcpy(pchCur, &psz[offBuf], cchCopy * sizeof(char));
662 else
663 rc = VINF_EOF;
664
665 return rc;
666}
667
668
669RTDECL(int) RTScriptLexCreateFromString(PRTSCRIPTLEX phScriptLex, const char *pszSrc, PRTSTRCACHE phStrCacheId,
670 PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
671{
672 return RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderStr, NULL, (void *)pszSrc, 0,
673 phStrCacheId, phStrCacheStringLit, pCfg);
674}
675
676
677/**
678 * @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a file.}
679 */
680static DECLCALLBACK(int) rtScriptLexReaderFile(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
681 size_t cchBuf, size_t *pcchRead, void *pvUser)
682{
683 RT_NOREF(hScriptLex);
684
685 RTFILE hFile = (RTFILE)pvUser;
686 return RTFileReadAt(hFile, offBuf, pchCur, cchBuf, pcchRead);
687}
688
689
690/**
691 * @callback_method_impl{FNRTSCRIPTLEXDTOR, Destructor for the file variant.}
692 */
693static DECLCALLBACK(void) rtScriptLexDtorFile(RTSCRIPTLEX hScriptLex, void *pvUser)
694{
695 RT_NOREF(hScriptLex);
696
697 RTFILE hFile = (RTFILE)pvUser;
698 RTFileClose(hFile);
699}
700
701
702RTDECL(int) RTScriptLexCreateFromFile(PRTSCRIPTLEX phScriptLex, const char *pszFilename, PRTSTRCACHE phStrCacheId,
703 PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
704{
705 RTFILE hFile;
706 int rc = RTFileOpen(&hFile, pszFilename, RTFILE_O_READ | RTFILE_O_DENY_WRITE | RTFILE_O_OPEN);
707 if (RT_SUCCESS(rc))
708 {
709 rc = RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderFile, rtScriptLexDtorFile, (void *)hFile, 0,
710 phStrCacheId, phStrCacheStringLit, pCfg);
711 if (RT_FAILURE(rc))
712 RTFileClose(hFile);
713 }
714
715 return rc;
716}
717
718
719RTDECL(void) RTScriptLexDestroy(RTSCRIPTLEX hScriptLex)
720{
721 PRTSCRIPTLEXINT pThis = hScriptLex;
722 AssertPtrReturnVoid(pThis);
723
724 if (pThis->pfnDtor)
725 pThis->pfnDtor(pThis, pThis->pvUser);
726
727 if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE)
728 RTStrCacheDestroy(pThis->hStrCacheId);
729 if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE)
730 RTStrCacheDestroy(pThis->hStrCacheStringLit);
731
732 if (pThis->pszStrLit)
733 RTStrFree(pThis->pszStrLit);
734
735 RTMemFree(pThis);
736}
737
738
739RTDECL(int) RTScriptLexQueryToken(RTSCRIPTLEX hScriptLex, PCRTSCRIPTLEXTOKEN *ppToken)
740{
741 PRTSCRIPTLEXINT pThis = hScriptLex;
742 AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
743 AssertPtrReturn(ppToken, VERR_INVALID_POINTER);
744
745 if (RT_SUCCESS(pThis->rcRdr))
746 *ppToken = pThis->pTokCur;
747
748 return pThis->rcRdr;
749}
750
751
752RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexGetTokenType(RTSCRIPTLEX hScriptLex)
753{
754 PRTSCRIPTLEXINT pThis = hScriptLex;
755 AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
756
757 if (RT_SUCCESS(pThis->rcRdr))
758 return pThis->pTokCur->enmType;
759
760 return RTSCRIPTLEXTOKTYPE_INVALID;
761}
762
763
764RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexPeekNextTokenType(RTSCRIPTLEX hScriptLex)
765{
766 PRTSCRIPTLEXINT pThis = hScriptLex;
767 AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
768
769 if (RT_SUCCESS(pThis->rcRdr))
770 return pThis->pTokNext->enmType;
771
772 return RTSCRIPTLEXTOKTYPE_INVALID;
773}
774
775
776RTDECL(PCRTSCRIPTLEXTOKEN) RTScriptLexConsumeToken(RTSCRIPTLEX hScriptLex)
777{
778 PRTSCRIPTLEXINT pThis = hScriptLex;
779 AssertPtrReturn(pThis, NULL);
780
781 /*
782 * Stop token production as soon as the current token indicates the
783 * end of the stream or an error
784 */
785 if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
786 && pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
787 {
788 PRTSCRIPTLEXTOKEN pTokTmp = pThis->pTokCur;
789
790 /* Switch next token to current token and read in the next token. */
791 pThis->pTokCur = pThis->pTokNext;
792 pThis->pTokNext = pTokTmp;
793 if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
794 && pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
795 rtScriptLexProduceToken(pThis, pThis->pTokNext);
796 else
797 pThis->pTokNext = pThis->pTokCur;
798 }
799
800 return pThis->pTokCur;
801}
802
803
804RTDECL(char) RTScriptLexConsumeCh(RTSCRIPTLEX hScriptLex)
805{
806 return RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_DEFAULT);
807}
808
809
810RTDECL(char) RTScriptLexConsumeChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
811{
812 PRTSCRIPTLEXINT pThis = hScriptLex;
813 AssertPtrReturn(pThis, '\0');
814
815 pThis->pchCur++;
816 pThis->Pos.iCh++;
817 if (pThis->pchCur == &pThis->achBuf[pThis->cchBuf])
818 rtScriptLexFillBuffer(pThis);
819
820 return RTScriptLexGetChEx(pThis, fFlags);
821}
822
823
824RTDECL(char) RTScriptLexPeekCh(RTSCRIPTLEX hScriptLex, unsigned idx)
825{
826 return RTScriptLexPeekChEx(hScriptLex, idx, RTSCRIPT_LEX_CONV_F_DEFAULT);
827}
828
829
830RTDECL(char) RTScriptLexPeekChEx(RTSCRIPTLEX hScriptLex, unsigned idx, uint32_t fFlags)
831{
832 PRTSCRIPTLEXINT pThis = hScriptLex;
833 AssertPtrReturn(pThis, '\0');
834
835 /* Just return the character if it is in the current buffer. */
836 char ch = '\0';
837 if (RT_LIKELY(pThis->pchCur + idx < &pThis->achBuf[pThis->cchBuf]))
838 ch = pThis->pchCur[idx];
839 else
840 {
841 /* Slow path, read data into temporary buffer to read character from and dismiss. */
842 /** @todo */
843 AssertReleaseFailed();
844 }
845
846 if ( (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE)
847 && !(fFlags & RTSCRIPT_LEX_CONV_F_NOTHING))
848 ch = RT_C_TO_LOWER(ch);
849
850 return ch;
851}
852
853
854RTDECL(char) RTScriptLexGetCh(RTSCRIPTLEX hScriptLex)
855{
856 return RTScriptLexPeekCh(hScriptLex, 0);
857}
858
859
860RTDECL(char) RTScriptLexGetChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
861{
862 return RTScriptLexPeekChEx(hScriptLex, 0, fFlags);
863}
864
865
866RTDECL(void) RTScriptLexSkipWhitespace(RTSCRIPTLEX hScriptLex)
867{
868 PRTSCRIPTLEXINT pThis = hScriptLex;
869 AssertPtrReturnVoid(pThis);
870
871 for (;;)
872 {
873 char ch = RTScriptLexGetCh(hScriptLex);
874
875 if (ch == '\0')
876 break;
877
878 /* Check for whitespace. */
879 const char *pszWs = pThis->pCfg->pszWhitespace ? pThis->pCfg->pszWhitespace : g_szWsDef;
880
881 if ( rtScriptLexLocateChInStrConsume(pThis, ch, pszWs)
882 || rtScriptLexIsNewlineConsume(pThis, ch)
883 || rtScriptLexIsMultiLineCommentConsume(pThis, ch)
884 || rtScriptLexIsSingleLineCommentConsume(pThis, ch))
885 continue;
886
887 /* All white space skipped, next is some real content. */
888 break;
889 }
890}
891
892
893RTDECL(int) RTScriptLexScanNumber(RTSCRIPTLEX hScriptLex, uint8_t uBase, bool fAllowReal,
894 PRTSCRIPTLEXTOKEN pTok)
895{
896 RT_NOREF(uBase, fAllowReal, pTok);
897 PRTSCRIPTLEXINT pThis = hScriptLex;
898 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
899 AssertReturn(!fAllowReal, VERR_NOT_IMPLEMENTED);
900 AssertReturn(!uBase, VERR_NOT_IMPLEMENTED);
901
902 /** @todo r=aeichner Quick and dirty to have something working for the disassembler testcase.
903 * Among others it misses overflow handling. */
904 uBase = 10;
905 char ch = RTScriptLexGetCh(hScriptLex);
906 pTok->Type.Number.enmType = ch == '-'
907 ? RTSCRIPTLEXTOKNUMTYPE_INTEGER
908 : RTSCRIPTLEXTOKNUMTYPE_NATURAL;
909 if (ch == '-' || ch == '+')
910 ch = RTScriptLexConsumeCh(hScriptLex);
911
912 if (ch == '0')
913 {
914 /* Some hex prefix? */
915 char chNext = RTScriptLexPeekCh(hScriptLex, 1);
916 if (chNext == 'x')
917 {
918 uBase = 16;
919 RTScriptLexConsumeCh(hScriptLex);
920 }
921 else if (chNext >= '0' && chNext <= '9') /* Octal stuff. */
922 AssertFailedReturn(VERR_NOT_IMPLEMENTED);
923
924 ch = RTScriptLexConsumeCh(hScriptLex);
925 }
926
927 uint64_t u64 = 0;
928 for (;;)
929 {
930 if ( (ch < '0' || ch > '9')
931 && (ch < 'a' || ch > 'f' || uBase == 10))
932 {
933 if (pTok->Type.Number.enmType == RTSCRIPTLEXTOKNUMTYPE_INTEGER)
934 pTok->Type.Number.Type.i64 = -(int64_t)u64;
935 else
936 pTok->Type.Number.Type.u64 = u64;
937 pTok->enmType = RTSCRIPTLEXTOKTYPE_NUMBER;
938 pTok->PosEnd = pThis->Pos;
939 return VINF_SUCCESS;
940 }
941
942 if (ch >= '0' && ch <= '9')
943 u64 = (u64 * uBase) + (ch - '0');
944 else if (ch >= 'a' && ch <= 'f')
945 {
946 Assert(uBase == 16);
947 u64 = (u64 << 4) + 10 + (ch - 'a');
948 }
949
950 ch = RTScriptLexConsumeCh(hScriptLex);
951 }
952}
953
954
955RTDECL(int) RTScriptLexScanIdentifier(RTSCRIPTLEX hScriptLex, char ch,
956 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
957{
958 PRTSCRIPTLEXINT pThis = hScriptLex;
959 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
960
961 const char *pszCharSet = pvUser ? (const char *)pvUser : g_aszIdeCharSetDef;
962 char aszIde[513]; RT_ZERO(aszIde);
963 unsigned idx = 0;
964 aszIde[idx++] = ch;
965
966 ch = RTScriptLexGetCh(hScriptLex);
967 while ( idx < sizeof(aszIde) - 1
968 && rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
969 {
970 aszIde[idx++] = ch;
971 ch = RTScriptLexGetCh(hScriptLex);
972 }
973
974 if ( idx == sizeof(aszIde) - 1
975 && rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
976 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_BUFFER_OVERFLOW, "Lexer: Identifier exceeds the allowed length");
977
978 /* Insert into string cache. */
979 pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
980 pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, &aszIde[0], idx);
981 if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
982 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
983
984 pTok->PosEnd = pThis->Pos;
985 return VINF_SUCCESS;
986}
987
988
989/**
990 * Adds the given character to the string literal add the given position, assuring the string
991 * is always zero terminated.
992 *
993 * @returns IPRT status code.
994 * @param pThis The lexer state.
995 * @param ch The character to add.
996 * @param idx At which position to add the character in the string.
997 */
998static int rtScriptLexScanStringLiteralChAdd(PRTSCRIPTLEXINT pThis, char ch, uint32_t idx)
999{
1000 int rc = VINF_SUCCESS;
1001
1002 if ( !pThis->cchStrLitMax
1003 || idx >= pThis->cchStrLitMax - 1)
1004 {
1005 /* Increase memory. */
1006 size_t cchMaxNew = pThis->cchStrLitMax + 64;
1007 char *pszNew = NULL;
1008 rc = RTStrRealloc(&pszNew, cchMaxNew * sizeof(char));
1009 if (RT_SUCCESS(rc))
1010 {
1011 pThis->pszStrLit = pszNew;
1012 pThis->cchStrLitMax = cchMaxNew;
1013 }
1014 }
1015
1016 if (RT_SUCCESS(rc))
1017 {
1018 pThis->pszStrLit[idx] = ch;
1019 pThis->pszStrLit[idx + 1] = '\0';
1020 }
1021
1022 return rc;
1023}
1024
1025
1026RTDECL(int) RTScriptLexScanStringLiteralC(RTSCRIPTLEX hScriptLex, char ch,
1027 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1028{
1029 RT_NOREF(ch, pvUser);
1030 PRTSCRIPTLEXINT pThis = hScriptLex;
1031 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1032
1033 uint32_t idxChCur = 0;
1034 int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1035 if (RT_FAILURE(rc))
1036 return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1037
1038 ch = RTScriptLexGetCh(hScriptLex);
1039 for (;;)
1040 {
1041 if (ch == '\0')
1042 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1043 else if (ch == '\"')
1044 {
1045 /* End of string, add it to the string literal cache and build the token. */
1046 pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1047 pTok->Type.StringLit.cchString = idxChCur;
1048 pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1049 if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1050 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1051 else
1052 break;
1053 }
1054 else if (ch == '\\')
1055 {
1056 /* Start of escape sequence. */
1057 RTScriptLexConsumeCh(hScriptLex);
1058 ch = RTScriptLexGetCh(hScriptLex);
1059 switch (ch)
1060 {
1061 case 'a': /* Alert (Bell) */
1062 ch = 0x07;
1063 break;
1064 case 'b': /* Backspace */
1065 ch = 0x08;
1066 break;
1067 case 'e': /* Escape character */
1068 ch = 0x1b;
1069 break;
1070 case 'f': /* Formfeed */
1071 ch = 0x0c;
1072 break;
1073 case 'n': /* Newline (line freed) */
1074 ch = 0x0a;
1075 break;
1076 case 'r': /* Carriage return */
1077 ch = 0x0d;
1078 break;
1079 case 't': /* Horizontal tab */
1080 ch = 0x09;
1081 break;
1082 case 'v': /* Vertical tab */
1083 ch = 0x0b;
1084 break;
1085 case '\\':
1086 case '\'':
1087 case '\"':
1088 case '\?':
1089 /* Can be added as is. */
1090 break;
1091 case 'x': /* Hexdecimal byte. */
1092 case '0': /* Octal */
1093 case '1':
1094 case '2':
1095 case '3':
1096 case '4':
1097 case '5':
1098 case '6':
1099 case '7':
1100 case '8':
1101 case '9':
1102 case 'u': /* Unicode point below 10000 */
1103 case 'U': /* Unicode point */
1104 default:
1105 /* Not supported for now. */
1106 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NOT_SUPPORTED, "Lexer: Invalid/unsupported escape sequence");
1107 }
1108 }
1109
1110 rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1111 if (RT_SUCCESS(rc))
1112 idxChCur++;
1113 else
1114 return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1115
1116 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1117 }
1118
1119 pTok->PosEnd = pThis->Pos;
1120 return VINF_SUCCESS;
1121}
1122
1123
1124RTDECL(int) RTScriptLexScanStringLiteralPascal(RTSCRIPTLEX hScriptLex, char ch,
1125 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1126{
1127 RT_NOREF(ch, pvUser);
1128 PRTSCRIPTLEXINT pThis = hScriptLex;
1129 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1130
1131 uint32_t idxChCur = 0;
1132 int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1133 if (RT_FAILURE(rc))
1134 return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1135
1136 ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1137 for (;;)
1138 {
1139 if (ch == '\0')
1140 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1141 else if (ch == '\'')
1142 {
1143 /*
1144 * Check whether there is a second ' coming afterwards used for
1145 * escaping ' characters.
1146 */
1147 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1148 if (ch != '\'')
1149 {
1150 /* End of string, add it to the string literal cache and build the token. */
1151 pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1152 pTok->Type.StringLit.cchString = idxChCur;
1153 pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1154 if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1155 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1156 else
1157 break;
1158 }
1159 /* else: Fall through and add the character to the string literal..*/
1160 }
1161
1162 rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1163 if (RT_SUCCESS(rc))
1164 idxChCur++;
1165 else
1166 return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1167 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1168 }
1169
1170 pTok->PosEnd = pThis->Pos;
1171 return VINF_SUCCESS;
1172}
1173
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette