VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/script/scriptlex.cpp@ 105746

Last change on this file since 105746 was 105746, checked in by vboxsync, 8 months ago

Runtime/script: Add a simple lexer API to turn a stream of characters into tokens for a defined configuration, bugref:10394

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 37.1 KB
Line 
1/* $Id: scriptlex.cpp 105746 2024-08-21 07:35:33Z vboxsync $ */
2/** @file
3 * IPRT - RTScript* lexer API.
4 */
5
6/*
7 * Copyright (C) 2017 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#define LOG_GROUP RTLOGGROUP_DEFAULT // @todo
32#include <iprt/script.h>
33
34#include <iprt/assert.h>
35#include <iprt/ctype.h>
36#include <iprt/err.h>
37#include <iprt/file.h>
38#include <iprt/log.h>
39#include <iprt/mem.h>
40#include <iprt/string.h>
41
42
43/*********************************************************************************************************************************
44* Structures and Typedefs *
45*********************************************************************************************************************************/
46
47/**
48 * Internal lexer state.
49 */
50typedef struct RTSCRIPTLEXINT
51{
52 /** Magic. */
53 uint32_t u32Magic;
54 /** Source position. */
55 RTSCRIPTPOS Pos;
56 /** Current and next token buffer. */
57 RTSCRIPTLEXTOKEN aToks[2];
58 /** Pointer to the current token. */
59 PRTSCRIPTLEXTOKEN pTokCur;
60 /** Pointer to the next token. */
61 PRTSCRIPTLEXTOKEN pTokNext;
62 /** The lexer config. */
63 PCRTSCRIPTLEXCFG pCfg;
64 /** The input reader. */
65 PFNRTSCRIPTLEXRDR pfnReader;
66 /** The destructor callback. */
67 PFNRTSCRIPTLEXDTOR pfnDtor;
68 /** Opaque user data for the reader. */
69 void *pvUser;
70 /** Identifier string cache. */
71 RTSTRCACHE hStrCacheId;
72 /** String literal string cache. */
73 RTSTRCACHE hStrCacheStringLit;
74 /** Status code from the reader. */
75 int rcRdr;
76 /** Internal error info. */
77 RTERRINFOSTATIC ErrInfo;
78 /** Lexer flags. */
79 uint32_t fFlags;
80 /** Maximum numebr of bytes allocated for temporary storage for literal strings. */
81 size_t cchStrLitMax;
82 /** Pointer to the string buffer for holding the literal string. */
83 char *pszStrLit;
84 /** Pointer to the current input character. */
85 const char *pchCur;
86 /** Offset to start reading the next chunk from. */
87 size_t offBufRead;
88 /** Size of the input buffer. */
89 size_t cchBuf;
90 /** The cached part of the input, variable in size. */
91 char achBuf[1];
92} RTSCRIPTLEXINT;
93/** Pointer to the internal lexer state. */
94typedef RTSCRIPTLEXINT *PRTSCRIPTLEXINT;
95
96
97/** Free the identifier string cache literal on destruction. */
98#define RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE RT_BIT_32(0)
99/** Free the string literal string cache literal on destruction. */
100#define RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE RT_BIT_32(1)
101/** End of stream reached. */
102#define RTSCRIPT_LEX_INT_F_EOS RT_BIT_32(2)
103
104
105/*********************************************************************************************************************************
106* Global Variables *
107*********************************************************************************************************************************/
108
109/** Default set of white spaces. */
110static const char *g_szWsDef = " \t";
111/** Default set of newlines. */
112static const char *g_aszNlDef[] =
113{
114 "\n",
115 "\r\n",
116 NULL
117};
118/** Default set of characters allowed for identifiers. */
119static const char *g_aszIdeCharSetDef = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
120
121
122/*********************************************************************************************************************************
123* Internal Functions *
124*********************************************************************************************************************************/
125
126
127/**
128 * Locates the given character in the string, consuming it if found.
129 *
130 * @returns Flag whether the character was found in the string.
131 * @param pThis The lexer state.
132 * @param ch The character to check for.
133 * @param psz The string to check.
134 */
135DECLINLINE(bool) rtScriptLexLocateChInStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz)
136{
137 while ( *psz != '\0'
138 && *psz != ch)
139 psz++;
140
141 if (*psz != '\0')
142 RTScriptLexConsumeCh(pThis);
143
144 return *psz != '\0';
145}
146
147
148/**
149 * Matches the input against the given string starting with the given character, consuming it
150 * if found.
151 *
152 * @returns Flag whether there was a match.
153 * @param pThis The lexer state.
154 * @param ch The character to check start matching.
155 * @param psz The string to match against.
156 * @param pszExclude When the string matched but the input continues
157 * with one of the characters in this string the
158 * match will not
159 */
160DECLINLINE(bool) rtScriptLexMatchStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz,
161 const char *pszExclude)
162{
163 bool fMatch = false;
164 if (*psz == ch)
165 {
166 unsigned offPeek = 1;
167
168 psz++;
169 while ( *psz != '\0'
170 && *psz == RTScriptLexPeekCh(pThis, offPeek))
171 {
172 offPeek++;
173 psz++;
174 }
175
176 if (*psz == '\0')
177 {
178 if (pszExclude)
179 {
180 ch = RTScriptLexPeekCh(pThis, offPeek);
181 fMatch = strchr(pszExclude, ch) == NULL;
182 }
183 else
184 fMatch = true;
185 }
186
187 if (fMatch)
188 {
189 /* Match, consume everything. */
190 while (offPeek-- > 0)
191 RTScriptLexConsumeCh(pThis);
192 }
193 }
194
195 return fMatch;
196}
197
198
199/**
200 * Tries to locate a string with the given starting character (+ peeking ahead) in the
201 * given string array (exact match) and consumes the entire substring.
202 *
203 * @returns Flag whether there was a match.
204 * @param pThis The lexer state.
205 * @param ch The character to check for.
206 * @param papsz Pointer to the string array to check for.
207 * @param pidx Where to store the index of the matching substring if found,
208 * optional.
209 */
210DECLINLINE(bool) rtScriptLexLocateSubStrInStrArrayMatchConsume(PRTSCRIPTLEXINT pThis, char ch,
211 const char **papsz, unsigned *pidx)
212{
213 unsigned int idx = 0;
214
215 while ( papsz[idx] != NULL
216 && !rtScriptLexMatchStrConsume(pThis, ch, papsz[idx], NULL))
217 idx++;
218
219 if ( papsz[idx] != NULL
220 && pidx)
221 *pidx = idx;
222
223 return papsz[idx] != NULL;
224}
225
226
227/**
228 * Tries to get an exact match starting with the given character, consuming it when found.
229 *
230 * @returns Flag whether there was a match.
231 * @param pThis The lexer state.
232 * @param ch The character to check for.
233 * @param ppMatch Where to store the exact match on success.
234 */
235DECLINLINE(bool) rtScriptLexLocateExactMatchConsume(PRTSCRIPTLEXINT pThis, char ch, PCRTSCRIPTLEXTOKMATCH *ppMatch)
236{
237 PCRTSCRIPTLEXTOKMATCH pTokMatch = pThis->pCfg->paTokMatches;
238
239 if (pTokMatch)
240 {
241 while ( pTokMatch->pszMatch != NULL
242 && !rtScriptLexMatchStrConsume(pThis, ch, pTokMatch->pszMatch,
243 pTokMatch->fMaybeIdentifier
244 ? g_aszIdeCharSetDef
245 : NULL))
246 pTokMatch++;
247
248 if (pTokMatch->pszMatch != NULL)
249 {
250 *ppMatch = pTokMatch;
251 return true;
252 }
253 }
254
255 return false;
256}
257
258
259DECLINLINE(bool) rtScriptLexIsNewlineConsume(PRTSCRIPTLEXINT pThis, char ch)
260{
261 const char **papszNl = pThis->pCfg->pszWhitespace ? pThis->pCfg->papszNewline : g_aszNlDef;
262
263 bool fMatched = rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszNl, NULL);
264 if (fMatched)
265 {
266 pThis->Pos.iLine++;
267 pThis->Pos.iCh = 1;
268 }
269
270 return fMatched;
271}
272
273
274/**
275 * Checks whether the character is the beginning of a multi line comment, skipping the whole
276 * comment if necessary.
277 *
278 * @returns Flag whether a multi line comment was detected and consumed.
279 * @param hScriptLex The lexer state.
280 * @param ch The character to check for.
281 */
282DECLINLINE(bool) rtScriptLexIsMultiLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
283{
284 const char **papszCommentMultiStart = pThis->pCfg->papszCommentMultiStart;
285 unsigned idxComment = 0;
286
287 if ( papszCommentMultiStart
288 && rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentMultiStart,
289 &idxComment))
290 {
291 /* Look for the matching closing lexeme in the input consuming everything along the way. */
292 const char *pszClosing = pThis->pCfg->papszCommentMultiEnd[idxComment];
293
294 for (;;)
295 {
296 char chTmp = RTScriptLexGetCh(pThis);
297
298 /* Check for new lines explicetly to advance the position information. */
299 if (rtScriptLexIsNewlineConsume(pThis, chTmp))
300 continue;
301
302 /** @todo: Not quite correct when there is an end of stream before the closing lexeme.
303 * But doesn't hurt at the moment. */
304 if ( chTmp == '\0'
305 || rtScriptLexMatchStrConsume(pThis, chTmp, pszClosing, NULL))
306 break;
307
308 RTScriptLexConsumeCh(pThis);
309 }
310
311 return true;
312 }
313
314 return false;
315}
316
317
318/**
319 * Checks whether the character is the beginning of a single line comment, skipping the whole
320 * comment if necessary.
321 *
322 * @returns Flag whether a single line comment was detected and consumed.
323 * @param hScriptLex The lexer state.
324 * @param ch The character to check for.
325 */
326DECLINLINE(bool) rtScriptLexIsSingleLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
327{
328 const char **papszCommentSingleStart = pThis->pCfg->papszCommentSingleStart;
329
330 if ( papszCommentSingleStart
331 && rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentSingleStart,
332 NULL))
333 {
334 for (;;)
335 {
336 char chTmp = RTScriptLexGetCh(pThis);
337
338 if ( chTmp == '\0'
339 || rtScriptLexIsNewlineConsume(pThis, chTmp))
340 break;
341
342 RTScriptLexConsumeCh(pThis);
343 }
344
345 return true;
346 }
347
348 return false;
349}
350
351
352/**
353 * Fills the input buffer with source data.
354 *
355 * @returns IPRT status code.
356 * @param pThis The lexer state.
357 */
358static int rtScriptLexFillBuffer(PRTSCRIPTLEXINT pThis)
359{
360 int rc = VINF_SUCCESS;
361 size_t cchToRead = pThis->cchBuf;
362 char *pchRead = &pThis->achBuf[0];
363
364 AssertReturn(!(pThis->fFlags & RTSCRIPT_LEX_INT_F_EOS), VERR_INVALID_STATE);
365
366 /* If there is input left to process move it to the front and fill the remainder. */
367 if (pThis->pchCur != NULL)
368 {
369 cchToRead = pThis->pchCur - &pThis->achBuf[0];
370 /* Move the rest to the front. */
371 memmove(&pThis->achBuf[0], pThis->pchCur, pThis->cchBuf - cchToRead);
372 pchRead = (char *)pThis->pchCur + 1;
373 memset(pchRead, 0, cchToRead);
374 }
375
376 if (cchToRead)
377 {
378 pThis->pchCur = &pThis->achBuf[0];
379
380 size_t cchRead = 0;
381 rc = pThis->pfnReader(pThis, pThis->offBufRead, pchRead, cchToRead, &cchRead, pThis->pvUser);
382 if (RT_SUCCESS(rc))
383 {
384 pThis->offBufRead += cchRead;
385 if (rc == VINF_EOF)
386 pThis->fFlags |= RTSCRIPT_LEX_INT_F_EOS;
387 rc = VINF_SUCCESS;
388 }
389 else
390 pThis->rcRdr = rc;
391 }
392 else
393 rc = VERR_BUFFER_OVERFLOW; /** @todo */
394
395 return rc;
396}
397
398
399/**
400 * Produce an end of stream token.
401 *
402 * @returns nothing.
403 * @param pThis The lexer state.
404 * @param pTok The token to fill.
405 */
406static void rtScriptLexProduceTokEos(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
407{
408 pTok->enmType = RTSCRIPTLEXTOKTYPE_EOS;
409 pTok->PosStart = pThis->Pos;
410 pTok->PosEnd = pThis->Pos;
411}
412
413
414/**
415 * Produce an error token with the given error message.
416 *
417 * @returns IPRT status code.
418 * @param pThis The lexer state.
419 * @param pTok The token to fill.
420 * @param rc The status code to use in the message.
421 * @param pszMsg The format string for the error message.
422 * @param ... Arguments to the format string.
423 */
424static int rtScriptLexProduceTokError(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
425 int rc, const char *pszMsg, ...)
426{
427 va_list va;
428 va_start(va, pszMsg);
429
430 pTok->enmType = RTSCRIPTLEXTOKTYPE_ERROR;
431 pTok->PosEnd = pThis->Pos;
432 pTok->Type.Error.pErr = &pThis->ErrInfo.Core;
433
434 RTErrInfoInitStatic(&pThis->ErrInfo);
435 RTErrInfoSetV(&pThis->ErrInfo.Core, rc, pszMsg, va);
436 va_end(va);
437
438 return rc;
439}
440
441
442/**
443 * Create the token from the exact match.
444 *
445 * @returns nothing.
446 * @param pThis The lexer state.
447 * @param pTok The token to fill.
448 * @param pMatch The matched string.
449 */
450static void rtScriptLexProduceTokFromExactMatch(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
451 PCRTSCRIPTLEXTOKMATCH pMatch)
452{
453 pTok->enmType = pMatch->enmTokType;
454 pTok->PosEnd = pThis->Pos;
455
456 switch (pTok->enmType)
457 {
458 case RTSCRIPTLEXTOKTYPE_OPERATOR:
459 pTok->Type.Operator.pOp = pMatch;
460 break;
461 case RTSCRIPTLEXTOKTYPE_KEYWORD:
462 pTok->Type.Keyword.pKeyword = pMatch;
463 break;
464 case RTSCRIPTLEXTOKTYPE_PUNCTUATOR:
465 pTok->Type.Punctuator.pPunctuator = pMatch;
466 break;
467 default:
468 rtScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
469 "Lexer: The match contains an invalid token type: %d\n",
470 pTok->enmType);
471 }
472}
473
474
475/**
476 * Goes through the rules trying to find a matching one.
477 *
478 * @returns Flag whether a matching rule was found.
479 * @param pThis The lexer state.
480 * @param ch The character to check.
481 * @param pTok The token to fill.
482 */
483static bool rtScriptLexProduceTokFromRules(PRTSCRIPTLEXINT pThis, char ch, PRTSCRIPTLEXTOKEN pTok)
484{
485 PCRTSCRIPTLEXRULE pRule = pThis->pCfg->paRules;
486
487 if (pRule)
488 {
489 while (pRule->pfnProd != NULL)
490 {
491 if ( ch >= pRule->chStart
492 && ch <= pRule->chEnd)
493 {
494 if (pRule->fFlags & RTSCRIPT_LEX_RULE_CONSUME)
495 RTScriptLexConsumeCh(pThis);
496 int rc = pRule->pfnProd(pThis, ch, pTok, pRule->pvUser);
497 AssertRC(rc);
498 return true;
499 }
500
501 pRule++;
502 }
503 }
504
505 return false;
506}
507
508
509/**
510 * Fills in the given token from the scanned input at the current location.
511 *
512 * @returns IPRT status code.
513 * @param pThis The lexer state.
514 * @param pTok The token to fill.
515 */
516static int rtScriptLexProduceToken(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
517{
518 RTScriptLexSkipWhitespace(pThis);
519
520 pTok->PosStart = pThis->Pos;
521
522 char ch = RTScriptLexGetCh(pThis);
523 PCRTSCRIPTLEXTOKMATCH pMatch = NULL;
524 if (ch == '\0')
525 rtScriptLexProduceTokEos(pThis, pTok);
526 else if (rtScriptLexLocateExactMatchConsume(pThis, ch, &pMatch))
527 rtScriptLexProduceTokFromExactMatch(pThis, pTok, pMatch);
528 else if (!rtScriptLexProduceTokFromRules(pThis, ch, pTok))
529 {
530 if (pThis->pCfg->pfnProdDef)
531 pThis->rcRdr = pThis->pCfg->pfnProdDef(pThis, ch, pTok, pThis->pCfg->pvProdDefUser);
532 else
533 rtScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
534 "Lexer: Invalid character found in input: %c\n",
535 ch);
536 }
537
538 return pThis->rcRdr;
539}
540
541
542/**
543 * Populates the lexer for the initial use.
544 *
545 * @returns IPRT status code.
546 * @param pThis The lexer state.
547 */
548static int rtScriptLexPopulate(PRTSCRIPTLEXINT pThis)
549{
550 int rc = rtScriptLexFillBuffer(pThis);
551 if (RT_SUCCESS(rc))
552 {
553 rc = rtScriptLexProduceToken(pThis, pThis->pTokCur);
554 if (RT_SUCCESS(rc))
555 rc = rtScriptLexProduceToken(pThis, pThis->pTokNext);
556 }
557
558 return rc;
559}
560
561
562
563RTDECL(int) RTScriptLexCreateFromReader(PRTSCRIPTLEX phScriptLex, PFNRTSCRIPTLEXRDR pfnReader,
564 PFNRTSCRIPTLEXDTOR pfnDtor, void *pvUser,
565 size_t cchBuf, PRTSTRCACHE phStrCacheId, PRTSTRCACHE phStrCacheStringLit,
566 PCRTSCRIPTLEXCFG pCfg)
567{
568 AssertPtrReturn(phScriptLex, VERR_INVALID_POINTER);
569 AssertPtrReturn(pfnReader, VERR_INVALID_POINTER);
570 AssertPtrReturn(pCfg, VERR_INVALID_POINTER);
571
572 if (!cchBuf)
573 cchBuf = _16K;
574 int rc = VINF_SUCCESS;
575 PRTSCRIPTLEXINT pThis = (PRTSCRIPTLEXINT)RTMemAllocZ(RT_OFFSETOF(RTSCRIPTLEXINT, achBuf[cchBuf]));
576 if (RT_LIKELY(pThis))
577 {
578 pThis->u32Magic = 0xfefecafe; /**@todo */
579 pThis->Pos.iLine = 1;
580 pThis->Pos.iCh = 1;
581 pThis->pTokCur = &pThis->aToks[0];
582 pThis->pTokNext = &pThis->aToks[1];
583 pThis->pCfg = pCfg;
584 pThis->pfnReader = pfnReader;
585 pThis->pfnDtor = pfnDtor;
586 pThis->pvUser = pvUser;
587 pThis->fFlags = 0;
588 pThis->cchStrLitMax = 0;
589 pThis->pszStrLit = NULL;
590 pThis->cchBuf = cchBuf;
591 pThis->offBufRead = 0;
592 pThis->pchCur = NULL;
593 pThis->hStrCacheId = NULL;
594 pThis->hStrCacheStringLit = NULL;
595
596 rc = RTStrCacheCreate(&pThis->hStrCacheId, "LEX-Ide");
597 if (RT_SUCCESS(rc))
598 {
599 rc = RTStrCacheCreate(&pThis->hStrCacheStringLit, "LEX-StrLit");
600 if (RT_SUCCESS(rc))
601 {
602 rc = rtScriptLexPopulate(pThis);
603 if (RT_SUCCESS(rc))
604 {
605 *phScriptLex = pThis;
606
607 if (phStrCacheId)
608 *phStrCacheId = pThis->hStrCacheId;
609 else
610 pThis->fFlags |= RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE;
611
612 if (phStrCacheStringLit)
613 *phStrCacheStringLit = pThis->hStrCacheStringLit;
614 else
615 pThis->fFlags |= RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE;
616
617 return VINF_SUCCESS;
618 }
619
620 RTStrCacheDestroy(pThis->hStrCacheStringLit);
621 }
622
623 RTStrCacheDestroy(pThis->hStrCacheId);
624 }
625
626 RTMemFree(pThis);
627 }
628 else
629 rc = VERR_NO_MEMORY;
630
631 return rc;
632}
633
634
635/**
636 * @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a string.}
637 */
638static DECLCALLBACK(int) rtScriptLexReaderStr(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
639 size_t cchBuf, size_t *pcchRead, void *pvUser)
640{
641 RT_NOREF(hScriptLex);
642
643 const char *psz = (const char *)pvUser;
644 size_t cch = strlen(psz);
645 size_t cchCopy = RT_MIN(cchBuf, cch - offBuf);
646 int rc = VINF_SUCCESS;
647
648 *pcchRead = cchCopy;
649
650 if (cchCopy)
651 memcpy(pchCur, &psz[offBuf], cchCopy * sizeof(char));
652 else
653 rc = VINF_EOF;
654
655 return rc;
656}
657
658
659RTDECL(int) RTScriptLexCreateFromString(PRTSCRIPTLEX phScriptLex, const char *pszSrc, PRTSTRCACHE phStrCacheId,
660 PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
661{
662 return RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderStr, NULL, (void *)pszSrc, 0,
663 phStrCacheId, phStrCacheStringLit, pCfg);
664}
665
666
667/**
668 * @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a file.}
669 */
670static DECLCALLBACK(int) rtScriptLexReaderFile(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
671 size_t cchBuf, size_t *pcchRead, void *pvUser)
672{
673 RT_NOREF(hScriptLex);
674
675 RTFILE hFile = (RTFILE)pvUser;
676 return RTFileReadAt(hFile, offBuf, pchCur, cchBuf, pcchRead);
677}
678
679
680/**
681 * @callback_method_impl{FNRTSCRIPTLEXDTOR, Destructor for the file variant.}
682 */
683static DECLCALLBACK(void) rtScriptLexDtorFile(RTSCRIPTLEX hScriptLex, void *pvUser)
684{
685 RT_NOREF(hScriptLex);
686
687 RTFILE hFile = (RTFILE)pvUser;
688 RTFileClose(hFile);
689}
690
691
692RTDECL(int) RTScriptLexCreateFromFile(PRTSCRIPTLEX phScriptLex, const char *pszFilename, PRTSTRCACHE phStrCacheId,
693 PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
694{
695 RTFILE hFile;
696 int rc = RTFileOpen(&hFile, pszFilename, RTFILE_O_READ | RTFILE_O_DENY_WRITE | RTFILE_O_OPEN);
697 if (RT_SUCCESS(rc))
698 {
699 rc = RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderFile, rtScriptLexDtorFile, (void *)hFile, 0,
700 phStrCacheId, phStrCacheStringLit, pCfg);
701 if (RT_FAILURE(rc))
702 RTFileClose(hFile);
703 }
704
705 return rc;
706}
707
708
709RTDECL(void) RTScriptLexDestroy(RTSCRIPTLEX hScriptLex)
710{
711 PRTSCRIPTLEXINT pThis = hScriptLex;
712 AssertPtrReturnVoid(pThis);
713
714 if (pThis->pfnDtor)
715 pThis->pfnDtor(pThis, pThis->pvUser);
716
717 if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE)
718 RTStrCacheDestroy(pThis->hStrCacheId);
719 if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE)
720 RTStrCacheDestroy(pThis->hStrCacheStringLit);
721
722 if (pThis->pszStrLit)
723 RTStrFree(pThis->pszStrLit);
724
725 RTMemFree(pThis);
726}
727
728
729RTDECL(int) RTScriptLexQueryToken(RTSCRIPTLEX hScriptLex, PCRTSCRIPTLEXTOKEN *ppToken)
730{
731 PRTSCRIPTLEXINT pThis = hScriptLex;
732 AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
733 AssertPtrReturn(ppToken, VERR_INVALID_POINTER);
734
735 if (RT_SUCCESS(pThis->rcRdr))
736 *ppToken = pThis->pTokCur;
737
738 return pThis->rcRdr;
739}
740
741
742RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexGetTokenType(RTSCRIPTLEX hScriptLex)
743{
744 PRTSCRIPTLEXINT pThis = hScriptLex;
745 AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
746
747 if (RT_SUCCESS(pThis->rcRdr))
748 return pThis->pTokCur->enmType;
749
750 return RTSCRIPTLEXTOKTYPE_INVALID;
751}
752
753
754RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexPeekNextTokenType(RTSCRIPTLEX hScriptLex)
755{
756 PRTSCRIPTLEXINT pThis = hScriptLex;
757 AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
758
759 if (RT_SUCCESS(pThis->rcRdr))
760 return pThis->pTokNext->enmType;
761
762 return RTSCRIPTLEXTOKTYPE_INVALID;
763}
764
765
766RTDECL(PCRTSCRIPTLEXTOKEN) RTScriptLexConsumeToken(RTSCRIPTLEX hScriptLex)
767{
768 PRTSCRIPTLEXINT pThis = hScriptLex;
769 AssertPtrReturn(pThis, NULL);
770
771 /*
772 * Stop token production as soon as the current token indicates the
773 * end of the stream or an error
774 */
775 if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
776 && pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
777 {
778 PRTSCRIPTLEXTOKEN pTokTmp = pThis->pTokCur;
779
780 /* Switch next token to current token and read in the next token. */
781 pThis->pTokCur = pThis->pTokNext;
782 pThis->pTokNext = pTokTmp;
783 if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
784 && pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
785 rtScriptLexProduceToken(pThis, pThis->pTokNext);
786 else
787 pThis->pTokNext = pThis->pTokCur;
788 }
789
790 return pThis->pTokCur;
791}
792
793
794RTDECL(char) RTScriptLexConsumeCh(RTSCRIPTLEX hScriptLex)
795{
796 return RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_DEFAULT);
797}
798
799
800RTDECL(char) RTScriptLexConsumeChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
801{
802 PRTSCRIPTLEXINT pThis = hScriptLex;
803 AssertPtrReturn(pThis, '\0');
804
805 pThis->pchCur++;
806 pThis->Pos.iCh++;
807 if (pThis->pchCur == &pThis->achBuf[pThis->cchBuf])
808 rtScriptLexFillBuffer(pThis);
809
810 return RTScriptLexGetChEx(pThis, fFlags);
811}
812
813
814RTDECL(char) RTScriptLexPeekCh(RTSCRIPTLEX hScriptLex, unsigned idx)
815{
816 return RTScriptLexPeekChEx(hScriptLex, idx, RTSCRIPT_LEX_CONV_F_DEFAULT);
817}
818
819
820RTDECL(char) RTScriptLexPeekChEx(RTSCRIPTLEX hScriptLex, unsigned idx, uint32_t fFlags)
821{
822 PRTSCRIPTLEXINT pThis = hScriptLex;
823 AssertPtrReturn(pThis, '\0');
824
825 /* Just return the character if it is in the current buffer. */
826 char ch = '\0';
827 if (RT_LIKELY(pThis->pchCur + idx < &pThis->achBuf[pThis->cchBuf]))
828 ch = pThis->pchCur[idx];
829 else
830 {
831 /* Slow path, read data into temporary buffer to read character from and dismiss. */
832 /** @todo */
833 AssertReleaseFailed();
834 }
835
836 if ( (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE)
837 && !(fFlags & RTSCRIPT_LEX_CONV_F_NOTHING))
838 ch = RT_C_TO_LOWER(ch);
839
840 return ch;
841}
842
843
844RTDECL(char) RTScriptLexGetCh(RTSCRIPTLEX hScriptLex)
845{
846 return RTScriptLexPeekCh(hScriptLex, 0);
847}
848
849
850RTDECL(char) RTScriptLexGetChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
851{
852 return RTScriptLexPeekChEx(hScriptLex, 0, fFlags);
853}
854
855
856RTDECL(void) RTScriptLexSkipWhitespace(RTSCRIPTLEX hScriptLex)
857{
858 PRTSCRIPTLEXINT pThis = hScriptLex;
859 AssertPtrReturnVoid(pThis);
860
861 for (;;)
862 {
863 char ch = RTScriptLexGetCh(hScriptLex);
864
865 if (ch == '\0')
866 break;
867
868 /* Check for whitespace. */
869 const char *pszWs = pThis->pCfg->pszWhitespace ? pThis->pCfg->pszWhitespace : g_szWsDef;
870
871 if ( rtScriptLexLocateChInStrConsume(pThis, ch, pszWs)
872 || rtScriptLexIsNewlineConsume(pThis, ch)
873 || rtScriptLexIsMultiLineCommentConsume(pThis, ch)
874 || rtScriptLexIsSingleLineCommentConsume(pThis, ch))
875 continue;
876
877 /* All white space skipped, next is some real content. */
878 break;
879 }
880}
881
882
883RTDECL(int) RTScriptLexScanNumber(RTSCRIPTLEX hScriptLex, uint8_t uBase, bool fAllowReal,
884 PRTSCRIPTLEXTOKEN pTok)
885{
886 RT_NOREF(uBase, fAllowReal, pTok);
887 PRTSCRIPTLEXINT pThis = hScriptLex;
888 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
889 AssertReturn(!fAllowReal, VERR_NOT_IMPLEMENTED);
890 AssertReturn(!uBase, VERR_NOT_IMPLEMENTED);
891
892 /** @todo r=aeichner Quick and dirty to have something working for the disassembler testcase.
893 * Among others it misses overflow handling. */
894 uBase = 10;
895 char ch = RTScriptLexGetCh(hScriptLex);
896 pTok->Type.Number.enmType = ch == '-'
897 ? RTSCRIPTLEXTOKNUMTYPE_INTEGER
898 : RTSCRIPTLEXTOKNUMTYPE_NATURAL;
899 if (ch == '-' || ch == '+')
900 ch = RTScriptLexConsumeCh(hScriptLex);
901
902 if (ch == '0')
903 {
904 /* Some hex prefix? */
905 if (RTScriptLexPeekCh(hScriptLex, 1) == 'x')
906 {
907 uBase = 16;
908 RTScriptLexConsumeCh(hScriptLex);
909 }
910 else /* Octal stuff. */
911 AssertFailedReturn(VERR_NOT_IMPLEMENTED);
912
913 ch = RTScriptLexConsumeCh(hScriptLex);
914 }
915
916 uint64_t u64 = 0;
917 for (;;)
918 {
919 if ( (ch < '0' || ch > '9')
920 && (ch < 'a' || ch > 'f' || uBase == 10))
921 {
922 if (pTok->Type.Number.enmType == RTSCRIPTLEXTOKNUMTYPE_INTEGER)
923 pTok->Type.Number.Type.i64 = -u64;
924 else
925 pTok->Type.Number.Type.u64 = u64;
926 pTok->enmType = RTSCRIPTLEXTOKTYPE_NUMBER;
927 pTok->PosEnd = pThis->Pos;
928 return VINF_SUCCESS;
929 }
930
931 if (ch >= '0' && ch <= '9')
932 u64 = (u64 * uBase) + (ch - '0');
933 else if (ch >= 'a' && ch <= 'f')
934 {
935 Assert(uBase == 16);
936 u64 = (u64 << 4) + 10 + (ch - 'a');
937 }
938
939 ch = RTScriptLexConsumeCh(hScriptLex);
940 }
941
942 return VINF_SUCCESS;
943}
944
945
946RTDECL(DECLCALLBACK(int)) RTScriptLexScanIdentifier(RTSCRIPTLEX hScriptLex, char ch,
947 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
948{
949 PRTSCRIPTLEXINT pThis = hScriptLex;
950 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
951
952 const char *pszCharSet = pvUser ? (const char *)pvUser : g_aszIdeCharSetDef;
953 char aszIde[513]; RT_ZERO(aszIde);
954 unsigned idx = 0;
955 aszIde[idx++] = ch;
956
957 ch = RTScriptLexGetCh(hScriptLex);
958 while ( idx < sizeof(aszIde) - 1
959 && rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
960 {
961 aszIde[idx++] = ch;
962 ch = RTScriptLexGetCh(hScriptLex);
963 }
964
965 if ( idx == sizeof(aszIde) - 1
966 && rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
967 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_BUFFER_OVERFLOW, "Lexer: Identifier exceeds the allowed length");
968
969 /* Insert into string cache. */
970 pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
971 pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, &aszIde[0], idx);
972 if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
973 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
974
975 pTok->PosEnd = pThis->Pos;
976 return VINF_SUCCESS;
977}
978
979
980/**
981 * Adds the given character to the string literal add the given position, assuring the string
982 * is always zero terminated.
983 *
984 * @returns IPRT status code.
985 * @param pThis The lexer state.
986 * @param ch The character to add.
987 * @param idx At which position to add the character in the string.
988 */
989static int rtScriptLexScanStringLiteralChAdd(PRTSCRIPTLEXINT pThis, char ch, uint32_t idx)
990{
991 int rc = VINF_SUCCESS;
992
993 if ( !pThis->cchStrLitMax
994 || idx >= pThis->cchStrLitMax - 1)
995 {
996 /* Increase memory. */
997 size_t cchMaxNew = pThis->cchStrLitMax + 64;
998 char *pszNew = NULL;
999 rc = RTStrRealloc(&pszNew, cchMaxNew * sizeof(char));
1000 if (RT_SUCCESS(rc))
1001 {
1002 pThis->pszStrLit = pszNew;
1003 pThis->cchStrLitMax = cchMaxNew;
1004 }
1005 }
1006
1007 if (RT_SUCCESS(rc))
1008 {
1009 pThis->pszStrLit[idx] = ch;
1010 pThis->pszStrLit[idx + 1] = '\0';
1011 }
1012
1013 return rc;
1014}
1015
1016
1017RTDECL(DECLCALLBACK(int)) RTScriptLexScanStringLiteralC(RTSCRIPTLEX hScriptLex, char ch,
1018 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1019{
1020 RT_NOREF(ch, pvUser);
1021 PRTSCRIPTLEXINT pThis = hScriptLex;
1022 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1023
1024 uint32_t idxChCur = 0;
1025 int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1026 if (RT_FAILURE(rc))
1027 return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1028
1029 ch = RTScriptLexGetCh(hScriptLex);
1030 for (;;)
1031 {
1032 if (ch == '\0')
1033 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1034 else if (ch == '\"')
1035 {
1036 /* End of string, add it to the string literal cache and build the token. */
1037 pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1038 pTok->Type.StringLit.cchString = idxChCur;
1039 pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1040 if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1041 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1042 else
1043 break;
1044 }
1045 else if (ch == '\\')
1046 {
1047 /* Start of escape sequence. */
1048 RTScriptLexConsumeCh(hScriptLex);
1049 ch = RTScriptLexGetCh(hScriptLex);
1050 switch (ch)
1051 {
1052 case 'a': /* Alert (Bell) */
1053 ch = 0x07;
1054 break;
1055 case 'b': /* Backspace */
1056 ch = 0x08;
1057 break;
1058 case 'e': /* Escape character */
1059 ch = 0x1b;
1060 break;
1061 case 'f': /* Formfeed */
1062 ch = 0x0c;
1063 break;
1064 case 'n': /* Newline (line freed) */
1065 ch = 0x0a;
1066 break;
1067 case 'r': /* Carriage return */
1068 ch = 0x0d;
1069 break;
1070 case 't': /* Horizontal tab */
1071 ch = 0x09;
1072 break;
1073 case 'v': /* Vertical tab */
1074 ch = 0x0b;
1075 break;
1076 case '\\':
1077 case '\'':
1078 case '\"':
1079 case '\?':
1080 /* Can be added as is. */
1081 break;
1082 case 'x': /* Hexdecimal byte. */
1083 case '0': /* Octal */
1084 case '1':
1085 case '2':
1086 case '3':
1087 case '4':
1088 case '5':
1089 case '6':
1090 case '7':
1091 case '8':
1092 case '9':
1093 case 'u': /* Unicode point below 10000 */
1094 case 'U': /* Unicode point */
1095 default:
1096 /* Not supported for now. */
1097 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NOT_SUPPORTED, "Lexer: Invalid/unsupported escape sequence");
1098 }
1099 }
1100
1101 rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1102 if (RT_SUCCESS(rc))
1103 idxChCur++;
1104 else
1105 return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1106
1107 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1108 }
1109
1110 pTok->PosEnd = pThis->Pos;
1111 return VINF_SUCCESS;
1112}
1113
1114
1115RTDECL(DECLCALLBACK(int)) RTScriptLexScanStringLiteralPascal(RTSCRIPTLEX hScriptLex, char ch,
1116 PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1117{
1118 RT_NOREF(ch, pvUser);
1119 PRTSCRIPTLEXINT pThis = hScriptLex;
1120 AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1121
1122 uint32_t idxChCur = 0;
1123 int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1124 if (RT_FAILURE(rc))
1125 return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1126
1127 ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1128 for (;;)
1129 {
1130 if (ch == '\0')
1131 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1132 else if (ch == '\'')
1133 {
1134 /*
1135 * Check whether there is a second ' coming afterwards used for
1136 * escaping ' characters.
1137 */
1138 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1139 if (ch != '\'')
1140 {
1141 /* End of string, add it to the string literal cache and build the token. */
1142 pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1143 pTok->Type.StringLit.cchString = idxChCur;
1144 pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1145 if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1146 return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1147 else
1148 break;
1149 }
1150 /* else: Fall through and add the character to the string literal..*/
1151 }
1152
1153 rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1154 if (RT_SUCCESS(rc))
1155 idxChCur++;
1156 else
1157 return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1158 ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1159 }
1160
1161 pTok->PosEnd = pThis->Pos;
1162 return VINF_SUCCESS;
1163}
1164
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette