scriptlex.cpp@ 107455

Last change on this file since 107455 was 106061, checked in by vboxsync, 5 months ago
Copyright year updates by scm.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 37.4 KB

Line
1	/* $Id: scriptlex.cpp 106061 2024-09-16 14:03:52Z vboxsync $ */
2	/** @file
3	* IPRT - RTScript* lexer API.
4	*/
5
6	/*
7	* Copyright (C) 2022-2024 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* The contents of this file may alternatively be used under the terms
26	* of the Common Development and Distribution License Version 1.0
27	* (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28	* in the VirtualBox distribution, in which case the provisions of the
29	* CDDL are applicable instead of those of the GPL.
30	*
31	* You may elect to license modified versions of this file under the
32	* terms and conditions of either the GPL or the CDDL or both.
33	*
34	* SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35	*/
36
37
38	/*********************************************************************************************************************************
39	* Header Files *
40	*********************************************************************************************************************************/
41	#define LOG_GROUP RTLOGGROUP_DEFAULT /// @todo
42	#include <iprt/script.h>
43
44	#include <iprt/assert.h>
45	#include <iprt/ctype.h>
46	#include <iprt/err.h>
47	#include <iprt/file.h>
48	#include <iprt/log.h>
49	#include <iprt/mem.h>
50	#include <iprt/string.h>
51
52
53	/*********************************************************************************************************************************
54	* Structures and Typedefs *
55	*********************************************************************************************************************************/
56
57	/**
58	* Internal lexer state.
59	*/
60	typedef struct RTSCRIPTLEXINT
61	{
62	/** Magic. */
63	uint32_t u32Magic;
64	/** Source position. */
65	RTSCRIPTPOS Pos;
66	/** Current and next token buffer. */
67	RTSCRIPTLEXTOKEN aToks[2];
68	/** Pointer to the current token. */
69	PRTSCRIPTLEXTOKEN pTokCur;
70	/** Pointer to the next token. */
71	PRTSCRIPTLEXTOKEN pTokNext;
72	/** The lexer config. */
73	PCRTSCRIPTLEXCFG pCfg;
74	/** The input reader. */
75	PFNRTSCRIPTLEXRDR pfnReader;
76	/** The destructor callback. */
77	PFNRTSCRIPTLEXDTOR pfnDtor;
78	/** Opaque user data for the reader. */
79	void *pvUser;
80	/** Identifier string cache. */
81	RTSTRCACHE hStrCacheId;
82	/** String literal string cache. */
83	RTSTRCACHE hStrCacheStringLit;
84	/** Status code from the reader. */
85	int rcRdr;
86	/** Internal error info. */
87	RTERRINFOSTATIC ErrInfo;
88	/** Lexer flags. */
89	uint32_t fFlags;
90	/** Maximum numebr of bytes allocated for temporary storage for literal strings. */
91	size_t cchStrLitMax;
92	/** Pointer to the string buffer for holding the literal string. */
93	char *pszStrLit;
94	/** Pointer to the current input character. */
95	const char *pchCur;
96	/** Offset to start reading the next chunk from. */
97	size_t offBufRead;
98	/** Size of the input buffer. */
99	size_t cchBuf;
100	/** The cached part of the input, variable in size. */
101	char achBuf[1];
102	} RTSCRIPTLEXINT;
103	/** Pointer to the internal lexer state. */
104	typedef RTSCRIPTLEXINT *PRTSCRIPTLEXINT;
105
106
107	/** Free the identifier string cache literal on destruction. */
108	#define RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE RT_BIT_32(0)
109	/** Free the string literal string cache literal on destruction. */
110	#define RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE RT_BIT_32(1)
111	/** End of stream reached. */
112	#define RTSCRIPT_LEX_INT_F_EOS RT_BIT_32(2)
113
114
115	/*********************************************************************************************************************************
116	* Global Variables *
117	*********************************************************************************************************************************/
118
119	/** Default set of white spaces. */
120	static const char *g_szWsDef = " \t";
121	/** Default set of newlines. */
122	static const char *g_aszNlDef[] =
123	{
124	"\n",
125	"\r\n",
126	NULL
127	};
128	/** Default set of characters allowed for identifiers. */
129	static const char *g_aszIdeCharSetDef = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
130
131
132	/*********************************************************************************************************************************
133	* Internal Functions *
134	*********************************************************************************************************************************/
135
136
137	/**
138	* Locates the given character in the string, consuming it if found.
139	*
140	* @returns Flag whether the character was found in the string.
141	* @param pThis The lexer state.
142	* @param ch The character to check for.
143	* @param psz The string to check.
144	*/
145	DECLINLINE(bool) rtScriptLexLocateChInStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz)
146	{
147	while ( *psz != '\0'
148	&& *psz != ch)
149	psz++;
150
151	if (*psz != '\0')
152	RTScriptLexConsumeCh(pThis);
153
154	return *psz != '\0';
155	}
156
157
158	/**
159	* Matches the input against the given string starting with the given character, consuming it
160	* if found.
161	*
162	* @returns Flag whether there was a match.
163	* @param pThis The lexer state.
164	* @param ch The character to check start matching.
165	* @param psz The string to match against.
166	* @param pszExclude When the string matched but the input continues
167	* with one of the characters in this string there will
168	* be no match.
169	*/
170	DECLINLINE(bool) rtScriptLexMatchStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz,
171	const char *pszExclude)
172	{
173	bool fMatch = false;
174	if (*psz == ch)
175	{
176	unsigned offPeek = 1;
177
178	psz++;
179	while ( *psz != '\0'
180	&& *psz == RTScriptLexPeekCh(pThis, offPeek))
181	{
182	offPeek++;
183	psz++;
184	}
185
186	if (*psz == '\0')
187	{
188	if (pszExclude)
189	{
190	ch = RTScriptLexPeekCh(pThis, offPeek);
191	fMatch = strchr(pszExclude, ch) == NULL;
192	}
193	else
194	fMatch = true;
195	}
196
197	if (fMatch)
198	{
199	/* Match, consume everything. */
200	while (offPeek-- > 0)
201	RTScriptLexConsumeCh(pThis);
202	}
203	}
204
205	return fMatch;
206	}
207
208
209	/**
210	* Tries to locate a string with the given starting character (+ peeking ahead) in the
211	* given string array (exact match) and consumes the entire substring.
212	*
213	* @returns Flag whether there was a match.
214	* @param pThis The lexer state.
215	* @param ch The character to check for.
216	* @param papsz Pointer to the string array to check for.
217	* @param pidx Where to store the index of the matching substring if found,
218	* optional.
219	*/
220	DECLINLINE(bool) rtScriptLexLocateSubStrInStrArrayMatchConsume(PRTSCRIPTLEXINT pThis, char ch,
221	const char *papsz, unsigned pidx)
222	{
223	unsigned int idx = 0;
224
225	while ( papsz[idx] != NULL
226	&& !rtScriptLexMatchStrConsume(pThis, ch, papsz[idx], NULL))
227	idx++;
228
229	if ( papsz[idx] != NULL
230	&& pidx)
231	*pidx = idx;
232
233	return papsz[idx] != NULL;
234	}
235
236
237	/**
238	* Tries to get an exact match starting with the given character, consuming it when found.
239	*
240	* @returns Flag whether there was a match.
241	* @param pThis The lexer state.
242	* @param ch The character to check for.
243	* @param ppMatch Where to store the exact match on success.
244	*/
245	DECLINLINE(bool) rtScriptLexLocateExactMatchConsume(PRTSCRIPTLEXINT pThis, char ch, PCRTSCRIPTLEXTOKMATCH *ppMatch)
246	{
247	PCRTSCRIPTLEXTOKMATCH pTokMatch = pThis->pCfg->paTokMatches;
248
249	if (pTokMatch)
250	{
251	while ( pTokMatch->pszMatch != NULL
252	&& !rtScriptLexMatchStrConsume(pThis, ch, pTokMatch->pszMatch,
253	pTokMatch->fMaybeIdentifier
254	? g_aszIdeCharSetDef
255	: NULL))
256	pTokMatch++;
257
258	if (pTokMatch->pszMatch != NULL)
259	{
260	*ppMatch = pTokMatch;
261	return true;
262	}
263	}
264
265	return false;
266	}
267
268
269	DECLINLINE(bool) rtScriptLexIsNewlineConsume(PRTSCRIPTLEXINT pThis, char ch)
270	{
271	const char **papszNl = pThis->pCfg->pszWhitespace ? pThis->pCfg->papszNewline : g_aszNlDef;
272
273	bool fMatched = rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszNl, NULL);
274	if (fMatched)
275	{
276	pThis->Pos.iLine++;
277	pThis->Pos.iCh = 1;
278	}
279
280	return fMatched;
281	}
282
283
284	/**
285	* Checks whether the character is the beginning of a multi line comment, skipping the whole
286	* comment if necessary.
287	*
288	* @returns Flag whether a multi line comment was detected and consumed.
289	* @param hScriptLex The lexer state.
290	* @param ch The character to check for.
291	*/
292	DECLINLINE(bool) rtScriptLexIsMultiLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
293	{
294	const char **papszCommentMultiStart = pThis->pCfg->papszCommentMultiStart;
295	unsigned idxComment = 0;
296
297	if ( papszCommentMultiStart
298	&& rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentMultiStart,
299	&idxComment))
300	{
301	/* Look for the matching closing lexeme in the input consuming everything along the way. */
302	const char *pszClosing = pThis->pCfg->papszCommentMultiEnd[idxComment];
303
304	for (;;)
305	{
306	char chTmp = RTScriptLexGetCh(pThis);
307
308	/* Check for new lines explicetly to advance the position information. */
309	if (rtScriptLexIsNewlineConsume(pThis, chTmp))
310	continue;
311
312	/** @todo Not quite correct when there is an end of stream before the closing lexeme.
313	* But doesn't hurt at the moment. */
314	if ( chTmp == '\0'
315	\|\| rtScriptLexMatchStrConsume(pThis, chTmp, pszClosing, NULL))
316	break;
317
318	RTScriptLexConsumeCh(pThis);
319	}
320
321	return true;
322	}
323
324	return false;
325	}
326
327
328	/**
329	* Checks whether the character is the beginning of a single line comment, skipping the whole
330	* comment if necessary.
331	*
332	* @returns Flag whether a single line comment was detected and consumed.
333	* @param hScriptLex The lexer state.
334	* @param ch The character to check for.
335	*/
336	DECLINLINE(bool) rtScriptLexIsSingleLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
337	{
338	const char **papszCommentSingleStart = pThis->pCfg->papszCommentSingleStart;
339
340	if ( papszCommentSingleStart
341	&& rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentSingleStart,
342	NULL))
343	{
344	for (;;)
345	{
346	char chTmp = RTScriptLexGetCh(pThis);
347
348	if ( chTmp == '\0'
349	\|\| rtScriptLexIsNewlineConsume(pThis, chTmp))
350	break;
351
352	RTScriptLexConsumeCh(pThis);
353	}
354
355	return true;
356	}
357
358	return false;
359	}
360
361
362	/**
363	* Fills the input buffer with source data.
364	*
365	* @returns IPRT status code.
366	* @param pThis The lexer state.
367	*/
368	static int rtScriptLexFillBuffer(PRTSCRIPTLEXINT pThis)
369	{
370	int rc = VINF_SUCCESS;
371	size_t cchToRead = pThis->cchBuf;
372	char *pchRead = &pThis->achBuf[0];
373
374	AssertReturn(!(pThis->fFlags & RTSCRIPT_LEX_INT_F_EOS), VERR_INVALID_STATE);
375
376	/* If there is input left to process move it to the front and fill the remainder. */
377	if (pThis->pchCur != NULL)
378	{
379	cchToRead = pThis->pchCur - &pThis->achBuf[0];
380	/* Move the rest to the front. */
381	memmove(&pThis->achBuf[0], pThis->pchCur, pThis->cchBuf - cchToRead);
382	pchRead = (char *)pThis->pchCur + 1;
383	memset(pchRead, 0, cchToRead);
384	}
385
386	if (cchToRead)
387	{
388	pThis->pchCur = &pThis->achBuf[0];
389
390	size_t cchRead = 0;
391	rc = pThis->pfnReader(pThis, pThis->offBufRead, pchRead, cchToRead, &cchRead, pThis->pvUser);
392	if (RT_SUCCESS(rc))
393	{
394	pThis->offBufRead += cchRead;
395	if (rc == VINF_EOF)
396	pThis->fFlags \|= RTSCRIPT_LEX_INT_F_EOS;
397	rc = VINF_SUCCESS;
398	}
399	else
400	pThis->rcRdr = rc;
401	}
402	else
403	rc = VERR_BUFFER_OVERFLOW; /** @todo */
404
405	return rc;
406	}
407
408
409	/**
410	* Produce an end of stream token.
411	*
412	* @returns nothing.
413	* @param pThis The lexer state.
414	* @param pTok The token to fill.
415	*/
416	static void rtScriptLexProduceTokEos(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
417	{
418	pTok->enmType = RTSCRIPTLEXTOKTYPE_EOS;
419	pTok->PosStart = pThis->Pos;
420	pTok->PosEnd = pThis->Pos;
421	}
422
423
424	/**
425	* Produce an error token with the given error message.
426	*
427	* @returns IPRT status code.
428	* @param pThis The lexer state.
429	* @param pTok The token to fill.
430	* @param rc The status code to use in the message.
431	* @param pszMsg The format string for the error message.
432	* @param ... Arguments to the format string.
433	*/
434	static int rtScriptLexProduceTokError(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
435	int rc, const char *pszMsg, ...)
436	{
437	va_list va;
438	va_start(va, pszMsg);
439
440	pTok->enmType = RTSCRIPTLEXTOKTYPE_ERROR;
441	pTok->PosEnd = pThis->Pos;
442	pTok->Type.Error.pErr = &pThis->ErrInfo.Core;
443
444	RTErrInfoInitStatic(&pThis->ErrInfo);
445	RTErrInfoSetV(&pThis->ErrInfo.Core, rc, pszMsg, va);
446	va_end(va);
447
448	return rc;
449	}
450
451
452	/**
453	* Create the token from the exact match.
454	*
455	* @returns nothing.
456	* @param pThis The lexer state.
457	* @param pTok The token to fill.
458	* @param pMatch The matched string.
459	*/
460	static void rtScriptLexProduceTokFromExactMatch(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
461	PCRTSCRIPTLEXTOKMATCH pMatch)
462	{
463	pTok->enmType = pMatch->enmTokType;
464	pTok->PosEnd = pThis->Pos;
465
466	switch (pTok->enmType)
467	{
468	case RTSCRIPTLEXTOKTYPE_OPERATOR:
469	pTok->Type.Operator.pOp = pMatch;
470	break;
471	case RTSCRIPTLEXTOKTYPE_KEYWORD:
472	pTok->Type.Keyword.pKeyword = pMatch;
473	break;
474	case RTSCRIPTLEXTOKTYPE_PUNCTUATOR:
475	pTok->Type.Punctuator.pPunctuator = pMatch;
476	break;
477	default:
478	rtScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
479	"Lexer: The match contains an invalid token type: %d\n",
480	pTok->enmType);
481	}
482	}
483
484
485	/**
486	* Goes through the rules trying to find a matching one.
487	*
488	* @returns Flag whether a matching rule was found.
489	* @param pThis The lexer state.
490	* @param ch The character to check.
491	* @param pTok The token to fill.
492	*/
493	static bool rtScriptLexProduceTokFromRules(PRTSCRIPTLEXINT pThis, char ch, PRTSCRIPTLEXTOKEN pTok)
494	{
495	PCRTSCRIPTLEXRULE pRule = pThis->pCfg->paRules;
496
497	if (pRule)
498	{
499	while (pRule->pfnProd != NULL)
500	{
501	if ( ch >= pRule->chStart
502	&& ch <= pRule->chEnd)
503	{
504	if (pRule->fFlags & RTSCRIPT_LEX_RULE_CONSUME)
505	RTScriptLexConsumeCh(pThis);
506	int rc = pRule->pfnProd(pThis, ch, pTok, pRule->pvUser);
507	AssertRC(rc);
508	return true;
509	}
510
511	pRule++;
512	}
513	}
514
515	return false;
516	}
517
518
519	/**
520	* Fills in the given token from the scanned input at the current location.
521	*
522	* @returns IPRT status code.
523	* @param pThis The lexer state.
524	* @param pTok The token to fill.
525	*/
526	static int rtScriptLexProduceToken(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
527	{
528	RTScriptLexSkipWhitespace(pThis);
529
530	pTok->PosStart = pThis->Pos;
531
532	char ch = RTScriptLexGetCh(pThis);
533	PCRTSCRIPTLEXTOKMATCH pMatch = NULL;
534	if (ch == '\0')
535	rtScriptLexProduceTokEos(pThis, pTok);
536	else if (rtScriptLexLocateExactMatchConsume(pThis, ch, &pMatch))
537	rtScriptLexProduceTokFromExactMatch(pThis, pTok, pMatch);
538	else if (!rtScriptLexProduceTokFromRules(pThis, ch, pTok))
539	{
540	if (pThis->pCfg->pfnProdDef)
541	pThis->rcRdr = pThis->pCfg->pfnProdDef(pThis, ch, pTok, pThis->pCfg->pvProdDefUser);
542	else
543	rtScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
544	"Lexer: Invalid character found in input: %c\n",
545	ch);
546	}
547
548	return pThis->rcRdr;
549	}
550
551
552	/**
553	* Populates the lexer for the initial use.
554	*
555	* @returns IPRT status code.
556	* @param pThis The lexer state.
557	*/
558	static int rtScriptLexPopulate(PRTSCRIPTLEXINT pThis)
559	{
560	int rc = rtScriptLexFillBuffer(pThis);
561	if (RT_SUCCESS(rc))
562	{
563	rc = rtScriptLexProduceToken(pThis, pThis->pTokCur);
564	if (RT_SUCCESS(rc))
565	rc = rtScriptLexProduceToken(pThis, pThis->pTokNext);
566	}
567
568	return rc;
569	}
570
571
572
573	RTDECL(int) RTScriptLexCreateFromReader(PRTSCRIPTLEX phScriptLex, PFNRTSCRIPTLEXRDR pfnReader,
574	PFNRTSCRIPTLEXDTOR pfnDtor, void *pvUser,
575	size_t cchBuf, PRTSTRCACHE phStrCacheId, PRTSTRCACHE phStrCacheStringLit,
576	PCRTSCRIPTLEXCFG pCfg)
577	{
578	AssertPtrReturn(phScriptLex, VERR_INVALID_POINTER);
579	AssertPtrReturn(pfnReader, VERR_INVALID_POINTER);
580	AssertPtrReturn(pCfg, VERR_INVALID_POINTER);
581
582	if (!cchBuf)
583	cchBuf = _16K;
584	int rc = VINF_SUCCESS;
585	PRTSCRIPTLEXINT pThis = (PRTSCRIPTLEXINT)RTMemAllocZ(RT_UOFFSETOF_DYN(RTSCRIPTLEXINT, achBuf[cchBuf]));
586	if (RT_LIKELY(pThis))
587	{
588	pThis->u32Magic = 0xfefecafe; /** @todo */
589	pThis->Pos.iLine = 1;
590	pThis->Pos.iCh = 1;
591	pThis->pTokCur = &pThis->aToks[0];
592	pThis->pTokNext = &pThis->aToks[1];
593	pThis->pCfg = pCfg;
594	pThis->pfnReader = pfnReader;
595	pThis->pfnDtor = pfnDtor;
596	pThis->pvUser = pvUser;
597	pThis->fFlags = 0;
598	pThis->cchStrLitMax = 0;
599	pThis->pszStrLit = NULL;
600	pThis->cchBuf = cchBuf;
601	pThis->offBufRead = 0;
602	pThis->pchCur = NULL;
603	pThis->hStrCacheId = NULL;
604	pThis->hStrCacheStringLit = NULL;
605
606	rc = RTStrCacheCreate(&pThis->hStrCacheId, "LEX-Ide");
607	if (RT_SUCCESS(rc))
608	{
609	rc = RTStrCacheCreate(&pThis->hStrCacheStringLit, "LEX-StrLit");
610	if (RT_SUCCESS(rc))
611	{
612	rc = rtScriptLexPopulate(pThis);
613	if (RT_SUCCESS(rc))
614	{
615	*phScriptLex = pThis;
616
617	if (phStrCacheId)
618	*phStrCacheId = pThis->hStrCacheId;
619	else
620	pThis->fFlags \|= RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE;
621
622	if (phStrCacheStringLit)
623	*phStrCacheStringLit = pThis->hStrCacheStringLit;
624	else
625	pThis->fFlags \|= RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE;
626
627	return VINF_SUCCESS;
628	}
629
630	RTStrCacheDestroy(pThis->hStrCacheStringLit);
631	}
632
633	RTStrCacheDestroy(pThis->hStrCacheId);
634	}
635
636	RTMemFree(pThis);
637	}
638	else
639	rc = VERR_NO_MEMORY;
640
641	return rc;
642	}
643
644
645	/**
646	* @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a string.}
647	*/
648	static DECLCALLBACK(int) rtScriptLexReaderStr(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
649	size_t cchBuf, size_t pcchRead, void pvUser)
650	{
651	RT_NOREF(hScriptLex);
652
653	const char psz = (const char )pvUser;
654	size_t cch = strlen(psz);
655	size_t cchCopy = RT_MIN(cchBuf, cch - offBuf);
656	int rc = VINF_SUCCESS;
657
658	*pcchRead = cchCopy;
659
660	if (cchCopy)
661	memcpy(pchCur, &psz[offBuf], cchCopy * sizeof(char));
662	else
663	rc = VINF_EOF;
664
665	return rc;
666	}
667
668
669	RTDECL(int) RTScriptLexCreateFromString(PRTSCRIPTLEX phScriptLex, const char *pszSrc, PRTSTRCACHE phStrCacheId,
670	PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
671	{
672	return RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderStr, NULL, (void *)pszSrc, 0,
673	phStrCacheId, phStrCacheStringLit, pCfg);
674	}
675
676
677	/**
678	* @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a file.}
679	*/
680	static DECLCALLBACK(int) rtScriptLexReaderFile(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
681	size_t cchBuf, size_t pcchRead, void pvUser)
682	{
683	RT_NOREF(hScriptLex);
684
685	RTFILE hFile = (RTFILE)pvUser;
686	return RTFileReadAt(hFile, offBuf, pchCur, cchBuf, pcchRead);
687	}
688
689
690	/**
691	* @callback_method_impl{FNRTSCRIPTLEXDTOR, Destructor for the file variant.}
692	*/
693	static DECLCALLBACK(void) rtScriptLexDtorFile(RTSCRIPTLEX hScriptLex, void *pvUser)
694	{
695	RT_NOREF(hScriptLex);
696
697	RTFILE hFile = (RTFILE)pvUser;
698	RTFileClose(hFile);
699	}
700
701
702	RTDECL(int) RTScriptLexCreateFromFile(PRTSCRIPTLEX phScriptLex, const char *pszFilename, PRTSTRCACHE phStrCacheId,
703	PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
704	{
705	RTFILE hFile;
706	int rc = RTFileOpen(&hFile, pszFilename, RTFILE_O_READ \| RTFILE_O_DENY_WRITE \| RTFILE_O_OPEN);
707	if (RT_SUCCESS(rc))
708	{
709	rc = RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderFile, rtScriptLexDtorFile, (void *)hFile, 0,
710	phStrCacheId, phStrCacheStringLit, pCfg);
711	if (RT_FAILURE(rc))
712	RTFileClose(hFile);
713	}
714
715	return rc;
716	}
717
718
719	RTDECL(void) RTScriptLexDestroy(RTSCRIPTLEX hScriptLex)
720	{
721	PRTSCRIPTLEXINT pThis = hScriptLex;
722	AssertPtrReturnVoid(pThis);
723
724	if (pThis->pfnDtor)
725	pThis->pfnDtor(pThis, pThis->pvUser);
726
727	if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE)
728	RTStrCacheDestroy(pThis->hStrCacheId);
729	if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE)
730	RTStrCacheDestroy(pThis->hStrCacheStringLit);
731
732	if (pThis->pszStrLit)
733	RTStrFree(pThis->pszStrLit);
734
735	RTMemFree(pThis);
736	}
737
738
739	RTDECL(int) RTScriptLexQueryToken(RTSCRIPTLEX hScriptLex, PCRTSCRIPTLEXTOKEN *ppToken)
740	{
741	PRTSCRIPTLEXINT pThis = hScriptLex;
742	AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
743	AssertPtrReturn(ppToken, VERR_INVALID_POINTER);
744
745	if (RT_SUCCESS(pThis->rcRdr))
746	*ppToken = pThis->pTokCur;
747
748	return pThis->rcRdr;
749	}
750
751
752	RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexGetTokenType(RTSCRIPTLEX hScriptLex)
753	{
754	PRTSCRIPTLEXINT pThis = hScriptLex;
755	AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
756
757	if (RT_SUCCESS(pThis->rcRdr))
758	return pThis->pTokCur->enmType;
759
760	return RTSCRIPTLEXTOKTYPE_INVALID;
761	}
762
763
764	RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexPeekNextTokenType(RTSCRIPTLEX hScriptLex)
765	{
766	PRTSCRIPTLEXINT pThis = hScriptLex;
767	AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
768
769	if (RT_SUCCESS(pThis->rcRdr))
770	return pThis->pTokNext->enmType;
771
772	return RTSCRIPTLEXTOKTYPE_INVALID;
773	}
774
775
776	RTDECL(PCRTSCRIPTLEXTOKEN) RTScriptLexConsumeToken(RTSCRIPTLEX hScriptLex)
777	{
778	PRTSCRIPTLEXINT pThis = hScriptLex;
779	AssertPtrReturn(pThis, NULL);
780
781	/*
782	* Stop token production as soon as the current token indicates the
783	* end of the stream or an error
784	*/
785	if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
786	&& pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
787	{
788	PRTSCRIPTLEXTOKEN pTokTmp = pThis->pTokCur;
789
790	/* Switch next token to current token and read in the next token. */
791	pThis->pTokCur = pThis->pTokNext;
792	pThis->pTokNext = pTokTmp;
793	if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
794	&& pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
795	rtScriptLexProduceToken(pThis, pThis->pTokNext);
796	else
797	pThis->pTokNext = pThis->pTokCur;
798	}
799
800	return pThis->pTokCur;
801	}
802
803
804	RTDECL(char) RTScriptLexConsumeCh(RTSCRIPTLEX hScriptLex)
805	{
806	return RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_DEFAULT);
807	}
808
809
810	RTDECL(char) RTScriptLexConsumeChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
811	{
812	PRTSCRIPTLEXINT pThis = hScriptLex;
813	AssertPtrReturn(pThis, '\0');
814
815	pThis->pchCur++;
816	pThis->Pos.iCh++;
817	if (pThis->pchCur == &pThis->achBuf[pThis->cchBuf])
818	rtScriptLexFillBuffer(pThis);
819
820	return RTScriptLexGetChEx(pThis, fFlags);
821	}
822
823
824	RTDECL(char) RTScriptLexPeekCh(RTSCRIPTLEX hScriptLex, unsigned idx)
825	{
826	return RTScriptLexPeekChEx(hScriptLex, idx, RTSCRIPT_LEX_CONV_F_DEFAULT);
827	}
828
829
830	RTDECL(char) RTScriptLexPeekChEx(RTSCRIPTLEX hScriptLex, unsigned idx, uint32_t fFlags)
831	{
832	PRTSCRIPTLEXINT pThis = hScriptLex;
833	AssertPtrReturn(pThis, '\0');
834
835	/* Just return the character if it is in the current buffer. */
836	char ch = '\0';
837	if (RT_LIKELY(pThis->pchCur + idx < &pThis->achBuf[pThis->cchBuf]))
838	ch = pThis->pchCur[idx];
839	else
840	{
841	/* Slow path, read data into temporary buffer to read character from and dismiss. */
842	/** @todo */
843	AssertReleaseFailed();
844	}
845
846	if ( (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE)
847	&& !(fFlags & RTSCRIPT_LEX_CONV_F_NOTHING))
848	ch = RT_C_TO_LOWER(ch);
849
850	return ch;
851	}
852
853
854	RTDECL(char) RTScriptLexGetCh(RTSCRIPTLEX hScriptLex)
855	{
856	return RTScriptLexPeekCh(hScriptLex, 0);
857	}
858
859
860	RTDECL(char) RTScriptLexGetChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
861	{
862	return RTScriptLexPeekChEx(hScriptLex, 0, fFlags);
863	}
864
865
866	RTDECL(void) RTScriptLexSkipWhitespace(RTSCRIPTLEX hScriptLex)
867	{
868	PRTSCRIPTLEXINT pThis = hScriptLex;
869	AssertPtrReturnVoid(pThis);
870
871	for (;;)
872	{
873	char ch = RTScriptLexGetCh(hScriptLex);
874
875	if (ch == '\0')
876	break;
877
878	/* Check for whitespace. */
879	const char *pszWs = pThis->pCfg->pszWhitespace ? pThis->pCfg->pszWhitespace : g_szWsDef;
880
881	if ( rtScriptLexLocateChInStrConsume(pThis, ch, pszWs)
882	\|\| rtScriptLexIsNewlineConsume(pThis, ch)
883	\|\| rtScriptLexIsMultiLineCommentConsume(pThis, ch)
884	\|\| rtScriptLexIsSingleLineCommentConsume(pThis, ch))
885	continue;
886
887	/* All white space skipped, next is some real content. */
888	break;
889	}
890	}
891
892
893	RTDECL(int) RTScriptLexScanNumber(RTSCRIPTLEX hScriptLex, uint8_t uBase, bool fAllowReal,
894	PRTSCRIPTLEXTOKEN pTok)
895	{
896	RT_NOREF(uBase, fAllowReal, pTok);
897	PRTSCRIPTLEXINT pThis = hScriptLex;
898	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
899	AssertReturn(!fAllowReal, VERR_NOT_IMPLEMENTED);
900	AssertReturn(!uBase, VERR_NOT_IMPLEMENTED);
901
902	/** @todo r=aeichner Quick and dirty to have something working for the disassembler testcase.
903	* Among others it misses overflow handling. */
904	uBase = 10;
905	char ch = RTScriptLexGetCh(hScriptLex);
906	pTok->Type.Number.enmType = ch == '-'
907	? RTSCRIPTLEXTOKNUMTYPE_INTEGER
908	: RTSCRIPTLEXTOKNUMTYPE_NATURAL;
909	if (ch == '-' \|\| ch == '+')
910	ch = RTScriptLexConsumeCh(hScriptLex);
911
912	if (ch == '0')
913	{
914	/* Some hex prefix? */
915	char chNext = RTScriptLexPeekCh(hScriptLex, 1);
916	if (chNext == 'x')
917	{
918	uBase = 16;
919	RTScriptLexConsumeCh(hScriptLex);
920	}
921	else if (chNext >= '0' && chNext <= '9') /* Octal stuff. */
922	AssertFailedReturn(VERR_NOT_IMPLEMENTED);
923
924	ch = RTScriptLexConsumeCh(hScriptLex);
925	}
926
927	uint64_t u64 = 0;
928	for (;;)
929	{
930	if ( (ch < '0' \|\| ch > '9')
931	&& (ch < 'a' \|\| ch > 'f' \|\| uBase == 10))
932	{
933	if (pTok->Type.Number.enmType == RTSCRIPTLEXTOKNUMTYPE_INTEGER)
934	pTok->Type.Number.Type.i64 = -(int64_t)u64;
935	else
936	pTok->Type.Number.Type.u64 = u64;
937	pTok->enmType = RTSCRIPTLEXTOKTYPE_NUMBER;
938	pTok->PosEnd = pThis->Pos;
939	return VINF_SUCCESS;
940	}
941
942	if (ch >= '0' && ch <= '9')
943	u64 = (u64 * uBase) + (ch - '0');
944	else if (ch >= 'a' && ch <= 'f')
945	{
946	Assert(uBase == 16);
947	u64 = (u64 << 4) + 10 + (ch - 'a');
948	}
949
950	ch = RTScriptLexConsumeCh(hScriptLex);
951	}
952	}
953
954
955	RTDECL(int) RTScriptLexScanIdentifier(RTSCRIPTLEX hScriptLex, char ch,
956	PRTSCRIPTLEXTOKEN pTok, void *pvUser)
957	{
958	PRTSCRIPTLEXINT pThis = hScriptLex;
959	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
960
961	const char pszCharSet = pvUser ? (const char )pvUser : g_aszIdeCharSetDef;
962	char aszIde[513]; RT_ZERO(aszIde);
963	unsigned idx = 0;
964	aszIde[idx++] = ch;
965
966	ch = RTScriptLexGetCh(hScriptLex);
967	while ( idx < sizeof(aszIde) - 1
968	&& rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
969	{
970	aszIde[idx++] = ch;
971	ch = RTScriptLexGetCh(hScriptLex);
972	}
973
974	if ( idx == sizeof(aszIde) - 1
975	&& rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
976	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_BUFFER_OVERFLOW, "Lexer: Identifier exceeds the allowed length");
977
978	/* Insert into string cache. */
979	pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
980	pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, &aszIde[0], idx);
981	if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
982	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
983
984	pTok->PosEnd = pThis->Pos;
985	return VINF_SUCCESS;
986	}
987
988
989	/**
990	* Adds the given character to the string literal add the given position, assuring the string
991	* is always zero terminated.
992	*
993	* @returns IPRT status code.
994	* @param pThis The lexer state.
995	* @param ch The character to add.
996	* @param idx At which position to add the character in the string.
997	*/
998	static int rtScriptLexScanStringLiteralChAdd(PRTSCRIPTLEXINT pThis, char ch, uint32_t idx)
999	{
1000	int rc = VINF_SUCCESS;
1001
1002	if ( !pThis->cchStrLitMax
1003	\|\| idx >= pThis->cchStrLitMax - 1)
1004	{
1005	/* Increase memory. */
1006	size_t cchMaxNew = pThis->cchStrLitMax + 64;
1007	char *pszNew = NULL;
1008	rc = RTStrRealloc(&pszNew, cchMaxNew * sizeof(char));
1009	if (RT_SUCCESS(rc))
1010	{
1011	pThis->pszStrLit = pszNew;
1012	pThis->cchStrLitMax = cchMaxNew;
1013	}
1014	}
1015
1016	if (RT_SUCCESS(rc))
1017	{
1018	pThis->pszStrLit[idx] = ch;
1019	pThis->pszStrLit[idx + 1] = '\0';
1020	}
1021
1022	return rc;
1023	}
1024
1025
1026	RTDECL(int) RTScriptLexScanStringLiteralC(RTSCRIPTLEX hScriptLex, char ch,
1027	PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1028	{
1029	RT_NOREF(ch, pvUser);
1030	PRTSCRIPTLEXINT pThis = hScriptLex;
1031	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1032
1033	uint32_t idxChCur = 0;
1034	int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1035	if (RT_FAILURE(rc))
1036	return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1037
1038	ch = RTScriptLexGetCh(hScriptLex);
1039	for (;;)
1040	{
1041	if (ch == '\0')
1042	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1043	else if (ch == '\"')
1044	{
1045	/* End of string, add it to the string literal cache and build the token. */
1046	pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1047	pTok->Type.StringLit.cchString = idxChCur;
1048	pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1049	if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1050	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1051	else
1052	break;
1053	}
1054	else if (ch == '\\')
1055	{
1056	/* Start of escape sequence. */
1057	RTScriptLexConsumeCh(hScriptLex);
1058	ch = RTScriptLexGetCh(hScriptLex);
1059	switch (ch)
1060	{
1061	case 'a': /* Alert (Bell) */
1062	ch = 0x07;
1063	break;
1064	case 'b': /* Backspace */
1065	ch = 0x08;
1066	break;
1067	case 'e': /* Escape character */
1068	ch = 0x1b;
1069	break;
1070	case 'f': /* Formfeed */
1071	ch = 0x0c;
1072	break;
1073	case 'n': /* Newline (line freed) */
1074	ch = 0x0a;
1075	break;
1076	case 'r': /* Carriage return */
1077	ch = 0x0d;
1078	break;
1079	case 't': /* Horizontal tab */
1080	ch = 0x09;
1081	break;
1082	case 'v': /* Vertical tab */
1083	ch = 0x0b;
1084	break;
1085	case '\\':
1086	case '\'':
1087	case '\"':
1088	case '\?':
1089	/* Can be added as is. */
1090	break;
1091	case 'x': /* Hexdecimal byte. */
1092	case '0': /* Octal */
1093	case '1':
1094	case '2':
1095	case '3':
1096	case '4':
1097	case '5':
1098	case '6':
1099	case '7':
1100	case '8':
1101	case '9':
1102	case 'u': /* Unicode point below 10000 */
1103	case 'U': /* Unicode point */
1104	default:
1105	/* Not supported for now. */
1106	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NOT_SUPPORTED, "Lexer: Invalid/unsupported escape sequence");
1107	}
1108	}
1109
1110	rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1111	if (RT_SUCCESS(rc))
1112	idxChCur++;
1113	else
1114	return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1115
1116	ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1117	}
1118
1119	pTok->PosEnd = pThis->Pos;
1120	return VINF_SUCCESS;
1121	}
1122
1123
1124	RTDECL(int) RTScriptLexScanStringLiteralPascal(RTSCRIPTLEX hScriptLex, char ch,
1125	PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1126	{
1127	RT_NOREF(ch, pvUser);
1128	PRTSCRIPTLEXINT pThis = hScriptLex;
1129	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1130
1131	uint32_t idxChCur = 0;
1132	int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1133	if (RT_FAILURE(rc))
1134	return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1135
1136	ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1137	for (;;)
1138	{
1139	if (ch == '\0')
1140	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1141	else if (ch == '\'')
1142	{
1143	/*
1144	* Check whether there is a second ' coming afterwards used for
1145	* escaping ' characters.
1146	*/
1147	ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1148	if (ch != '\'')
1149	{
1150	/* End of string, add it to the string literal cache and build the token. */
1151	pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1152	pTok->Type.StringLit.cchString = idxChCur;
1153	pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1154	if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1155	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1156	else
1157	break;
1158	}
1159	/* else: Fall through and add the character to the string literal..*/
1160	}
1161
1162	rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1163	if (RT_SUCCESS(rc))
1164	idxChCur++;
1165	else
1166	return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1167	ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1168	}
1169
1170	pTok->PosEnd = pThis->Pos;
1171	return VINF_SUCCESS;
1172	}
1173

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/script/scriptlex.cpp@ 107455

Download in other formats: