scriptlex.cpp@ 108194

Last change on this file since 108194 was 108194, checked in by vboxsync, 3 months ago
Runtime/comon/script/scriptlex.cpp: Fix re-allocating memory for a string literal, bugref:10733
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 38.6 KB

Line
1	/* $Id: scriptlex.cpp 108194 2025-02-13 14:35:47Z vboxsync $ */
2	/** @file
3	* IPRT - RTScript* lexer API.
4	*/
5
6	/*
7	* Copyright (C) 2022-2024 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* The contents of this file may alternatively be used under the terms
26	* of the Common Development and Distribution License Version 1.0
27	* (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28	* in the VirtualBox distribution, in which case the provisions of the
29	* CDDL are applicable instead of those of the GPL.
30	*
31	* You may elect to license modified versions of this file under the
32	* terms and conditions of either the GPL or the CDDL or both.
33	*
34	* SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35	*/
36
37
38	/*********************************************************************************************************************************
39	* Header Files *
40	*********************************************************************************************************************************/
41	#define LOG_GROUP RTLOGGROUP_DEFAULT /// @todo
42	#include <iprt/script.h>
43
44	#include <iprt/assert.h>
45	#include <iprt/ctype.h>
46	#include <iprt/err.h>
47	#include <iprt/file.h>
48	#include <iprt/log.h>
49	#include <iprt/mem.h>
50	#include <iprt/string.h>
51
52
53	/*********************************************************************************************************************************
54	* Structures and Typedefs *
55	*********************************************************************************************************************************/
56
57	/**
58	* Internal lexer state.
59	*/
60	typedef struct RTSCRIPTLEXINT
61	{
62	/** Magic. */
63	uint32_t u32Magic;
64	/** Source position. */
65	RTSCRIPTPOS Pos;
66	/** Current and next token buffer. */
67	RTSCRIPTLEXTOKEN aToks[2];
68	/** Pointer to the current token. */
69	PRTSCRIPTLEXTOKEN pTokCur;
70	/** Pointer to the next token. */
71	PRTSCRIPTLEXTOKEN pTokNext;
72	/** The lexer config. */
73	PCRTSCRIPTLEXCFG pCfg;
74	/** The input reader. */
75	PFNRTSCRIPTLEXRDR pfnReader;
76	/** The destructor callback. */
77	PFNRTSCRIPTLEXDTOR pfnDtor;
78	/** Opaque user data for the reader. */
79	void *pvUser;
80	/** Identifier string cache. */
81	RTSTRCACHE hStrCacheId;
82	/** String literal string cache. */
83	RTSTRCACHE hStrCacheStringLit;
84	/** Status code from the reader. */
85	int rcRdr;
86	/** Internal error info. */
87	RTERRINFOSTATIC ErrInfo;
88	/** Lexer flags. */
89	uint32_t fFlags;
90	/** Maximum numebr of bytes allocated for temporary storage for literal strings. */
91	size_t cchStrLitMax;
92	/** Pointer to the string buffer for holding the literal string. */
93	char *pszStrLit;
94	/** Pointer to the current input character. */
95	const char *pchCur;
96	/** Offset to start reading the next chunk from. */
97	size_t offBufRead;
98	/** Size of the input buffer. */
99	size_t cchBuf;
100	/** The cached part of the input, variable in size. */
101	char achBuf[1];
102	} RTSCRIPTLEXINT;
103	/** Pointer to the internal lexer state. */
104	typedef RTSCRIPTLEXINT *PRTSCRIPTLEXINT;
105
106
107	/** Free the identifier string cache literal on destruction. */
108	#define RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE RT_BIT_32(0)
109	/** Free the string literal string cache literal on destruction. */
110	#define RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE RT_BIT_32(1)
111	/** End of stream reached. */
112	#define RTSCRIPT_LEX_INT_F_EOS RT_BIT_32(2)
113
114
115	/*********************************************************************************************************************************
116	* Global Variables *
117	*********************************************************************************************************************************/
118
119	/** Default set of white spaces. */
120	static const char *g_szWsDef = " \t";
121	/** Default set of newlines. */
122	static const char *g_aszNlDef[] =
123	{
124	"\n",
125	"\r\n",
126	NULL
127	};
128	/** Default set of characters allowed for identifiers. */
129	static const char *g_aszIdeCharSetDef = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
130
131
132	/*********************************************************************************************************************************
133	* Internal Functions *
134	*********************************************************************************************************************************/
135
136
137	/**
138	* Locates the given character in the string, consuming it if found.
139	*
140	* @returns Flag whether the character was found in the string.
141	* @param pThis The lexer state.
142	* @param ch The character to check for.
143	* @param psz The string to check.
144	*/
145	DECLINLINE(bool) rtScriptLexLocateChInStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz)
146	{
147	while ( *psz != '\0'
148	&& *psz != ch)
149	psz++;
150
151	if (*psz != '\0')
152	RTScriptLexConsumeCh(pThis);
153
154	return *psz != '\0';
155	}
156
157
158	/**
159	* Matches the input against the given string starting with the given character, consuming it
160	* if found.
161	*
162	* @returns Flag whether there was a match.
163	* @param pThis The lexer state.
164	* @param ch The character to check start matching.
165	* @param psz The string to match against.
166	* @param pszExclude When the string matched but the input continues
167	* with one of the characters in this string there will
168	* be no match.
169	*/
170	DECLINLINE(bool) rtScriptLexMatchStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz,
171	const char *pszExclude)
172	{
173	bool fMatch = false;
174	if (*psz == ch)
175	{
176	unsigned offPeek = 1;
177
178	psz++;
179	while ( *psz != '\0'
180	&& *psz == RTScriptLexPeekCh(pThis, offPeek))
181	{
182	offPeek++;
183	psz++;
184	}
185
186	if (*psz == '\0')
187	{
188	if (pszExclude)
189	{
190	ch = RTScriptLexPeekCh(pThis, offPeek);
191	fMatch = strchr(pszExclude, ch) == NULL;
192	}
193	else
194	fMatch = true;
195	}
196
197	if (fMatch)
198	{
199	/* Match, consume everything. */
200	while (offPeek-- > 0)
201	RTScriptLexConsumeCh(pThis);
202	}
203	}
204
205	return fMatch;
206	}
207
208
209	/**
210	* Tries to locate a string with the given starting character (+ peeking ahead) in the
211	* given string array (exact match) and consumes the entire substring.
212	*
213	* @returns Flag whether there was a match.
214	* @param pThis The lexer state.
215	* @param ch The character to check for.
216	* @param papsz Pointer to the string array to check for.
217	* @param pidx Where to store the index of the matching substring if found,
218	* optional.
219	*/
220	DECLINLINE(bool) rtScriptLexLocateSubStrInStrArrayMatchConsume(PRTSCRIPTLEXINT pThis, char ch,
221	const char *papsz, unsigned pidx)
222	{
223	unsigned int idx = 0;
224
225	while ( papsz[idx] != NULL
226	&& !rtScriptLexMatchStrConsume(pThis, ch, papsz[idx], NULL))
227	idx++;
228
229	if ( papsz[idx] != NULL
230	&& pidx)
231	*pidx = idx;
232
233	return papsz[idx] != NULL;
234	}
235
236
237	/**
238	* Tries to get an exact match starting with the given character, consuming it when found.
239	*
240	* @returns Flag whether there was a match.
241	* @param pThis The lexer state.
242	* @param ch The character to check for.
243	* @param ppMatch Where to store the exact match on success.
244	*/
245	DECLINLINE(bool) rtScriptLexLocateExactMatchConsume(PRTSCRIPTLEXINT pThis, char ch, PCRTSCRIPTLEXTOKMATCH *ppMatch)
246	{
247	PCRTSCRIPTLEXTOKMATCH pTokMatch = pThis->pCfg->paTokMatches;
248
249	if (pTokMatch)
250	{
251	while ( pTokMatch->pszMatch != NULL
252	&& !rtScriptLexMatchStrConsume(pThis, ch, pTokMatch->pszMatch,
253	pTokMatch->fMaybeIdentifier
254	? g_aszIdeCharSetDef
255	: NULL))
256	pTokMatch++;
257
258	if (pTokMatch->pszMatch != NULL)
259	{
260	*ppMatch = pTokMatch;
261	return true;
262	}
263	}
264
265	return false;
266	}
267
268
269	DECLINLINE(bool) rtScriptLexIsNewlineConsume(PRTSCRIPTLEXINT pThis, char ch)
270	{
271	const char **papszNl = pThis->pCfg->pszWhitespace ? pThis->pCfg->papszNewline : g_aszNlDef;
272
273	bool fMatched = rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszNl, NULL);
274	if (fMatched)
275	{
276	pThis->Pos.iLine++;
277	pThis->Pos.iCh = 1;
278	}
279
280	return fMatched;
281	}
282
283
284	/**
285	* Checks whether the character is the beginning of a multi line comment, skipping the whole
286	* comment if necessary.
287	*
288	* @returns Flag whether a multi line comment was detected and consumed.
289	* @param hScriptLex The lexer state.
290	* @param ch The character to check for.
291	*/
292	DECLINLINE(bool) rtScriptLexIsMultiLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
293	{
294	const char **papszCommentMultiStart = pThis->pCfg->papszCommentMultiStart;
295	unsigned idxComment = 0;
296
297	if ( papszCommentMultiStart
298	&& rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentMultiStart,
299	&idxComment))
300	{
301	/* Look for the matching closing lexeme in the input consuming everything along the way. */
302	const char *pszClosing = pThis->pCfg->papszCommentMultiEnd[idxComment];
303
304	for (;;)
305	{
306	char chTmp = RTScriptLexGetCh(pThis);
307
308	/* Check for new lines explicetly to advance the position information. */
309	if (rtScriptLexIsNewlineConsume(pThis, chTmp))
310	continue;
311
312	/** @todo Not quite correct when there is an end of stream before the closing lexeme.
313	* But doesn't hurt at the moment. */
314	if ( chTmp == '\0'
315	\|\| rtScriptLexMatchStrConsume(pThis, chTmp, pszClosing, NULL))
316	break;
317
318	RTScriptLexConsumeCh(pThis);
319	}
320
321	return true;
322	}
323
324	return false;
325	}
326
327
328	/**
329	* Checks whether the character is the beginning of a single line comment, skipping the whole
330	* comment if necessary.
331	*
332	* @returns Flag whether a single line comment was detected and consumed.
333	* @param hScriptLex The lexer state.
334	* @param ch The character to check for.
335	*/
336	DECLINLINE(bool) rtScriptLexIsSingleLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
337	{
338	const char **papszCommentSingleStart = pThis->pCfg->papszCommentSingleStart;
339
340	if ( papszCommentSingleStart
341	&& rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentSingleStart,
342	NULL))
343	{
344	for (;;)
345	{
346	char chTmp = RTScriptLexGetCh(pThis);
347
348	if ( chTmp == '\0'
349	\|\| rtScriptLexIsNewlineConsume(pThis, chTmp))
350	break;
351
352	RTScriptLexConsumeCh(pThis);
353	}
354
355	return true;
356	}
357
358	return false;
359	}
360
361
362	/**
363	* Fills the input buffer with source data.
364	*
365	* @returns IPRT status code.
366	* @param pThis The lexer state.
367	*/
368	static int rtScriptLexFillBuffer(PRTSCRIPTLEXINT pThis)
369	{
370	int rc = VINF_SUCCESS;
371	size_t cchToRead = pThis->cchBuf;
372	char *pchRead = &pThis->achBuf[0];
373
374	AssertReturn(!(pThis->fFlags & RTSCRIPT_LEX_INT_F_EOS), VERR_INVALID_STATE);
375
376	/* If there is input left to process move it to the front and fill the remainder. */
377	if ( pThis->pchCur != NULL
378	&& pThis->pchCur != &pThis->achBuf[pThis->cchBuf])
379	{
380	cchToRead = pThis->pchCur - &pThis->achBuf[0];
381	/* Move the rest to the front. */
382	memmove(&pThis->achBuf[0], pThis->pchCur, pThis->cchBuf - cchToRead);
383	pchRead = (char *)pThis->pchCur + 1;
384	}
385
386	if (cchToRead)
387	{
388	pThis->pchCur = &pThis->achBuf[0];
389
390	size_t cchRead = 0;
391	rc = pThis->pfnReader(pThis, pThis->offBufRead, pchRead, cchToRead, &cchRead, pThis->pvUser);
392	if (RT_SUCCESS(rc))
393	{
394	pThis->offBufRead += cchRead;
395	if (rc == VINF_EOF)
396	pThis->fFlags \|= RTSCRIPT_LEX_INT_F_EOS;
397	if (cchRead < cchToRead)
398	memset(pchRead + cchRead, 0, cchToRead - cchRead);
399	rc = VINF_SUCCESS;
400	}
401	else
402	pThis->rcRdr = rc;
403	}
404	else
405	rc = VERR_BUFFER_OVERFLOW; /** @todo */
406
407	return rc;
408	}
409
410
411	/**
412	* Produce an end of stream token.
413	*
414	* @returns nothing.
415	* @param pThis The lexer state.
416	* @param pTok The token to fill.
417	*/
418	static void rtScriptLexProduceTokEos(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
419	{
420	pTok->enmType = RTSCRIPTLEXTOKTYPE_EOS;
421	pTok->PosStart = pThis->Pos;
422	pTok->PosEnd = pThis->Pos;
423	}
424
425
426	RTDECL(int) RTScriptLexProduceTokError(RTSCRIPTLEX hScriptLex, PRTSCRIPTLEXTOKEN pTok,
427	int rc, const char *pszMsg, ...)
428	{
429	PRTSCRIPTLEXINT pThis = hScriptLex;
430
431	va_list va;
432	va_start(va, pszMsg);
433
434	pTok->enmType = RTSCRIPTLEXTOKTYPE_ERROR;
435	pTok->PosEnd = pThis->Pos;
436	pTok->Type.Error.pErr = &pThis->ErrInfo.Core;
437
438	RTErrInfoInitStatic(&pThis->ErrInfo);
439	RTErrInfoSetV(&pThis->ErrInfo.Core, rc, pszMsg, va);
440	va_end(va);
441
442	return rc;
443	}
444
445
446	RTDECL(int) RTScriptLexProduceTokIde(RTSCRIPTLEX hScriptLex, PRTSCRIPTLEXTOKEN pTok, const char *pszIde, size_t cchIde)
447	{
448	PRTSCRIPTLEXINT pThis = hScriptLex;
449
450	/* Insert into string cache. */
451	pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
452	pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, pszIde, cchIde);
453	if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
454	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
455
456	pTok->PosEnd = pThis->Pos;
457	return VINF_SUCCESS;
458	}
459
460
461	/**
462	* Create the token from the exact match.
463	*
464	* @returns nothing.
465	* @param pThis The lexer state.
466	* @param pTok The token to fill.
467	* @param pMatch The matched string.
468	*/
469	static void rtScriptLexProduceTokFromExactMatch(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
470	PCRTSCRIPTLEXTOKMATCH pMatch)
471	{
472	pTok->enmType = pMatch->enmTokType;
473	pTok->PosEnd = pThis->Pos;
474
475	switch (pTok->enmType)
476	{
477	case RTSCRIPTLEXTOKTYPE_OPERATOR:
478	pTok->Type.Operator.pOp = pMatch;
479	break;
480	case RTSCRIPTLEXTOKTYPE_KEYWORD:
481	pTok->Type.Keyword.pKeyword = pMatch;
482	break;
483	case RTSCRIPTLEXTOKTYPE_PUNCTUATOR:
484	pTok->Type.Punctuator.pPunctuator = pMatch;
485	break;
486	default:
487	RTScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
488	"Lexer: The match contains an invalid token type: %d\n",
489	pTok->enmType);
490	}
491	}
492
493
494	/**
495	* Goes through the rules trying to find a matching one.
496	*
497	* @returns Flag whether a matching rule was found.
498	* @param pThis The lexer state.
499	* @param ch The character to check.
500	* @param pTok The token to fill.
501	*/
502	static bool rtScriptLexProduceTokFromRules(PRTSCRIPTLEXINT pThis, char ch, PRTSCRIPTLEXTOKEN pTok)
503	{
504	PCRTSCRIPTLEXRULE pRule = pThis->pCfg->paRules;
505
506	if (pRule)
507	{
508	while (pRule->pfnProd != NULL)
509	{
510	if ( ch >= pRule->chStart
511	&& ch <= pRule->chEnd)
512	{
513	if (pRule->fFlags & RTSCRIPT_LEX_RULE_CONSUME)
514	RTScriptLexConsumeCh(pThis);
515	int rc = pRule->pfnProd(pThis, ch, pTok, pRule->pvUser);
516	AssertRC(rc);
517	return true;
518	}
519
520	pRule++;
521	}
522	}
523
524	return false;
525	}
526
527
528	/**
529	* Fills in the given token from the scanned input at the current location.
530	*
531	* @returns IPRT status code.
532	* @param pThis The lexer state.
533	* @param pTok The token to fill.
534	*/
535	static int rtScriptLexProduceToken(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
536	{
537	RTScriptLexSkipWhitespace(pThis);
538
539	pTok->PosStart = pThis->Pos;
540
541	char ch = RTScriptLexGetCh(pThis);
542	PCRTSCRIPTLEXTOKMATCH pMatch = NULL;
543	if (ch == '\0')
544	rtScriptLexProduceTokEos(pThis, pTok);
545	else if (rtScriptLexLocateExactMatchConsume(pThis, ch, &pMatch))
546	rtScriptLexProduceTokFromExactMatch(pThis, pTok, pMatch);
547	else if (!rtScriptLexProduceTokFromRules(pThis, ch, pTok))
548	{
549	if (pThis->pCfg->pfnProdDef)
550	pThis->rcRdr = pThis->pCfg->pfnProdDef(pThis, ch, pTok, pThis->pCfg->pvProdDefUser);
551	else
552	RTScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
553	"Lexer: Invalid character found in input: %c\n",
554	ch);
555	}
556
557	return pThis->rcRdr;
558	}
559
560
561	/**
562	* Populates the lexer for the initial use.
563	*
564	* @returns IPRT status code.
565	* @param pThis The lexer state.
566	*/
567	static int rtScriptLexPopulate(PRTSCRIPTLEXINT pThis)
568	{
569	int rc = rtScriptLexFillBuffer(pThis);
570	if (RT_SUCCESS(rc))
571	{
572	rc = rtScriptLexProduceToken(pThis, pThis->pTokCur);
573	if (RT_SUCCESS(rc))
574	rc = rtScriptLexProduceToken(pThis, pThis->pTokNext);
575	}
576
577	return rc;
578	}
579
580
581
582	RTDECL(int) RTScriptLexCreateFromReader(PRTSCRIPTLEX phScriptLex, PFNRTSCRIPTLEXRDR pfnReader,
583	PFNRTSCRIPTLEXDTOR pfnDtor, void *pvUser,
584	size_t cchBuf, PRTSTRCACHE phStrCacheId, PRTSTRCACHE phStrCacheStringLit,
585	PCRTSCRIPTLEXCFG pCfg)
586	{
587	AssertPtrReturn(phScriptLex, VERR_INVALID_POINTER);
588	AssertPtrReturn(pfnReader, VERR_INVALID_POINTER);
589	AssertPtrReturn(pCfg, VERR_INVALID_POINTER);
590
591	/* Case insensitivity with internal lower or upper case conversion is mutually exclusive. */
592	AssertReturn( (pCfg->fFlags & (RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_LOWER \| RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_UPPER))
593	!= (RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_LOWER \| RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_UPPER), VERR_INVALID_PARAMETER);
594
595	if (!cchBuf)
596	cchBuf = _16K;
597	int rc = VINF_SUCCESS;
598	PRTSCRIPTLEXINT pThis = (PRTSCRIPTLEXINT)RTMemAllocZ(RT_UOFFSETOF_DYN(RTSCRIPTLEXINT, achBuf[cchBuf]));
599	if (RT_LIKELY(pThis))
600	{
601	pThis->u32Magic = 0xfefecafe; /** @todo */
602	pThis->Pos.iLine = 1;
603	pThis->Pos.iCh = 1;
604	pThis->pTokCur = &pThis->aToks[0];
605	pThis->pTokNext = &pThis->aToks[1];
606	pThis->pCfg = pCfg;
607	pThis->pfnReader = pfnReader;
608	pThis->pfnDtor = pfnDtor;
609	pThis->pvUser = pvUser;
610	pThis->fFlags = 0;
611	pThis->cchStrLitMax = 0;
612	pThis->pszStrLit = NULL;
613	pThis->cchBuf = cchBuf;
614	pThis->offBufRead = 0;
615	pThis->pchCur = NULL;
616	pThis->hStrCacheId = NULL;
617	pThis->hStrCacheStringLit = NULL;
618
619	rc = RTStrCacheCreate(&pThis->hStrCacheId, "LEX-Ide");
620	if (RT_SUCCESS(rc))
621	{
622	rc = RTStrCacheCreate(&pThis->hStrCacheStringLit, "LEX-StrLit");
623	if (RT_SUCCESS(rc))
624	{
625	rc = rtScriptLexPopulate(pThis);
626	if (RT_SUCCESS(rc))
627	{
628	*phScriptLex = pThis;
629
630	if (phStrCacheId)
631	*phStrCacheId = pThis->hStrCacheId;
632	else
633	pThis->fFlags \|= RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE;
634
635	if (phStrCacheStringLit)
636	*phStrCacheStringLit = pThis->hStrCacheStringLit;
637	else
638	pThis->fFlags \|= RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE;
639
640	return VINF_SUCCESS;
641	}
642
643	RTStrCacheDestroy(pThis->hStrCacheStringLit);
644	}
645
646	RTStrCacheDestroy(pThis->hStrCacheId);
647	}
648
649	RTMemFree(pThis);
650	}
651	else
652	rc = VERR_NO_MEMORY;
653
654	return rc;
655	}
656
657
658	/**
659	* @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a string.}
660	*/
661	static DECLCALLBACK(int) rtScriptLexReaderStr(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
662	size_t cchBuf, size_t pcchRead, void pvUser)
663	{
664	RT_NOREF(hScriptLex);
665
666	const char psz = (const char )pvUser;
667	size_t cch = strlen(psz);
668	size_t cchCopy = RT_MIN(cchBuf, cch - offBuf);
669	int rc = VINF_SUCCESS;
670
671	*pcchRead = cchCopy;
672
673	if (cchCopy)
674	memcpy(pchCur, &psz[offBuf], cchCopy * sizeof(char));
675	else
676	rc = VINF_EOF;
677
678	return rc;
679	}
680
681
682	RTDECL(int) RTScriptLexCreateFromString(PRTSCRIPTLEX phScriptLex, const char *pszSrc, PRTSTRCACHE phStrCacheId,
683	PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
684	{
685	return RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderStr, NULL, (void *)pszSrc, 0,
686	phStrCacheId, phStrCacheStringLit, pCfg);
687	}
688
689
690	/**
691	* @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a file.}
692	*/
693	static DECLCALLBACK(int) rtScriptLexReaderFile(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
694	size_t cchBuf, size_t pcchRead, void pvUser)
695	{
696	RT_NOREF(hScriptLex);
697
698	RTFILE hFile = (RTFILE)pvUser;
699	return RTFileReadAt(hFile, offBuf, pchCur, cchBuf, pcchRead);
700	}
701
702
703	/**
704	* @callback_method_impl{FNRTSCRIPTLEXDTOR, Destructor for the file variant.}
705	*/
706	static DECLCALLBACK(void) rtScriptLexDtorFile(RTSCRIPTLEX hScriptLex, void *pvUser)
707	{
708	RT_NOREF(hScriptLex);
709
710	RTFILE hFile = (RTFILE)pvUser;
711	RTFileClose(hFile);
712	}
713
714
715	RTDECL(int) RTScriptLexCreateFromFile(PRTSCRIPTLEX phScriptLex, const char *pszFilename, PRTSTRCACHE phStrCacheId,
716	PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
717	{
718	RTFILE hFile;
719	int rc = RTFileOpen(&hFile, pszFilename, RTFILE_O_READ \| RTFILE_O_DENY_WRITE \| RTFILE_O_OPEN);
720	if (RT_SUCCESS(rc))
721	{
722	rc = RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderFile, rtScriptLexDtorFile, (void *)hFile, 0,
723	phStrCacheId, phStrCacheStringLit, pCfg);
724	if (RT_FAILURE(rc))
725	RTFileClose(hFile);
726	}
727
728	return rc;
729	}
730
731
732	RTDECL(void) RTScriptLexDestroy(RTSCRIPTLEX hScriptLex)
733	{
734	PRTSCRIPTLEXINT pThis = hScriptLex;
735	AssertPtrReturnVoid(pThis);
736
737	if (pThis->pfnDtor)
738	pThis->pfnDtor(pThis, pThis->pvUser);
739
740	if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE)
741	RTStrCacheDestroy(pThis->hStrCacheId);
742	if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE)
743	RTStrCacheDestroy(pThis->hStrCacheStringLit);
744
745	if (pThis->pszStrLit)
746	RTStrFree(pThis->pszStrLit);
747
748	RTMemFree(pThis);
749	}
750
751
752	RTDECL(int) RTScriptLexQueryToken(RTSCRIPTLEX hScriptLex, PCRTSCRIPTLEXTOKEN *ppToken)
753	{
754	PRTSCRIPTLEXINT pThis = hScriptLex;
755	AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
756	AssertPtrReturn(ppToken, VERR_INVALID_POINTER);
757
758	if (RT_SUCCESS(pThis->rcRdr))
759	*ppToken = pThis->pTokCur;
760
761	return pThis->rcRdr;
762	}
763
764
765	RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexGetTokenType(RTSCRIPTLEX hScriptLex)
766	{
767	PRTSCRIPTLEXINT pThis = hScriptLex;
768	AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
769
770	if (RT_SUCCESS(pThis->rcRdr))
771	return pThis->pTokCur->enmType;
772
773	return RTSCRIPTLEXTOKTYPE_INVALID;
774	}
775
776
777	RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexPeekNextTokenType(RTSCRIPTLEX hScriptLex)
778	{
779	PRTSCRIPTLEXINT pThis = hScriptLex;
780	AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
781
782	if (RT_SUCCESS(pThis->rcRdr))
783	return pThis->pTokNext->enmType;
784
785	return RTSCRIPTLEXTOKTYPE_INVALID;
786	}
787
788
789	RTDECL(PCRTSCRIPTLEXTOKEN) RTScriptLexConsumeToken(RTSCRIPTLEX hScriptLex)
790	{
791	PRTSCRIPTLEXINT pThis = hScriptLex;
792	AssertPtrReturn(pThis, NULL);
793
794	/*
795	* Stop token production as soon as the current token indicates the
796	* end of the stream or an error
797	*/
798	if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
799	&& pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
800	{
801	PRTSCRIPTLEXTOKEN pTokTmp = pThis->pTokCur;
802
803	/* Switch next token to current token and read in the next token. */
804	pThis->pTokCur = pThis->pTokNext;
805	pThis->pTokNext = pTokTmp;
806	if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
807	&& pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
808	rtScriptLexProduceToken(pThis, pThis->pTokNext);
809	else
810	pThis->pTokNext = pThis->pTokCur;
811	}
812
813	return pThis->pTokCur;
814	}
815
816
817	RTDECL(char) RTScriptLexConsumeCh(RTSCRIPTLEX hScriptLex)
818	{
819	return RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_DEFAULT);
820	}
821
822
823	RTDECL(char) RTScriptLexConsumeChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
824	{
825	PRTSCRIPTLEXINT pThis = hScriptLex;
826	AssertPtrReturn(pThis, '\0');
827
828	pThis->pchCur++;
829	pThis->Pos.iCh++;
830	if (pThis->pchCur == &pThis->achBuf[pThis->cchBuf])
831	rtScriptLexFillBuffer(pThis);
832
833	return RTScriptLexGetChEx(pThis, fFlags);
834	}
835
836
837	RTDECL(char) RTScriptLexPeekCh(RTSCRIPTLEX hScriptLex, unsigned idx)
838	{
839	return RTScriptLexPeekChEx(hScriptLex, idx, RTSCRIPT_LEX_CONV_F_DEFAULT);
840	}
841
842
843	RTDECL(char) RTScriptLexPeekChEx(RTSCRIPTLEX hScriptLex, unsigned idx, uint32_t fFlags)
844	{
845	PRTSCRIPTLEXINT pThis = hScriptLex;
846	AssertPtrReturn(pThis, '\0');
847
848	/* Just return the character if it is in the current buffer. */
849	char ch = '\0';
850	if (RT_LIKELY(pThis->pchCur + idx < &pThis->achBuf[pThis->cchBuf]))
851	ch = pThis->pchCur[idx];
852	else
853	{
854	/* Slow path, read data into temporary buffer to read character from and dismiss. */
855	/** @todo */
856	AssertReleaseFailed();
857	}
858
859	if (!(fFlags & RTSCRIPT_LEX_CONV_F_NOTHING))
860	{
861	if (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_LOWER)
862	ch = RT_C_TO_LOWER(ch);
863	else if (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE_UPPER)
864	ch = RT_C_TO_UPPER(ch);
865	}
866
867	return ch;
868	}
869
870
871	RTDECL(char) RTScriptLexGetCh(RTSCRIPTLEX hScriptLex)
872	{
873	return RTScriptLexPeekCh(hScriptLex, 0);
874	}
875
876
877	RTDECL(char) RTScriptLexGetChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
878	{
879	return RTScriptLexPeekChEx(hScriptLex, 0, fFlags);
880	}
881
882
883	RTDECL(void) RTScriptLexSkipWhitespace(RTSCRIPTLEX hScriptLex)
884	{
885	PRTSCRIPTLEXINT pThis = hScriptLex;
886	AssertPtrReturnVoid(pThis);
887
888	for (;;)
889	{
890	char ch = RTScriptLexGetCh(hScriptLex);
891
892	if (ch == '\0')
893	break;
894
895	/* Check for whitespace. */
896	const char *pszWs = pThis->pCfg->pszWhitespace ? pThis->pCfg->pszWhitespace : g_szWsDef;
897
898	if ( rtScriptLexLocateChInStrConsume(pThis, ch, pszWs)
899	\|\| rtScriptLexIsNewlineConsume(pThis, ch)
900	\|\| rtScriptLexIsMultiLineCommentConsume(pThis, ch)
901	\|\| rtScriptLexIsSingleLineCommentConsume(pThis, ch))
902	continue;
903
904	/* All white space skipped, next is some real content. */
905	break;
906	}
907	}
908
909
910	RTDECL(int) RTScriptLexScanNumber(RTSCRIPTLEX hScriptLex, uint8_t uBase, bool fAllowReal,
911	PRTSCRIPTLEXTOKEN pTok)
912	{
913	RT_NOREF(uBase, fAllowReal, pTok);
914	PRTSCRIPTLEXINT pThis = hScriptLex;
915	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
916	AssertReturn(!fAllowReal, VERR_NOT_IMPLEMENTED);
917	AssertReturn(!uBase, VERR_NOT_IMPLEMENTED);
918
919	/** @todo r=aeichner Quick and dirty to have something working for the disassembler testcase.
920	* Among others it misses overflow handling. */
921	uBase = 10;
922	char ch = RTScriptLexGetCh(hScriptLex);
923	pTok->Type.Number.enmType = ch == '-'
924	? RTSCRIPTLEXTOKNUMTYPE_INTEGER
925	: RTSCRIPTLEXTOKNUMTYPE_NATURAL;
926	if (ch == '-' \|\| ch == '+')
927	ch = RTScriptLexConsumeCh(hScriptLex);
928
929	if (ch == '0')
930	{
931	/* Some hex prefix? */
932	char chNext = RTScriptLexPeekCh(hScriptLex, 1);
933	if (chNext == 'x' \|\| chNext == 'X')
934	{
935	uBase = 16;
936	RTScriptLexConsumeCh(hScriptLex);
937	}
938	else if (chNext >= '0' && chNext <= '9') /* Octal stuff. */
939	AssertFailedReturn(VERR_NOT_IMPLEMENTED);
940
941	ch = RTScriptLexConsumeCh(hScriptLex);
942	}
943
944	uint64_t u64 = 0;
945	for (;;)
946	{
947	if ( (ch < '0' \|\| ch > '9')
948	&& ( ( !(ch >= 'a' && ch <= 'f')
949	&& !(ch >= 'A' && ch <= 'F'))
950	\|\| uBase == 10))
951	{
952	if (pTok->Type.Number.enmType == RTSCRIPTLEXTOKNUMTYPE_INTEGER)
953	pTok->Type.Number.Type.i64 = -(int64_t)u64;
954	else
955	pTok->Type.Number.Type.u64 = u64;
956	pTok->enmType = RTSCRIPTLEXTOKTYPE_NUMBER;
957	pTok->PosEnd = pThis->Pos;
958	return VINF_SUCCESS;
959	}
960
961	if (ch >= '0' && ch <= '9')
962	u64 = (u64 * uBase) + (ch - '0');
963	else if (ch >= 'a' && ch <= 'f')
964	{
965	Assert(uBase == 16);
966	u64 = (u64 << 4) + 10 + (ch - 'a');
967	}
968	else if (ch >= 'A' && ch <= 'F')
969	{
970	Assert(uBase == 16);
971	u64 = (u64 << 4) + 10 + (ch - 'A');
972	}
973
974	ch = RTScriptLexConsumeCh(hScriptLex);
975	}
976	}
977
978
979	RTDECL(int) RTScriptLexScanIdentifier(RTSCRIPTLEX hScriptLex, char ch,
980	PRTSCRIPTLEXTOKEN pTok, void *pvUser)
981	{
982	PRTSCRIPTLEXINT pThis = hScriptLex;
983	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
984
985	const char pszCharSet = pvUser ? (const char )pvUser : g_aszIdeCharSetDef;
986	char aszIde[513]; RT_ZERO(aszIde);
987	unsigned idx = 0;
988	aszIde[idx++] = ch;
989
990	ch = RTScriptLexGetCh(hScriptLex);
991	while ( idx < sizeof(aszIde) - 1
992	&& rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
993	{
994	aszIde[idx++] = ch;
995	ch = RTScriptLexGetCh(hScriptLex);
996	}
997
998	if ( idx == sizeof(aszIde) - 1
999	&& rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
1000	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_BUFFER_OVERFLOW, "Lexer: Identifier exceeds the allowed length");
1001
1002	/* Insert into string cache. */
1003	pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
1004	pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, &aszIde[0], idx);
1005	if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
1006	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
1007
1008	pTok->PosEnd = pThis->Pos;
1009	return VINF_SUCCESS;
1010	}
1011
1012
1013	/**
1014	* Adds the given character to the string literal add the given position, assuring the string
1015	* is always zero terminated.
1016	*
1017	* @returns IPRT status code.
1018	* @param pThis The lexer state.
1019	* @param ch The character to add.
1020	* @param idx At which position to add the character in the string.
1021	*/
1022	static int rtScriptLexScanStringLiteralChAdd(PRTSCRIPTLEXINT pThis, char ch, uint32_t idx)
1023	{
1024	int rc = VINF_SUCCESS;
1025
1026	if ( !pThis->cchStrLitMax
1027	\|\| idx >= pThis->cchStrLitMax - 1)
1028	{
1029	/* Increase memory. */
1030	size_t cchMaxNew = pThis->cchStrLitMax + 64;
1031	char *pszNew = pThis->pszStrLit;
1032	rc = RTStrRealloc(&pszNew, cchMaxNew * sizeof(char));
1033	if (RT_SUCCESS(rc))
1034	{
1035	pThis->pszStrLit = pszNew;
1036	pThis->cchStrLitMax = cchMaxNew;
1037	}
1038	}
1039
1040	if (RT_SUCCESS(rc))
1041	{
1042	pThis->pszStrLit[idx] = ch;
1043	pThis->pszStrLit[idx + 1] = '\0';
1044	}
1045
1046	return rc;
1047	}
1048
1049
1050	RTDECL(int) RTScriptLexScanStringLiteralC(RTSCRIPTLEX hScriptLex, char ch,
1051	PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1052	{
1053	RT_NOREF(ch, pvUser);
1054	PRTSCRIPTLEXINT pThis = hScriptLex;
1055	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1056
1057	uint32_t idxChCur = 0;
1058	int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1059	if (RT_FAILURE(rc))
1060	return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1061
1062	ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1063	for (;;)
1064	{
1065	if (ch == '\0')
1066	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1067	else if (ch == '\"')
1068	{
1069	RTScriptLexConsumeCh(hScriptLex);
1070
1071	/* End of string, add it to the string literal cache and build the token. */
1072	pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1073	pTok->Type.StringLit.cchString = idxChCur;
1074	pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1075	if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1076	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1077	else
1078	break;
1079	}
1080	else if (ch == '\\')
1081	{
1082	/* Start of escape sequence. */
1083	RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1084	ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1085	switch (ch)
1086	{
1087	case 'a': /* Alert (Bell) */
1088	ch = 0x07;
1089	break;
1090	case 'b': /* Backspace */
1091	ch = 0x08;
1092	break;
1093	case 'e': /* Escape character */
1094	ch = 0x1b;
1095	break;
1096	case 'f': /* Formfeed */
1097	ch = 0x0c;
1098	break;
1099	case 'n': /* Newline (line freed) */
1100	ch = 0x0a;
1101	break;
1102	case 'r': /* Carriage return */
1103	ch = 0x0d;
1104	break;
1105	case 't': /* Horizontal tab */
1106	ch = 0x09;
1107	break;
1108	case 'v': /* Vertical tab */
1109	ch = 0x0b;
1110	break;
1111	case '\\':
1112	case '\'':
1113	case '\"':
1114	case '\?':
1115	/* Can be added as is. */
1116	break;
1117	case 'x': /* Hexdecimal byte. */
1118	case '0': /* Octal */
1119	case '1':
1120	case '2':
1121	case '3':
1122	case '4':
1123	case '5':
1124	case '6':
1125	case '7':
1126	case '8':
1127	case '9':
1128	case 'u': /* Unicode point below 10000 */
1129	case 'U': /* Unicode point */
1130	default:
1131	/* Not supported for now. */
1132	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NOT_SUPPORTED, "Lexer: Invalid/unsupported escape sequence");
1133	}
1134	}
1135
1136	rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1137	if (RT_SUCCESS(rc))
1138	idxChCur++;
1139	else
1140	return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1141
1142	ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1143	}
1144
1145	pTok->PosEnd = pThis->Pos;
1146	return VINF_SUCCESS;
1147	}
1148
1149
1150	RTDECL(int) RTScriptLexScanStringLiteralPascal(RTSCRIPTLEX hScriptLex, char ch,
1151	PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1152	{
1153	RT_NOREF(ch, pvUser);
1154	PRTSCRIPTLEXINT pThis = hScriptLex;
1155	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1156
1157	uint32_t idxChCur = 0;
1158	int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1159	if (RT_FAILURE(rc))
1160	return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1161
1162	ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1163	for (;;)
1164	{
1165	if (ch == '\0')
1166	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1167	else if (ch == '\'')
1168	{
1169	/*
1170	* Check whether there is a second ' coming afterwards used for
1171	* escaping ' characters.
1172	*/
1173	ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1174	if (ch != '\'')
1175	{
1176	/* End of string, add it to the string literal cache and build the token. */
1177	pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1178	pTok->Type.StringLit.cchString = idxChCur;
1179	pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1180	if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1181	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1182	else
1183	break;
1184	}
1185	/* else: Fall through and add the character to the string literal..*/
1186	}
1187
1188	rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1189	if (RT_SUCCESS(rc))
1190	idxChCur++;
1191	else
1192	return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1193	ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1194	}
1195
1196	pTok->PosEnd = pThis->Pos;
1197	return VINF_SUCCESS;
1198	}
1199

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/script/scriptlex.cpp@ 108194

Download in other formats: