scriptlex.cpp@ 108014

Last change on this file since 108014 was 108014, checked in by vboxsync, 4 weeks ago
Runtime/common/script/scriptlex.cpp: Fix C string literal scanning and add some helper APIs to create tokens for errors and identifiers, bugref:10733
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 37.7 KB

Line
1	/* $Id: scriptlex.cpp 108014 2025-02-01 19:20:09Z vboxsync $ */
2	/** @file
3	* IPRT - RTScript* lexer API.
4	*/
5
6	/*
7	* Copyright (C) 2022-2024 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* The contents of this file may alternatively be used under the terms
26	* of the Common Development and Distribution License Version 1.0
27	* (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28	* in the VirtualBox distribution, in which case the provisions of the
29	* CDDL are applicable instead of those of the GPL.
30	*
31	* You may elect to license modified versions of this file under the
32	* terms and conditions of either the GPL or the CDDL or both.
33	*
34	* SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35	*/
36
37
38	/*********************************************************************************************************************************
39	* Header Files *
40	*********************************************************************************************************************************/
41	#define LOG_GROUP RTLOGGROUP_DEFAULT /// @todo
42	#include <iprt/script.h>
43
44	#include <iprt/assert.h>
45	#include <iprt/ctype.h>
46	#include <iprt/err.h>
47	#include <iprt/file.h>
48	#include <iprt/log.h>
49	#include <iprt/mem.h>
50	#include <iprt/string.h>
51
52
53	/*********************************************************************************************************************************
54	* Structures and Typedefs *
55	*********************************************************************************************************************************/
56
57	/**
58	* Internal lexer state.
59	*/
60	typedef struct RTSCRIPTLEXINT
61	{
62	/** Magic. */
63	uint32_t u32Magic;
64	/** Source position. */
65	RTSCRIPTPOS Pos;
66	/** Current and next token buffer. */
67	RTSCRIPTLEXTOKEN aToks[2];
68	/** Pointer to the current token. */
69	PRTSCRIPTLEXTOKEN pTokCur;
70	/** Pointer to the next token. */
71	PRTSCRIPTLEXTOKEN pTokNext;
72	/** The lexer config. */
73	PCRTSCRIPTLEXCFG pCfg;
74	/** The input reader. */
75	PFNRTSCRIPTLEXRDR pfnReader;
76	/** The destructor callback. */
77	PFNRTSCRIPTLEXDTOR pfnDtor;
78	/** Opaque user data for the reader. */
79	void *pvUser;
80	/** Identifier string cache. */
81	RTSTRCACHE hStrCacheId;
82	/** String literal string cache. */
83	RTSTRCACHE hStrCacheStringLit;
84	/** Status code from the reader. */
85	int rcRdr;
86	/** Internal error info. */
87	RTERRINFOSTATIC ErrInfo;
88	/** Lexer flags. */
89	uint32_t fFlags;
90	/** Maximum numebr of bytes allocated for temporary storage for literal strings. */
91	size_t cchStrLitMax;
92	/** Pointer to the string buffer for holding the literal string. */
93	char *pszStrLit;
94	/** Pointer to the current input character. */
95	const char *pchCur;
96	/** Offset to start reading the next chunk from. */
97	size_t offBufRead;
98	/** Size of the input buffer. */
99	size_t cchBuf;
100	/** The cached part of the input, variable in size. */
101	char achBuf[1];
102	} RTSCRIPTLEXINT;
103	/** Pointer to the internal lexer state. */
104	typedef RTSCRIPTLEXINT *PRTSCRIPTLEXINT;
105
106
107	/** Free the identifier string cache literal on destruction. */
108	#define RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE RT_BIT_32(0)
109	/** Free the string literal string cache literal on destruction. */
110	#define RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE RT_BIT_32(1)
111	/** End of stream reached. */
112	#define RTSCRIPT_LEX_INT_F_EOS RT_BIT_32(2)
113
114
115	/*********************************************************************************************************************************
116	* Global Variables *
117	*********************************************************************************************************************************/
118
119	/** Default set of white spaces. */
120	static const char *g_szWsDef = " \t";
121	/** Default set of newlines. */
122	static const char *g_aszNlDef[] =
123	{
124	"\n",
125	"\r\n",
126	NULL
127	};
128	/** Default set of characters allowed for identifiers. */
129	static const char *g_aszIdeCharSetDef = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
130
131
132	/*********************************************************************************************************************************
133	* Internal Functions *
134	*********************************************************************************************************************************/
135
136
137	/**
138	* Locates the given character in the string, consuming it if found.
139	*
140	* @returns Flag whether the character was found in the string.
141	* @param pThis The lexer state.
142	* @param ch The character to check for.
143	* @param psz The string to check.
144	*/
145	DECLINLINE(bool) rtScriptLexLocateChInStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz)
146	{
147	while ( *psz != '\0'
148	&& *psz != ch)
149	psz++;
150
151	if (*psz != '\0')
152	RTScriptLexConsumeCh(pThis);
153
154	return *psz != '\0';
155	}
156
157
158	/**
159	* Matches the input against the given string starting with the given character, consuming it
160	* if found.
161	*
162	* @returns Flag whether there was a match.
163	* @param pThis The lexer state.
164	* @param ch The character to check start matching.
165	* @param psz The string to match against.
166	* @param pszExclude When the string matched but the input continues
167	* with one of the characters in this string there will
168	* be no match.
169	*/
170	DECLINLINE(bool) rtScriptLexMatchStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz,
171	const char *pszExclude)
172	{
173	bool fMatch = false;
174	if (*psz == ch)
175	{
176	unsigned offPeek = 1;
177
178	psz++;
179	while ( *psz != '\0'
180	&& *psz == RTScriptLexPeekCh(pThis, offPeek))
181	{
182	offPeek++;
183	psz++;
184	}
185
186	if (*psz == '\0')
187	{
188	if (pszExclude)
189	{
190	ch = RTScriptLexPeekCh(pThis, offPeek);
191	fMatch = strchr(pszExclude, ch) == NULL;
192	}
193	else
194	fMatch = true;
195	}
196
197	if (fMatch)
198	{
199	/* Match, consume everything. */
200	while (offPeek-- > 0)
201	RTScriptLexConsumeCh(pThis);
202	}
203	}
204
205	return fMatch;
206	}
207
208
209	/**
210	* Tries to locate a string with the given starting character (+ peeking ahead) in the
211	* given string array (exact match) and consumes the entire substring.
212	*
213	* @returns Flag whether there was a match.
214	* @param pThis The lexer state.
215	* @param ch The character to check for.
216	* @param papsz Pointer to the string array to check for.
217	* @param pidx Where to store the index of the matching substring if found,
218	* optional.
219	*/
220	DECLINLINE(bool) rtScriptLexLocateSubStrInStrArrayMatchConsume(PRTSCRIPTLEXINT pThis, char ch,
221	const char *papsz, unsigned pidx)
222	{
223	unsigned int idx = 0;
224
225	while ( papsz[idx] != NULL
226	&& !rtScriptLexMatchStrConsume(pThis, ch, papsz[idx], NULL))
227	idx++;
228
229	if ( papsz[idx] != NULL
230	&& pidx)
231	*pidx = idx;
232
233	return papsz[idx] != NULL;
234	}
235
236
237	/**
238	* Tries to get an exact match starting with the given character, consuming it when found.
239	*
240	* @returns Flag whether there was a match.
241	* @param pThis The lexer state.
242	* @param ch The character to check for.
243	* @param ppMatch Where to store the exact match on success.
244	*/
245	DECLINLINE(bool) rtScriptLexLocateExactMatchConsume(PRTSCRIPTLEXINT pThis, char ch, PCRTSCRIPTLEXTOKMATCH *ppMatch)
246	{
247	PCRTSCRIPTLEXTOKMATCH pTokMatch = pThis->pCfg->paTokMatches;
248
249	if (pTokMatch)
250	{
251	while ( pTokMatch->pszMatch != NULL
252	&& !rtScriptLexMatchStrConsume(pThis, ch, pTokMatch->pszMatch,
253	pTokMatch->fMaybeIdentifier
254	? g_aszIdeCharSetDef
255	: NULL))
256	pTokMatch++;
257
258	if (pTokMatch->pszMatch != NULL)
259	{
260	*ppMatch = pTokMatch;
261	return true;
262	}
263	}
264
265	return false;
266	}
267
268
269	DECLINLINE(bool) rtScriptLexIsNewlineConsume(PRTSCRIPTLEXINT pThis, char ch)
270	{
271	const char **papszNl = pThis->pCfg->pszWhitespace ? pThis->pCfg->papszNewline : g_aszNlDef;
272
273	bool fMatched = rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszNl, NULL);
274	if (fMatched)
275	{
276	pThis->Pos.iLine++;
277	pThis->Pos.iCh = 1;
278	}
279
280	return fMatched;
281	}
282
283
284	/**
285	* Checks whether the character is the beginning of a multi line comment, skipping the whole
286	* comment if necessary.
287	*
288	* @returns Flag whether a multi line comment was detected and consumed.
289	* @param hScriptLex The lexer state.
290	* @param ch The character to check for.
291	*/
292	DECLINLINE(bool) rtScriptLexIsMultiLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
293	{
294	const char **papszCommentMultiStart = pThis->pCfg->papszCommentMultiStart;
295	unsigned idxComment = 0;
296
297	if ( papszCommentMultiStart
298	&& rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentMultiStart,
299	&idxComment))
300	{
301	/* Look for the matching closing lexeme in the input consuming everything along the way. */
302	const char *pszClosing = pThis->pCfg->papszCommentMultiEnd[idxComment];
303
304	for (;;)
305	{
306	char chTmp = RTScriptLexGetCh(pThis);
307
308	/* Check for new lines explicetly to advance the position information. */
309	if (rtScriptLexIsNewlineConsume(pThis, chTmp))
310	continue;
311
312	/** @todo Not quite correct when there is an end of stream before the closing lexeme.
313	* But doesn't hurt at the moment. */
314	if ( chTmp == '\0'
315	\|\| rtScriptLexMatchStrConsume(pThis, chTmp, pszClosing, NULL))
316	break;
317
318	RTScriptLexConsumeCh(pThis);
319	}
320
321	return true;
322	}
323
324	return false;
325	}
326
327
328	/**
329	* Checks whether the character is the beginning of a single line comment, skipping the whole
330	* comment if necessary.
331	*
332	* @returns Flag whether a single line comment was detected and consumed.
333	* @param hScriptLex The lexer state.
334	* @param ch The character to check for.
335	*/
336	DECLINLINE(bool) rtScriptLexIsSingleLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
337	{
338	const char **papszCommentSingleStart = pThis->pCfg->papszCommentSingleStart;
339
340	if ( papszCommentSingleStart
341	&& rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentSingleStart,
342	NULL))
343	{
344	for (;;)
345	{
346	char chTmp = RTScriptLexGetCh(pThis);
347
348	if ( chTmp == '\0'
349	\|\| rtScriptLexIsNewlineConsume(pThis, chTmp))
350	break;
351
352	RTScriptLexConsumeCh(pThis);
353	}
354
355	return true;
356	}
357
358	return false;
359	}
360
361
362	/**
363	* Fills the input buffer with source data.
364	*
365	* @returns IPRT status code.
366	* @param pThis The lexer state.
367	*/
368	static int rtScriptLexFillBuffer(PRTSCRIPTLEXINT pThis)
369	{
370	int rc = VINF_SUCCESS;
371	size_t cchToRead = pThis->cchBuf;
372	char *pchRead = &pThis->achBuf[0];
373
374	AssertReturn(!(pThis->fFlags & RTSCRIPT_LEX_INT_F_EOS), VERR_INVALID_STATE);
375
376	/* If there is input left to process move it to the front and fill the remainder. */
377	if (pThis->pchCur != NULL)
378	{
379	cchToRead = pThis->pchCur - &pThis->achBuf[0];
380	/* Move the rest to the front. */
381	memmove(&pThis->achBuf[0], pThis->pchCur, pThis->cchBuf - cchToRead);
382	pchRead = (char *)pThis->pchCur + 1;
383	memset(pchRead, 0, cchToRead);
384	}
385
386	if (cchToRead)
387	{
388	pThis->pchCur = &pThis->achBuf[0];
389
390	size_t cchRead = 0;
391	rc = pThis->pfnReader(pThis, pThis->offBufRead, pchRead, cchToRead, &cchRead, pThis->pvUser);
392	if (RT_SUCCESS(rc))
393	{
394	pThis->offBufRead += cchRead;
395	if (rc == VINF_EOF)
396	pThis->fFlags \|= RTSCRIPT_LEX_INT_F_EOS;
397	rc = VINF_SUCCESS;
398	}
399	else
400	pThis->rcRdr = rc;
401	}
402	else
403	rc = VERR_BUFFER_OVERFLOW; /** @todo */
404
405	return rc;
406	}
407
408
409	/**
410	* Produce an end of stream token.
411	*
412	* @returns nothing.
413	* @param pThis The lexer state.
414	* @param pTok The token to fill.
415	*/
416	static void rtScriptLexProduceTokEos(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
417	{
418	pTok->enmType = RTSCRIPTLEXTOKTYPE_EOS;
419	pTok->PosStart = pThis->Pos;
420	pTok->PosEnd = pThis->Pos;
421	}
422
423
424	RTDECL(int) RTScriptLexProduceTokError(RTSCRIPTLEX hScriptLex, PRTSCRIPTLEXTOKEN pTok,
425	int rc, const char *pszMsg, ...)
426	{
427	PRTSCRIPTLEXINT pThis = hScriptLex;
428
429	va_list va;
430	va_start(va, pszMsg);
431
432	pTok->enmType = RTSCRIPTLEXTOKTYPE_ERROR;
433	pTok->PosEnd = pThis->Pos;
434	pTok->Type.Error.pErr = &pThis->ErrInfo.Core;
435
436	RTErrInfoInitStatic(&pThis->ErrInfo);
437	RTErrInfoSetV(&pThis->ErrInfo.Core, rc, pszMsg, va);
438	va_end(va);
439
440	return rc;
441	}
442
443
444	RTDECL(int) RTScriptLexProduceTokIde(RTSCRIPTLEX hScriptLex, PRTSCRIPTLEXTOKEN pTok, const char *pszIde, size_t cchIde)
445	{
446	PRTSCRIPTLEXINT pThis = hScriptLex;
447
448	/* Insert into string cache. */
449	pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
450	pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, pszIde, cchIde);
451	if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
452	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
453
454	pTok->PosEnd = pThis->Pos;
455	return VINF_SUCCESS;
456	}
457
458
459	/**
460	* Create the token from the exact match.
461	*
462	* @returns nothing.
463	* @param pThis The lexer state.
464	* @param pTok The token to fill.
465	* @param pMatch The matched string.
466	*/
467	static void rtScriptLexProduceTokFromExactMatch(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
468	PCRTSCRIPTLEXTOKMATCH pMatch)
469	{
470	pTok->enmType = pMatch->enmTokType;
471	pTok->PosEnd = pThis->Pos;
472
473	switch (pTok->enmType)
474	{
475	case RTSCRIPTLEXTOKTYPE_OPERATOR:
476	pTok->Type.Operator.pOp = pMatch;
477	break;
478	case RTSCRIPTLEXTOKTYPE_KEYWORD:
479	pTok->Type.Keyword.pKeyword = pMatch;
480	break;
481	case RTSCRIPTLEXTOKTYPE_PUNCTUATOR:
482	pTok->Type.Punctuator.pPunctuator = pMatch;
483	break;
484	default:
485	RTScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
486	"Lexer: The match contains an invalid token type: %d\n",
487	pTok->enmType);
488	}
489	}
490
491
492	/**
493	* Goes through the rules trying to find a matching one.
494	*
495	* @returns Flag whether a matching rule was found.
496	* @param pThis The lexer state.
497	* @param ch The character to check.
498	* @param pTok The token to fill.
499	*/
500	static bool rtScriptLexProduceTokFromRules(PRTSCRIPTLEXINT pThis, char ch, PRTSCRIPTLEXTOKEN pTok)
501	{
502	PCRTSCRIPTLEXRULE pRule = pThis->pCfg->paRules;
503
504	if (pRule)
505	{
506	while (pRule->pfnProd != NULL)
507	{
508	if ( ch >= pRule->chStart
509	&& ch <= pRule->chEnd)
510	{
511	if (pRule->fFlags & RTSCRIPT_LEX_RULE_CONSUME)
512	RTScriptLexConsumeCh(pThis);
513	int rc = pRule->pfnProd(pThis, ch, pTok, pRule->pvUser);
514	AssertRC(rc);
515	return true;
516	}
517
518	pRule++;
519	}
520	}
521
522	return false;
523	}
524
525
526	/**
527	* Fills in the given token from the scanned input at the current location.
528	*
529	* @returns IPRT status code.
530	* @param pThis The lexer state.
531	* @param pTok The token to fill.
532	*/
533	static int rtScriptLexProduceToken(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
534	{
535	RTScriptLexSkipWhitespace(pThis);
536
537	pTok->PosStart = pThis->Pos;
538
539	char ch = RTScriptLexGetCh(pThis);
540	PCRTSCRIPTLEXTOKMATCH pMatch = NULL;
541	if (ch == '\0')
542	rtScriptLexProduceTokEos(pThis, pTok);
543	else if (rtScriptLexLocateExactMatchConsume(pThis, ch, &pMatch))
544	rtScriptLexProduceTokFromExactMatch(pThis, pTok, pMatch);
545	else if (!rtScriptLexProduceTokFromRules(pThis, ch, pTok))
546	{
547	if (pThis->pCfg->pfnProdDef)
548	pThis->rcRdr = pThis->pCfg->pfnProdDef(pThis, ch, pTok, pThis->pCfg->pvProdDefUser);
549	else
550	RTScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
551	"Lexer: Invalid character found in input: %c\n",
552	ch);
553	}
554
555	return pThis->rcRdr;
556	}
557
558
559	/**
560	* Populates the lexer for the initial use.
561	*
562	* @returns IPRT status code.
563	* @param pThis The lexer state.
564	*/
565	static int rtScriptLexPopulate(PRTSCRIPTLEXINT pThis)
566	{
567	int rc = rtScriptLexFillBuffer(pThis);
568	if (RT_SUCCESS(rc))
569	{
570	rc = rtScriptLexProduceToken(pThis, pThis->pTokCur);
571	if (RT_SUCCESS(rc))
572	rc = rtScriptLexProduceToken(pThis, pThis->pTokNext);
573	}
574
575	return rc;
576	}
577
578
579
580	RTDECL(int) RTScriptLexCreateFromReader(PRTSCRIPTLEX phScriptLex, PFNRTSCRIPTLEXRDR pfnReader,
581	PFNRTSCRIPTLEXDTOR pfnDtor, void *pvUser,
582	size_t cchBuf, PRTSTRCACHE phStrCacheId, PRTSTRCACHE phStrCacheStringLit,
583	PCRTSCRIPTLEXCFG pCfg)
584	{
585	AssertPtrReturn(phScriptLex, VERR_INVALID_POINTER);
586	AssertPtrReturn(pfnReader, VERR_INVALID_POINTER);
587	AssertPtrReturn(pCfg, VERR_INVALID_POINTER);
588
589	if (!cchBuf)
590	cchBuf = _16K;
591	int rc = VINF_SUCCESS;
592	PRTSCRIPTLEXINT pThis = (PRTSCRIPTLEXINT)RTMemAllocZ(RT_UOFFSETOF_DYN(RTSCRIPTLEXINT, achBuf[cchBuf]));
593	if (RT_LIKELY(pThis))
594	{
595	pThis->u32Magic = 0xfefecafe; /** @todo */
596	pThis->Pos.iLine = 1;
597	pThis->Pos.iCh = 1;
598	pThis->pTokCur = &pThis->aToks[0];
599	pThis->pTokNext = &pThis->aToks[1];
600	pThis->pCfg = pCfg;
601	pThis->pfnReader = pfnReader;
602	pThis->pfnDtor = pfnDtor;
603	pThis->pvUser = pvUser;
604	pThis->fFlags = 0;
605	pThis->cchStrLitMax = 0;
606	pThis->pszStrLit = NULL;
607	pThis->cchBuf = cchBuf;
608	pThis->offBufRead = 0;
609	pThis->pchCur = NULL;
610	pThis->hStrCacheId = NULL;
611	pThis->hStrCacheStringLit = NULL;
612
613	rc = RTStrCacheCreate(&pThis->hStrCacheId, "LEX-Ide");
614	if (RT_SUCCESS(rc))
615	{
616	rc = RTStrCacheCreate(&pThis->hStrCacheStringLit, "LEX-StrLit");
617	if (RT_SUCCESS(rc))
618	{
619	rc = rtScriptLexPopulate(pThis);
620	if (RT_SUCCESS(rc))
621	{
622	*phScriptLex = pThis;
623
624	if (phStrCacheId)
625	*phStrCacheId = pThis->hStrCacheId;
626	else
627	pThis->fFlags \|= RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE;
628
629	if (phStrCacheStringLit)
630	*phStrCacheStringLit = pThis->hStrCacheStringLit;
631	else
632	pThis->fFlags \|= RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE;
633
634	return VINF_SUCCESS;
635	}
636
637	RTStrCacheDestroy(pThis->hStrCacheStringLit);
638	}
639
640	RTStrCacheDestroy(pThis->hStrCacheId);
641	}
642
643	RTMemFree(pThis);
644	}
645	else
646	rc = VERR_NO_MEMORY;
647
648	return rc;
649	}
650
651
652	/**
653	* @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a string.}
654	*/
655	static DECLCALLBACK(int) rtScriptLexReaderStr(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
656	size_t cchBuf, size_t pcchRead, void pvUser)
657	{
658	RT_NOREF(hScriptLex);
659
660	const char psz = (const char )pvUser;
661	size_t cch = strlen(psz);
662	size_t cchCopy = RT_MIN(cchBuf, cch - offBuf);
663	int rc = VINF_SUCCESS;
664
665	*pcchRead = cchCopy;
666
667	if (cchCopy)
668	memcpy(pchCur, &psz[offBuf], cchCopy * sizeof(char));
669	else
670	rc = VINF_EOF;
671
672	return rc;
673	}
674
675
676	RTDECL(int) RTScriptLexCreateFromString(PRTSCRIPTLEX phScriptLex, const char *pszSrc, PRTSTRCACHE phStrCacheId,
677	PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
678	{
679	return RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderStr, NULL, (void *)pszSrc, 0,
680	phStrCacheId, phStrCacheStringLit, pCfg);
681	}
682
683
684	/**
685	* @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a file.}
686	*/
687	static DECLCALLBACK(int) rtScriptLexReaderFile(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
688	size_t cchBuf, size_t pcchRead, void pvUser)
689	{
690	RT_NOREF(hScriptLex);
691
692	RTFILE hFile = (RTFILE)pvUser;
693	return RTFileReadAt(hFile, offBuf, pchCur, cchBuf, pcchRead);
694	}
695
696
697	/**
698	* @callback_method_impl{FNRTSCRIPTLEXDTOR, Destructor for the file variant.}
699	*/
700	static DECLCALLBACK(void) rtScriptLexDtorFile(RTSCRIPTLEX hScriptLex, void *pvUser)
701	{
702	RT_NOREF(hScriptLex);
703
704	RTFILE hFile = (RTFILE)pvUser;
705	RTFileClose(hFile);
706	}
707
708
709	RTDECL(int) RTScriptLexCreateFromFile(PRTSCRIPTLEX phScriptLex, const char *pszFilename, PRTSTRCACHE phStrCacheId,
710	PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
711	{
712	RTFILE hFile;
713	int rc = RTFileOpen(&hFile, pszFilename, RTFILE_O_READ \| RTFILE_O_DENY_WRITE \| RTFILE_O_OPEN);
714	if (RT_SUCCESS(rc))
715	{
716	rc = RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderFile, rtScriptLexDtorFile, (void *)hFile, 0,
717	phStrCacheId, phStrCacheStringLit, pCfg);
718	if (RT_FAILURE(rc))
719	RTFileClose(hFile);
720	}
721
722	return rc;
723	}
724
725
726	RTDECL(void) RTScriptLexDestroy(RTSCRIPTLEX hScriptLex)
727	{
728	PRTSCRIPTLEXINT pThis = hScriptLex;
729	AssertPtrReturnVoid(pThis);
730
731	if (pThis->pfnDtor)
732	pThis->pfnDtor(pThis, pThis->pvUser);
733
734	if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE)
735	RTStrCacheDestroy(pThis->hStrCacheId);
736	if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE)
737	RTStrCacheDestroy(pThis->hStrCacheStringLit);
738
739	if (pThis->pszStrLit)
740	RTStrFree(pThis->pszStrLit);
741
742	RTMemFree(pThis);
743	}
744
745
746	RTDECL(int) RTScriptLexQueryToken(RTSCRIPTLEX hScriptLex, PCRTSCRIPTLEXTOKEN *ppToken)
747	{
748	PRTSCRIPTLEXINT pThis = hScriptLex;
749	AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
750	AssertPtrReturn(ppToken, VERR_INVALID_POINTER);
751
752	if (RT_SUCCESS(pThis->rcRdr))
753	*ppToken = pThis->pTokCur;
754
755	return pThis->rcRdr;
756	}
757
758
759	RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexGetTokenType(RTSCRIPTLEX hScriptLex)
760	{
761	PRTSCRIPTLEXINT pThis = hScriptLex;
762	AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
763
764	if (RT_SUCCESS(pThis->rcRdr))
765	return pThis->pTokCur->enmType;
766
767	return RTSCRIPTLEXTOKTYPE_INVALID;
768	}
769
770
771	RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexPeekNextTokenType(RTSCRIPTLEX hScriptLex)
772	{
773	PRTSCRIPTLEXINT pThis = hScriptLex;
774	AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
775
776	if (RT_SUCCESS(pThis->rcRdr))
777	return pThis->pTokNext->enmType;
778
779	return RTSCRIPTLEXTOKTYPE_INVALID;
780	}
781
782
783	RTDECL(PCRTSCRIPTLEXTOKEN) RTScriptLexConsumeToken(RTSCRIPTLEX hScriptLex)
784	{
785	PRTSCRIPTLEXINT pThis = hScriptLex;
786	AssertPtrReturn(pThis, NULL);
787
788	/*
789	* Stop token production as soon as the current token indicates the
790	* end of the stream or an error
791	*/
792	if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
793	&& pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
794	{
795	PRTSCRIPTLEXTOKEN pTokTmp = pThis->pTokCur;
796
797	/* Switch next token to current token and read in the next token. */
798	pThis->pTokCur = pThis->pTokNext;
799	pThis->pTokNext = pTokTmp;
800	if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
801	&& pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
802	rtScriptLexProduceToken(pThis, pThis->pTokNext);
803	else
804	pThis->pTokNext = pThis->pTokCur;
805	}
806
807	return pThis->pTokCur;
808	}
809
810
811	RTDECL(char) RTScriptLexConsumeCh(RTSCRIPTLEX hScriptLex)
812	{
813	return RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_DEFAULT);
814	}
815
816
817	RTDECL(char) RTScriptLexConsumeChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
818	{
819	PRTSCRIPTLEXINT pThis = hScriptLex;
820	AssertPtrReturn(pThis, '\0');
821
822	pThis->pchCur++;
823	pThis->Pos.iCh++;
824	if (pThis->pchCur == &pThis->achBuf[pThis->cchBuf])
825	rtScriptLexFillBuffer(pThis);
826
827	return RTScriptLexGetChEx(pThis, fFlags);
828	}
829
830
831	RTDECL(char) RTScriptLexPeekCh(RTSCRIPTLEX hScriptLex, unsigned idx)
832	{
833	return RTScriptLexPeekChEx(hScriptLex, idx, RTSCRIPT_LEX_CONV_F_DEFAULT);
834	}
835
836
837	RTDECL(char) RTScriptLexPeekChEx(RTSCRIPTLEX hScriptLex, unsigned idx, uint32_t fFlags)
838	{
839	PRTSCRIPTLEXINT pThis = hScriptLex;
840	AssertPtrReturn(pThis, '\0');
841
842	/* Just return the character if it is in the current buffer. */
843	char ch = '\0';
844	if (RT_LIKELY(pThis->pchCur + idx < &pThis->achBuf[pThis->cchBuf]))
845	ch = pThis->pchCur[idx];
846	else
847	{
848	/* Slow path, read data into temporary buffer to read character from and dismiss. */
849	/** @todo */
850	AssertReleaseFailed();
851	}
852
853	if ( (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE)
854	&& !(fFlags & RTSCRIPT_LEX_CONV_F_NOTHING))
855	ch = RT_C_TO_LOWER(ch);
856
857	return ch;
858	}
859
860
861	RTDECL(char) RTScriptLexGetCh(RTSCRIPTLEX hScriptLex)
862	{
863	return RTScriptLexPeekCh(hScriptLex, 0);
864	}
865
866
867	RTDECL(char) RTScriptLexGetChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
868	{
869	return RTScriptLexPeekChEx(hScriptLex, 0, fFlags);
870	}
871
872
873	RTDECL(void) RTScriptLexSkipWhitespace(RTSCRIPTLEX hScriptLex)
874	{
875	PRTSCRIPTLEXINT pThis = hScriptLex;
876	AssertPtrReturnVoid(pThis);
877
878	for (;;)
879	{
880	char ch = RTScriptLexGetCh(hScriptLex);
881
882	if (ch == '\0')
883	break;
884
885	/* Check for whitespace. */
886	const char *pszWs = pThis->pCfg->pszWhitespace ? pThis->pCfg->pszWhitespace : g_szWsDef;
887
888	if ( rtScriptLexLocateChInStrConsume(pThis, ch, pszWs)
889	\|\| rtScriptLexIsNewlineConsume(pThis, ch)
890	\|\| rtScriptLexIsMultiLineCommentConsume(pThis, ch)
891	\|\| rtScriptLexIsSingleLineCommentConsume(pThis, ch))
892	continue;
893
894	/* All white space skipped, next is some real content. */
895	break;
896	}
897	}
898
899
900	RTDECL(int) RTScriptLexScanNumber(RTSCRIPTLEX hScriptLex, uint8_t uBase, bool fAllowReal,
901	PRTSCRIPTLEXTOKEN pTok)
902	{
903	RT_NOREF(uBase, fAllowReal, pTok);
904	PRTSCRIPTLEXINT pThis = hScriptLex;
905	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
906	AssertReturn(!fAllowReal, VERR_NOT_IMPLEMENTED);
907	AssertReturn(!uBase, VERR_NOT_IMPLEMENTED);
908
909	/** @todo r=aeichner Quick and dirty to have something working for the disassembler testcase.
910	* Among others it misses overflow handling. */
911	uBase = 10;
912	char ch = RTScriptLexGetCh(hScriptLex);
913	pTok->Type.Number.enmType = ch == '-'
914	? RTSCRIPTLEXTOKNUMTYPE_INTEGER
915	: RTSCRIPTLEXTOKNUMTYPE_NATURAL;
916	if (ch == '-' \|\| ch == '+')
917	ch = RTScriptLexConsumeCh(hScriptLex);
918
919	if (ch == '0')
920	{
921	/* Some hex prefix? */
922	char chNext = RTScriptLexPeekCh(hScriptLex, 1);
923	if (chNext == 'x')
924	{
925	uBase = 16;
926	RTScriptLexConsumeCh(hScriptLex);
927	}
928	else if (chNext >= '0' && chNext <= '9') /* Octal stuff. */
929	AssertFailedReturn(VERR_NOT_IMPLEMENTED);
930
931	ch = RTScriptLexConsumeCh(hScriptLex);
932	}
933
934	uint64_t u64 = 0;
935	for (;;)
936	{
937	if ( (ch < '0' \|\| ch > '9')
938	&& (ch < 'a' \|\| ch > 'f' \|\| uBase == 10))
939	{
940	if (pTok->Type.Number.enmType == RTSCRIPTLEXTOKNUMTYPE_INTEGER)
941	pTok->Type.Number.Type.i64 = -(int64_t)u64;
942	else
943	pTok->Type.Number.Type.u64 = u64;
944	pTok->enmType = RTSCRIPTLEXTOKTYPE_NUMBER;
945	pTok->PosEnd = pThis->Pos;
946	return VINF_SUCCESS;
947	}
948
949	if (ch >= '0' && ch <= '9')
950	u64 = (u64 * uBase) + (ch - '0');
951	else if (ch >= 'a' && ch <= 'f')
952	{
953	Assert(uBase == 16);
954	u64 = (u64 << 4) + 10 + (ch - 'a');
955	}
956
957	ch = RTScriptLexConsumeCh(hScriptLex);
958	}
959	}
960
961
962	RTDECL(int) RTScriptLexScanIdentifier(RTSCRIPTLEX hScriptLex, char ch,
963	PRTSCRIPTLEXTOKEN pTok, void *pvUser)
964	{
965	PRTSCRIPTLEXINT pThis = hScriptLex;
966	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
967
968	const char pszCharSet = pvUser ? (const char )pvUser : g_aszIdeCharSetDef;
969	char aszIde[513]; RT_ZERO(aszIde);
970	unsigned idx = 0;
971	aszIde[idx++] = ch;
972
973	ch = RTScriptLexGetCh(hScriptLex);
974	while ( idx < sizeof(aszIde) - 1
975	&& rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
976	{
977	aszIde[idx++] = ch;
978	ch = RTScriptLexGetCh(hScriptLex);
979	}
980
981	if ( idx == sizeof(aszIde) - 1
982	&& rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
983	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_BUFFER_OVERFLOW, "Lexer: Identifier exceeds the allowed length");
984
985	/* Insert into string cache. */
986	pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
987	pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, &aszIde[0], idx);
988	if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
989	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
990
991	pTok->PosEnd = pThis->Pos;
992	return VINF_SUCCESS;
993	}
994
995
996	/**
997	* Adds the given character to the string literal add the given position, assuring the string
998	* is always zero terminated.
999	*
1000	* @returns IPRT status code.
1001	* @param pThis The lexer state.
1002	* @param ch The character to add.
1003	* @param idx At which position to add the character in the string.
1004	*/
1005	static int rtScriptLexScanStringLiteralChAdd(PRTSCRIPTLEXINT pThis, char ch, uint32_t idx)
1006	{
1007	int rc = VINF_SUCCESS;
1008
1009	if ( !pThis->cchStrLitMax
1010	\|\| idx >= pThis->cchStrLitMax - 1)
1011	{
1012	/* Increase memory. */
1013	size_t cchMaxNew = pThis->cchStrLitMax + 64;
1014	char *pszNew = NULL;
1015	rc = RTStrRealloc(&pszNew, cchMaxNew * sizeof(char));
1016	if (RT_SUCCESS(rc))
1017	{
1018	pThis->pszStrLit = pszNew;
1019	pThis->cchStrLitMax = cchMaxNew;
1020	}
1021	}
1022
1023	if (RT_SUCCESS(rc))
1024	{
1025	pThis->pszStrLit[idx] = ch;
1026	pThis->pszStrLit[idx + 1] = '\0';
1027	}
1028
1029	return rc;
1030	}
1031
1032
1033	RTDECL(int) RTScriptLexScanStringLiteralC(RTSCRIPTLEX hScriptLex, char ch,
1034	PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1035	{
1036	RT_NOREF(ch, pvUser);
1037	PRTSCRIPTLEXINT pThis = hScriptLex;
1038	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1039
1040	uint32_t idxChCur = 0;
1041	int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1042	if (RT_FAILURE(rc))
1043	return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1044
1045	ch = RTScriptLexGetCh(hScriptLex);
1046	for (;;)
1047	{
1048	if (ch == '\0')
1049	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1050	else if (ch == '\"')
1051	{
1052	RTScriptLexConsumeCh(hScriptLex);
1053
1054	/* End of string, add it to the string literal cache and build the token. */
1055	pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1056	pTok->Type.StringLit.cchString = idxChCur;
1057	pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1058	if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1059	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1060	else
1061	break;
1062	}
1063	else if (ch == '\\')
1064	{
1065	/* Start of escape sequence. */
1066	RTScriptLexConsumeCh(hScriptLex);
1067	ch = RTScriptLexGetCh(hScriptLex);
1068	switch (ch)
1069	{
1070	case 'a': /* Alert (Bell) */
1071	ch = 0x07;
1072	break;
1073	case 'b': /* Backspace */
1074	ch = 0x08;
1075	break;
1076	case 'e': /* Escape character */
1077	ch = 0x1b;
1078	break;
1079	case 'f': /* Formfeed */
1080	ch = 0x0c;
1081	break;
1082	case 'n': /* Newline (line freed) */
1083	ch = 0x0a;
1084	break;
1085	case 'r': /* Carriage return */
1086	ch = 0x0d;
1087	break;
1088	case 't': /* Horizontal tab */
1089	ch = 0x09;
1090	break;
1091	case 'v': /* Vertical tab */
1092	ch = 0x0b;
1093	break;
1094	case '\\':
1095	case '\'':
1096	case '\"':
1097	case '\?':
1098	/* Can be added as is. */
1099	break;
1100	case 'x': /* Hexdecimal byte. */
1101	case '0': /* Octal */
1102	case '1':
1103	case '2':
1104	case '3':
1105	case '4':
1106	case '5':
1107	case '6':
1108	case '7':
1109	case '8':
1110	case '9':
1111	case 'u': /* Unicode point below 10000 */
1112	case 'U': /* Unicode point */
1113	default:
1114	/* Not supported for now. */
1115	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NOT_SUPPORTED, "Lexer: Invalid/unsupported escape sequence");
1116	}
1117	}
1118
1119	rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1120	if (RT_SUCCESS(rc))
1121	idxChCur++;
1122	else
1123	return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1124
1125	ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1126	}
1127
1128	pTok->PosEnd = pThis->Pos;
1129	return VINF_SUCCESS;
1130	}
1131
1132
1133	RTDECL(int) RTScriptLexScanStringLiteralPascal(RTSCRIPTLEX hScriptLex, char ch,
1134	PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1135	{
1136	RT_NOREF(ch, pvUser);
1137	PRTSCRIPTLEXINT pThis = hScriptLex;
1138	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1139
1140	uint32_t idxChCur = 0;
1141	int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1142	if (RT_FAILURE(rc))
1143	return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1144
1145	ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1146	for (;;)
1147	{
1148	if (ch == '\0')
1149	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1150	else if (ch == '\'')
1151	{
1152	/*
1153	* Check whether there is a second ' coming afterwards used for
1154	* escaping ' characters.
1155	*/
1156	ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1157	if (ch != '\'')
1158	{
1159	/* End of string, add it to the string literal cache and build the token. */
1160	pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1161	pTok->Type.StringLit.cchString = idxChCur;
1162	pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1163	if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1164	return RTScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1165	else
1166	break;
1167	}
1168	/* else: Fall through and add the character to the string literal..*/
1169	}
1170
1171	rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1172	if (RT_SUCCESS(rc))
1173	idxChCur++;
1174	else
1175	return RTScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1176	ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1177	}
1178
1179	pTok->PosEnd = pThis->Pos;
1180	return VINF_SUCCESS;
1181	}
1182

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/script/scriptlex.cpp@ 108014

Download in other formats: