scriptlex.cpp@ 105746

Last change on this file since 105746 was 105746, checked in by vboxsync, 8 months ago
Runtime/script: Add a simple lexer API to turn a stream of characters into tokens for a defined configuration, bugref:10394
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 37.1 KB

Line
1	/* $Id: scriptlex.cpp 105746 2024-08-21 07:35:33Z vboxsync $ */
2	/** @file
3	* IPRT - RTScript* lexer API.
4	*/
5
6	/*
7	* Copyright (C) 2017 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*********************************************************************************************************************************
29	* Header Files *
30	*********************************************************************************************************************************/
31	#define LOG_GROUP RTLOGGROUP_DEFAULT // @todo
32	#include <iprt/script.h>
33
34	#include <iprt/assert.h>
35	#include <iprt/ctype.h>
36	#include <iprt/err.h>
37	#include <iprt/file.h>
38	#include <iprt/log.h>
39	#include <iprt/mem.h>
40	#include <iprt/string.h>
41
42
43	/*********************************************************************************************************************************
44	* Structures and Typedefs *
45	*********************************************************************************************************************************/
46
47	/**
48	* Internal lexer state.
49	*/
50	typedef struct RTSCRIPTLEXINT
51	{
52	/** Magic. */
53	uint32_t u32Magic;
54	/** Source position. */
55	RTSCRIPTPOS Pos;
56	/** Current and next token buffer. */
57	RTSCRIPTLEXTOKEN aToks[2];
58	/** Pointer to the current token. */
59	PRTSCRIPTLEXTOKEN pTokCur;
60	/** Pointer to the next token. */
61	PRTSCRIPTLEXTOKEN pTokNext;
62	/** The lexer config. */
63	PCRTSCRIPTLEXCFG pCfg;
64	/** The input reader. */
65	PFNRTSCRIPTLEXRDR pfnReader;
66	/** The destructor callback. */
67	PFNRTSCRIPTLEXDTOR pfnDtor;
68	/** Opaque user data for the reader. */
69	void *pvUser;
70	/** Identifier string cache. */
71	RTSTRCACHE hStrCacheId;
72	/** String literal string cache. */
73	RTSTRCACHE hStrCacheStringLit;
74	/** Status code from the reader. */
75	int rcRdr;
76	/** Internal error info. */
77	RTERRINFOSTATIC ErrInfo;
78	/** Lexer flags. */
79	uint32_t fFlags;
80	/** Maximum numebr of bytes allocated for temporary storage for literal strings. */
81	size_t cchStrLitMax;
82	/** Pointer to the string buffer for holding the literal string. */
83	char *pszStrLit;
84	/** Pointer to the current input character. */
85	const char *pchCur;
86	/** Offset to start reading the next chunk from. */
87	size_t offBufRead;
88	/** Size of the input buffer. */
89	size_t cchBuf;
90	/** The cached part of the input, variable in size. */
91	char achBuf[1];
92	} RTSCRIPTLEXINT;
93	/** Pointer to the internal lexer state. */
94	typedef RTSCRIPTLEXINT *PRTSCRIPTLEXINT;
95
96
97	/** Free the identifier string cache literal on destruction. */
98	#define RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE RT_BIT_32(0)
99	/** Free the string literal string cache literal on destruction. */
100	#define RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE RT_BIT_32(1)
101	/** End of stream reached. */
102	#define RTSCRIPT_LEX_INT_F_EOS RT_BIT_32(2)
103
104
105	/*********************************************************************************************************************************
106	* Global Variables *
107	*********************************************************************************************************************************/
108
109	/** Default set of white spaces. */
110	static const char *g_szWsDef = " \t";
111	/** Default set of newlines. */
112	static const char *g_aszNlDef[] =
113	{
114	"\n",
115	"\r\n",
116	NULL
117	};
118	/** Default set of characters allowed for identifiers. */
119	static const char *g_aszIdeCharSetDef = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
120
121
122	/*********************************************************************************************************************************
123	* Internal Functions *
124	*********************************************************************************************************************************/
125
126
127	/**
128	* Locates the given character in the string, consuming it if found.
129	*
130	* @returns Flag whether the character was found in the string.
131	* @param pThis The lexer state.
132	* @param ch The character to check for.
133	* @param psz The string to check.
134	*/
135	DECLINLINE(bool) rtScriptLexLocateChInStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz)
136	{
137	while ( *psz != '\0'
138	&& *psz != ch)
139	psz++;
140
141	if (*psz != '\0')
142	RTScriptLexConsumeCh(pThis);
143
144	return *psz != '\0';
145	}
146
147
148	/**
149	* Matches the input against the given string starting with the given character, consuming it
150	* if found.
151	*
152	* @returns Flag whether there was a match.
153	* @param pThis The lexer state.
154	* @param ch The character to check start matching.
155	* @param psz The string to match against.
156	* @param pszExclude When the string matched but the input continues
157	* with one of the characters in this string the
158	* match will not
159	*/
160	DECLINLINE(bool) rtScriptLexMatchStrConsume(PRTSCRIPTLEXINT pThis, char ch, const char *psz,
161	const char *pszExclude)
162	{
163	bool fMatch = false;
164	if (*psz == ch)
165	{
166	unsigned offPeek = 1;
167
168	psz++;
169	while ( *psz != '\0'
170	&& *psz == RTScriptLexPeekCh(pThis, offPeek))
171	{
172	offPeek++;
173	psz++;
174	}
175
176	if (*psz == '\0')
177	{
178	if (pszExclude)
179	{
180	ch = RTScriptLexPeekCh(pThis, offPeek);
181	fMatch = strchr(pszExclude, ch) == NULL;
182	}
183	else
184	fMatch = true;
185	}
186
187	if (fMatch)
188	{
189	/* Match, consume everything. */
190	while (offPeek-- > 0)
191	RTScriptLexConsumeCh(pThis);
192	}
193	}
194
195	return fMatch;
196	}
197
198
199	/**
200	* Tries to locate a string with the given starting character (+ peeking ahead) in the
201	* given string array (exact match) and consumes the entire substring.
202	*
203	* @returns Flag whether there was a match.
204	* @param pThis The lexer state.
205	* @param ch The character to check for.
206	* @param papsz Pointer to the string array to check for.
207	* @param pidx Where to store the index of the matching substring if found,
208	* optional.
209	*/
210	DECLINLINE(bool) rtScriptLexLocateSubStrInStrArrayMatchConsume(PRTSCRIPTLEXINT pThis, char ch,
211	const char *papsz, unsigned pidx)
212	{
213	unsigned int idx = 0;
214
215	while ( papsz[idx] != NULL
216	&& !rtScriptLexMatchStrConsume(pThis, ch, papsz[idx], NULL))
217	idx++;
218
219	if ( papsz[idx] != NULL
220	&& pidx)
221	*pidx = idx;
222
223	return papsz[idx] != NULL;
224	}
225
226
227	/**
228	* Tries to get an exact match starting with the given character, consuming it when found.
229	*
230	* @returns Flag whether there was a match.
231	* @param pThis The lexer state.
232	* @param ch The character to check for.
233	* @param ppMatch Where to store the exact match on success.
234	*/
235	DECLINLINE(bool) rtScriptLexLocateExactMatchConsume(PRTSCRIPTLEXINT pThis, char ch, PCRTSCRIPTLEXTOKMATCH *ppMatch)
236	{
237	PCRTSCRIPTLEXTOKMATCH pTokMatch = pThis->pCfg->paTokMatches;
238
239	if (pTokMatch)
240	{
241	while ( pTokMatch->pszMatch != NULL
242	&& !rtScriptLexMatchStrConsume(pThis, ch, pTokMatch->pszMatch,
243	pTokMatch->fMaybeIdentifier
244	? g_aszIdeCharSetDef
245	: NULL))
246	pTokMatch++;
247
248	if (pTokMatch->pszMatch != NULL)
249	{
250	*ppMatch = pTokMatch;
251	return true;
252	}
253	}
254
255	return false;
256	}
257
258
259	DECLINLINE(bool) rtScriptLexIsNewlineConsume(PRTSCRIPTLEXINT pThis, char ch)
260	{
261	const char **papszNl = pThis->pCfg->pszWhitespace ? pThis->pCfg->papszNewline : g_aszNlDef;
262
263	bool fMatched = rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszNl, NULL);
264	if (fMatched)
265	{
266	pThis->Pos.iLine++;
267	pThis->Pos.iCh = 1;
268	}
269
270	return fMatched;
271	}
272
273
274	/**
275	* Checks whether the character is the beginning of a multi line comment, skipping the whole
276	* comment if necessary.
277	*
278	* @returns Flag whether a multi line comment was detected and consumed.
279	* @param hScriptLex The lexer state.
280	* @param ch The character to check for.
281	*/
282	DECLINLINE(bool) rtScriptLexIsMultiLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
283	{
284	const char **papszCommentMultiStart = pThis->pCfg->papszCommentMultiStart;
285	unsigned idxComment = 0;
286
287	if ( papszCommentMultiStart
288	&& rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentMultiStart,
289	&idxComment))
290	{
291	/* Look for the matching closing lexeme in the input consuming everything along the way. */
292	const char *pszClosing = pThis->pCfg->papszCommentMultiEnd[idxComment];
293
294	for (;;)
295	{
296	char chTmp = RTScriptLexGetCh(pThis);
297
298	/* Check for new lines explicetly to advance the position information. */
299	if (rtScriptLexIsNewlineConsume(pThis, chTmp))
300	continue;
301
302	/** @todo: Not quite correct when there is an end of stream before the closing lexeme.
303	* But doesn't hurt at the moment. */
304	if ( chTmp == '\0'
305	\|\| rtScriptLexMatchStrConsume(pThis, chTmp, pszClosing, NULL))
306	break;
307
308	RTScriptLexConsumeCh(pThis);
309	}
310
311	return true;
312	}
313
314	return false;
315	}
316
317
318	/**
319	* Checks whether the character is the beginning of a single line comment, skipping the whole
320	* comment if necessary.
321	*
322	* @returns Flag whether a single line comment was detected and consumed.
323	* @param hScriptLex The lexer state.
324	* @param ch The character to check for.
325	*/
326	DECLINLINE(bool) rtScriptLexIsSingleLineCommentConsume(PRTSCRIPTLEXINT pThis, char ch)
327	{
328	const char **papszCommentSingleStart = pThis->pCfg->papszCommentSingleStart;
329
330	if ( papszCommentSingleStart
331	&& rtScriptLexLocateSubStrInStrArrayMatchConsume(pThis, ch, papszCommentSingleStart,
332	NULL))
333	{
334	for (;;)
335	{
336	char chTmp = RTScriptLexGetCh(pThis);
337
338	if ( chTmp == '\0'
339	\|\| rtScriptLexIsNewlineConsume(pThis, chTmp))
340	break;
341
342	RTScriptLexConsumeCh(pThis);
343	}
344
345	return true;
346	}
347
348	return false;
349	}
350
351
352	/**
353	* Fills the input buffer with source data.
354	*
355	* @returns IPRT status code.
356	* @param pThis The lexer state.
357	*/
358	static int rtScriptLexFillBuffer(PRTSCRIPTLEXINT pThis)
359	{
360	int rc = VINF_SUCCESS;
361	size_t cchToRead = pThis->cchBuf;
362	char *pchRead = &pThis->achBuf[0];
363
364	AssertReturn(!(pThis->fFlags & RTSCRIPT_LEX_INT_F_EOS), VERR_INVALID_STATE);
365
366	/* If there is input left to process move it to the front and fill the remainder. */
367	if (pThis->pchCur != NULL)
368	{
369	cchToRead = pThis->pchCur - &pThis->achBuf[0];
370	/* Move the rest to the front. */
371	memmove(&pThis->achBuf[0], pThis->pchCur, pThis->cchBuf - cchToRead);
372	pchRead = (char *)pThis->pchCur + 1;
373	memset(pchRead, 0, cchToRead);
374	}
375
376	if (cchToRead)
377	{
378	pThis->pchCur = &pThis->achBuf[0];
379
380	size_t cchRead = 0;
381	rc = pThis->pfnReader(pThis, pThis->offBufRead, pchRead, cchToRead, &cchRead, pThis->pvUser);
382	if (RT_SUCCESS(rc))
383	{
384	pThis->offBufRead += cchRead;
385	if (rc == VINF_EOF)
386	pThis->fFlags \|= RTSCRIPT_LEX_INT_F_EOS;
387	rc = VINF_SUCCESS;
388	}
389	else
390	pThis->rcRdr = rc;
391	}
392	else
393	rc = VERR_BUFFER_OVERFLOW; /** @todo */
394
395	return rc;
396	}
397
398
399	/**
400	* Produce an end of stream token.
401	*
402	* @returns nothing.
403	* @param pThis The lexer state.
404	* @param pTok The token to fill.
405	*/
406	static void rtScriptLexProduceTokEos(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
407	{
408	pTok->enmType = RTSCRIPTLEXTOKTYPE_EOS;
409	pTok->PosStart = pThis->Pos;
410	pTok->PosEnd = pThis->Pos;
411	}
412
413
414	/**
415	* Produce an error token with the given error message.
416	*
417	* @returns IPRT status code.
418	* @param pThis The lexer state.
419	* @param pTok The token to fill.
420	* @param rc The status code to use in the message.
421	* @param pszMsg The format string for the error message.
422	* @param ... Arguments to the format string.
423	*/
424	static int rtScriptLexProduceTokError(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
425	int rc, const char *pszMsg, ...)
426	{
427	va_list va;
428	va_start(va, pszMsg);
429
430	pTok->enmType = RTSCRIPTLEXTOKTYPE_ERROR;
431	pTok->PosEnd = pThis->Pos;
432	pTok->Type.Error.pErr = &pThis->ErrInfo.Core;
433
434	RTErrInfoInitStatic(&pThis->ErrInfo);
435	RTErrInfoSetV(&pThis->ErrInfo.Core, rc, pszMsg, va);
436	va_end(va);
437
438	return rc;
439	}
440
441
442	/**
443	* Create the token from the exact match.
444	*
445	* @returns nothing.
446	* @param pThis The lexer state.
447	* @param pTok The token to fill.
448	* @param pMatch The matched string.
449	*/
450	static void rtScriptLexProduceTokFromExactMatch(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok,
451	PCRTSCRIPTLEXTOKMATCH pMatch)
452	{
453	pTok->enmType = pMatch->enmTokType;
454	pTok->PosEnd = pThis->Pos;
455
456	switch (pTok->enmType)
457	{
458	case RTSCRIPTLEXTOKTYPE_OPERATOR:
459	pTok->Type.Operator.pOp = pMatch;
460	break;
461	case RTSCRIPTLEXTOKTYPE_KEYWORD:
462	pTok->Type.Keyword.pKeyword = pMatch;
463	break;
464	case RTSCRIPTLEXTOKTYPE_PUNCTUATOR:
465	pTok->Type.Punctuator.pPunctuator = pMatch;
466	break;
467	default:
468	rtScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
469	"Lexer: The match contains an invalid token type: %d\n",
470	pTok->enmType);
471	}
472	}
473
474
475	/**
476	* Goes through the rules trying to find a matching one.
477	*
478	* @returns Flag whether a matching rule was found.
479	* @param pThis The lexer state.
480	* @param ch The character to check.
481	* @param pTok The token to fill.
482	*/
483	static bool rtScriptLexProduceTokFromRules(PRTSCRIPTLEXINT pThis, char ch, PRTSCRIPTLEXTOKEN pTok)
484	{
485	PCRTSCRIPTLEXRULE pRule = pThis->pCfg->paRules;
486
487	if (pRule)
488	{
489	while (pRule->pfnProd != NULL)
490	{
491	if ( ch >= pRule->chStart
492	&& ch <= pRule->chEnd)
493	{
494	if (pRule->fFlags & RTSCRIPT_LEX_RULE_CONSUME)
495	RTScriptLexConsumeCh(pThis);
496	int rc = pRule->pfnProd(pThis, ch, pTok, pRule->pvUser);
497	AssertRC(rc);
498	return true;
499	}
500
501	pRule++;
502	}
503	}
504
505	return false;
506	}
507
508
509	/**
510	* Fills in the given token from the scanned input at the current location.
511	*
512	* @returns IPRT status code.
513	* @param pThis The lexer state.
514	* @param pTok The token to fill.
515	*/
516	static int rtScriptLexProduceToken(PRTSCRIPTLEXINT pThis, PRTSCRIPTLEXTOKEN pTok)
517	{
518	RTScriptLexSkipWhitespace(pThis);
519
520	pTok->PosStart = pThis->Pos;
521
522	char ch = RTScriptLexGetCh(pThis);
523	PCRTSCRIPTLEXTOKMATCH pMatch = NULL;
524	if (ch == '\0')
525	rtScriptLexProduceTokEos(pThis, pTok);
526	else if (rtScriptLexLocateExactMatchConsume(pThis, ch, &pMatch))
527	rtScriptLexProduceTokFromExactMatch(pThis, pTok, pMatch);
528	else if (!rtScriptLexProduceTokFromRules(pThis, ch, pTok))
529	{
530	if (pThis->pCfg->pfnProdDef)
531	pThis->rcRdr = pThis->pCfg->pfnProdDef(pThis, ch, pTok, pThis->pCfg->pvProdDefUser);
532	else
533	rtScriptLexProduceTokError(pThis, pTok, VERR_INVALID_PARAMETER,
534	"Lexer: Invalid character found in input: %c\n",
535	ch);
536	}
537
538	return pThis->rcRdr;
539	}
540
541
542	/**
543	* Populates the lexer for the initial use.
544	*
545	* @returns IPRT status code.
546	* @param pThis The lexer state.
547	*/
548	static int rtScriptLexPopulate(PRTSCRIPTLEXINT pThis)
549	{
550	int rc = rtScriptLexFillBuffer(pThis);
551	if (RT_SUCCESS(rc))
552	{
553	rc = rtScriptLexProduceToken(pThis, pThis->pTokCur);
554	if (RT_SUCCESS(rc))
555	rc = rtScriptLexProduceToken(pThis, pThis->pTokNext);
556	}
557
558	return rc;
559	}
560
561
562
563	RTDECL(int) RTScriptLexCreateFromReader(PRTSCRIPTLEX phScriptLex, PFNRTSCRIPTLEXRDR pfnReader,
564	PFNRTSCRIPTLEXDTOR pfnDtor, void *pvUser,
565	size_t cchBuf, PRTSTRCACHE phStrCacheId, PRTSTRCACHE phStrCacheStringLit,
566	PCRTSCRIPTLEXCFG pCfg)
567	{
568	AssertPtrReturn(phScriptLex, VERR_INVALID_POINTER);
569	AssertPtrReturn(pfnReader, VERR_INVALID_POINTER);
570	AssertPtrReturn(pCfg, VERR_INVALID_POINTER);
571
572	if (!cchBuf)
573	cchBuf = _16K;
574	int rc = VINF_SUCCESS;
575	PRTSCRIPTLEXINT pThis = (PRTSCRIPTLEXINT)RTMemAllocZ(RT_OFFSETOF(RTSCRIPTLEXINT, achBuf[cchBuf]));
576	if (RT_LIKELY(pThis))
577	{
578	pThis->u32Magic = 0xfefecafe; /*@todo /
579	pThis->Pos.iLine = 1;
580	pThis->Pos.iCh = 1;
581	pThis->pTokCur = &pThis->aToks[0];
582	pThis->pTokNext = &pThis->aToks[1];
583	pThis->pCfg = pCfg;
584	pThis->pfnReader = pfnReader;
585	pThis->pfnDtor = pfnDtor;
586	pThis->pvUser = pvUser;
587	pThis->fFlags = 0;
588	pThis->cchStrLitMax = 0;
589	pThis->pszStrLit = NULL;
590	pThis->cchBuf = cchBuf;
591	pThis->offBufRead = 0;
592	pThis->pchCur = NULL;
593	pThis->hStrCacheId = NULL;
594	pThis->hStrCacheStringLit = NULL;
595
596	rc = RTStrCacheCreate(&pThis->hStrCacheId, "LEX-Ide");
597	if (RT_SUCCESS(rc))
598	{
599	rc = RTStrCacheCreate(&pThis->hStrCacheStringLit, "LEX-StrLit");
600	if (RT_SUCCESS(rc))
601	{
602	rc = rtScriptLexPopulate(pThis);
603	if (RT_SUCCESS(rc))
604	{
605	*phScriptLex = pThis;
606
607	if (phStrCacheId)
608	*phStrCacheId = pThis->hStrCacheId;
609	else
610	pThis->fFlags \|= RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE;
611
612	if (phStrCacheStringLit)
613	*phStrCacheStringLit = pThis->hStrCacheStringLit;
614	else
615	pThis->fFlags \|= RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE;
616
617	return VINF_SUCCESS;
618	}
619
620	RTStrCacheDestroy(pThis->hStrCacheStringLit);
621	}
622
623	RTStrCacheDestroy(pThis->hStrCacheId);
624	}
625
626	RTMemFree(pThis);
627	}
628	else
629	rc = VERR_NO_MEMORY;
630
631	return rc;
632	}
633
634
635	/**
636	* @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a string.}
637	*/
638	static DECLCALLBACK(int) rtScriptLexReaderStr(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
639	size_t cchBuf, size_t pcchRead, void pvUser)
640	{
641	RT_NOREF(hScriptLex);
642
643	const char psz = (const char )pvUser;
644	size_t cch = strlen(psz);
645	size_t cchCopy = RT_MIN(cchBuf, cch - offBuf);
646	int rc = VINF_SUCCESS;
647
648	*pcchRead = cchCopy;
649
650	if (cchCopy)
651	memcpy(pchCur, &psz[offBuf], cchCopy * sizeof(char));
652	else
653	rc = VINF_EOF;
654
655	return rc;
656	}
657
658
659	RTDECL(int) RTScriptLexCreateFromString(PRTSCRIPTLEX phScriptLex, const char *pszSrc, PRTSTRCACHE phStrCacheId,
660	PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
661	{
662	return RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderStr, NULL, (void *)pszSrc, 0,
663	phStrCacheId, phStrCacheStringLit, pCfg);
664	}
665
666
667	/**
668	* @callback_method_impl{FNRTSCRIPTLEXRDR, Worker to read from a file.}
669	*/
670	static DECLCALLBACK(int) rtScriptLexReaderFile(RTSCRIPTLEX hScriptLex, size_t offBuf, char *pchCur,
671	size_t cchBuf, size_t pcchRead, void pvUser)
672	{
673	RT_NOREF(hScriptLex);
674
675	RTFILE hFile = (RTFILE)pvUser;
676	return RTFileReadAt(hFile, offBuf, pchCur, cchBuf, pcchRead);
677	}
678
679
680	/**
681	* @callback_method_impl{FNRTSCRIPTLEXDTOR, Destructor for the file variant.}
682	*/
683	static DECLCALLBACK(void) rtScriptLexDtorFile(RTSCRIPTLEX hScriptLex, void *pvUser)
684	{
685	RT_NOREF(hScriptLex);
686
687	RTFILE hFile = (RTFILE)pvUser;
688	RTFileClose(hFile);
689	}
690
691
692	RTDECL(int) RTScriptLexCreateFromFile(PRTSCRIPTLEX phScriptLex, const char *pszFilename, PRTSTRCACHE phStrCacheId,
693	PRTSTRCACHE phStrCacheStringLit, PCRTSCRIPTLEXCFG pCfg)
694	{
695	RTFILE hFile;
696	int rc = RTFileOpen(&hFile, pszFilename, RTFILE_O_READ \| RTFILE_O_DENY_WRITE \| RTFILE_O_OPEN);
697	if (RT_SUCCESS(rc))
698	{
699	rc = RTScriptLexCreateFromReader(phScriptLex, rtScriptLexReaderFile, rtScriptLexDtorFile, (void *)hFile, 0,
700	phStrCacheId, phStrCacheStringLit, pCfg);
701	if (RT_FAILURE(rc))
702	RTFileClose(hFile);
703	}
704
705	return rc;
706	}
707
708
709	RTDECL(void) RTScriptLexDestroy(RTSCRIPTLEX hScriptLex)
710	{
711	PRTSCRIPTLEXINT pThis = hScriptLex;
712	AssertPtrReturnVoid(pThis);
713
714	if (pThis->pfnDtor)
715	pThis->pfnDtor(pThis, pThis->pvUser);
716
717	if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_ID_FREE)
718	RTStrCacheDestroy(pThis->hStrCacheId);
719	if (pThis->fFlags & RTSCRIPT_LEX_INT_F_STR_CACHE_STR_LIT_FREE)
720	RTStrCacheDestroy(pThis->hStrCacheStringLit);
721
722	if (pThis->pszStrLit)
723	RTStrFree(pThis->pszStrLit);
724
725	RTMemFree(pThis);
726	}
727
728
729	RTDECL(int) RTScriptLexQueryToken(RTSCRIPTLEX hScriptLex, PCRTSCRIPTLEXTOKEN *ppToken)
730	{
731	PRTSCRIPTLEXINT pThis = hScriptLex;
732	AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
733	AssertPtrReturn(ppToken, VERR_INVALID_POINTER);
734
735	if (RT_SUCCESS(pThis->rcRdr))
736	*ppToken = pThis->pTokCur;
737
738	return pThis->rcRdr;
739	}
740
741
742	RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexGetTokenType(RTSCRIPTLEX hScriptLex)
743	{
744	PRTSCRIPTLEXINT pThis = hScriptLex;
745	AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
746
747	if (RT_SUCCESS(pThis->rcRdr))
748	return pThis->pTokCur->enmType;
749
750	return RTSCRIPTLEXTOKTYPE_INVALID;
751	}
752
753
754	RTDECL(RTSCRIPTLEXTOKTYPE) RTScriptLexPeekNextTokenType(RTSCRIPTLEX hScriptLex)
755	{
756	PRTSCRIPTLEXINT pThis = hScriptLex;
757	AssertPtrReturn(pThis, RTSCRIPTLEXTOKTYPE_INVALID);
758
759	if (RT_SUCCESS(pThis->rcRdr))
760	return pThis->pTokNext->enmType;
761
762	return RTSCRIPTLEXTOKTYPE_INVALID;
763	}
764
765
766	RTDECL(PCRTSCRIPTLEXTOKEN) RTScriptLexConsumeToken(RTSCRIPTLEX hScriptLex)
767	{
768	PRTSCRIPTLEXINT pThis = hScriptLex;
769	AssertPtrReturn(pThis, NULL);
770
771	/*
772	* Stop token production as soon as the current token indicates the
773	* end of the stream or an error
774	*/
775	if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
776	&& pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
777	{
778	PRTSCRIPTLEXTOKEN pTokTmp = pThis->pTokCur;
779
780	/* Switch next token to current token and read in the next token. */
781	pThis->pTokCur = pThis->pTokNext;
782	pThis->pTokNext = pTokTmp;
783	if ( pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_EOS
784	&& pThis->pTokCur->enmType != RTSCRIPTLEXTOKTYPE_ERROR)
785	rtScriptLexProduceToken(pThis, pThis->pTokNext);
786	else
787	pThis->pTokNext = pThis->pTokCur;
788	}
789
790	return pThis->pTokCur;
791	}
792
793
794	RTDECL(char) RTScriptLexConsumeCh(RTSCRIPTLEX hScriptLex)
795	{
796	return RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_DEFAULT);
797	}
798
799
800	RTDECL(char) RTScriptLexConsumeChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
801	{
802	PRTSCRIPTLEXINT pThis = hScriptLex;
803	AssertPtrReturn(pThis, '\0');
804
805	pThis->pchCur++;
806	pThis->Pos.iCh++;
807	if (pThis->pchCur == &pThis->achBuf[pThis->cchBuf])
808	rtScriptLexFillBuffer(pThis);
809
810	return RTScriptLexGetChEx(pThis, fFlags);
811	}
812
813
814	RTDECL(char) RTScriptLexPeekCh(RTSCRIPTLEX hScriptLex, unsigned idx)
815	{
816	return RTScriptLexPeekChEx(hScriptLex, idx, RTSCRIPT_LEX_CONV_F_DEFAULT);
817	}
818
819
820	RTDECL(char) RTScriptLexPeekChEx(RTSCRIPTLEX hScriptLex, unsigned idx, uint32_t fFlags)
821	{
822	PRTSCRIPTLEXINT pThis = hScriptLex;
823	AssertPtrReturn(pThis, '\0');
824
825	/* Just return the character if it is in the current buffer. */
826	char ch = '\0';
827	if (RT_LIKELY(pThis->pchCur + idx < &pThis->achBuf[pThis->cchBuf]))
828	ch = pThis->pchCur[idx];
829	else
830	{
831	/* Slow path, read data into temporary buffer to read character from and dismiss. */
832	/** @todo */
833	AssertReleaseFailed();
834	}
835
836	if ( (pThis->pCfg->fFlags & RTSCRIPT_LEX_CFG_F_CASE_INSENSITIVE)
837	&& !(fFlags & RTSCRIPT_LEX_CONV_F_NOTHING))
838	ch = RT_C_TO_LOWER(ch);
839
840	return ch;
841	}
842
843
844	RTDECL(char) RTScriptLexGetCh(RTSCRIPTLEX hScriptLex)
845	{
846	return RTScriptLexPeekCh(hScriptLex, 0);
847	}
848
849
850	RTDECL(char) RTScriptLexGetChEx(RTSCRIPTLEX hScriptLex, uint32_t fFlags)
851	{
852	return RTScriptLexPeekChEx(hScriptLex, 0, fFlags);
853	}
854
855
856	RTDECL(void) RTScriptLexSkipWhitespace(RTSCRIPTLEX hScriptLex)
857	{
858	PRTSCRIPTLEXINT pThis = hScriptLex;
859	AssertPtrReturnVoid(pThis);
860
861	for (;;)
862	{
863	char ch = RTScriptLexGetCh(hScriptLex);
864
865	if (ch == '\0')
866	break;
867
868	/* Check for whitespace. */
869	const char *pszWs = pThis->pCfg->pszWhitespace ? pThis->pCfg->pszWhitespace : g_szWsDef;
870
871	if ( rtScriptLexLocateChInStrConsume(pThis, ch, pszWs)
872	\|\| rtScriptLexIsNewlineConsume(pThis, ch)
873	\|\| rtScriptLexIsMultiLineCommentConsume(pThis, ch)
874	\|\| rtScriptLexIsSingleLineCommentConsume(pThis, ch))
875	continue;
876
877	/* All white space skipped, next is some real content. */
878	break;
879	}
880	}
881
882
883	RTDECL(int) RTScriptLexScanNumber(RTSCRIPTLEX hScriptLex, uint8_t uBase, bool fAllowReal,
884	PRTSCRIPTLEXTOKEN pTok)
885	{
886	RT_NOREF(uBase, fAllowReal, pTok);
887	PRTSCRIPTLEXINT pThis = hScriptLex;
888	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
889	AssertReturn(!fAllowReal, VERR_NOT_IMPLEMENTED);
890	AssertReturn(!uBase, VERR_NOT_IMPLEMENTED);
891
892	/** @todo r=aeichner Quick and dirty to have something working for the disassembler testcase.
893	* Among others it misses overflow handling. */
894	uBase = 10;
895	char ch = RTScriptLexGetCh(hScriptLex);
896	pTok->Type.Number.enmType = ch == '-'
897	? RTSCRIPTLEXTOKNUMTYPE_INTEGER
898	: RTSCRIPTLEXTOKNUMTYPE_NATURAL;
899	if (ch == '-' \|\| ch == '+')
900	ch = RTScriptLexConsumeCh(hScriptLex);
901
902	if (ch == '0')
903	{
904	/* Some hex prefix? */
905	if (RTScriptLexPeekCh(hScriptLex, 1) == 'x')
906	{
907	uBase = 16;
908	RTScriptLexConsumeCh(hScriptLex);
909	}
910	else /* Octal stuff. */
911	AssertFailedReturn(VERR_NOT_IMPLEMENTED);
912
913	ch = RTScriptLexConsumeCh(hScriptLex);
914	}
915
916	uint64_t u64 = 0;
917	for (;;)
918	{
919	if ( (ch < '0' \|\| ch > '9')
920	&& (ch < 'a' \|\| ch > 'f' \|\| uBase == 10))
921	{
922	if (pTok->Type.Number.enmType == RTSCRIPTLEXTOKNUMTYPE_INTEGER)
923	pTok->Type.Number.Type.i64 = -u64;
924	else
925	pTok->Type.Number.Type.u64 = u64;
926	pTok->enmType = RTSCRIPTLEXTOKTYPE_NUMBER;
927	pTok->PosEnd = pThis->Pos;
928	return VINF_SUCCESS;
929	}
930
931	if (ch >= '0' && ch <= '9')
932	u64 = (u64 * uBase) + (ch - '0');
933	else if (ch >= 'a' && ch <= 'f')
934	{
935	Assert(uBase == 16);
936	u64 = (u64 << 4) + 10 + (ch - 'a');
937	}
938
939	ch = RTScriptLexConsumeCh(hScriptLex);
940	}
941
942	return VINF_SUCCESS;
943	}
944
945
946	RTDECL(DECLCALLBACK(int)) RTScriptLexScanIdentifier(RTSCRIPTLEX hScriptLex, char ch,
947	PRTSCRIPTLEXTOKEN pTok, void *pvUser)
948	{
949	PRTSCRIPTLEXINT pThis = hScriptLex;
950	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
951
952	const char pszCharSet = pvUser ? (const char )pvUser : g_aszIdeCharSetDef;
953	char aszIde[513]; RT_ZERO(aszIde);
954	unsigned idx = 0;
955	aszIde[idx++] = ch;
956
957	ch = RTScriptLexGetCh(hScriptLex);
958	while ( idx < sizeof(aszIde) - 1
959	&& rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
960	{
961	aszIde[idx++] = ch;
962	ch = RTScriptLexGetCh(hScriptLex);
963	}
964
965	if ( idx == sizeof(aszIde) - 1
966	&& rtScriptLexLocateChInStrConsume(hScriptLex, ch, pszCharSet))
967	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_BUFFER_OVERFLOW, "Lexer: Identifier exceeds the allowed length");
968
969	/* Insert into string cache. */
970	pTok->enmType = RTSCRIPTLEXTOKTYPE_IDENTIFIER;
971	pTok->Type.Id.pszIde = RTStrCacheEnterN(pThis->hStrCacheId, &aszIde[0], idx);
972	if (RT_UNLIKELY(!pTok->Type.Id.pszIde))
973	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Out of memory inserting identifier into string cache");
974
975	pTok->PosEnd = pThis->Pos;
976	return VINF_SUCCESS;
977	}
978
979
980	/**
981	* Adds the given character to the string literal add the given position, assuring the string
982	* is always zero terminated.
983	*
984	* @returns IPRT status code.
985	* @param pThis The lexer state.
986	* @param ch The character to add.
987	* @param idx At which position to add the character in the string.
988	*/
989	static int rtScriptLexScanStringLiteralChAdd(PRTSCRIPTLEXINT pThis, char ch, uint32_t idx)
990	{
991	int rc = VINF_SUCCESS;
992
993	if ( !pThis->cchStrLitMax
994	\|\| idx >= pThis->cchStrLitMax - 1)
995	{
996	/* Increase memory. */
997	size_t cchMaxNew = pThis->cchStrLitMax + 64;
998	char *pszNew = NULL;
999	rc = RTStrRealloc(&pszNew, cchMaxNew * sizeof(char));
1000	if (RT_SUCCESS(rc))
1001	{
1002	pThis->pszStrLit = pszNew;
1003	pThis->cchStrLitMax = cchMaxNew;
1004	}
1005	}
1006
1007	if (RT_SUCCESS(rc))
1008	{
1009	pThis->pszStrLit[idx] = ch;
1010	pThis->pszStrLit[idx + 1] = '\0';
1011	}
1012
1013	return rc;
1014	}
1015
1016
1017	RTDECL(DECLCALLBACK(int)) RTScriptLexScanStringLiteralC(RTSCRIPTLEX hScriptLex, char ch,
1018	PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1019	{
1020	RT_NOREF(ch, pvUser);
1021	PRTSCRIPTLEXINT pThis = hScriptLex;
1022	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1023
1024	uint32_t idxChCur = 0;
1025	int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1026	if (RT_FAILURE(rc))
1027	return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1028
1029	ch = RTScriptLexGetCh(hScriptLex);
1030	for (;;)
1031	{
1032	if (ch == '\0')
1033	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1034	else if (ch == '\"')
1035	{
1036	/* End of string, add it to the string literal cache and build the token. */
1037	pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1038	pTok->Type.StringLit.cchString = idxChCur;
1039	pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1040	if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1041	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1042	else
1043	break;
1044	}
1045	else if (ch == '\\')
1046	{
1047	/* Start of escape sequence. */
1048	RTScriptLexConsumeCh(hScriptLex);
1049	ch = RTScriptLexGetCh(hScriptLex);
1050	switch (ch)
1051	{
1052	case 'a': /* Alert (Bell) */
1053	ch = 0x07;
1054	break;
1055	case 'b': /* Backspace */
1056	ch = 0x08;
1057	break;
1058	case 'e': /* Escape character */
1059	ch = 0x1b;
1060	break;
1061	case 'f': /* Formfeed */
1062	ch = 0x0c;
1063	break;
1064	case 'n': /* Newline (line freed) */
1065	ch = 0x0a;
1066	break;
1067	case 'r': /* Carriage return */
1068	ch = 0x0d;
1069	break;
1070	case 't': /* Horizontal tab */
1071	ch = 0x09;
1072	break;
1073	case 'v': /* Vertical tab */
1074	ch = 0x0b;
1075	break;
1076	case '\\':
1077	case '\'':
1078	case '\"':
1079	case '\?':
1080	/* Can be added as is. */
1081	break;
1082	case 'x': /* Hexdecimal byte. */
1083	case '0': /* Octal */
1084	case '1':
1085	case '2':
1086	case '3':
1087	case '4':
1088	case '5':
1089	case '6':
1090	case '7':
1091	case '8':
1092	case '9':
1093	case 'u': /* Unicode point below 10000 */
1094	case 'U': /* Unicode point */
1095	default:
1096	/* Not supported for now. */
1097	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NOT_SUPPORTED, "Lexer: Invalid/unsupported escape sequence");
1098	}
1099	}
1100
1101	rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1102	if (RT_SUCCESS(rc))
1103	idxChCur++;
1104	else
1105	return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1106
1107	ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1108	}
1109
1110	pTok->PosEnd = pThis->Pos;
1111	return VINF_SUCCESS;
1112	}
1113
1114
1115	RTDECL(DECLCALLBACK(int)) RTScriptLexScanStringLiteralPascal(RTSCRIPTLEX hScriptLex, char ch,
1116	PRTSCRIPTLEXTOKEN pTok, void *pvUser)
1117	{
1118	RT_NOREF(ch, pvUser);
1119	PRTSCRIPTLEXINT pThis = hScriptLex;
1120	AssertPtrReturn(pThis, VERR_INVALID_POINTER);
1121
1122	uint32_t idxChCur = 0;
1123	int rc = rtScriptLexScanStringLiteralChAdd(pThis, '\0', idxChCur);
1124	if (RT_FAILURE(rc))
1125	return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1126
1127	ch = RTScriptLexGetChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1128	for (;;)
1129	{
1130	if (ch == '\0')
1131	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_EOF, "Lexer: End of stream before closing string literal terminal");
1132	else if (ch == '\'')
1133	{
1134	/*
1135	* Check whether there is a second ' coming afterwards used for
1136	* escaping ' characters.
1137	*/
1138	ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1139	if (ch != '\'')
1140	{
1141	/* End of string, add it to the string literal cache and build the token. */
1142	pTok->enmType = RTSCRIPTLEXTOKTYPE_STRINGLIT;
1143	pTok->Type.StringLit.cchString = idxChCur;
1144	pTok->Type.StringLit.pszString = RTStrCacheEnterN(pThis->hStrCacheStringLit, pThis->pszStrLit, idxChCur);
1145	if (RT_UNLIKELY(!pTok->Type.StringLit.pszString))
1146	return rtScriptLexProduceTokError(hScriptLex, pTok, VERR_NO_STR_MEMORY, "Lexer: Error adding string literal to the cache");
1147	else
1148	break;
1149	}
1150	/* else: Fall through and add the character to the string literal..*/
1151	}
1152
1153	rc = rtScriptLexScanStringLiteralChAdd(pThis, ch, idxChCur);
1154	if (RT_SUCCESS(rc))
1155	idxChCur++;
1156	else
1157	return rtScriptLexProduceTokError(hScriptLex, pTok, rc, "Lexer: Error adding character to string literal");
1158	ch = RTScriptLexConsumeChEx(hScriptLex, RTSCRIPT_LEX_CONV_F_NOTHING);
1159	}
1160
1161	pTok->PosEnd = pThis->Pos;
1162	return VINF_SUCCESS;
1163	}
1164

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/script/scriptlex.cpp@ 105746

Download in other formats: