Newer
Older
/**************************************************************************/
/* */
/* This file is part of Frama-Clang */
/* */
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
/* CEA (Commissariat à l'énergie atomique et aux énergies */
/* alternatives) */
/* */
/* you can redistribute it and/or modify it under the terms of the GNU */
/* Lesser General Public License as published by the Free Software */
/* Foundation, version 2.1. */
/* */
/* It is distributed in the hope that it will be useful, */
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
/* GNU Lesser General Public License for more details. */
/* */
/* See the GNU Lesser General Public License version 2.1 */
/* for more details (enclosed in the file LICENSE). */
/* */
/**************************************************************************/
//
// Description:
// Definition of the ACSL++ lexer.
//
#ifndef ACSL_LexerH
#define ACSL_LexerH
#include <list>
#include "DescentParse.h"
#include "ACSLToken.h"
extern "C" {
#include "intermediate_format.h"
}
#include "Clang_utils.h"
#include "clang/AST/DeclCXX.h"
#include "clang/Lex/Token.h"
/** @file */
namespace clang {
class ASTContext;
class DeclContext;
class Sema;
class Scope;
class MacroArgs;
class Token;
class MacroInfo;
} // end of namespace clang
/** @file */
namespace Acsl {
/*! @class Lexer
* @brief Builds a token Token from a string buffer.
*
* The main method of the class Lexer is readToken. It reads characters from
* the input string buffer and then it returns the token that has been read.
* readToken returns either RRHasToken, in which case lexer.queryToken()
* returns the token, or RRFinished, in which case the input has been completely lexed.
* \n \n
* The lexer is setup by first calling the constructor, and then calling setBuffer to
* set the input. setBuffer can be called repeatedly; each call to setBuffer will
* reset the internal state of the lexer.
* \n\n
* If queryToken() reveals the token to
* be a first kind enum token, then a simple static_cast conversion
* gives an access to what it is. If queryToken() reveals to be a second kind
* elaborate token the method getContentToken gives a read access to its
* content. \n \n
*
* The lexer uses one of two methods to lex the input. The frist method is to
* delegate the lexing to clang. In this case the input buffer is completely
* lexed into a series of clang tokens, which are then converted to ACSL tokens
* as they are requested by calls to readToken(). Note that a single clang token
* is sometimes multiple ACSL tokens and sometimes multiple clang tokens
* constitute a single ACSL token. By using clang, we delegate all of the
* preprocessing macro substitution logic to clang.
* \n\n
* The second lexing method is to lex the input directly with methods in this class.
* This method is used to perform a first pass through the input to check for
* preprocessing directives that are not permitted in ACSL. This method is also
* incompletely implemented, in that it does not implemente all preprocessing directives
* (e.g. evaluating #if directives) and does not closely implement the details of
* macro substition, digraphs, trigraphs, backslash-newlines and likely other details.
*/
class Lexer : public Parser::Base {
public:
/*! @class Error
* @brief Defines a lexing error to be reported into a Acsl::ErrorMessage.
*/
class Error {
public:
std::string filePos;
unsigned linePos; //!< Starting with 1
unsigned columnPos; //!< Starting with 1 -- FIXME check this
std::string message;
/*! Creates a Error message, copying information from the beginning token location */
Error(location position, const std::string& msg);
Error(const std::string& file, unsigned line, unsigned column,
const std::string& msg)
: filePos(file), linePos(line), columnPos(column), message(msg) {}
std::string str() const;
friend std::ostream & operator <<( std::ostream &os, const Error &err ) {
return (os << err.str());
}
};
typedef DLexer::AbstractToken AbstractToken;
private:
typedef ReadResult (Lexer::*ReadPointerMethod)(const std::string& buffer,
size_t& position, location loc);
typedef DLexer::Token Token;
typedef Parser::TTextBuffer<char> TextBuffer;
typedef std::list<std::pair<std::string,AbstractToken> > Utf8SymbolSet;
std::string _buffer; //!< The text material being lexed
size_t _position; //!< The current position within _buffer
bool _hasFinished; //!< set true by the lexer when the end of input has been reached
location _tokenLocation;
//!< The ACSL location of the current token in the source text, updated as the source text is read
clang::SourceLocation _clangSourceLocation;
//!< Location of the annotation text (the contents of _buffer) as provided by clang (not updated as tokens are lexed)
TextBuffer _currentToken;
//!< text content of _token that the lexer is currently reading.
Token _token; //!< Token that the lexer most recently completed
bool _hasNewlineToken; //!< if the previous token was a comment -- FIXME explain this
char _context; //!< context for comments and for constant reading.
char _extension; //!< additional information for reading a literal extension.
enum { Unknown, Dec, Oct, Hex, Bin } _digits;
//!< kind of digits allowed when parsing numeric constant
enum { LeadZero, Digits, FracPart, Exponent, ExpNext, ISuf, FSuf } _litState;
//!< current state of numeric literal parsing
DLexer::CharacterLiteralToken::Type _charLitKind;
std::list<Error> _errorList; //!< List of errors produced by the lexer.
// macro arguments for clang::TokenLexer
typedef std::vector<std::list<std::pair<unsigned,
DLexer::AbstractToken*> > > MacroTokensStack;
MacroTokensStack _macroTokensStack;
std::vector<clang::MacroArgs*> _currentMacroArgumentsStack;
std::set<std::string> _usedMacros;
enum StateLexer { SLStandard, SLMacroArgs, SLMacroTokens} _stateLexer;
mutable std::vector<std::string> _stringsForToken;
const clang::Sema* _clangSema;
//!< Pointer to the clang Sema structure, for access to various Clang services
unsigned lexerWarning;
unsigned lexerError;
unsigned ppWarning;
unsigned ppError;
Utf8SymbolSet _AcceptedUtf8Symbols;
//!< utf8-encoded symbols that are parsed in ACSL specifications
//! Stack of tokens from expanding macros; these are the next tokens to be read
std::stack<DLexer::Token> _token_stack;
std::list<Token> _acslTokens;
//!< next tokens to be read, produced when a clang token represents multiple ACSL tokens
std::list<clang::Token> _clangTokens;
//!< list of clang tokens representing the input
bool _clangTokensSet;
//!< true if lexer used clang to do the actual lexing (so take tokens from _clangTokens)
bool _rawOnly;
//!< If true, then do not expand macros (only valid when NOT using clang)
//! If true, the ACSL preprocessor only does a check for ASCL-specific PP restrictions, and
//! leaves all other error reporting, etc., to the clang preprocessor
bool _ppCheckOnly;
/*! whether the given character could be the first character of an
accepted utf8 symbol.
*/
bool isUtf8SymbolStart(char cid);
/*! Advance position until a non-space (and non-@) character is reached; skips backslash-newlines,
* skips comments, and if skipNewLines is true, skips new line characters as well; the position
* argument and the (beginning) line and character positions in the loc argument are updated,
* and will point to the first non-space character when the function returns (and the character
* at that position is returned). An '@' symbol is space.
*/
// FIXME - is this still used
char skipSpace(const std::string& buffer, size_t& position, location loc, bool skipNewLines=true);
// FIXME - document
void handlePPDirectiveInACSL(const std::string& buffer, size_t& position, size_t start, location loc);
// FIXME - document
// FIXME - is this still used
const std::string getPreprocessorToken(const std::string& buffer, size_t& position, location loc, bool skipNewLines, bool raw);
//! advances to end of line, issuing a warning about any non-white-space, non-comment material present.
// FIXME - is this still used
void complainAboutExcessMaterial(const std::string& buffer, size_t& position, location loc, bool warn);
//! Advances Lexer to just before the end of line, ignoring everything, but advancing past backslash-newlines
// FIXME - is this still used
const std::string skipUpToEndOfLine(const std::string& buffer, size_t& position, location loc);
//! Skips to next preprocessing directive, reading and returning it; used to skip material within conditional if/ifdef/ifndef blocks
const std::string skipToNextPPDirective(const std::string& buffer, size_t& position, location loc);
// FIXME - is this still used?
void skipThroughToken(const std::string& tok, const std::string& buffer, size_t& position, location loc);
// FIXME - is this still used?
const std::string getAndTrimRestOfLine(const std::string& buffer, size_t& position, location loc);
// Utilities for handling macros
/*! Consults clang to see if the given string has a macro definition */
bool isDefinedMacro(const std::string& id);
/*! Gets macro information from clang (defined only if isDefinedMacro is true) */
clang::MacroInfo* getMacro(const std::string& name) const;
/*! Expands an identifier if it is a macro, returning the first token and pushing other tokens
* on to the _token_stack; returns the token itself if it is not a macro.
*/
DLexer::Token expandIfMacro(const DLexer::Token& token, Parser::Base::ReadResult& result);
// FIXME - is this needed?
bool handleMacroExpandedIdentifier(const std::string& identifier,
clang::MacroInfo *macro, ReadResult& parseResult);
/*! Used to read the arguments of a macro */
clang::MacroArgs* readFunctionLikeMacroArgs(const std::string& identifier, clang::MacroInfo *macro);
/*! Converts a clang token into an ACSL token; if the clang token represents multiple ACSL tokens,
* the extra ACSL tokens are pushed onto _acslTokens
*/
DLexer::Token convertClangTokenToACSLToken(const clang::Token& source) const;
//! Converts an ACSL token to a clang token
//!
clang::Token convertToClang(DLexer::Token source) const ;
// Note here that charnum2/linenum2 are the end of a token (or the current position while
// accumulating a token); charnum1/linenum1 are the beginning of the token.
// Error grabs charnum1 -- the beginning of the token.
/*! Adds current character to _currentToken and advances the end-position of the token
* (the resulting position and loc information points to just after the stored token),
* returns the character at the new position
*/
char advanceChar2(const std::string& buffer, size_t& position, location loc) {
_currentToken << buffer[position];
position++;
loc->charnum2++;
char ch = buffer[position];
if (ch == '\0') {
if (getMoreCharacters()) ch = buffer[position];
}
return ch;
}
/*! Advances the position and begin-position of loc, returning the character at the new position,
* without affecting the accumulating token; skips over backslash-newlines. FIXME - what about comments
*/
char advanceChar1NoToken(const std::string& buffer, size_t& position, location loc);
/*! Advances the position and end-position of loc, returning the character at the new position,
* without affecting the accumulating token; skips over backslash-newlines. FIXME - what about comments
*/ // FIXME - more detail, should this be used?
char advanceChar2NoToken(const std::string& buffer, size_t& position, location loc);
/*! returns true if the character is a valid digit for the currently
* selected base. Assumes that _digits is not UNKNOWN.
*/
bool isDigit(char ch) const {
assert(_digits);
if (ch == '0' || ch == '1') return true;
if (_digits == Bin) return false;
if ( '0' <= ch && ch <= '7') return true;
if (_digits == Oct) return false;
if (ch == '8' || ch == '9') return true;
if (_digits == Dec) return false;
if ('a' <= ch && ch <= 'f') return true;
return ('A' <= ch && ch <= 'F');
}
/*! sets the type of the given token based on the value of _digits */
void setType(DLexer::IntegerLiteralToken *intToken) const {
if (_digits == Hex)
intToken->setType(DLexer::IntegerLiteralToken::THexaDecimal);
if (_digits == Bin)
intToken->setType(DLexer::IntegerLiteralToken::TBit);
if (_digits == Oct)
intToken->setType(DLexer::IntegerLiteralToken::TOctal);
}
/*! Returns an identifier or keyword token for the given text (without
* an initial backslash) according to whether the text is a protected token
* (cf. AcslToken.h)
*/
Token protectedKeywordOrIdentifier(const std::string& textNoBS);
void reparseWithClang(const std::string& text, clang::SourceLocation clangLoc);
protected:
//! Internal utility function to read a token beginning at the given position,
//! called by readToken()
ReadResult readToken(const std::string& buffer, size_t& position, location loc);
//! sets the stored token; only valid if not currently set
//!
void setToken(const Token& token)
{ assert(_token.getType() == AbstractToken::TUndefined);
_token = token;
}
/*! Utility function that comments out the remainder of a line within the text, starting
* at the given offset in the input buffer
*/
void removeFromRevision(size_t start);
/*! reads the next token beginning at position and and stores it in _token,
* updating position and leaving loc with the new tokens beginning and end positions;
* position will be the character after the lexed token (FIXME - check this)
* FIXME - explain result, here and in the following
*/
ReadResult readMain(const std::string& buffer, size_t& position, location loc);
//! read the end of a CommentToken, depending of the kind of comment
//! (a line one or a delimited one).
ReadResult readEndComment(const std::string& buffer, size_t& position, location loc);
//! read an IdentifierToken from buffer beginning at position, putting the token
//! in _token and updating position and loc
ReadResult readIdentifierToken(const std::string& buffer, size_t& position,
location loc);
//! read a character literal
//! in _token and updating position and loc
ReadResult readNumberToken(const std::string& buffer, size_t& position, location loc);
//! read a Character Token from buffer beginning at position, putting the token
//! in _token and updating position and loc
ReadResult readCharLiteral(const std::string& buffer, size_t& position, location loc);
//! read a protected Token (one starting with a backslash)
//! from buffer beginning at position, putting the token
//! in _token and updating position and loc
ReadResult readProtectedToken(const std::string& buffer, size_t& position, location loc);
//! reads one or more chars in buffer, starting at position, interpreting them as a
//! UTF8 character; only a predefined set of UTF8 characters are recognizezd
ReadResult readUtf8Symbol(const std::string& buffer, size_t& position, location loc);
/*! A common point to call for not-yet-implemented features (issues an error message) */
void notImplemented(const clang::Token& source) const;
// FIXME - want this to be static so it can be reused
/*! When using clang, call this method on new input to lex the entire input into clang preprocessor tokens,
* which are placed in the _clangTokens list
*/
void lexUsingClang(const clang::Sema* _sema, const std::string& input, clang::SourceLocation loc, std::list<clang::Token>& clangTokens);
private:
Lexer(const Lexer& source) = delete;
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
public:
//! creates a lexer
Lexer(const clang::Sema* sema);
/*! Destructs internal state of the lexer */
~Lexer() {
if (!_macroTokensStack.empty()) {
MacroTokensStack::iterator iterEnd = _macroTokensStack.end();
for (MacroTokensStack::iterator iter = _macroTokensStack.begin();
iter != iterEnd; ++iter) {
std::list<std::pair<unsigned, DLexer::AbstractToken*> >::iterator
tokenIterEnd = iter->end();
for (std::list<std::pair<unsigned, DLexer::AbstractToken*> >
::iterator tokenIter = iter->begin();
tokenIter != tokenIterEnd; ++tokenIter)
if (tokenIter->second)
delete tokenIter->second;
};
};
}
/*! returns true if all tokens have been read */
bool hasFinished() {
return _hasFinished;
}
/*! sets the input to a new buffer, replacing any old state and buffer */
Lexer& setBuffer(const std::string& buffer, const clang::SourceLocation& sourceLocation, int position = 0, bool useClang = true, bool raw = false) {
_buffer = buffer;
_revised = buffer;
_position = position;
if (_tokenLocation) free_location(_tokenLocation);
_tokenLocation = makeLocation(sourceLocation);
_clangSourceLocation = sourceLocation;
_clangTokensSet = false;
_rawOnly = raw;
_hasFinished = false;
_acslTokens.clear();
if (useClang) initFromClang();
return *this;
}
/*! revised input text, in cases where some error recovery is possible */
std::string _revised;
/*! The routine to use to read tokens by means of the lexer; tokens are read from
* buffer beginning at position; the token is stored in _token, with loc holding the
* beginning and end position of the the token. On return position will point to the character
* just after the token (FIXME - check this). Lexer sips over initial white space
* comments, and backslash-newlines
*/ // FIXME - loc has end position or one beyond end?
// FIXME - what about trigraphs, digraphs
ReadResult readToken();
public:
// FIXME - explain
void eatToken(ReadResult& result);
//! returns the stored token; note that tokens have a single owner, so the caller
//! will be the actively owning instance
Token& extractToken() { return _token; }
//! returns the abstract token contained in the stored token (which must be valid)
//!
AbstractToken queryToken() { return _token.getFullToken(); }
//! returns (a pointer to) the token location (does not own the returned value)
//!
location seeTokenLocation() const { return _tokenLocation; }
// FIXME _ explain
const AbstractToken& getContentToken() const { return _token.getContent(); }
// FIXME - explain
void assumeContentToken() { _token.assumeContent(); }
//! returns true if errors have been accumulated by the lexer
//!
bool hasErrors() const { return !_errorList.empty(); }
//! returns (a reference to) the list of lexer errors;
//! error instances can be copied from and deleted using this reference
std::list<Error>& errorList() { return _errorList; }
//! returns true if there is a stored token
//!
bool doesNeedClear() const { return _token.getType(); }
//! deletes (frees) the stored token and prepares the lexer to read the next token
void clearToken()
{ _currentToken.clear();
_context = '\0';
_token = Token();
}
// FIXME - document - does this need to be public?
void initFromClang() {
lexUsingClang(_clangSema, _buffer, _clangSourceLocation, _clangTokens);
_clangTokensSet = true;
}
std::string str(const location loc) {
std::ostringstream s;
s << loc->filename1 << ":" << loc->linenum1 << ":" << loc->charnum1 << "::" << loc->filename2 << ":" << loc->linenum2 << ":" << loc->charnum2;
return s.str();
}
/*! returns verbose information about a clang token (including start and end location) */
std::string str(const clang::Token& t) const;
/*! returns verbose information about a clang source location */
std::string str(const clang::SourceLocation& loc) const;
/*! returns the source text for a clang token */
std::string text(const clang::Token& t) const;
/*! makes a ACSL location from a clang source location */
location makeLocation(clang::SourceLocation source) const;
/*! sets _tokenLocation from a pair (beginning and end) clang locations */
void setLocation(const clang::SourceLocation& begin, const clang::SourceLocation& end);
/*! returns true if two clang locations are the same */
bool sameLocation(const clang::SourceLocation& begin, const clang::SourceLocation& end) const;
protected:
// If the input buffers are such that they do not necessarily contain the whole input, then
// getMoreCharacters() should be implemented to add material to the buffer when called.
/*! Adds more material to _buffer, resetting _position as needed; returns false if no more
* material was added.
*/
// There are not yet any non-vacuous implementations of this function so its uses may not be correct
bool getMoreCharacters() { return false; }
};
} // end of namespace Acsl
#endif // ACSL_LexerH