From d55f75669ab36327bec881f4f148b1a11efe669a Mon Sep 17 00:00:00 2001 From: Maxime Jacquemin <maxime.jacquemin@cea.fr> Date: Wed, 31 Jan 2024 17:04:27 +0100 Subject: [PATCH] [Lexer] do some refactoring --- src/kernel_internals/parsing/clexer.mll | 862 +++++++++++------------- 1 file changed, 412 insertions(+), 450 deletions(-) diff --git a/src/kernel_internals/parsing/clexer.mll b/src/kernel_internals/parsing/clexer.mll index d3a9f4d5a46..2b345a2670d 100644 --- a/src/kernel_internals/parsing/clexer.mll +++ b/src/kernel_internals/parsing/clexer.mll @@ -53,6 +53,9 @@ module E = Errorloc let currentLoc () = E.currentLoc () +(* Convert char into Int64 *) +let int64_of_char c = Int64.of_int (Char.code c) + let one_line_ghost = ref false let is_oneline_ghost () = !one_line_ghost let enter_oneline_ghost () = one_line_ghost := true @@ -68,274 +71,222 @@ let is_ghost_annot () = !ghost_annot let enter_ghost_annot () = ghost_annot := true let exit_ghost_annot () = ghost_annot := false -let addComment c = Cabshelper.Comments.add (currentLoc()) c - -(* Some debugging support for line numbers *) -let dbgToken (t: token) = - if false then begin - let dprintf fmt = Kernel.debug fmt in - (match t with - IDENT n -> dprintf "IDENT(%s)\n" n - | LBRACE l -> dprintf "LBRACE(%d)\n" (fst l).Filepath.pos_lnum - | RBRACE l -> dprintf "RBRACE(%d)\n" (fst l).Filepath.pos_lnum - | IF l -> dprintf "IF(%d)\n" (fst l).Filepath.pos_lnum - | SWITCH l -> dprintf "SWITCH(%d)\n" (fst l).Filepath.pos_lnum - | RETURN l -> dprintf "RETURN(%d)\n" (fst l).Filepath.pos_lnum - | _ -> ()) ; - t - end else - t +let add_comment c = Cabshelper.Comments.add (currentLoc()) c + + + (* ** Keyword hashtable *) + let lexicon = H.create 211 -let init_lexicon _ = +let add key builder = H.add lexicon key builder +let remove key = H.remove lexicon key +let is_c_keyword s = Hashtbl.mem lexicon s + +(* Search for a keyword, default to variable name, as opposed to type *) +let scan_ident id = + let here = currentLoc () in + try (H.find lexicon id) here + with Not_found -> IDENT id + +let valid key builder = + add key builder + +let unsupported key = + let msg f k = Format.fprintf f "%s is currently unsupported by Frama-C." k in + add key (fun loc -> Kernel.abort ~source:(fst loc) "%a" msg key) + +let warning_C11 key builder = + let warning () = Kernel.(warning ~wkey:wkey_c11 "%s is a C11 keyword" key) in + add key (fun loc -> warning () ; builder loc) + +let thread_keyword () = + let wkey = Kernel.wkey_conditional_feature in + let s = "__thread is a GCC extension, use a GCC-based machdep to enable it" in + let warning () = Kernel.warning ~wkey "%s" s ; IDENT "__thread" in + add "__thread" (fun loc -> if Cil.gccMode () then THREAD loc else warning ()) + +let filename_keyword () = + let convert acc c = int64_of_char c :: acc in + let path (loc : Cil_types.location) = (fst loc).pos_path in + let filename loc = Filepath.Normalized.to_pretty_string (path loc) in + let ints loc = List.rev (String.fold_left convert [] (filename loc)) in + add "__FC_FILENAME__" (fun loc -> CST_STRING (ints loc, loc)) + +let init_lexicon () = H.clear lexicon ; Logic_env.reset_typenames () ; Logic_env.builtin_types_as_typenames () ; - List.iter - (fun (key, builder) -> H.add lexicon key builder) - [ ("auto", fun loc -> AUTO loc); - ("const", fun loc -> CONST loc); - ("__const", fun loc -> CONST loc); - ("__const__", fun loc -> CONST loc); - ("static", fun loc -> STATIC loc); - ("extern", fun loc -> EXTERN loc); - ("long", fun loc -> LONG loc); - ("short", fun loc -> SHORT loc); - ("register", fun loc -> REGISTER loc); - ("signed", fun loc -> SIGNED loc); - ("__signed", fun loc -> SIGNED loc); - ("unsigned", fun loc -> UNSIGNED loc); - ("volatile", fun loc -> VOLATILE loc); - ("__volatile", fun loc -> VOLATILE loc); + valid "auto" (fun loc -> AUTO loc) ; + valid "const" (fun loc -> CONST loc) ; + valid "__const" (fun loc -> CONST loc) ; + valid "__const__" (fun loc -> CONST loc) ; + valid "static" (fun loc -> STATIC loc) ; + valid "extern" (fun loc -> EXTERN loc) ; + valid "long" (fun loc -> LONG loc) ; + valid "short" (fun loc -> SHORT loc) ; + valid "register" (fun loc -> REGISTER loc) ; + valid "signed" (fun loc -> SIGNED loc) ; + valid "__signed" (fun loc -> SIGNED loc) ; + valid "unsigned" (fun loc -> UNSIGNED loc) ; + valid "volatile" (fun loc -> VOLATILE loc) ; + valid "__volatile" (fun loc -> VOLATILE loc) ; (* WW: see /usr/include/sys/cdefs.h for why __signed and __volatile * are accepted GCC-isms *) - ("char", fun loc -> CHAR loc); - ("_Bool", fun loc -> BOOL loc); - ("int", fun loc -> INT loc); - ("float", fun loc -> FLOAT loc); - ("double", fun loc -> DOUBLE loc); - ("void", fun loc -> VOID loc); - ("enum", fun loc -> ENUM loc); - ("struct", fun loc -> STRUCT loc); - ("typedef", fun loc -> TYPEDEF loc); - ("union", fun loc -> UNION loc); - ("break", fun loc -> BREAK loc); - ("continue", fun loc -> CONTINUE loc); - ("goto", fun loc -> GOTO loc); - ("return", fun loc -> dbgToken (RETURN loc)); - ("switch", fun loc -> dbgToken (SWITCH loc)); - ("case", fun loc -> CASE loc); - ("default", fun loc -> DEFAULT loc); - ("while", fun loc -> WHILE loc); - ("do", fun loc -> DO loc); - ("for", fun loc -> FOR loc); - ("if", fun loc -> dbgToken (IF loc)); - ("else", fun _ -> ELSE); + valid "char" (fun loc -> CHAR loc) ; + valid "_Bool" (fun loc -> BOOL loc) ; + valid "int" (fun loc -> INT loc) ; + valid "float" (fun loc -> FLOAT loc) ; + valid "double" (fun loc -> DOUBLE loc) ; + valid "void" (fun loc -> VOID loc) ; + valid "enum" (fun loc -> ENUM loc) ; + valid "struct" (fun loc -> STRUCT loc) ; + valid "typedef" (fun loc -> TYPEDEF loc) ; + valid "union" (fun loc -> UNION loc) ; + valid "break" (fun loc -> BREAK loc) ; + valid "continue" (fun loc -> CONTINUE loc) ; + valid "goto" (fun loc -> GOTO loc) ; + valid "return" (fun loc -> RETURN loc) ; + valid "switch" (fun loc -> SWITCH loc) ; + valid "case" (fun loc -> CASE loc) ; + valid "default" (fun loc -> DEFAULT loc) ; + valid "while" (fun loc -> WHILE loc) ; + valid "do" (fun loc -> DO loc) ; + valid "for" (fun loc -> FOR loc) ; + valid "if" (fun loc -> IF loc) ; + valid "else" (fun _ -> ELSE) ; (*** Implementation specific keywords ***) - ("__signed__", fun loc -> SIGNED loc); - ("__inline__", fun loc -> INLINE loc); - ("inline", fun loc -> INLINE loc); - ("__inline", fun loc -> INLINE loc); - ("_Noreturn", - fun loc -> - Kernel.(warning ~wkey:wkey_c11 "_Noreturn is a C11 keyword"); - NORETURN loc); - ("_Static_assert", - fun loc -> - Kernel.(warning ~wkey:wkey_c11 "_Static_assert is a C11 keyword"); - STATIC_ASSERT loc); - ("__attribute__", fun loc -> ATTRIBUTE loc); - ("__attribute", fun loc -> ATTRIBUTE loc); - ("_Nullable", fun loc -> NOP_ATTRIBUTE loc); - ("__blockattribute__", fun _ -> BLOCKATTRIBUTE); - ("__blockattribute", fun _ -> BLOCKATTRIBUTE); - ("__asm__", fun loc -> ASM loc); - ("asm", fun loc -> ASM loc); - ("__typeof__", fun loc -> TYPEOF loc); - ("__typeof", fun loc -> TYPEOF loc); - ("typeof", fun loc -> TYPEOF loc); - ("__alignof", fun loc -> ALIGNOF loc); - ("__alignof__", fun loc -> ALIGNOF loc); - ("__volatile__", fun loc -> VOLATILE loc); - ("__volatile", fun loc -> VOLATILE loc); - - ("__FUNCTION__", fun loc -> FUNCTION__ loc); - ("__func__", fun loc -> FUNCTION__ loc); (* ISO 6.4.2.2 *) - ("__PRETTY_FUNCTION__", fun loc -> PRETTY_FUNCTION__ loc); - ("__label__", fun _ -> LABEL__); + valid "__signed__" (fun loc -> SIGNED loc) ; + valid "__inline__" (fun loc -> INLINE loc) ; + valid "inline" (fun loc -> INLINE loc) ; + valid "__inline" (fun loc -> INLINE loc) ; + warning_C11 "_Noreturn" (fun loc -> NORETURN loc) ; + warning_C11 "_Static_assert" (fun loc -> STATIC_ASSERT loc) ; + valid "__attribute__" (fun loc -> ATTRIBUTE loc) ; + valid "__attribute" (fun loc -> ATTRIBUTE loc) ; + valid "_Nullable" (fun loc -> NOP_ATTRIBUTE loc) ; + valid "__blockattribute__" (fun _ -> BLOCKATTRIBUTE) ; + valid "__blockattribute" (fun _ -> BLOCKATTRIBUTE) ; + valid "__asm__" (fun loc -> ASM loc) ; + valid "asm" (fun loc -> ASM loc) ; + valid "__typeof__" (fun loc -> TYPEOF loc) ; + valid "__typeof" (fun loc -> TYPEOF loc) ; + valid "typeof" (fun loc -> TYPEOF loc) ; + valid "__alignof" (fun loc -> ALIGNOF loc) ; + valid "__alignof__" (fun loc -> ALIGNOF loc) ; + valid "__volatile__" (fun loc -> VOLATILE loc) ; + valid "__volatile" (fun loc -> VOLATILE loc) ; + valid "__FUNCTION__" (fun loc -> FUNCTION__ loc) ; + valid "__func__" (fun loc -> FUNCTION__ loc) ; (* ISO 6.4.2.2 *) + valid "__PRETTY_FUNCTION__" (fun loc -> PRETTY_FUNCTION__ loc) ; + valid "__label__" (fun _ -> LABEL__) ; (*** weimer: GCC arcana ***) - ("__restrict", fun loc -> RESTRICT loc); - ("restrict", fun loc -> RESTRICT loc); -(* ("__extension__", EXTENSION); *) + valid "__restrict" (fun loc -> RESTRICT loc) ; + valid "restrict" (fun loc -> RESTRICT loc) ; (**** MS VC ***) - ("__int64", fun _ -> INT64 (currentLoc ())); - ("__int32", fun loc -> INT loc); - ("_cdecl", fun _ -> MSATTR ("_cdecl", currentLoc ())); - ("__cdecl", fun _ -> MSATTR ("__cdecl", currentLoc ())); - ("_stdcall", fun _ -> MSATTR ("_stdcall", currentLoc ())); - ("__stdcall", fun _ -> MSATTR ("__stdcall", currentLoc ())); - ("_fastcall", fun _ -> MSATTR ("_fastcall", currentLoc ())); - ("__fastcall", fun _ -> MSATTR ("__fastcall", currentLoc ())); - ("__w64", fun _ -> MSATTR("__w64", currentLoc ())); - ("__declspec", fun loc -> DECLSPEC loc); - ("__forceinline", fun loc -> INLINE loc); (* !! we turn forceinline - * into inline *) - (* weimer: some files produced by 'GCC -E' expect this type to be - * defined *) - ("__builtin_va_list", - fun _ -> NAMED_TYPE "__builtin_va_list"); - ("__builtin_va_arg", fun loc -> BUILTIN_VA_ARG loc); - ("__builtin_types_compatible_p", fun loc -> BUILTIN_TYPES_COMPAT loc); - ("__builtin_offsetof", fun loc -> BUILTIN_OFFSETOF loc); - ("_Thread_local", - fun loc -> - Kernel.(warning ~wkey:wkey_c11 "_Thread_local is a C11 keyword"); - THREAD_LOCAL loc); + valid "__int64" (fun _ -> INT64 (currentLoc ())) ; + valid "__int32" (fun loc -> INT loc) ; + valid "_cdecl" ( fun _ -> MSATTR ("_cdecl", currentLoc ())) ; + valid "__cdecl" (fun _ -> MSATTR ("__cdecl", currentLoc ())) ; + valid "_stdcall" (fun _ -> MSATTR ("_stdcall", currentLoc ())) ; + valid "__stdcall" (fun _ -> MSATTR ("__stdcall", currentLoc ())) ; + valid "_fastcall" (fun _ -> MSATTR ("_fastcall", currentLoc ())) ; + valid "__fastcall" (fun _ -> MSATTR ("__fastcall", currentLoc ())) ; + valid "__w64" (fun _ -> MSATTR("__w64", currentLoc ())) ; + valid "__declspec" (fun loc -> DECLSPEC loc) ; + (* !! we turn forceinline into inline *) + valid "__forceinline" (fun loc -> INLINE loc) ; + (* Some files produced by 'GCC -E' expect this type to be defined *) + valid "__builtin_va_list" (fun _ -> NAMED_TYPE "__builtin_va_list") ; + valid "__builtin_va_arg" (fun loc -> BUILTIN_VA_ARG loc) ; + valid "__builtin_types_compatible_p" (fun loc -> BUILTIN_TYPES_COMPAT loc) ; + valid "__builtin_offsetof" (fun loc -> BUILTIN_OFFSETOF loc) ; + warning_C11 "_Thread_local" (fun loc -> THREAD_LOCAL loc) ; (* We recognize __thread for GCC machdeps *) - ("__thread", - fun loc -> - if Cil.gccMode () then - THREAD loc - else begin - Kernel.( - warning - ~wkey:wkey_conditional_feature - "__thread is a GCC extension, use a GCC-based machdep to enable it"); - IDENT "__thread" - end); - ("__FC_FILENAME__", - (fun loc -> - let filename = - Filepath.Normalized.to_pretty_string (fst loc).pos_path - in - let seq = String.to_seq filename in - let convert_char c = Int64.of_int (Char.code c) in - let l = - Seq.fold_left (fun acc c -> convert_char c :: acc) [] seq - in - CST_STRING (List.rev l,loc))); + thread_keyword () ; + filename_keyword () ; (* The following C11 tokens are not yet supported, so we provide some helpful error messages. Usage of 'fatal' instead of 'error' below prevents duplicate error messages due to parsing errors. *) - ("_Alignas", - fun loc -> - Kernel.abort ~source:(fst loc) - "_Alignas is currently unsupported by Frama-C."); - ("_Alignof", - fun loc -> - Kernel.abort ~source:(fst loc) - "_Alignof is currently unsupported by Frama-C."); - ("_Complex", - fun loc -> - Kernel.abort ~source:(fst loc) - "_Complex is currently unsupported by Frama-C."); - ("_Generic", - fun loc -> - Kernel.(warning ~wkey:wkey_c11 "_Generic is a C11 keyword"); - GENERIC loc); - ("_Imaginary", - fun loc -> - Kernel.abort ~source:(fst loc) - "_Imaginary is currently unsupported by Frama-C."); - ] + unsupported "_Alignas" ; + unsupported "_Alignof" ; + unsupported "_Complex" ; + warning_C11 "_Generic" (fun loc -> GENERIC loc) ; + unsupported "_Imaginary" + -let is_c_keyword s = Hashtbl.mem lexicon s (* Mark an identifier as a type name. The old mapping is preserved and will * be reinstated when we exit this context *) let add_type name = - (* ignore (print_string ("adding type name " ^ name ^ "\n")); *) - H.add lexicon name (fun _ -> NAMED_TYPE name); + add name (fun _ -> NAMED_TYPE name) ; Logic_env.add_typename name -let context : string list list ref = ref [ [] ] - -let typedef_decl = ref false +let remove_type name = + remove name ; Logic_env.remove_typename name -let is_typedef () = !typedef_decl -let set_typedef () = typedef_decl := true -let reset_typedef () = typedef_decl := false +let context : string list list ref = ref [ [] ] let push_context _ = context := [] :: !context let pop_context _ = match !context with - [] -> Kernel.fatal "Empty context stack" - | con::sub -> - (context := sub; - List.iter (fun name -> - (* Format.eprintf - "removing lexicon for %s@." name; *) - H.remove lexicon name; - Logic_env.remove_typename name - ) con) + | [] -> Kernel.fatal "Empty context stack" + | con :: sub -> context := sub ; List.iter remove_type con (* Mark an identifier as a variable name. The old mapping is preserved and * will be reinstated when we exit this context *) let add_identifier name = match !context with - [] -> Kernel.fatal "Empty context stack" + | [] -> Kernel.fatal "Empty context stack" | con :: sub -> - (context := (name::con)::sub; - (*Format.eprintf "adding IDENT for %s@." name;*) - H.add lexicon name (fun _ -> dbgToken (IDENT name)); + context := (name :: con) :: sub ; + add name (fun _ -> IDENT name) ; Logic_env.hide_typename name - ) -(* -** Useful primitives -*) -let scan_ident id = - let here = currentLoc () in - try (H.find lexicon id) here - (* default to variable name, as opposed to type *) - with Not_found -> dbgToken (IDENT id) + +let typedef_decl = ref false +let is_typedef () = !typedef_decl +let set_typedef () = typedef_decl := true +let reset_typedef () = typedef_decl := false + (* ** Buffer processor *) - let init ~filename lexer = - init_lexicon (); - E.startParsing filename lexer - + init_lexicon () ; E.startParsing filename lexer let finish () = - E.finishParsing (); - Logic_env.reset_typenames () + E.finishParsing () ; Logic_env.reset_typenames () (*** escape character management ***) -let scan_escape (char: char) : int64 = - let result = match char with - 'n' -> '\n' - | 'r' -> '\r' - | 't' -> '\t' - | 'b' -> '\b' - | 'f' -> '\012' (* ASCII code 12 *) - | 'v' -> '\011' (* ASCII code 11 *) - | 'a' -> '\007' (* ASCII code 7 *) - | 'e' | 'E' -> '\027' (* ASCII code 27. This is a GCC extension *) - | '\'' -> '\'' - | '"'-> '"' (* '"' *) - | '?' -> '?' - | '(' -> '(' - | '{' -> '{' - | '[' -> '[' - | '%' -> '%' - | '\\' -> '\\' - | other -> - E.parse_error "Unrecognized escape sequence: \\%c" other - in - Int64.of_int (Char.code result) +let scan_escape = function + | 'n' -> int64_of_char '\n' + | 'r' -> int64_of_char '\r' + | 't' -> int64_of_char '\t' + | 'b' -> int64_of_char '\b' + | 'f' -> int64_of_char '\012' (* ASCII code 12 *) + | 'v' -> int64_of_char '\011' (* ASCII code 11 *) + | 'a' -> int64_of_char '\007' (* ASCII code 7 *) + | 'e' | 'E' -> int64_of_char '\027' (* ASCII code 27. This is a GCC extension *) + | '\'' -> int64_of_char '\'' + | '"' -> int64_of_char '"' (* '"' *) + | '?' -> int64_of_char '?' + | '(' -> int64_of_char '(' + | '{' -> int64_of_char '{' + | '[' -> int64_of_char '[' + | '%' -> int64_of_char '%' + | '\\' -> int64_of_char '\\' + | other -> E.parse_error "Unrecognized escape sequence: \\%c" other let scan_hex_escape str = let radix = Int64.of_int 16 in @@ -373,77 +324,49 @@ let lex_simple_escape remainder lexbuf = prefix :: remainder lexbuf let lex_unescaped remainder lexbuf = - let prefix = Int64.of_int (Char.code (Lexing.lexeme_char lexbuf 0)) in + let prefix = int64_of_char (Lexing.lexeme_char lexbuf 0) in prefix :: remainder lexbuf let lex_comment remainder buffer lexbuf = let s = Lexing.lexeme lexbuf in if s = "\n" then E.newline() ; - (match buffer with None -> () | Some b -> Buffer.add_string b s) ; + Option.iter (fun b -> Buffer.add_string b s) buffer ; remainder buffer lexbuf +(* We ignore the last line if it does not contain "\n" as it means that + it directly precedes the "else" and thus contains only whitespaces *) +let rec iter_comments ends_with_n f = function + | [] -> () + | [ _ ] when not ends_with_n -> () + | x :: l -> f x ; iter_comments ends_with_n f l + let do_ghost_else_comments register comments = - let ends_with_n = let last = String.length comments in (* note that comments contains at least a blank *) - '\n' = String.get comments (last - 1) - in + let ends_with_n = '\n' = String.get comments (last - 1) in let comments = String.split_on_char '\n' comments in let comments = List.map String.trim comments in - let process_line s = - E.newline () ; - if register - && Kernel.PrintComments.get () - && not (String.equal "" s) - then - addComment s - in - let rec iter f = function - | [] -> () - (* We ignore the last line if it does not contain "\n" as it means that - it directly precedes the "else" and thus contains only whitespaces *) - | [ _ ] when not ends_with_n -> () - | x :: l -> f x ; iter f l - in - iter process_line comments + let print_comments = Kernel.PrintComments.get () in + let do_comment s = register && print_comments && not (String.equal "" s) in + let process_line s = E.newline () ; if do_comment s then add_comment s in + iter_comments ends_with_n process_line comments let do_lex_comment ?(first_string="") remainder lexbuf = - let buffer = - if Kernel.PrintComments.get () then begin + if Kernel.PrintComments.get () then let b = Buffer.create 80 in - Buffer.add_string b first_string; - Some b - end else None - in remainder buffer lexbuf ; - match buffer with - | Some b -> addComment (Buffer.contents b) - | None -> () - - -(* ISO standard locale-specific function to convert a wide character - * into a sequence of normal characters. Here we work on strings. - * We convert L"Hi" to "H\000i\000" - matth: this seems unused. -let wbtowc wstr = - let len = String.length wstr in - let dest = String.make (len * 2) '\000' in - for i = 0 to len-1 do - dest.[i*2] <- wstr.[i] ; - done ; - dest -*) + let () = Buffer.add_string b first_string in + let () = remainder (Some b) lexbuf in + add_comment (Buffer.contents b) + else remainder None lexbuf + +let do_oneline_ghost ~first_string comment lexbuf initial = + do_lex_comment ~first_string comment lexbuf ; + E.newline () ; + if is_oneline_ghost () + then (exit_oneline_ghost () ; RGHOST) + else initial lexbuf + -(* This function converts the "Hi" in L"Hi" to { L'H', L'i', L'\0' } - matth: this seems unused. -let wstr_to_warray wstr = - let len = String.length wstr in - let res = ref "{ " in - for i = 0 to len-1 do - res := !res ^ (Printf.sprintf "L'%c', " wstr.[i]) - done ; - res := !res ^ "}" ; - !res -*) (* Pragmas get explicit end-of-line tokens. * Elsewhere they are silently discarded as whitespace. *) @@ -451,11 +374,9 @@ let pragmaLine = ref false let annot_char = ref '@' -let () = - Kernel.ReadAnnot.add_set_hook - (fun _ x -> (* prevent the C lexer interpretation of comments *) - annot_char := if x then '@' else '\000') +let prevent _ x = annot_char := if x then '@' else '\000' +let () = Kernel.ReadAnnot.add_set_hook prevent let annot_start_pos = ref Cabshelper.cabslu let buf = Buffer.create 1024 @@ -464,40 +385,39 @@ let save_current_pos () = annot_start_pos := currentLoc () let annot_lex initial rule lexbuf = - try save_current_pos () ; Buffer.clear buf ; - rule lexbuf + try rule lexbuf with Parsing.Parse_error -> - let source = Cil_datatype.Position.of_lexing_pos (Lexing.lexeme_start_p lexbuf) in + let start = Lexing.lexeme_start_p lexbuf in + let source = Cil_datatype.Position.of_lexing_pos start in Kernel.warning ~wkey:Kernel.wkey_annot_error ~source "skipping annotation" ; initial lexbuf let make_annot ~one_line default lexbuf s = let start = snd !annot_start_pos in match Logic_lexer.annot (start, s) with + (* error occured and annotation is discarded. Find a normal token. *) + | None -> default lexbuf | Some (stop, token) -> lexbuf.Lexing.lex_curr_p <- Cil_datatype.Position.to_lexing_pos stop ; if one_line then E.newline () ; - (match token with + match token with | Logic_ptree.Adecl d -> DECL d | Logic_ptree.Aspec -> SPEC (start, s) (* At this point, we only have identified a function spec. Complete - parsing of the annotation will only occur in the cparser.mly rule. - *) + parsing of the annotation will only occur in the cparser.mly rule. *) | Logic_ptree.Acode_annot (loc,a) -> CODE_ANNOT (a, loc) | Logic_ptree.Aloop_annot (loc,a) -> LOOP_ANNOT (a,loc) - | Logic_ptree.Aattribute_annot (loc,a) -> ATTRIBUTE_ANNOT (a, loc)) - | None -> (* error occured and annotation is discarded. Find a normal token. *) - default lexbuf + | Logic_ptree.Aattribute_annot (loc,a) -> ATTRIBUTE_ANNOT (a, loc) let parse_error lexbuf msg = let loc = lexbuf.Lexing.lex_curr_p in let source = Cil_datatype.Position.of_lexing_pos loc in E.parse_error ~source msg -let () = (* Initialize the pointer in Errormsg *) +let () = Lexerhack.add_type := add_type ; Lexerhack.push_context := push_context ; Lexerhack.pop_context := pop_context ; @@ -507,18 +427,18 @@ let () = Lexerhack.reset_typedef := reset_typedef } + + let decdigit = ['0'-'9'] let octdigit = ['0'-'7'] let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] let binarydigit = ['0' '1'] let letter = ['a'- 'z' 'A'-'Z'] - - let usuffix = ['u' 'U'] let lsuffix = "l" | "L" | "ll" | "LL" -let intsuffix = lsuffix | usuffix | usuffix lsuffix | lsuffix usuffix - | usuffix ? "i64" +let intsuffix = + lsuffix | usuffix | usuffix lsuffix | lsuffix usuffix | usuffix ? "i64" let hexprefix = '0' ['x' 'X'] let binaryprefix = '0' ['b' 'B'] @@ -530,144 +450,138 @@ let binarynum = binaryprefix binarydigit+ intsuffix? let exponent = ['e' 'E']['+' '-']? decdigit+ let fraction = '.' decdigit+ -let decfloat = (intnum? fraction) +let decfloat = + (intnum? fraction) | (intnum exponent) | (intnum? fraction exponent) | (intnum '.') | (intnum '.' exponent) - let hexfraction = hexdigit* '.' hexdigit+ | hexdigit+ '.' let binexponent = ['p' 'P'] ['+' '-']? decdigit+ -let hexfloat = hexprefix hexfraction binexponent +let hexfloat = + hexprefix hexfraction binexponent | hexprefix hexdigit+ binexponent let floatsuffix = ['f' 'F' 'l' 'L'] let floatnum = (decfloat | hexfloat) floatsuffix? let ident = (letter|'_')(letter|decdigit|'_'|'$')* -let blank = [' ' '\t' '\012' '\r' '\026'(*this is the plain old DOS eof char*)]+ +(* \026 is the dos EOF character *) +let blank = [' ' '\t' '\012' '\r' '\026']+ let escape = '\\' _ let hex_escape = '\\' ['x' 'X'] hexdigit+ let oct_escape = '\\' octdigit octdigit? octdigit? -(* Pragmas that are not parsed by CIL. We lex them as PRAGMA_LINE tokens *) -let no_parse_pragma = - "warning" | "GCC" - (* Solaris-style pragmas: *) - | "ident" | "section" | "option" | "asm" | "use_section" | "weak" - | "redefine_extname" - | "TCS_align" +(* Solaris-style pragmas *) +let solaris_pragmas = + "ident" | "section" | "option" | "asm" | "use_section" | "weak" + | "redefine_extname" | "TCS_align" + (* Embedded world *) - | "global_register" | "location" +let embedded_pragmas = "global_register" | "location" + +(* Pragmas that are not parsed by CIL. We lex them as PRAGMA_LINE tokens *) +let no_parse_pragma = "warning" | "GCC" | solaris_pragmas | embedded_pragmas + +let ghost_comments = "//\n" | ("//" [^'\n''@'] ([^'\n']*("\\\n")?)* '\n') + -let ghost_comments = "//\n" - | ("//" [^'\n''@'] ([^'\n']*("\\\n")?)* '\n') rule initial = parse -| "/*" ("" | "@{" | "@}" as suf) (* Skip special doxygen comments. Use of '@' - instead of '!annot_char' is intentional *) + + (* Skip special doxygen comments. Use of '@' instead of '!annot_char' is + intentional *) + | "/*" ("" | "@{" | "@}" as suf) { do_lex_comment ~first_string:suf comment lexbuf ; initial lexbuf } | "/*" ([^ '*' '\n'] as c) - { if c = !annot_char then begin - annot_lex initial annot_first_token lexbuf - end else - begin - do_lex_comment ~first_string:(String.make 1 c) comment lexbuf ; - initial lexbuf - end + { + let first_string = String.make 1 c in + if c = !annot_char + then annot_lex initial annot_first_token lexbuf + else (do_lex_comment ~first_string comment lexbuf ; initial lexbuf) } -| "//" ("" | "@{" | "@}" as suf) (* See comment for "/*@{" above *) + (* See comment for "/*@{" above *) + | "//" ("" | "@{" | "@}" as suf) { - do_lex_comment ~first_string:suf onelinecomment lexbuf ; - E.newline(); - if is_oneline_ghost () then begin - exit_oneline_ghost (); - RGHOST - end else begin - initial lexbuf - end + do_oneline_ghost ~first_string:suf onelinecomment lexbuf initial } | "//" ([^ '\n'] as c) - { if c = !annot_char then begin - annot_lex initial annot_one_line lexbuf - end else - begin - do_lex_comment ~first_string:(String.make 1 c) onelinecomment lexbuf; - E.newline(); - if is_oneline_ghost () then - begin - exit_oneline_ghost (); - RGHOST - end - else - begin - initial lexbuf - end - end + { + let first_string = String.make 1 c in + if c = !annot_char + then annot_lex initial annot_one_line lexbuf + else do_oneline_ghost ~first_string onelinecomment lexbuf initial } + | "\\ghost" - { if is_ghost_code () || is_oneline_ghost () then begin - GHOST (currentLoc()) - end else begin - parse_error lexbuf "Use of \\ghost out of ghost code" - end + { + if is_ghost_code () || is_oneline_ghost () + then GHOST (currentLoc()) + else parse_error lexbuf "Use of \\ghost out of ghost code" + } + + | blank {initial lexbuf} | '\n' + { + E.newline () ; + if !pragmaLine then (pragmaLine := false ; PRAGMA_EOL) + else if is_oneline_ghost () then (exit_oneline_ghost () ; RGHOST) + else initial lexbuf } -| blank {initial lexbuf} -| '\n' { E.newline (); - if !pragmaLine then - begin - pragmaLine := false; - PRAGMA_EOL - end - else if is_oneline_ghost () then - begin - exit_oneline_ghost (); - RGHOST - end - else begin - initial lexbuf - end } -| '\\' '\r' * '\n' { E.newline (); + | '\\' '\r' * '\n' + { + E.newline () ; initial lexbuf } + | '#' { hash lexbuf } | "%:" { hash lexbuf } | "_Pragma" { PRAGMA (currentLoc ()) } -| '\'' { + + | '\'' + { let start = Lexing.lexeme_start_p lexbuf in let content = chr lexbuf in let last = Lexing.lexeme_end_p lexbuf in CST_CHAR (content, Cil_datatype.Location.of_lexing_loc (start,last)) } -| "L'" { + + | "L'" + { let start = Lexing.lexeme_start_p lexbuf in let content = chr lexbuf in let last = Lexing.lexeme_end_p lexbuf in CST_WCHAR (content, Cil_datatype.Location.of_lexing_loc (start,last)) } -| '"' { + + | '"' + { let start = Lexing.lexeme_start_p lexbuf in let content = str lexbuf in let last = Lexing.lexeme_end_p lexbuf in CST_STRING (content, Cil_datatype.Location.of_lexing_loc (start,last)) } -| "L\"" { + + | "L\"" + { let start = Lexing.lexeme_start_p lexbuf in let content = str lexbuf in let last = Lexing.lexeme_end_p lexbuf in CST_WSTRING(content, Cil_datatype.Location.of_lexing_loc (start,last)) } + | floatnum { CST_FLOAT (Lexing.lexeme lexbuf, currentLoc ()) } -| binarynum { (* GCC Extension for binary numbers *) - CST_INT (Lexing.lexeme lexbuf, currentLoc ())} + + (* GCC Extension for binary numbers *) + | binarynum { CST_INT (Lexing.lexeme lexbuf, currentLoc ()) } + | hexnum { CST_INT (Lexing.lexeme lexbuf, currentLoc ()) } | octnum { CST_INT (Lexing.lexeme lexbuf, currentLoc ()) } | intnum { CST_INT (Lexing.lexeme lexbuf, currentLoc ()) } @@ -697,21 +611,23 @@ rule initial = parse | "->" { ARROW } | '+' { PLUS (currentLoc ()) } | '-' { MINUS (currentLoc ()) } + | '*' - { if is_ghost_code () then might_end_ghost lexbuf - else - STAR (currentLoc ())} + { + if is_ghost_code () + then might_end_ghost lexbuf + else STAR (currentLoc ()) + } + | "/" ([^ '\n'] as c) - { if c = !annot_char then - if is_ghost_code () || is_oneline_ghost () then begin - enter_ghost_annot(); - annot_lex initial annot_first_token lexbuf - end else - parse_error lexbuf "This kind of annotation is valid only inside ghost code" - else begin - lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - 1; - SLASH - end } + { + if c = !annot_char then + if is_ghost_code () || is_oneline_ghost () + then (enter_ghost_annot () ; annot_lex initial annot_first_token lexbuf) + else parse_error lexbuf "This kind of annotation is valid only inside ghost code" + else (lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - 1 ; SLASH) + } + | '/' { SLASH } | '%' { PERCENT } | '!' { EXCLAM (currentLoc ()) } @@ -721,152 +637,174 @@ rule initial = parse | '|' { PIPE } | '^' { CIRC } | '?' { QUEST } + | ':' - { if Cabshelper.is_attr_test () then begin - Cabshelper.pop_attr_test (); COLON2 - end else COLON + { + if Cabshelper.is_attr_test () + then (Cabshelper.pop_attr_test () ; COLON2) + else COLON } -| '~' {TILDE (currentLoc ())} -| '{' {dbgToken (LBRACE (currentLoc ()))} -| '}' {dbgToken (RBRACE (currentLoc ()))} -| "<%" {dbgToken (LBRACE (currentLoc ()))} -| "%>" {dbgToken (RBRACE (currentLoc ()))} + | '~' { TILDE (currentLoc ()) } + | '{' { LBRACE (currentLoc ()) } + | '}' { RBRACE (currentLoc ()) } + | "<%" { LBRACE (currentLoc ()) } + | "%>" { RBRACE (currentLoc ()) } | '[' { LBRACKET } | ']' { RBRACKET } | "<:" { LBRACKET } | ":>" { RBRACKET } -| '(' {dbgToken (LPAREN (currentLoc ())) } + | '(' { LPAREN (currentLoc ()) } | ')' { RPAREN } -| ';' {dbgToken (SEMICOLON (currentLoc ())) } + | ';' { SEMICOLON (currentLoc ()) } | ',' { COMMA } | '.' { DOT } | "sizeof" { SIZEOF (currentLoc ()) } | "__asm" { ASM (currentLoc ()) } -(* If we see __pragma we eat it and the matching parentheses as well *) -| "__pragma" { let _ = matchingpars 0 lexbuf in - initial lexbuf - } + (* If we see __pragma we consume it and the matching parentheses as well *) + | "__pragma" { let _ = matchingpars 0 lexbuf in initial lexbuf } (* __extension__ is a black. The parser runs into some conflicts if we let it - * pass *) + pass *) | "__extension__" { initial lexbuf } + | ident { scan_ident (Lexing.lexeme lexbuf) } + | eof - { if is_oneline_ghost() then begin - exit_oneline_ghost (); RGHOST - end + { + if is_oneline_ghost() + then (exit_oneline_ghost () ; RGHOST) else EOF } + | _ as c - { if is_ghost_code() && c = '@' then initial lexbuf + { + if is_ghost_code() && c = '@' then initial lexbuf else parse_error lexbuf "Invalid symbol" } + + and might_end_ghost = parse | '/' { exit_ghost_code() ; RGHOST } | "" { STAR (currentLoc()) } + and comment buffer = parse | "*/" { } | eof { parse_error lexbuf "Unterminated C comment" } | _ { lex_comment comment buffer lexbuf } + and onelinecomment buffer = parse - | "*/" { if is_ghost_code () then + | "*/" + { (* end of multiline comment *) - lexbuf.Lexing.lex_curr_pos <- - lexbuf.Lexing.lex_curr_pos - 2 - else - lex_comment onelinecomment buffer lexbuf + if is_ghost_code () + then lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - 2 + else lex_comment onelinecomment buffer lexbuf } | '\n' | eof { } | _ { lex_comment onelinecomment buffer lexbuf } + and matchingpars parsopen = parse - '\n' { E.newline (); matchingpars parsopen lexbuf } + | '\n' { E.newline (); matchingpars parsopen lexbuf } | blank { matchingpars parsopen lexbuf } | '(' { matchingpars (parsopen + 1) lexbuf } -| ')' { if parsopen > 1 then - matchingpars (parsopen - 1) lexbuf - } -| "/*" { do_lex_comment comment lexbuf ; - matchingpars parsopen lexbuf } -| '"' { let _ = str lexbuf in - matchingpars parsopen lexbuf } + | ')' { if parsopen > 1 then matchingpars (parsopen - 1) lexbuf } + | "/*" { do_lex_comment comment lexbuf ; matchingpars parsopen lexbuf } + | '"' { let _ = str lexbuf in matchingpars parsopen lexbuf } | _ { matchingpars parsopen lexbuf } + (* # <line number> <file name> ... *) and hash = parse - '\n' { E.newline (); initial lexbuf} + | '\n' { E.newline (); initial lexbuf} | blank { hash lexbuf} -| intnum { (* We are seeing a line number. This is the number for the - * next line *) + + (* We are seeing a line number. This is the number for the next line *) + | intnum + { let s = Lexing.lexeme lexbuf in - let lineno = try - int_of_string s - with Failure _ -> - (* the int is too big. *) - Kernel.warning "Bad line number in preprocessed file: %s" s; - (-1) - in + let msg () = Kernel.warning "Bad line number in preprocessed file: %s" s in + let lineno = try int_of_string s with Failure _ -> msg () ; -1 in E.setCurrentLine (lineno - 1) ; (* A file name may follow *) - file lexbuf } -| "line" { hash lexbuf } (* MSVC line number info *) - (* For pragmas with irregular syntax, like #pragma warning, - * we parse them as a whole line. *) + file lexbuf + } + + (* MSVC line number info *) + | "line" { hash lexbuf } + + (* For pragmas with irregular syntax, like #pragma warning, we parse them as a + whole line. *) | "pragma" blank (no_parse_pragma as pragmaName) - { let here = currentLoc () in + { + let here = currentLoc () in PRAGMA_LINE (pragmaName ^ pragma lexbuf, here) } + | "pragma" { pragmaLine := true ; PRAGMA (currentLoc ()) } | _ { endline lexbuf } + and file = parse - '\n' {E.newline (); initial lexbuf} + | '\n' { E.newline () ; initial lexbuf } | blank { file lexbuf } -(* The //-ending file directive is a GCC extension that provides the CWD of the - preprocessor when the file was preprocessed. *) -| '"' ([^ '\012' '\t' '"']* as d) "//\"" { + + (* The //-ending file directive is a GCC extension that provides the CWD of + the preprocessor when the file was preprocessed. *) + | '"' ([^ '\012' '\t' '"']* as d) "//\"" + { E.setCurrentWorkingDirectory d ; - endline lexbuf } -| '"' (([^ '\012' '\t' '"']|"\\\"")* as f) '"' { + endline lexbuf + } + + | '"' (([^ '\012' '\t' '"']|"\\\"")* as f) '"' + { let unescape = Str.regexp_string "\\\"" in let f = Str.global_replace unescape "\"" f in E.setCurrentFile f ; - endline lexbuf} + endline lexbuf + } | _ { endline lexbuf } + and endline = parse - '\n' { E.newline (); initial lexbuf} + | '\n' { E.newline () ; initial lexbuf } | eof { EOF } | _ { endline lexbuf } + and pragma = parse - '\n' { E.newline (); "" } -| _ { let cur = Lexing.lexeme lexbuf in - cur ^ (pragma lexbuf) } + | '\n' { E.newline () ; "" } + | _ { let cur = Lexing.lexeme lexbuf in cur ^ (pragma lexbuf) } + and str = parse - '"' {[]} (* no nul terminiation in CST_STRING '"' *) + (* no nul terminiation in CST_STRING '"' *) + | '"' { [] } | hex_escape { lex_hex_escape str lexbuf } | oct_escape { lex_oct_escape str lexbuf } | escape { lex_simple_escape str lexbuf } | eof { parse_error lexbuf "unterminated string" } | _ { lex_unescaped str lexbuf } + and chr = parse - '\'' {[]} + | '\'' { [] } | hex_escape { lex_hex_escape chr lexbuf } | oct_escape { lex_oct_escape chr lexbuf } | escape { lex_simple_escape chr lexbuf } | eof { parse_error lexbuf "unterminated char" } | _ { lex_unescaped chr lexbuf } + and annot_first_token = parse - | "ghost" ((blank| '\\'?'\n' | ghost_comments)* as comments) "else" { + | "ghost" ((blank| '\\'?'\n' | ghost_comments)* as comments) "else" + { if is_oneline_ghost () then parse_error lexbuf "nested ghost code" ; Buffer.clear buf ; let loc = currentLoc () in @@ -874,54 +812,78 @@ and annot_first_token = parse enter_ghost_code () ; LGHOST_ELSE (loc) } - | "ghost" { + + | "ghost" + { if is_oneline_ghost () then parse_error lexbuf "nested ghost code" ; Buffer.clear buf ; enter_ghost_code () ; LGHOST } + | ' '|'@'|'\t'|'\r' as c { Buffer.add_char buf c ; annot_first_token lexbuf } | '\n' { E.newline() ; Buffer.add_char buf '\n' ; annot_first_token lexbuf } | "" { annot_token lexbuf } + + and annot_token = parse - | "*/" { if is_ghost_annot () then - parse_error lexbuf "Ghost multi-line annotation not terminated"; + | "*/" + { + if is_ghost_annot () + then parse_error lexbuf "Ghost multi-line annotation not terminated" ; let s = Buffer.contents buf in - make_annot ~one_line:false initial lexbuf s } + make_annot ~one_line:false initial lexbuf s + } + | eof { parse_error lexbuf "Unterminated annotation" } | '\n' { E.newline() ; Buffer.add_char buf '\n' ; annot_token lexbuf } - | _ as c { if is_ghost_annot () && c = !annot_char then - might_end_ghost_annot lexbuf - else (Buffer.add_char buf c; annot_token lexbuf) } + | _ as c + { + if is_ghost_annot () && c = !annot_char + then might_end_ghost_annot lexbuf + else (Buffer.add_char buf c ; annot_token lexbuf) + } + + and might_end_ghost_annot = parse | '/' { exit_ghost_annot (); let s = Buffer.contents buf in make_annot ~one_line:false initial lexbuf s } | "" { Buffer.add_char buf !annot_char; annot_token lexbuf } + + and annot_one_line = parse - | "ghost" ((blank|"\\\n")+ as comments) "else" { + | "ghost" ((blank|"\\\n")+ as comments) "else" + { do_ghost_else_comments false comments ; if is_oneline_ghost () then parse_error lexbuf "nested ghost code" ; - enter_oneline_ghost (); LGHOST_ELSE (currentLoc ()) + enter_oneline_ghost () ; + LGHOST_ELSE (currentLoc ()) } - | "ghost" { + + | "ghost" + { if is_oneline_ghost () then parse_error lexbuf "nested ghost code" ; - enter_oneline_ghost (); LGHOST + enter_oneline_ghost () ; + LGHOST } + | ' '|'@'|'\t'|'\r' as c { Buffer.add_char buf c ; annot_one_line lexbuf } | "" { annot_one_line_logic lexbuf } + + and annot_one_line_logic = parse | '\n' { make_annot ~one_line:true initial lexbuf (Buffer.contents buf) } | eof { parse_error lexbuf "Invalid C file: should end with a newline" } | _ as c { Buffer.add_char buf c ; annot_one_line_logic lexbuf } -{ + +{ (* Catch the exceptions raised by the lexer itself *) let initial lexbuf = try initial lexbuf with Failure _ -> raise Parsing.Parse_error - } (* -- GitLab