From 1e912a3070ee779cbb0a475fe3585c80cde2a40a Mon Sep 17 00:00:00 2001 From: Andre Maroneze <andre.maroneze@cea.fr> Date: Fri, 16 Jul 2021 15:34:18 +0200 Subject: [PATCH] [Makefile] use OCaml-based, more efficient, tool to check for non-UTF-8 files --- .gitignore | 1 + Makefile | 49 +++++++++++++++++++++++++--------- bin/isutf8.ml | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 12 deletions(-) create mode 100644 bin/isutf8.ml diff --git a/.gitignore b/.gitignore index f674592c232..90610825c23 100644 --- a/.gitignore +++ b/.gitignore @@ -65,6 +65,7 @@ autom4te.cache /devel_tools/fc-memuse /bin/ocamldep_transitive_closure /bin/check_newlines +/bin/isutf8 #share /share/Makefile.config diff --git a/Makefile b/Makefile index 4f8cb85784b..62d72ea62cc 100644 --- a/Makefile +++ b/Makefile @@ -2151,9 +2151,31 @@ else $(OCAMLC) unix.cma $< -o $@ endif +check-newlines-clean: + $(RM) $(CHECK_NEWLINES) bin/check_newlines.cm* bin/check_newlines.o + +clean:: check-newlines-clean + +ISUTF8:=./bin/isutf8$(EXE) + +$(ISUTF8): bin/isutf8.ml + $(PRINT_MAKING) $@ +ifeq ($(OCAMLBEST),opt) + $(OCAMLOPT) $< -o $@ +else + $(OCAMLC) $< -o $@ +endif + +isutf8-clean: + $(RM) $(ISUTF8) bin/isutf8.cm* bin/isutf8.o + +clean:: isutf8-clean + FILES_WITHOUT_NEWLINE := \ VERSION \ - VERSION_CODENAME \ + VERSION_CODENAME + +BINARY_DISTRIB_FILES := \ $(sort $(wildcard ivette/src/dome/doc/template/static/fonts/*)) \ $(sort $(wildcard share/*.ico share/*.png share/theme/*/*.png)) @@ -2163,6 +2185,13 @@ TESTS_WITHOUT_NEWLINE := \ tests/verisec/suite/programs/apps/sendmail/CVE-1999-0047/mime7to8/array_vs_pointer.ods \ tests/verisec/suite/programs/apps/sendmail/CVE-1999-0047/mime7to8/data_testing.ods \ +BINARY_DISTRIB_TESTS := \ + tests/misc/oracle/interpreted_automata_dataflow_backward.dot \ + tests/misc/oracle/interpreted_automata_dataflow_forward.dot \ + tests/verisec/suite/programs/apps/SpamAssassin/BID-6679/message_write/test \ + tests/verisec/suite/programs/apps/sendmail/CVE-1999-0047/mime7to8/array_vs_pointer.ods \ + tests/verisec/suite/programs/apps/sendmail/CVE-1999-0047/mime7to8/data_testing.ods \ + # OPEN_SOURCE: set it to 'yes' if you want to check open source headers # STRICT_HEADERS: set it to 'yes' if you want to consider warnings as errors # The target check-headers does the following checks: @@ -2175,7 +2204,7 @@ TESTS_WITHOUT_NEWLINE := \ # because identical headers but with different encodings are not exactly # easy to distinguish .PHONY: check-headers -check-headers: $(HDRCK) $(CHECK_NEWLINES) +check-headers: $(HDRCK) $(CHECK_NEWLINES) $(ISUTF8) $(PRINT) "Checking $(DISTRIB_HEADERS) headers (OPEN_SOURCE=$(OPEN_SOURCE), CURRENT_HEADERS=$(CURRENT_HEADERS))..." $(PRINT) "- HEADER_SPEC_FILE=$(HEADER_SPEC_FILE)" $(PRINT) "- CURRENT_HEADER_DIRS=$(CURRENT_HEADER_DIRS)" @@ -2185,16 +2214,12 @@ check-headers: $(HDRCK) $(CHECK_NEWLINES) $(file >distrib_tests.tmp) $(foreach O,$(DISTRIB_TESTS),$(file >>distrib_tests.tmp,$O)) $(file >header_exceptions.tmp) $(foreach O,$(HEADER_EXCEPTIONS),$(file >>header_exceptions.tmp,$O)) echo "Checking that distributed files terminate with a newline..." - $(CHECK_NEWLINES) distrib_files.tmp $(FILES_WITHOUT_NEWLINE) - $(CHECK_NEWLINES) distrib_tests.tmp $(TESTS_WITHOUT_NEWLINE) - @if command -v file >/dev/null 2>/dev/null; then \ - echo "Checking that distributed files do not use iso-8859..."; \ - file --mime-encoding -f distrib_files.tmp -f distrib_tests.tmp | \ - grep "iso-8859" \ - | $(SED) "s/^/error: invalid encoding in /" \ - | ( ! grep "error: invalid encoding" ); \ - else echo "command 'file' not found, skipping encoding checks"; \ - fi + $(CHECK_NEWLINES) distrib_files.tmp $(FILES_WITHOUT_NEWLINE) $(BINARY_DISTRIB_FILES) + $(CHECK_NEWLINES) distrib_tests.tmp $(TESTS_WITHOUT_NEWLINE) $(BINARY_DISTRIB_TESTS) + echo "Checking that distributed files do not use iso-8859..." + $(ISUTF8) distrib_files.tmp $(BINARY_DISTRIB_FILES) + $(ISUTF8) distrib_tests.tmp $(BINARY_DISTRIB_TESTS) + echo "Checking headers..." $(HDRCK) \ $(HDRCK_EXTRA) \ $(addprefix -header-dirs ,$(CURRENT_HEADER_DIRS)) \ diff --git a/bin/isutf8.ml b/bin/isutf8.ml new file mode 100644 index 00000000000..ccfedc46857 --- /dev/null +++ b/bin/isutf8.ml @@ -0,0 +1,73 @@ +module StringSet = Set.Make(String) + +exception False + +let is_valid_utf8 filename = + let buf = Bytes.create 1024 in + try + let ic = open_in_bin filename in + let extra = ref 0 in + try + while true do + let n_bytes_read = input ic buf 0 1024 in + if n_bytes_read = 0 then raise End_of_file; + for i = 0 to n_bytes_read - 1 do + let c = Bytes.get_uint8 buf i in + (*Format.printf "extra: %d, read byte: %d (0x%x, char %c)@." + !extra c c (Char.chr c);*) + if !extra > 0 then begin + decr extra; + if c lsr 6 <> 2 then raise False + end + else + if c > 127 then begin + if c lsr 5 = 6 then extra := 1 + else if c lsr 4 = 14 then extra := 2 + else if c lsr 3 = 30 then extra := 3 + else raise False + end; + done; + done; + close_in ic; + !extra = 0 + with + | End_of_file -> + !extra = 0 + | False -> + close_in ic; + false + with + | Sys_error _ -> + (* possibly a non-existing file (e.g. with spaces); ignoring *) + Format.printf "could not open, ignoring file: %s" filename; + true + +(* usage: first argument is a file name containing a list of files + (one per line) to be checked; the remaining arguments are filenames + to be ignored during checking. *) +let () = + if Array.length Sys.argv < 2 then begin + Format.printf "usage: %s file_list.txt [ignore1 ignore2 ...]@." Sys.argv.(0); + exit 0 + end; + let errors = ref 0 in + let file_list_ic = open_in Sys.argv.(1) in + let to_ignore = StringSet.of_list (List.tl (Array.to_list Sys.argv)) in + begin + try + while true; do + let filename = input_line file_list_ic in + if not (StringSet.mem filename to_ignore) + && not (is_valid_utf8 filename) then begin + incr errors; + Format.printf "error: invalid UTF-8 in file: %s@." filename + end + done + with End_of_file -> + close_in file_list_ic + end; + if !errors > 0 then begin + Format.printf "Found %d file(s) with errors.@." !errors; + exit 1 + end else + exit 0 -- GitLab