From 1e912a3070ee779cbb0a475fe3585c80cde2a40a Mon Sep 17 00:00:00 2001
From: Andre Maroneze <andre.maroneze@cea.fr>
Date: Fri, 16 Jul 2021 15:34:18 +0200
Subject: [PATCH] [Makefile] use OCaml-based, more efficient, tool to check for
 non-UTF-8 files

---
 .gitignore    |  1 +
 Makefile      | 49 +++++++++++++++++++++++++---------
 bin/isutf8.ml | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 111 insertions(+), 12 deletions(-)
 create mode 100644 bin/isutf8.ml

diff --git a/.gitignore b/.gitignore
index f674592c232..90610825c23 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,6 +65,7 @@ autom4te.cache
 /devel_tools/fc-memuse
 /bin/ocamldep_transitive_closure
 /bin/check_newlines
+/bin/isutf8
 
 #share
 /share/Makefile.config
diff --git a/Makefile b/Makefile
index 4f8cb85784b..62d72ea62cc 100644
--- a/Makefile
+++ b/Makefile
@@ -2151,9 +2151,31 @@ else
 	$(OCAMLC) unix.cma $< -o $@
 endif
 
+check-newlines-clean:
+	$(RM) $(CHECK_NEWLINES) bin/check_newlines.cm* bin/check_newlines.o
+
+clean:: check-newlines-clean
+
+ISUTF8:=./bin/isutf8$(EXE)
+
+$(ISUTF8): bin/isutf8.ml
+	$(PRINT_MAKING)	$@
+ifeq ($(OCAMLBEST),opt)
+	$(OCAMLOPT) $< -o $@
+else
+	$(OCAMLC) $< -o $@
+endif
+
+isutf8-clean:
+	$(RM) $(ISUTF8) bin/isutf8.cm* bin/isutf8.o
+
+clean:: isutf8-clean
+
 FILES_WITHOUT_NEWLINE := \
   VERSION \
-  VERSION_CODENAME \
+  VERSION_CODENAME
+
+BINARY_DISTRIB_FILES := \
   $(sort $(wildcard ivette/src/dome/doc/template/static/fonts/*)) \
   $(sort $(wildcard share/*.ico share/*.png share/theme/*/*.png))
 
@@ -2163,6 +2185,13 @@ TESTS_WITHOUT_NEWLINE := \
   tests/verisec/suite/programs/apps/sendmail/CVE-1999-0047/mime7to8/array_vs_pointer.ods \
   tests/verisec/suite/programs/apps/sendmail/CVE-1999-0047/mime7to8/data_testing.ods \
 
+BINARY_DISTRIB_TESTS := \
+  tests/misc/oracle/interpreted_automata_dataflow_backward.dot \
+  tests/misc/oracle/interpreted_automata_dataflow_forward.dot \
+  tests/verisec/suite/programs/apps/SpamAssassin/BID-6679/message_write/test \
+  tests/verisec/suite/programs/apps/sendmail/CVE-1999-0047/mime7to8/array_vs_pointer.ods \
+  tests/verisec/suite/programs/apps/sendmail/CVE-1999-0047/mime7to8/data_testing.ods \
+
 # OPEN_SOURCE: set it to 'yes' if you want to check open source headers
 # STRICT_HEADERS: set it to 'yes' if you want to consider warnings as errors
 # The target check-headers does the following checks:
@@ -2175,7 +2204,7 @@ TESTS_WITHOUT_NEWLINE := \
 # because identical headers but with different encodings are not exactly
 # easy to distinguish
 .PHONY: check-headers
-check-headers: $(HDRCK) $(CHECK_NEWLINES)
+check-headers: $(HDRCK) $(CHECK_NEWLINES) $(ISUTF8)
 	$(PRINT) "Checking $(DISTRIB_HEADERS) headers (OPEN_SOURCE=$(OPEN_SOURCE), CURRENT_HEADERS=$(CURRENT_HEADERS))..."
 	$(PRINT) "- HEADER_SPEC_FILE=$(HEADER_SPEC_FILE)"
 	$(PRINT) "- CURRENT_HEADER_DIRS=$(CURRENT_HEADER_DIRS)"
@@ -2185,16 +2214,12 @@ check-headers: $(HDRCK) $(CHECK_NEWLINES)
 	$(file >distrib_tests.tmp) $(foreach O,$(DISTRIB_TESTS),$(file >>distrib_tests.tmp,$O))
 	$(file >header_exceptions.tmp) $(foreach O,$(HEADER_EXCEPTIONS),$(file >>header_exceptions.tmp,$O))
 	echo "Checking that distributed files terminate with a newline..."
-	$(CHECK_NEWLINES) distrib_files.tmp $(FILES_WITHOUT_NEWLINE)
-	$(CHECK_NEWLINES) distrib_tests.tmp $(TESTS_WITHOUT_NEWLINE)
-	@if command -v file >/dev/null 2>/dev/null; then \
-		echo "Checking that distributed files do not use iso-8859..."; \
-		file --mime-encoding -f distrib_files.tmp -f distrib_tests.tmp | \
-			grep "iso-8859" \
-			| $(SED) "s/^/error: invalid encoding in /" \
-			| ( ! grep "error: invalid encoding" ); \
-	else echo "command 'file' not found, skipping encoding checks"; \
-	fi
+	$(CHECK_NEWLINES) distrib_files.tmp $(FILES_WITHOUT_NEWLINE) $(BINARY_DISTRIB_FILES)
+	$(CHECK_NEWLINES) distrib_tests.tmp $(TESTS_WITHOUT_NEWLINE) $(BINARY_DISTRIB_TESTS)
+	echo "Checking that distributed files do not use iso-8859..."
+	$(ISUTF8) distrib_files.tmp $(BINARY_DISTRIB_FILES)
+	$(ISUTF8) distrib_tests.tmp $(BINARY_DISTRIB_TESTS)
+	echo "Checking headers..."
 	$(HDRCK) \
 		$(HDRCK_EXTRA) \
 		$(addprefix -header-dirs ,$(CURRENT_HEADER_DIRS)) \
diff --git a/bin/isutf8.ml b/bin/isutf8.ml
new file mode 100644
index 00000000000..ccfedc46857
--- /dev/null
+++ b/bin/isutf8.ml
@@ -0,0 +1,73 @@
+module StringSet = Set.Make(String)
+
+exception False
+
+let is_valid_utf8 filename =
+  let buf = Bytes.create 1024 in
+  try
+    let ic = open_in_bin filename in
+    let extra = ref 0 in
+    try
+      while true do
+        let n_bytes_read = input ic buf 0 1024 in
+        if n_bytes_read = 0 then raise End_of_file;
+        for i = 0 to n_bytes_read - 1 do
+          let c = Bytes.get_uint8 buf i in
+          (*Format.printf "extra: %d, read byte: %d (0x%x, char %c)@."
+            !extra c c (Char.chr c);*)
+          if !extra > 0 then begin
+            decr extra;
+            if c lsr 6 <> 2 then raise False
+          end
+          else
+          if c > 127 then begin
+            if c lsr 5 = 6 then extra := 1
+            else if c lsr 4 = 14 then extra := 2
+            else if c lsr 3 = 30 then extra := 3
+            else raise False
+          end;
+        done;
+      done;
+      close_in ic;
+      !extra = 0
+    with
+    | End_of_file ->
+      !extra = 0
+    | False ->
+      close_in ic;
+      false
+  with
+  | Sys_error _ ->
+    (* possibly a non-existing file (e.g. with spaces); ignoring *)
+    Format.printf "could not open, ignoring file: %s" filename;
+    true
+
+(* usage: first argument is a file name containing a list of files
+   (one per line) to be checked; the remaining arguments are filenames
+   to be ignored during checking. *)
+let () =
+  if Array.length Sys.argv < 2 then begin
+    Format.printf "usage: %s file_list.txt [ignore1 ignore2 ...]@." Sys.argv.(0);
+    exit 0
+  end;
+  let errors = ref 0 in
+  let file_list_ic = open_in Sys.argv.(1) in
+  let to_ignore = StringSet.of_list (List.tl (Array.to_list Sys.argv)) in
+  begin
+    try
+      while true; do
+        let filename = input_line file_list_ic in
+        if not (StringSet.mem filename to_ignore)
+        && not (is_valid_utf8 filename) then begin
+          incr errors;
+          Format.printf "error: invalid UTF-8 in file: %s@." filename
+        end
+      done
+    with End_of_file ->
+      close_in file_list_ic
+  end;
+  if !errors > 0 then begin
+    Format.printf "Found %d file(s) with errors.@." !errors;
+    exit 1
+  end else
+    exit 0
-- 
GitLab