diff --git a/Makefile b/Makefile
index d54194c4305fd821607a814895512f6cf094bd5d..9b37922a1720555b0839db0ca6f186064b4dda95 100644
--- a/Makefile
+++ b/Makefile
@@ -285,6 +285,7 @@ DISTRIB_FILES:=\
       share/analysis-scripts/results_display.py                         \
       share/analysis-scripts/script_for_creduce_fatal.sh                \
       share/analysis-scripts/script_for_creduce_non_fatal.sh            \
+      share/analysis-scripts/source_filter.py                           \
       share/analysis-scripts/summary.py                                 \
       share/analysis-scripts/template.mk                                \
       $(wildcard share/emacs/*.el) share/autocomplete_frama-c           \
@@ -1998,6 +1999,7 @@ install:: install-lib-$(OCAMLBEST)
 	  share/analysis-scripts/results_display.py \
 	  share/analysis-scripts/script_for_creduce_fatal.sh \
 	  share/analysis-scripts/script_for_creduce_non_fatal.sh \
+	  share/analysis-scripts/source_filter.py \
 	  share/analysis-scripts/summary.py \
 	  share/analysis-scripts/template.mk \
 	  $(FRAMAC_DATADIR)/analysis-scripts
diff --git a/headers/header_spec.txt b/headers/header_spec.txt
index 1b6649697bff9801fa9a205532acf3c923c33b6f..7eef44ba09bb4562be65708698e210d792abaa1a 100644
--- a/headers/header_spec.txt
+++ b/headers/header_spec.txt
@@ -144,6 +144,7 @@ share/analysis-scripts/README.md: .ignore
 share/analysis-scripts/results_display.py: CEA_LGPL
 share/analysis-scripts/script_for_creduce_fatal.sh: .ignore
 share/analysis-scripts/script_for_creduce_non_fatal.sh: .ignore
+share/analysis-scripts/source_filter.py: CEA_LGPL
 share/analysis-scripts/summary.py: CEA_LGPL
 share/analysis-scripts/template.mk: .ignore
 share/compliance/c11_functions.json: .ignore
diff --git a/share/analysis-scripts/build.py b/share/analysis-scripts/build.py
index 88a0762d0d29a6b5ddf33700c44e972891749c16..9528e62b30d75f83d5686d6d0ef89b16ed1843c9 100755
--- a/share/analysis-scripts/build.py
+++ b/share/analysis-scripts/build.py
@@ -37,6 +37,7 @@ import sys
 import subprocess
 
 import function_finder
+import source_filter
 
 script_dir = os.path.dirname(sys.argv[0])
 
@@ -195,8 +196,7 @@ def copy_fc_stubs():
 # [funcname] in [filename].
 # [has_args] is used to distinguish between main(void) and main(int, char**).
 def find_definitions(funcname, filename):
-    with open(filename, encoding="ascii", errors='ignore') as data:
-        file_content = data.read()
+    file_content = source_filter.open_and_filter(filename, not under_test)
     file_lines = file_content.splitlines(keepends=True)
     newlines = function_finder.compute_newline_offsets(file_lines)
     defs = function_finder.find_definitions_and_declarations(True, False, filename, file_content, file_lines, newlines, funcname)
diff --git a/share/analysis-scripts/build_callgraph.py b/share/analysis-scripts/build_callgraph.py
index 6ab8ab2ab6b9848228dcc16a7c28bbb28c27792d..a1047a70e0eaa9d92044396ed947d5178c2a0822 100755
--- a/share/analysis-scripts/build_callgraph.py
+++ b/share/analysis-scripts/build_callgraph.py
@@ -30,6 +30,7 @@ import re
 import sys
 
 import function_finder
+import source_filter
 
 under_test = os.getenv("PTESTS_TESTING")
 
@@ -79,8 +80,7 @@ class Callgraph:
 def compute(files):
     cg = Callgraph()
     for f in files:
-        with open(f, encoding="ascii", errors='ignore') as data:
-            file_content = data.read()
+        file_content = source_filter.open_and_filter(f, not under_test)
         file_lines = file_content.splitlines(keepends=True)
         newlines = function_finder.compute_newline_offsets(file_lines)
         defs = function_finder.find_definitions_and_declarations(True, False, f, file_content, file_lines, newlines)
diff --git a/share/analysis-scripts/estimate_difficulty.py b/share/analysis-scripts/estimate_difficulty.py
index e7549fefd304cd5d7e27759c2101af0e4bad7fd3..c0bade9d03dfa84f9ba2674bb935ecca68ebd0d6 100755
--- a/share/analysis-scripts/estimate_difficulty.py
+++ b/share/analysis-scripts/estimate_difficulty.py
@@ -36,6 +36,7 @@ import tempfile
 
 import build_callgraph
 import function_finder
+import source_filter
 
 #TODO : avoid relativizing paths when introducing too many ".." ;
 #TODO : accept directory as argument (--full-tree), and then do glob **/*.{c,i} inside
@@ -78,16 +79,16 @@ def get_framac_libc_function_statuses(framac, framac_share):
     return (defined, spec_only)
 
 re_include = re.compile(r'\s*#\s*include\s*("|<)([^">]+)("|>)')
-def grep_includes_in_file(file):
-    with open(file, "r", encoding="utf-8", errors='ignore') as f:
-        i = 0
-        for line in f.readlines():
-            i += 1
-            m = re_include.match(line)
-            if m:
-                kind = m.group(1)
-                header = m.group(2)
-                yield((i,kind,header))
+def grep_includes_in_file(filename):
+    file_content = source_filter.open_and_filter(filename, not under_test)
+    i = 0
+    for line in f.readlines():
+        i += 1
+        m = re_include.match(line)
+        if m:
+            kind = m.group(1)
+            header = m.group(2)
+            yield((i,kind,header))
 
 def get_includes(files):
     quote_includes = {}
diff --git a/share/analysis-scripts/source_filter.py b/share/analysis-scripts/source_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..4156e072aee0fbf7b331d7fe8bbf18b2ae7b1a64
--- /dev/null
+++ b/share/analysis-scripts/source_filter.py
@@ -0,0 +1,90 @@
+#-*- coding: utf-8 -*-
+##########################################################################
+#                                                                        #
+#  This file is part of Frama-C.                                         #
+#                                                                        #
+#  Copyright (C) 2007-2021                                               #
+#    CEA (Commissariat à l'énergie atomique et aux énergies              #
+#         alternatives)                                                  #
+#                                                                        #
+#  you can redistribute it and/or modify it under the terms of the GNU   #
+#  Lesser General Public License as published by the Free Software       #
+#  Foundation, version 2.1.                                              #
+#                                                                        #
+#  It is distributed in the hope that it will be useful,                 #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of        #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         #
+#  GNU Lesser General Public License for more details.                   #
+#                                                                        #
+#  See the GNU Lesser General Public License version 2.1                 #
+#  for more details (enclosed in the file licenses/LGPLv2.1).            #
+#                                                                        #
+##########################################################################
+
+# This file provides some functions to open and filter source files
+# before they are used by other scripts. These filters help improve
+# the efficiency of regex-based heuristics.
+
+# These filters require external tools, either in the PATH, or in
+# environment variables (the latter has higher priority than the former).
+# - scc (a fork including option -k), to remove C comments (variable SCC);
+# - astyle, to re-indent lines (variable ASTYLE)
+# If a tool is absent, the filter is equivalent to a no-op.
+
+# These functions receive a file object (such as produced by open(),
+# subprocess.run, or a previous filter) and return a
+# file object containing the output. They abort execution in case
+# of errors when running the filters. Note that an absent tool
+# does _not_ lead to an error.
+
+import os
+from   pathlib import Path
+import shutil
+import subprocess
+import sys
+
+# warnings about missing commands are disabled during testing
+emit_warns = os.getenv("PTESTS_TESTING") == None
+
+# Returns a Path to the command binary, or None if it is not found
+# Emits a warning the first time it looks for a command
+warned = {}
+def get_command(command, env_var_name):
+    p = os.getenv(env_var_name)
+    if not p:
+        p = shutil.which(command)
+    if not p:
+        if emit_warns and command not in warned:
+            print(f"info: optional external command '{command}' not found in PATH; consider installing it or setting environment variable {env_var_name}")
+            warned[command] = True
+        return None
+    return Path(p)
+
+def run_and_check(command_and_args, input_data):
+    try:
+        return subprocess.check_output(command_and_args, input=input_data, stderr=None, encoding="ascii", errors="ignore")
+    except subprocess.CalledProcessError as e:
+        sys.exit(f"error running command: {command_and_args}\n{e}")
+
+def filter_with_scc(input_data):
+    scc = get_command("scc", "SCC")
+    if scc:
+        return run_and_check([scc, "-k"], input_data)
+    else:
+        return input_data
+
+def filter_with_astyle(input_data):
+    astyle = get_command("astyle", "ASTYLE")
+    if astyle:
+        return run_and_check([astyle, "--keep-one-line-blocks", "--keep-one-line-statements"], input_data)
+    else:
+        return input_data
+
+def open_and_filter(filename, apply_filters):
+    # we ignore encoding errors and use ASCII to avoid issues when
+    # opening files with different encodings (UTF-8, ISO-8859, etc)
+    with open(filename, "r", encoding="ascii", errors='ignore') as f:
+        data = f.read()
+    if apply_filters:
+        data = filter_with_astyle(filter_with_scc(data))
+    return data