From 2476e3bfe37080d61ee809e4a5e7821457bf1444 Mon Sep 17 00:00:00 2001 From: Michele Alberti <michele.alberti@cea.fr> Date: Mon, 19 Feb 2024 17:36:16 +0100 Subject: [PATCH] [stdlib] Define standalone theory for csv dataset robustness. --- src/interpretation.ml | 60 ++++++++++++++++++++++++++++++---- src/language.ml | 20 +++++------- stdlib/dataset.mlw | 29 +++++++++++----- stdlib/dune | 2 +- stdlib/robustness.mlw | 39 ++++++++++++++++++++-- tests/interpretation_dataset.t | 38 ++++----------------- 6 files changed, 126 insertions(+), 62 deletions(-) diff --git a/src/interpretation.ml b/src/interpretation.ml index 92469b0..3120a10 100644 --- a/src/interpretation.ml +++ b/src/interpretation.ml @@ -280,13 +280,10 @@ let caisar_builtins : caisar_env CRE.built_in_theories list = in (* Dataset *) - let read_dataset : _ CRE.builtin = + let read_dataset_csv : _ CRE.builtin = fun engine ls vl ty -> match vl with - | [ - Term { t_node = Tconst (ConstStr dataset); _ }; - Term { t_node = Tapp ({ ls_name = { id_string = "CSV"; _ }; _ }, []); _ }; - ] -> + | [ Term { t_node = Tconst (ConstStr dataset); _ } ] -> let { env; cwd; _ } = CRE.user_env engine in let caisar_op = let filename = Stdlib.Filename.concat cwd dataset in @@ -295,7 +292,49 @@ let caisar_builtins : caisar_env CRE.built_in_theories list = Dataset (ds, dataset) in value_term (term_of_caisar_op engine caisar_op ty) - | [ Term _t1; Term _t2 ] -> reconstruct () + | [ Term _t1 ] -> reconstruct () + | _ -> invalid_arg (error_message ls) + in + let min_label : _ CRE.builtin = + fun engine ls vl _ty -> + match vl with + | [ Term ({ t_node = Tapp (ls1, _); _ } as _t1) ] -> ( + match caisar_op_of_ls engine ls1 with + | Dataset (_, DS_csv csv) -> + (* TODO: test for non-empty csv. *) + let min_label = + List.fold_left csv ~init:None ~f:(fun min_label elt -> + let label = Int.of_string (List.hd_exn elt) in + Some + (match min_label with + | None -> label + | Some min_label -> min min_label label)) + in + let min_label = Option.value_exn min_label in + value_int (BigInt.of_int min_label) + | _ -> assert false) + | [ Term _t1 ] -> reconstruct () + | _ -> invalid_arg (error_message ls) + in + let max_label : _ CRE.builtin = + fun engine ls vl _ty -> + match vl with + | [ Term ({ t_node = Tapp (ls1, _); _ } as _t1) ] -> ( + match caisar_op_of_ls engine ls1 with + | Dataset (_, DS_csv csv) -> + (* TODO: test for non-empty csv. *) + let max_label = + List.fold_left csv ~init:None ~f:(fun max_label elt -> + let label = Int.of_string (List.hd_exn elt) in + Some + (match max_label with + | None -> label + | Some max_label -> max max_label label)) + in + let max_label = Option.value_exn max_label in + value_int (BigInt.of_int max_label) + | _ -> assert false) + | [ Term _t1 ] -> reconstruct () | _ -> invalid_arg (error_message ls) in @@ -316,7 +355,14 @@ let caisar_builtins : caisar_env CRE.built_in_theories list = ([ "read_neural_network" ], None, read_neural_network); ([ Ident.op_infix "@@" ], None, apply_neural_network); ] ); - ([ "dataset" ], "Dataset", [], [ ([ "read_dataset" ], None, read_dataset) ]); + ( [ "dataset" ], + "DatasetCSV", + [], + [ + ([ "read_dataset_csv" ], None, read_dataset_csv); + ([ "min_label" ], None, min_label); + ([ "max_label" ], None, max_label); + ] ); ] let bounded_quant engine vs ~cond : CRE.bounded_quant_result option = diff --git a/src/language.ml b/src/language.ml index 39ea448..971f59c 100644 --- a/src/language.ml +++ b/src/language.ml @@ -160,19 +160,19 @@ let register_ovo_support () = let vectors = Term.Hls.create 10 -let float64_t_ty env = +let ty_float64_t env = let th = Env.read_theory env [ "ieee_float" ] "Float64" in Ty.ty_app (Theory.ns_find_ts th.th_export [ "t" ]) [] -let vector_ty env ty_elt = +let ty_vector env ty_elt = let th = Env.read_theory env [ "vector" ] "Vector" in Ty.ty_app (Theory.ns_find_ts th.th_export [ "vector" ]) [ ty_elt ] let create_vector = Env.Wenv.memoize 13 (fun env -> let h = Hashtbl.create (module Int) in - let ty_elt = float64_t_ty env in - let ty = vector_ty env ty_elt in + let ty_elt = ty_float64_t env in + let ty = ty_vector env ty_elt in Hashtbl.findi_or_add h ~default:(fun length -> let ls = let id = Ident.id_fresh "vector" in @@ -208,7 +208,7 @@ let fresh_nn_ls env name = let create_nn_nnet = Env.Wenv.memoize 13 (fun env -> let h = Hashtbl.create (module String) in - let ty_elt = float64_t_ty env in + let ty_elt = ty_float64_t env in Hashtbl.findi_or_add h ~default:(fun filename -> let ls = fresh_nn_ls env "nnet_nn" in let nn = @@ -230,7 +230,7 @@ let create_nn_nnet = let create_nn_onnx = Env.Wenv.memoize 13 (fun env -> let h = Hashtbl.create (module String) in - let ty_elt = float64_t_ty env in + let ty_elt = ty_float64_t env in Hashtbl.findi_or_add h ~default:(fun filename -> let ls = fresh_nn_ls env "onnx_nn" in let onnx = @@ -269,11 +269,9 @@ let datasets = Term.Hls.create 10 let fresh_dataset_csv_ls env name = let ty = - let ty_elt = vector_ty env (float64_t_ty env) in - let th = Env.read_theory env [ "dataset" ] "Dataset" in - Ty.ty_app - (Theory.ns_find_ts th.th_export [ "dataset" ]) - [ Ty.ty_int; ty_elt ] + let ty_feature = ty_float64_t env in + let th = Env.read_theory env [ "dataset" ] "DatasetCSV" in + Ty.ty_app (Theory.ns_find_ts th.th_export [ "dataset" ]) [ ty_feature ] in let id = Ident.id_fresh name in Term.create_fsymbol id [] ty diff --git a/stdlib/dataset.mlw b/stdlib/dataset.mlw index 06aa7bb..050a3a6 100644 --- a/stdlib/dataset.mlw +++ b/stdlib/dataset.mlw @@ -22,21 +22,32 @@ (** {1 Datasets} *) -(** {2 Generic Datasets} *) +(** {2 CSV Datasets} -theory Dataset +A dataset in CSV format is such that each element is given as: +- first column is the label, +- rest of the columns are the vector features. +*) + +theory DatasetCSV + + use int.Int use vector.Vector - type dataset 'a 'b = vector ('a, 'b) - type format = CSV + type label_ = int + type features 'a = vector 'a + type dataset 'a = vector (label_, features 'a) + + function read_dataset_csv (f: string) : dataset 'a - function read_dataset (f: string) (k: format) : dataset 'a 'b + function min_label (d: dataset 'a) : label_ + function max_label (d: dataset 'a) : label_ - predicate forall_ (d: dataset 'a 'b) (f: 'a -> 'b -> bool) = - Vector.forall_ d (fun e -> let a, b = e in f a b) + predicate forall_ (d: dataset 'a) (f: label_ -> features 'a -> bool) = + Vector.forall_ d (fun e -> let i, a = e in f i a) - function foreach (d: dataset 'a 'b) (f: 'a -> 'b -> 'c) : vector 'c = - Vector.foreach d (fun e -> let a, b = e in f a b) + function foreach (d: dataset 'a) (f: label_ -> features 'a -> 'b) : vector 'b = + Vector.foreach d (fun e -> let i, a = e in f i a) end diff --git a/stdlib/dune b/stdlib/dune index eea0f53..46fd7c7 100644 --- a/stdlib/dune +++ b/stdlib/dune @@ -2,5 +2,5 @@ (section (site (caisar stdlib))) - (files caisar.mlw vector.mlw nn.mlw dataset.mlw) + (files caisar.mlw robustness.mlw vector.mlw nn.mlw dataset.mlw) (package caisar)) diff --git a/stdlib/robustness.mlw b/stdlib/robustness.mlw index c012769..1142f8c 100644 --- a/stdlib/robustness.mlw +++ b/stdlib/robustness.mlw @@ -22,8 +22,43 @@ (** {1 Robustness} *) -(** {2 Dataset Robustness} *) +(** {2 Robustness of CSV Datasets} *) -theory DatasetRobustness +theory RobustDatasetCSV + + use ieee_float.Float64 + use int.Int + use vector.Vector + use nn.NeuralNetwork + use dataset.DatasetCSV + + type elt = features t + + predicate valid_label (min_label: label_) (max_label: label_) (l: label_) = + min_label <= l <= max_label + + predicate valid_elt (e: elt) = + forall v: index. valid_index e v -> (0.0: t) .<= e[v] .<= (1.0: t) + + predicate bounded_by_epsilon (e: elt) (eps: t) = + forall v: index. valid_index e v -> .- eps .<= e[v] .<= eps + + predicate robust_around (advises: elt -> label_ -> bool) (eps: t) (l: label_) (e: elt) = + forall perturbed_elt: elt. + has_length perturbed_elt (length e) -> + valid_elt perturbed_elt -> + let perturbation = perturbed_elt - e in + bounded_by_epsilon perturbation eps -> + advises perturbed_elt l + + predicate advises (valid_label: label_ -> bool) (nn: nn) (e: elt) (l: label_) = + valid_label l -> + forall j: int. valid_label j -> j <> l -> (nn @@ e)[l] .>= (nn @@ e)[j] + + predicate robust (nn: nn) (d: dataset t) (eps: t) = + let min_label = DatasetCSV.min_label d in + let max_label = DatasetCSV.max_label d in + let advises = advises (valid_label min_label max_label) nn in + DatasetCSV.forall_ d (robust_around advises eps) end \ No newline at end of file diff --git a/tests/interpretation_dataset.t b/tests/interpretation_dataset.t index 64786e2..d4e8cbd 100644 --- a/tests/interpretation_dataset.t +++ b/tests/interpretation_dataset.t @@ -1,6 +1,6 @@ Test interpret on dataset $ cat - > dataset.csv << EOF - > 1,0.0,1.0,0.784313725,0.019607843,0.776470588 + > 2,0.0,1.0,0.784313725,0.019607843,0.776470588 > 0,0.941176471,0,0,0.011764706,0.039215686 > EOF @@ -13,39 +13,13 @@ Test interpret on dataset > theory T > use ieee_float.Float64 > use int.Int - > use vector.Vector > use nn.NeuralNetwork - > use dataset.Dataset - > - > type image = vector t - > type label_ = int - > - > predicate valid_image (i: image) = - > forall v: index. valid_index i v -> (0.0: t) .<= i[v] .<= (1.0: t) - > - > predicate valid_label (l: label_) = 0 <= l <= 2 - > - > predicate advises (n: nn) (i: image) (l: label_) = - > valid_label l -> - > forall j: int. valid_label j -> j <> l -> (n@@i)[l] .> (n@@i)[j] - > - > predicate bounded_by_epsilon (i: image) (eps: t) = - > forall v: index. valid_index i v -> .- eps .<= i[v] .<= eps - > - > predicate robust_around (n: nn) (eps: t) (l: label_) (i: image) = - > forall perturbed_image: image. - > has_length perturbed_image (length i) -> - > valid_image perturbed_image -> - > let perturbation = perturbed_image - i in - > bounded_by_epsilon perturbation eps -> - > advises n perturbed_image l - > - > predicate robust (n: nn) (d: dataset label_ image) (eps: t) = - > Dataset.forall_ d (robust_around n eps) + > use dataset.DatasetCSV + > use robustness.RobustDatasetCSV > > goal G: > let nn = read_neural_network "TestNetwork.nnet" NNet in - > let dataset = read_dataset "dataset.csv" CSV in + > let dataset = read_dataset_csv "dataset.csv" in > let eps = (0.0100000000000000002081668171172168513294309377670288085937500000:t) in > robust nn dataset eps > end @@ -71,7 +45,7 @@ Test interpret on dataset x3 <= 0.029607843000000001743021726952065364457666873931884765625 x4 >= 0.7664705880000000082219457908649928867816925048828125 x4 <= 0.7864705880000000259855141848674975335597991943359375 - +y0 -y1 >= 0 + +y0 -y2 >= 0 [DEBUG]{ProverSpec} Prover-tailored specification: x0 >= 0.0 @@ -94,7 +68,7 @@ Test interpret on dataset x3 <= 0.029607843000000001743021726952065364457666873931884765625 x4 >= 0.7664705880000000082219457908649928867816925048828125 x4 <= 0.7864705880000000259855141848674975335597991943359375 - +y2 -y1 >= 0 + +y1 -y2 >= 0 [DEBUG]{ProverSpec} Prover-tailored specification: x0 >= 0.0 -- GitLab