diff options
author | Hugo Herbelin <Hugo.Herbelin@inria.fr> | 2022-07-12 19:15:23 (GMT) |
---|---|---|
committer | Hugo Herbelin <Hugo.Herbelin@inria.fr> | 2022-07-15 15:05:44 (GMT) |
commit | 238f69c13e2768d68694a49c23b1d148b25695ef (patch) | |
tree | daccb3fd1f5bde3314438f7e98d7c420a55da3bd /clib | |
parent | 60c63f4a98f7665ce118e579c4046de4b91081f5 (diff) | |
download | coq-238f69c13e2768d68694a49c23b1d148b25695ef.zip coq-238f69c13e2768d68694a49c23b1d148b25695ef.tar.gz coq-238f69c13e2768d68694a49c23b1d148b25695ef.tar.bz2 |
Adding unicode classes Separator and Control.
Not strictly used by the lexer but useful to classify the ascii
characters too consistently with the non-ascii ones.
Diffstat (limited to 'clib')
-rw-r--r-- | clib/unicode.ml | 29 |
1 files changed, 22 insertions, 7 deletions
diff --git a/clib/unicode.ml b/clib/unicode.ml index 9202143..e1584d2 100644 --- a/clib/unicode.ml +++ b/clib/unicode.ml @@ -10,7 +10,7 @@ (** Unicode utilities *) -type status = Letter | IdentPart | Symbol | IdentSep | Unknown +type status = Letter | IdentPart | Symbol | IdentSep | Separator | Control | Unknown (* The following table stores classes of Unicode characters that are used by the lexer. There are 5 different classes so 3 bits @@ -31,6 +31,8 @@ let mask i = function | IdentPart -> 2 lsl ((i land 7) * 3) (* 010 *) | Symbol -> 3 lsl ((i land 7) * 3) (* 011 *) | IdentSep -> 4 lsl ((i land 7) * 3) (* 100 *) + | Separator -> 5 lsl ((i land 7) * 3) (* 101 *) + | Control -> 6 lsl ((i land 7) * 3) (* 110 *) | Unknown -> 0 lsl ((i land 7) * 3) (* 000 *) (* Helper to reset 3 bits in a word. *) @@ -59,6 +61,8 @@ let lookup x = else if v = 2 then IdentPart else if v = 3 then Symbol else if v = 4 then IdentSep + else if v = 5 then Separator + else if v = 6 then Control else Unknown (* [classify] discriminates between 5 different kinds of @@ -69,6 +73,7 @@ let classify = (* General tables. *) mk_lookup_table_from_unicode_tables_for Symbol [ + Unicodetable.sk; (* Symbol, modifiers. *) Unicodetable.sm; (* Symbol, maths. *) Unicodetable.sc; (* Symbol, currency. *) Unicodetable.so; (* Symbol, modifier. *) @@ -113,10 +118,20 @@ let classify = [(0x02074, 0x02079)]; (* Superscript 4-9. *) single 0x0002E; (* Dot. *) ]; + mk_lookup_table_from_unicode_tables_for Separator + [ + Unicodetable.zs; (* Separator, Space. *) + Unicodetable.zl; (* Separator, Line. *) + Unicodetable.zp; (* Separator, Paragraph. *) + ]; + mk_lookup_table_from_unicode_tables_for Control + [ + Unicodetable.cc; (* Other, Control. *) + ]; mk_lookup_table_from_unicode_tables_for IdentSep [ single 0x005F; (* Underscore. *) - single 0x00A0; (* Non breaking space. *) + single 0x00A0; (* Non breaking space, overrides Sep *) ]; mk_lookup_table_from_unicode_tables_for IdentPart [ @@ -206,7 +221,7 @@ let escaped_if_non_utf8 s = let is_valid_ident_initial = function | Letter | IdentSep -> true - | IdentPart | Symbol | Unknown -> false + | IdentPart | Symbol | Separator | Control | Unknown -> false let initial_refutation j n s = if is_valid_ident_initial (classify n) then None @@ -217,7 +232,7 @@ let initial_refutation j n s = let is_valid_ident_trailing = function | Letter | IdentSep | IdentPart -> true - | Symbol | Unknown -> false + | Symbol | Separator | Control | Unknown -> false let trailing_refutation i j n s = if is_valid_ident_trailing (classify n) then None @@ -228,15 +243,15 @@ let trailing_refutation i j n s = let is_unknown = function | Unknown -> true - | Letter | IdentSep | IdentPart | Symbol -> false + | Letter | IdentSep | IdentPart | Symbol | Separator | Control -> false let is_ident_part = function | IdentPart -> true - | Letter | IdentSep | Symbol | Unknown -> false + | Letter | IdentSep | Symbol | Unknown | Separator | Control -> false let is_ident_sep = function | IdentSep -> true - | Letter | IdentPart | Symbol | Unknown -> false + | Letter | IdentPart | Symbol | Unknown | Separator | Control -> false let ident_refutation s = if s = ".." then None else try |