From 374bea4a03cc98af6db4cf7265098bd6ed932e84 Mon Sep 17 00:00:00 2001 From: Jens Maurer Date: Tue, 8 Jun 2021 17:56:01 +0200 Subject: [PATCH] P1949R7 C++ Identifier Syntax using Unicode Standard Annex 31 --- source/back.tex | 6 ++ source/compatibility.tex | 31 ++++++--- source/intro.tex | 9 +++ source/lex.tex | 115 +++++++++++---------------------- source/std.tex | 1 + source/uax31.tex | 133 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 206 insertions(+), 89 deletions(-) create mode 100644 source/uax31.tex diff --git a/source/back.tex b/source/back.tex index 57195eeda2..0314b1007c 100644 --- a/source/back.tex +++ b/source/back.tex @@ -23,6 +23,12 @@ \chapter{Bibliography} \doccite{Unicode Text Segmentation} [online]. Edited by Mark Davis. Revision 35; issued for Unicode 12.0.0. 2019-02-15 [viewed 2020-02-23]. Available from: \url{http://www.unicode.org/reports/tr29/tr29-35.html} +\item + The Unicode Consortium. Unicode Standard Annex, UAX \#31, + \doccite{Unicode Identifier and Pattern Syntax} [online]. + Edited by Mark Davis. Revision 33; issued for Unicode 13.0.0. + 2020-02-13 [viewed 2021-06-08]. + Available from: \url{https://www.unicode.org/reports/tr31/tr31-33.html} \item IANA Time Zone Database. Available from: \url{https://www.iana.org/time-zones} diff --git a/source/compatibility.tex b/source/compatibility.tex index 09dbc7a1bb..70723677af 100644 --- a/source/compatibility.tex +++ b/source/compatibility.tex @@ -1,15 +1,28 @@ %!TEX root = std.tex \infannex{diff}{Compatibility} -% TODO: Add this once we have differences. - -%\rSec1[diff.cpp20]{\Cpp{} and ISO \CppXX{}} -% -%\pnum -%\indextext{summary!compatibility with ISO \CppXX{}}% -%This subclause lists the differences between \Cpp{} and -%ISO \CppXX{} (ISO/IEC 14882:2020, \doccite{Programming Languages --- \Cpp{}}), -%by the chapters of this document. +\rSec1[diff.cpp20]{\Cpp{} and ISO \CppXX{}} + +\rSec2[diff.cpp20.general]{General} + +\pnum +\indextext{summary!compatibility with ISO \CppXX{}}% +Subclause \ref{diff.cpp20} lists the differences between \Cpp{} and +ISO \CppXX{} (ISO/IEC 14882:2020, \doccite{Programming Languages --- \Cpp{}}), +by the chapters of this document. + +\rSec2[diff.cpp20.lex]{\ref{lex}: lexical conventions} + +\diffref{lex.name} +\change +Previously valid identifiers containing characters +not present in UAX \#44 properties XID_Start or XID_Continue, or +not in Normalization Form C, are now rejected. +\rationale +Prevent confusing characters in identifiers. +Requiring normalization of names ensures consistent linker behavior. +\effect +Some identifiers are no longer well-formed. \rSec1[diff.cpp17]{\Cpp{} and ISO \CppXVII{}} diff --git a/source/intro.tex b/source/intro.tex index 9ab77c231e..72fd45d75a 100644 --- a/source/intro.tex +++ b/source/intro.tex @@ -70,6 +70,15 @@ \end{footnote} Language Specification}, Standard Ecma-262, third edition, 1999. +\item +The Unicode Consortium. +Unicode Standard Annex, UAX \#44, \doccite{Unicode Character Database}. +Edited by Ken Whistler and Lauren\c{t}iu Iancu. +Available from: \url{http://www.unicode.org/reports/tr44/} +\item +The Unicode Consortium. +The Unicode Standard, \doccite{Derived Core Properties}. +Available from: \url{https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt} \end{itemize} \pnum diff --git a/source/lex.tex b/source/lex.tex index b0ef313853..eeab984043 100644 --- a/source/lex.tex +++ b/source/lex.tex @@ -325,6 +325,7 @@ string-literal\br user-defined-string-literal\br preprocessing-op-or-punc\br + \textnormal{each} universal-character-name \textnormal{that cannot be one of the above}\br \textnormal{each non-whitespace character that cannot be one of the above} \end{bnf} @@ -340,8 +341,12 @@ (\grammarterm{import-keyword}, \grammarterm{module-keyword}, and \grammarterm{export-keyword}), identifiers, preprocessing numbers, character literals (including user-defined character literals), string literals (including user-defined string literals), preprocessing -operators and punctuators, and single non-whitespace characters that do not lexically -match the other preprocessing token categories. If a \tcode{'} or a \tcode{"} character +operators and punctuators, and single \grammarterm{universal-character-name}s and non-whitespace characters that do not lexically +match the other preprocessing token categories. +If a single \grammarterm{universal-character-name} +does not match any of the other preprocessing token categories, +the program is ill-formed. +If a \tcode{'} or a \tcode{"} character matches the last category, the behavior is undefined. Preprocessing tokens can be separated by \indextext{whitespace}% @@ -602,8 +607,7 @@ \nontermdef{pp-number}\br digit\br \terminal{.} digit\br - pp-number digit\br - pp-number identifier-nondigit\br + pp-number identifier-continue\br pp-number \terminal{'} digit\br pp-number \terminal{'} nondigit\br pp-number \terminal{e} sign\br @@ -630,15 +634,21 @@ \indextext{identifier|(}% \begin{bnf} \nontermdef{identifier}\br - identifier-nondigit\br - identifier identifier-nondigit\br - identifier digit + identifier-start\br + identifier identifier-continue\br \end{bnf} \begin{bnf} -\nontermdef{identifier-nondigit}\br +\nontermdef{identifier-start}\br nondigit\br - universal-character-name + universal-character-name \textnormal{of class XID_Start} +\end{bnf} + +\begin{bnf} +\nontermdef{identifier-continue}\br + digit\br + nondigit\br + universal-character-name \textnormal{of class XID_Continue} \end{bnf} \begin{bnf} @@ -657,15 +667,8 @@ \pnum \indextext{name!length of}% \indextext{name}% -An identifier is an arbitrarily long sequence of letters and digits. -Each \grammarterm{universal-character-name} in an identifier shall designate a -character whose encoding in ISO/IEC 10646 falls into one of the ranges -specified in \tref{lex.name.allowed}. -The initial element shall not be a \grammarterm{universal-character-name} -designating a character whose encoding falls into one of the ranges -specified in \tref{lex.name.disallowed}. -Upper- and lower-case letters are -different. All characters are significant. +The character classes XID_Start and XID_Continue +are Derived Core Properties as described by UAX \#44. \begin{footnote} On systems in which linkers cannot accept extended characters, an encoding of the \grammarterm{universal-character-name} can be used in @@ -674,69 +677,21 @@ \tcode{\textbackslash u} in a \grammarterm{universal-character-name}. Extended characters can produce a long external identifier, but \Cpp{} does not place a translation limit on significant characters for external -identifiers. In \Cpp{}, upper- and lower-case letters are considered -different for all identifiers, including external identifiers. +identifiers. \end{footnote} - -\begin{floattable}{Ranges of characters allowed}{lex.name.allowed} -{lllll} -\topline -\tcode{00A8} & -\tcode{00AA} & -\tcode{00AD} & -\tcode{00AF} & -\tcode{00B2-00B5} \\ -\tcode{00B7-00BA} & -\tcode{00BC-00BE} & -\tcode{00C0-00D6} & -\tcode{00D8-00F6} & -\tcode{00F8-00FF} \\ -\tcode{0100-167F} & -\tcode{1681-180D} & -\tcode{180F-1FFF} &&\\ -\tcode{200B-200D} & -\tcode{202A-202E} & -\tcode{203F-2040} & -\tcode{2054} & -\tcode{2060-206F} \\ -\tcode{2070-218F} & -\tcode{2460-24FF} & -\tcode{2776-2793} & -\tcode{2C00-2DFF} & -\tcode{2E80-2FFF} \\ -\tcode{3004-3007} & -\tcode{3021-302F} & -\tcode{3031-D7FF} && \\ -\tcode{F900-FD3D} & -\tcode{FD40-FDCF} & -\tcode{FDF0-FE44} & -\tcode{FE47-FFFD} & \\ -\tcode{10000-1FFFD} & -\tcode{20000-2FFFD} & -\tcode{30000-3FFFD} & -\tcode{40000-4FFFD} & -\tcode{50000-5FFFD} \\ -\tcode{60000-6FFFD} & -\tcode{70000-7FFFD} & -\tcode{80000-8FFFD} & -\tcode{90000-9FFFD} & -\tcode{A0000-AFFFD} \\ -\tcode{B0000-BFFFD} & -\tcode{C0000-CFFFD} & -\tcode{D0000-DFFFD} & -\tcode{E0000-EFFFD} & -\\ -\end{floattable} - -\begin{floattable}{Ranges of characters disallowed initially (combining characters)}{lex.name.disallowed} -{llll} -\topline -\tcode{0300-036F} & -% FIXME: Unicode v7 adds 1AB0-1AFF -\tcode{1DC0-1DFF} & -\tcode{20D0-20FF} & -\tcode{FE20-FE2F} \\ -\end{floattable} +The program is ill-formed +if an \grammarterm{identifier} does not conform to +Normalization Form C as specified in ISO/IEC 10646. +\begin{note} +Upper- and lower-case letters are considered different for all identifiers. +\end{note} +\begin{note} +In translation phase 4, +\grammarterm{identifier} also includes +those \grammarterm{preprocessing-token}s\iref{lex.pptoken} +differentiated as keywords\iref{lex.key} +in the later translation phase 7\iref{lex.token}. +\end{note} \pnum \indextext{\idxcode{import}}% diff --git a/source/std.tex b/source/std.tex index b9c20d4265..5b4a97f572 100644 --- a/source/std.tex +++ b/source/std.tex @@ -146,6 +146,7 @@ \include{limits} \include{compatibility} \include{future} +\include{uax31} %%-------------------------------------------------- %% back matter diff --git a/source/uax31.tex b/source/uax31.tex new file mode 100644 index 0000000000..392e74ec96 --- /dev/null +++ b/source/uax31.tex @@ -0,0 +1,133 @@ +%!TEX root = std.tex +\infannex{uaxid}{Conformance with UAX \#31} + +\rSec1[uaxid.general]{General} + +\pnum +This Annex describes the choices made in application of +UAX \#31 (``Unicode Identifier and Pattern Syntax'') +to \Cpp{} in terms of the requirements from UAX \#31 and +how they do or do not apply to \Cpp{}. +In terms of UAX \#31, +\Cpp{} conforms by meeting the requirements +R1 ``Default Identifiers'' and +R4 ``Equivalent Normalized Identifiers''. +The other requirements, also listed below, +are either alternatives not taken or do not apply to \Cpp{}. + +\rSec1[uaxid.def]{R1 Default identifiers} + +\rSec2[uaxid.def.general]{General} + +\pnum +UAX \#31 specifies a default syntax for identifiers +based on properties from the Unicode Character Database, UAX \#44. +The general syntax is +\begin{codeblock} + := * ( +)* +\end{codeblock} +where \tcode{} has the XID_Start property, +\tcode{} has the XID_Continue property, and +\tcode{} is a list of characters permitted between continue characters. +For \Cpp{} we add the character U+005F, LOW LINE, or \tcode{_}, +to the set of permitted \tcode{} characters, +the \tcode{} set is empty, and +the \tcode{} characters are unmodified. +In the grammar used in UAX \#31, this is +\begin{codeblock} + := * + := XID_Start + U+005F + := + XID_Continue +\end{codeblock} + +\pnum +This is described in the \Cpp{} grammar in \ref{lex.name}, +where \grammarterm{identifier} is formed from +\grammarterm{identifier-start} or +\grammarterm{identifier} followed by \grammarterm{identifier-continue}. + +\rSec2[uaxid.def.rfmt]{R1a Restricted format characters} + +\pnum +If an implementation of UAX \#31 wishes to allow format characters +such as ZERO WIDTH JOINER or ZERO WIDTH NON-JOINER +it must define a profile allowing them, or +describe precisely which combinations are permitted. + +\pnum +\Cpp{} does not allow format characters in identifiers, so this does not apply. + +\rSec2[uaxid.def.stable]{R1b Stable identifiers} + +\pnum +An implementation of UAX \#31 may choose to guarantee +that identifiers are stable across versions of the Unicode Standard. +Once a string qualifies as an identifier it does so in all future versions. + +\pnum +\Cpp{} does not make this guarantee, +except to the extent that UAX \#31 guarantees +the stability of the XID_Start and XID_Continue properties. + +\rSec1[uaxid.immutable]{R2 Immutable identifiers} + +\pnum +An implementation may choose to guarantee that +the set of identifiers will never change +by fixing the set of code points allowed in identifiers forever. + +\pnum +\Cpp{} does not choose to make this guarantee. +As scripts are added to Unicode, +additional characters in those scripts may become available +for use in identifiers. + +\rSec1[uaxid.pattern]{R3 Pattern_White_Space and Pattern_Syntax characters} + +\pnum +UAX \#31 describes how languages that use or interpret patterns of characters, +such as regular expressions or number formats, +may describe that syntax with Unicode properties. + +\pnum +\Cpp{} does not do this as part of the language, +deferring to library components for such usage of patterns. +This requirement does not apply to \Cpp{}. + +\rSec1[uaxid.eqn]{R4 Equivalent normalized identifiers} + +\pnum +UAX \#31 requires that implementations describe +how identifiers are compared and considered equivalent. + +\pnum +\Cpp{} requires that identifiers be in Normalization Form C and +therefore identifiers that compare the same under NFC are equivalent. +This is described in \ref{lex.name}. + +\rSec1[uaxid.eqci]{R5 Equivalent case-insensitive identifiers} + +\pnum +\Cpp{} considers case to be significant in identifier comparison, and +does not do any case folding. +This requirement does not apply to \Cpp{}. + +\rSec1[uaxid.filter]{R6 Filtered normalized identifiers} + +\pnum +If any characters are excluded from normalization, +UAX \#31 requires a precise specification of those exclusions. + +\pnum +\Cpp{} does not make any such exclusions. + +\rSec1[uaxid.filterci]{R7 Filtered case-insensitive identifiers} + +\pnum +\Cpp{} identifiers are case sensitive, and +therefore this requirement does not apply. + +\rSec1[uaxid.hashtag]{R8 Hashtag identifiers} + +\pnum +There are no hashtags in \Cpp{}, so this requirement does not apply.