From d3de50ac3cce755d8351d5166c4bed694a23626f Mon Sep 17 00:00:00 2001
From: Jens Maurer <Jens.Maurer@gmx.net>
Date: Sat, 17 Jun 2023 16:50:52 +0200
Subject: [PATCH] P1854R4 Making non-encodable string literals ill-formed

---
 source/lex.tex | 56 +++++++++++---------------------------------------
 1 file changed, 12 insertions(+), 44 deletions(-)

diff --git a/source/lex.tex b/source/lex.tex
index 1ced646a79..29016afc73 100644
--- a/source/lex.tex
+++ b/source/lex.tex
@@ -1436,20 +1436,14 @@
 \indextext{type!\idxcode{char32_t}}%
 \indextext{wide-character}%
 \indextext{type!\idxcode{wchar_t}}%
-A \defnx{non-encodable character literal}{literal!character!non-encodable}
-is a \grammarterm{character-literal}
-whose \grammarterm{c-char-sequence} consists of a single \grammarterm{c-char}
-that is not a \grammarterm{numeric-escape-sequence} and
-that specifies a character
-that either lacks representation in the literal's associated character encoding
-or that cannot be encoded as a single code unit.
 A \defnadj{multicharacter}{literal} is a \grammarterm{character-literal}
 whose \grammarterm{c-char-sequence} consists of
 more than one \grammarterm{c-char}.
-The \grammarterm{encoding-prefix} of
-a non-encodable character literal or a multicharacter literal
-shall be absent.
-Such \grammarterm{character-literal}s are conditionally-supported.
+A multicharacter literal shall not have an \grammarterm{encoding-prefix}.
+If a multicharacter literal contains a \grammarterm{c-char}
+that is not encodable as a single code unit in the ordinary literal encoding,
+the program is ill-formed.
+Multicharacter literals are conditionally-supported.
 
 \pnum
 The kind of a \grammarterm{character-literal},
@@ -1457,21 +1451,6 @@
 are determined by
 its \grammarterm{encoding-prefix} and its \grammarterm{c-char-sequence}
 as defined by \tref{lex.ccon.literal}.
-The special cases for
-non-encodable character literals and multicharacter literals
-take precedence over the base kind.
-\begin{note}
-The associated character encoding for ordinary character literals
-determines encodability,
-but does not determine the value of
-non-encodable ordinary character literals or
-ordinary multicharacter literals.
-The examples in \tref{lex.ccon.literal}
-for non-encodable ordinary character literals assume that
-the specified character lacks representation in
-the ordinary literal encoding or
-that encoding the character would require more than one code unit.
-\end{note}
 
 \begin{floattable}{Character literals}{lex.ccon.literal}
 {l|l|l|l|l}
@@ -1482,15 +1461,10 @@
 none &
 \defnx{ordinary character literal}{literal!character!ordinary} &
 \keyword{char} &
-ordinary &
+ordinary literal &
 \tcode{'v'} \\ \cline{2-3}\cline{5-5}
  &
-non-encodable ordinary character literal &
-\keyword{int} &
-literal &
-\tcode{'\textbackslash U0001F525'} \\ \cline{2-3}\cline{5-5}
- &
-ordinary multicharacter literal &
+multicharacter literal &
 \keyword{int} &
 encoding &
 \tcode{'abcd'} \\ \hline
@@ -1522,8 +1496,7 @@
 the value of a \grammarterm{character-literal} is determined
 using the range of representable values
 of the \grammarterm{character-literal}'s type in translation phase 7.
-A non-encodable character literal or a multicharacter literal
-has an
+A multicharacter literal has an
 \impldef{value of non-encodable character literal or multicharacter literal}
 value.
 The value of any other kind of \grammarterm{character-literal}
@@ -1537,12 +1510,10 @@
 \grammarterm{universal-character-name}
 is the code unit value of the specified character
 as encoded in the literal's associated character encoding.
-\begin{note}
 If the specified character lacks
 representation in the literal's associated character encoding or
 if it cannot be encoded as a single code unit,
-then the literal is a non-encodable character literal.
-\end{note}
+then the program is ill-formed.
 \item
 A \grammarterm{character-literal} with
 a \grammarterm{c-char-sequence} consisting of
@@ -1568,7 +1539,7 @@
 $v$ does not exceed the range of representable values of the corresponding unsigned type for the underlying type of the \grammarterm{character-literal}'s type,
 then the value is the unique value of the \grammarterm{character-literal}'s type \tcode{T} that is congruent to $v$ modulo $2^N$, where $N$ is the width of \tcode{T}.
 \item
-Otherwise, the \grammarterm{character-literal} is ill-formed.
+Otherwise, the program is ill-formed.
 \end{itemize}
 \item
 A \grammarterm{character-literal} with
@@ -2006,10 +1977,7 @@
 is encoded to a code unit sequence
 using the \grammarterm{string-literal}'s associated character encoding.
 If a character lacks representation in the associated character encoding,
-then the \grammarterm{string-literal} is conditionally-supported and
-an
-\impldef{code unit sequence for non-representable \grammarterm{string-literal}}
-code unit sequence is encoded.
+then the program is ill-formed.
 \begin{note}
 No character lacks representation in any Unicode encoding form.
 \end{note}
@@ -2050,7 +2018,7 @@
 the \grammarterm{string-literal}'s array element type \tcode{T}
 that is congruent to $v$ modulo $2^N$, where $N$ is the width of \tcode{T}.
 \item
-Otherwise, the \grammarterm{string-literal} is ill-formed.
+Otherwise, the program is ill-formed.
 \end{itemize}
 When encoding a stateful character encoding,
 these sequences should have no effect on encoding state.