Skip to content

bits/unicode.h from GCC 14.1.1 stdlibc++ errors #92586

@KyunLFA

Description

@KyunLFA

Something has regressed from the day before yesterday to today as of 621d0f3 which I compiled from git and passed all Clang and LLD tests which I run every time.

I compiled Mesa successfully. However, the PCSX2 emulator no longer compiles, not crashing the compiler but instead producing a compilation error, not inside PCSX2, but in GCC's stdlib++'s unicode.h, located on Arch Linux bases on /usr/include/c++/14.1.1/bits/unicode.h.

The error reads:

/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.1.1/../../../../include/c++/14.1.1/bits/unicode.h:807:4: error: expression is not assignable
  807 |           ++this;

Version 18.1.5 of LLVM Clang (latest stable I have on CachyOS Linux) does not suffer from this error. Reverting to that version no longer produces the error. Going back to LLVM Clang git 621d0f3 introduces the error again.

Here's the offending excerpt from unicode.h:

inline namespace __v15_1_0
{
#define _GLIBCXX_GET_UNICODE_DATA 150100
#include "unicode-data.h"
#ifdef _GLIBCXX_GET_UNICODE_DATA
# error "Invalid unicode data"
#endif

  // The field width of a code point.
  constexpr int
  __field_width(char32_t __c) noexcept
  {
    if (__c < __width_edges[0]) [[likely]]
      return 1;

    auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);
    return (__p - __width_edges) % 2 + 1;
  }

  // @pre c <= 0x10FFFF
  constexpr _Gcb_property
  __grapheme_cluster_break_property(char32_t __c) noexcept
  {
    constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;
    auto* __end = std::end(__gcb_edges);
    auto* __p = std::lower_bound(__gcb_edges, __end,
				 (__c << __gcb_shift_bits) | __mask);
    return _Gcb_property(__p[-1] & __mask);
  }

  constexpr bool
  __is_incb_linker(char32_t __c) noexcept
  {
    const auto __end = std::end(__incb_linkers);
    // Array is small enough that linear search is faster than binary search.
    return std::find(__incb_linkers, __end, __c) != __end;
  }

  // @pre c <= 0x10FFFF
  constexpr _InCB
  __incb_property(char32_t __c) noexcept
  {
    if ((__c << 2) < __incb_edges[0]) [[likely]]
      return _InCB(0);

    constexpr uint32_t __mask = 0x3;
    auto* __end = std::end(__incb_edges);
    auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);
    return _InCB(__p[-1] & __mask);
  }

  constexpr bool
  __is_extended_pictographic(char32_t __c)
  {
    if (__c < __xpicto_edges[0]) [[likely]]
      return 0;

    auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);
    return (__p - __xpicto_edges) % 2;
  }

  struct _Grapheme_cluster_iterator_base
  {
    char32_t _M_c; // First code point in the cluster.
    _Gcb_property _M_prop; // GCB property of _M_c.
    enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
    _XPicto _M_xpicto_seq_state = _XPicto::_Init;
    unsigned char _M_RI_count = 0;
    bool _M_incb_linker_seen = false;

    constexpr void
    _M_reset(char32_t __c, _Gcb_property __p)
    {
      _M_c = __c;
      _M_prop = __p;
      _M_xpicto_seq_state = _XPicto::_Init;
      _M_RI_count = 0;
      _M_incb_linker_seen = false;
    }

    constexpr void
    _M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
    {
      if (_M_xpicto_seq_state == _XPicto::_Failed)
	return;

      auto __next_state = _XPicto::_Failed;
      if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
	{
	  if (__p == _Gcb_property::_Gcb_ZWJ)
	    {
	      if (_M_xpicto_seq_state == _XPicto::_Matched)
		__next_state = _XPicto::_Zwj;
	      // We check _M_c here so that we do the lookup at most once,
	      // and only for clusters containing at least one ZWJ.
	      else if (__is_extended_pictographic(_M_c))
		__next_state = _XPicto::_Zwj;
	    }
	  else if (__p == _Gcb_property::_Gcb_Extend)
	    __next_state = _M_xpicto_seq_state; // no change
	}
      else // Zwj
	{
	  // This assumes that all \p{Extended_Pictographic} emoji have
	  // Grapheme_Cluster_Break=Other.
	  if (__p == _Gcb_property::_Gcb_Other
		&& __is_extended_pictographic(__c))
	    __next_state = _XPicto::_Matched;
	}
      _M_xpicto_seq_state = __next_state;
    }

    constexpr void
    _M_update_ri_count(_Gcb_property __p)
    {
      if (__p == _Gcb_property::_Gcb_Regional_Indicator)
	++_M_RI_count;
      else
	_M_RI_count = 0;
    }

    constexpr void
    _M_update_incb_state(char32_t __c, _Gcb_property)
    {
      if (__is_incb_linker(__c))
	_M_incb_linker_seen = true;
    }
  };

  // Split a range into extended grapheme clusters.
  template<ranges::forward_range _View> requires ranges::view<_View>
    class _Grapheme_cluster_view
    : public ranges::view_interface<_Grapheme_cluster_view<_View>>
    {
    public:

      constexpr
      _Grapheme_cluster_view(_View __v)
      : _M_begin(_Utf32_view<_View>(std::move(__v)).begin())
      { }

      constexpr auto begin() const { return _M_begin; }
      constexpr auto end() const { return _M_begin.end(); }

    private:
      struct _Iterator : private _Grapheme_cluster_iterator_base
      {
      private:
	// Iterator over the underlying code points.
	using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;

      public:
	// TODO: Change value_type to be subrange<_U32_iterator> instead?
	// Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
	// That would be the whole cluster, not just the first code point.
	// Would need to store two iterators and find end of current cluster
	// on increment, so operator* returns value_type(_M_base, _M_next).
	using value_type = char32_t;
	using iterator_concept = forward_iterator_tag;
	using difference_type = ptrdiff_t;

	constexpr
	_Iterator(_U32_iterator __i)
	: _M_base(__i)
	{
	  if (__i != __i.end())
	    {
	      _M_c = *__i;
	      _M_prop = __grapheme_cluster_break_property(_M_c);
	    }
	}

	// The first code point of the current extended grapheme cluster.
	constexpr value_type
	operator*() const
	{ return _M_c; }

	constexpr auto
	operator->() const
	{ return &_M_c; }

	// Move to the next extended grapheme cluster.
	constexpr _Iterator&
	operator++()
	{
	  const auto __end = _M_base.end();
	  if (_M_base != __end)
	    {
	      auto __p_prev = _M_prop;
	      auto __it = _M_base;
	      while (++__it != __end)
		{
		  char32_t __c = *__it;
		  auto __p = __grapheme_cluster_break_property(*__it);
		  _M_update_xpicto_seq_state(__c, __p);
		  _M_update_ri_count(__p);
		  _M_update_incb_state(__c, __p);
		  if (_M_is_break(__p_prev, __p, __it))
		    {
		      // Found a grapheme cluster break
		      _M_reset(__c, __p);
		      break;
		    }
		  __p_prev = __p;
		}
	      _M_base = __it;
	    }
	  return *this;
	}

	constexpr _Iterator
	operator++(int)
	{
	  auto __tmp = *this;
	  ++this; \\ crashes here **(annotated here for the sake of the issue)**
	  return __tmp;
	}

	constexpr bool
	operator==(const _Iterator& __i) const
	{ return _M_base == __i._M_base; }

	// This supports iter != iter.end()
	constexpr bool
	operator==(const ranges::sentinel_t<_View>& __i) const
	{ return _M_base == __i; }

	// Iterator to the start of the current cluster.
	constexpr auto base() const { return _M_base.base(); }

	// The end of the underlying view (not the end of the current cluster!)
	constexpr auto end() const { return _M_base.end(); }

	// Field width of the first code point in the cluster.
	constexpr int
	width() const noexcept
	{ return __field_width(_M_c); }

      private:
	_U32_iterator _M_base;

	// Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
	// http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
	// This implements the rules from TR29 revision 43 in Unicode 15.1.0.
	// Return true if there is a break between code point with property p1
	// and code point with property p2.
	constexpr bool
	_M_is_break(_Gcb_property __p1, _Gcb_property __p2,
		    _U32_iterator __curr) const
	{
	  using enum _Gcb_property;

	  if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)
	    return true; // Break after Control or LF.

	  if (__p1 == _Gcb_CR)
	    return __p2 != _Gcb_LF; // Do not break between a CR and LF.

	  // Rule GB5
	  if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)
	    return true; // Break before Control, CR or LF.

	  // Rule GB6
	  if (__p1 == _Gcb_L)
	    switch (__p2)
	    {
	      case _Gcb_L:
	      case _Gcb_V:
	      case _Gcb_LV:
	      case _Gcb_LVT:
		return false; // Do not break Hangul syllable sequences.
	      default:
		return true;
	      }

	  // Rule GB7
	  if (__p1 == _Gcb_LV || __p1 == _Gcb_V)
	    switch (__p2)
	    {
	      case _Gcb_V:
	      case _Gcb_T:
		return false; // Do not break Hangul syllable sequences.
	      default:
		return true;
	      }

	  // Rule GB8
	  if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)
	    return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.

	  // Rule GB9
	  if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)
	    return false; // Do not break before extending characters or ZWJ.

	  // The following GB9x rules only apply to extended grapheme clusters,
	  // which is what the C++ standard uses (not legacy grapheme clusters).

	  // Rule GB9a
	  if (__p2 == _Gcb_SpacingMark)
	    return false; // Do not break before SpacingMarks,
	  // Rule GB9b
	  if (__p1 == _Gcb_Prepend)
	    return false; // or after Prepend characters.

	  // Rule GB9c (Unicode 15.1.0)
	  // Do not break within certain combinations with
	  // Indic_Conjunct_Break (InCB)=Linker.
	  if (_M_incb_linker_seen
		&& __incb_property(_M_c) == _InCB::_Consonant
		&& __incb_property(*__curr) == _InCB::_Consonant)
	    {
	      // Match [_M_base, __curr] against regular expression
	      // Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
	      bool __have_linker = false;
	      auto __it = _M_base;
	      while (++__it != __curr)
		{
		  if (__is_incb_linker(*__it))
		    __have_linker = true;
		  else
		    {
		      auto __incb = __incb_property(*__it);
		      if (__incb == _InCB::_Consonant)
			__have_linker = false;
		      else if (__incb != _InCB::_Extend)
			break;
		    }
		}
	      if (__it == __curr && __have_linker)
		return false;
	    }

	  // Rule GB11
	  // Do not break within emoji modifier sequences
	  // or emoji zwj sequences.
	  if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
	    return false;

	  // Rules GB12 and GB13
	  // Do not break within emoji flag sequences. That is, do not break
	  // between regional indicator (RI) symbols if there is an odd number
	  // of RI characters before the break point.
	  if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
	    return (_M_RI_count & 1) == 0;

	  // Rule GB999
	  return true; // Otherwise, break everywhere.
	}
      };

      _Iterator _M_begin;
    };

} // namespace __v15_1_0

My flags:
CFLAGS: "-flto=full -march=znver4 -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=3 -Wformat -Werror=format-security -fstack-protector-all --param=ssp-buffer-size=4 -fcf-protection -fno-semantic-interposition -Wno-unused-command-line-argument -fplugin=/usr/lib/LLVMPolly.so -mllvm -polly -fwhole-program-vtables -pthread -fopenmp -mllvm -polly-parallel -mllvm -polly-omp-backend=LLVM -mllvm -polly-num-threads=24 -mllvm -polly-vectorizer=stripmine -mllvm -polly-position=before-vectorizer -fopenmp-extensions -foffload-lto=full -funified-lto -foptimize-sibling-calls -funroll-loops -fstrict-aliasing -fstrict-aliasing -fstrict-vtable-pointers -fslp-vectorize -fvectorize -mllvm -polly-reschedule -mllvm -polly-tc-opt -mllvm -polly-tiling -mllvm -polly-2nd-level-tiling -mllvm -polly-matmul-opt -mllvm -polly-optimized-scops -mllvm -polly-pattern-matching-based-opts -mllvm -polly-postopts -mllvm -polly-register-tiling -fPIE -fPIC"
CXXFLAGS: "$CFLAGS -Wp,-D_GLIBCXX_ASSERTIONS"
LDFLAGS: "-v -fuse-ld=lld -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now,--threads=24,--lto-O3,--lto-CGO3,-lpthread,-lgomp,-z,pack-relative-relocs"
(PCSX2 and most programs I compile do compile and work correctly with these flags)

Can anyone confirm at the least that PCSX2 doesn't compile for you as well on git HEAD of LLVM? Thanks.

My system info:
Kernel: 6.9.0-GIT (patched)
Distro: CachyOS
AMD Ryzen 9 7900X
2x48GB 6800MT/s DDR5
LLVM 621d0f3

Metadata

Metadata

Assignees

No one assigned

    Labels

    clang:frontendLanguage frontend issues, e.g. anything involving "Sema"invalidResolved as invalid, i.e. not a buglibstdc++GNU libstdc++ C++ standard library

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions