From 942591309739c7fd9d8a5c58d180a5a82cb1b957 Mon Sep 17 00:00:00 2001 From: Etgar Perets Date: Mon, 14 Jul 2025 16:18:06 +0300 Subject: [PATCH] SGA-11411 Added unquoted identifier unicode support in PostegreSQL, MySQL, added a test for that, and adjusted a test to reflect this support --- src/dialect/mysql.rs | 2 ++ src/dialect/postgresql.rs | 7 +++---- src/dialect/redshift.rs | 4 ++-- tests/sqlparser_common.rs | 13 +++++++++++-- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs index b50c8df50..f7b5f574e 100644 --- a/src/dialect/mysql.rs +++ b/src/dialect/mysql.rs @@ -43,11 +43,13 @@ impl Dialect for MySqlDialect { // See https://dev.mysql.com/doc/refman/8.0/en/identifiers.html. // Identifiers which begin with a digit are recognized while tokenizing numbers, // so they can be distinguished from exponent numeric literals. + // MySQL also implements non ascii utf-8 charecters ch.is_alphabetic() || ch == '_' || ch == '$' || ch == '@' || ('\u{0080}'..='\u{ffff}').contains(&ch) + || !ch.is_ascii() } fn is_identifier_part(&self, ch: char) -> bool { diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index c1f025574..9cea252c8 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -65,10 +65,9 @@ impl Dialect for PostgreSqlDialect { } fn is_identifier_start(&self, ch: char) -> bool { - // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS - // We don't yet support identifiers beginning with "letters with - // diacritical marks" - ch.is_alphabetic() || ch == '_' + ch.is_alphabetic() || ch == '_' || + // PostgreSQL implements Unicode characters in identifiers. + !ch.is_ascii() } fn is_identifier_part(&self, ch: char) -> bool { diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index c910e4c77..68e025d18 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -80,9 +80,9 @@ impl Dialect for RedshiftSqlDialect { } fn is_identifier_start(&self, ch: char) -> bool { - // Extends Postgres dialect with sharp and UTF-8 multibyte chars + // UTF-8 multibyte characters are supported in identifiers via the PostgreSqlDialect. // https://docs.aws.amazon.com/redshift/latest/dg/r_names.html - PostgreSqlDialect {}.is_identifier_start(ch) || ch == '#' || !ch.is_ascii() + PostgreSqlDialect {}.is_identifier_start(ch) || ch == '#' } fn is_identifier_part(&self, ch: char) -> bool { diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index ba72399f9..e95c7e7b6 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -11151,9 +11151,7 @@ fn parse_non_latin_identifiers() { let supported_dialects = TestedDialects::new(vec![ Box::new(GenericDialect {}), Box::new(DuckDbDialect {}), - Box::new(PostgreSqlDialect {}), Box::new(MsSqlDialect {}), - Box::new(MySqlDialect {}), ]); assert!(supported_dialects .parse_sql_statements("SELECT πŸ’ FROM table1") @@ -16147,3 +16145,14 @@ fn test_identifier_unicode_support() { ]); let _ = dialects.verified_stmt(sql); } + +#[test] +fn test_identifier_unicode_start() { + let sql = r#"SELECT πŸ’phone AS πŸ’ FROM customers"#; + let dialects = TestedDialects::new(vec![ + Box::new(MySqlDialect {}), + Box::new(RedshiftSqlDialect {}), + Box::new(PostgreSqlDialect {}), + ]); + let _ = dialects.verified_stmt(sql); +}