Add identifier start unicode support for Postegres, MySql and Redshift (#1944)

etgarperets · web-flow · commit ecd5d88638ef · 2025-07-15T09:26:11.000+02:00
diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs
@@ -43,11 +43,13 @@ impl Dialect for MySqlDialect {
         // See https://dev.mysql.com/doc/refman/8.0/en/identifiers.html.
         // Identifiers which begin with a digit are recognized while tokenizing numbers,
         // so they can be distinguished from exponent numeric literals.
+        // MySQL also implements non ascii utf-8 charecters
         ch.is_alphabetic()
             || ch == '_'
             || ch == '$'
             || ch == '@'
             || ('\u{0080}'..='\u{ffff}').contains(&ch)
+            || !ch.is_ascii()
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs
@@ -65,10 +65,9 @@ impl Dialect for PostgreSqlDialect {
     }
 
     fn is_identifier_start(&self, ch: char) -> bool {
-        // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
-        // We don't yet support identifiers beginning with "letters with
-        // diacritical marks"
-        ch.is_alphabetic() || ch == '_'
+        ch.is_alphabetic() || ch == '_' ||
+        // PostgreSQL implements Unicode characters in identifiers.
+        !ch.is_ascii()
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs
@@ -80,9 +80,9 @@ impl Dialect for RedshiftSqlDialect {
     }
 
     fn is_identifier_start(&self, ch: char) -> bool {
-        // Extends Postgres dialect with sharp and UTF-8 multibyte chars
+        // UTF-8 multibyte characters are supported in identifiers via the PostgreSqlDialect.
         // https://docs.aws.amazon.com/redshift/latest/dg/r_names.html
-        PostgreSqlDialect {}.is_identifier_start(ch) || ch == '#' || !ch.is_ascii()
+        PostgreSqlDialect {}.is_identifier_start(ch) || ch == '#'
     }
 
     fn is_identifier_part(&self, ch: char) -> bool {
diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs
@@ -11151,9 +11151,7 @@ fn parse_non_latin_identifiers() {
     let supported_dialects = TestedDialects::new(vec![
         Box::new(GenericDialect {}),
         Box::new(DuckDbDialect {}),
-        Box::new(PostgreSqlDialect {}),
         Box::new(MsSqlDialect {}),
-        Box::new(MySqlDialect {}),
     ]);
     assert!(supported_dialects
         .parse_sql_statements("SELECT 💝 FROM table1")
@@ -16147,3 +16145,14 @@ fn test_identifier_unicode_support() {
     ]);
     let _ = dialects.verified_stmt(sql);
 }
+
+#[test]
+fn test_identifier_unicode_start() {
+    let sql = r#"SELECT 💝phone AS 💝 FROM customers"#;
+    let dialects = TestedDialects::new(vec![
+        Box::new(MySqlDialect {}),
+        Box::new(RedshiftSqlDialect {}),
+        Box::new(PostgreSqlDialect {}),
+    ]);
+    let _ = dialects.verified_stmt(sql);
+}

Original file line number	Diff line number	Diff line change
`@@ -65,10 +65,9 @@ impl Dialect for PostgreSqlDialect {`
`65`	`65`	`}`
`66`	`66`
`67`	`67`	`fn is_identifier_start(&self, ch: char) -> bool {`
`68`		`- // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS`
`69`		`- // We don't yet support identifiers beginning with "letters with`
`70`		`- // diacritical marks"`
`71`		`- ch.is_alphabetic() \|\| ch == '_'`
	`68`	`+ ch.is_alphabetic() \|\| ch == '_' \|\|`
	`69`	`+ // PostgreSQL implements Unicode characters in identifiers.`
	`70`	`+ !ch.is_ascii()`
`72`	`71`	`}`
`73`	`72`
`74`	`73`	`fn is_identifier_part(&self, ch: char) -> bool {`
Original file line number	Diff line number	Diff line change
`@@ -80,9 +80,9 @@ impl Dialect for RedshiftSqlDialect {`
`80`	`80`	`}`
`81`	`81`
`82`	`82`	`fn is_identifier_start(&self, ch: char) -> bool {`
`83`		`- // Extends Postgres dialect with sharp and UTF-8 multibyte chars`
	`83`	`+ // UTF-8 multibyte characters are supported in identifiers via the PostgreSqlDialect.`
`84`	`84`	`// https://docs.aws.amazon.com/redshift/latest/dg/r_names.html`
`85`		`- PostgreSqlDialect {}.is_identifier_start(ch) \|\| ch == '#' \|\| !ch.is_ascii()`
	`85`	`+ PostgreSqlDialect {}.is_identifier_start(ch) \|\| ch == '#'`
`86`	`86`	`}`
`87`	`87`
`88`	`88`	`fn is_identifier_part(&self, ch: char) -> bool {`