-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Format hex code in unicode escape sequences in string literals #2916
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
add30b8
483fc15
cc48d2d
f1dbc96
ef442a6
2ada012
125ebec
af86102
69c9664
7d0e548
52bd904
a5c4e62
221995e
d4dde2e
3557faf
77a48e6
1b9d5fd
3c24427
9f35b61
27d2d86
420a8f9
296cdb9
625c085
1511959
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,9 @@ | |
import re | ||
import sys | ||
from functools import lru_cache | ||
from typing import List, Pattern | ||
from typing import List, Match, Pattern | ||
|
||
from blib2to3.pytree import Leaf | ||
|
||
if sys.version_info < (3, 8): | ||
from typing_extensions import Final | ||
|
@@ -18,6 +20,15 @@ | |
r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL | ||
) | ||
FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") | ||
UNICODE_ESCAPE_RE: Final = re.compile( | ||
r"(?P<backslashes>\\+)(?P<body>" | ||
r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx | ||
r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx | ||
r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh | ||
r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database | ||
r")", | ||
re.VERBOSE, | ||
) | ||
|
||
|
||
def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str: | ||
|
@@ -236,3 +247,34 @@ def normalize_string_quotes(s: str) -> str: | |
return s # Prefer double quotes | ||
|
||
return f"{prefix}{new_quote}{new_body}{new_quote}" | ||
|
||
|
||
def normalize_unicode_escape_sequences(leaf: Leaf) -> None: | ||
"""Replace hex codes in Unicode escape sequences with lowercase representation.""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will have to be thought out still, as this comment points out. My two cents: I prefer upper case, and since Black formats hex numbers to upper already I think it would be consistent. The Python repr argument is solid too, but we should think about changing hex literals as well then. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather not change hex numbers, we already changed our mind there a few times. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So if we're not changing numbers (which I agree with), do y'all share the concern for consistency? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My comments read a bit ambiguously. So to be clear, I'm proposing that we switch the formatting to be upper case to be consistent with hex numbers. Y'all in? |
||
text = leaf.value | ||
prefix = get_string_prefix(text) | ||
JelleZijlstra marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if "r" in prefix.lower(): | ||
return | ||
|
||
def replace(m: Match[str]) -> str: | ||
groups = m.groupdict() | ||
back_slashes = groups["backslashes"] | ||
|
||
if len(back_slashes) % 2 == 0: | ||
return back_slashes + groups["body"] | ||
|
||
if groups["u"]: | ||
# \u | ||
return back_slashes + "u" + groups["u"].lower() | ||
elif groups["U"]: | ||
# \U | ||
return back_slashes + "U" + groups["U"].lower() | ||
elif groups["x"]: | ||
# \x | ||
return back_slashes + "x" + groups["x"].lower() | ||
else: | ||
assert groups["N"], f"Unexpected match: {m}" | ||
# \N{} | ||
return back_slashes + "N{" + groups["N"].upper() + "}" | ||
|
||
leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
x = "\x1F" | ||
x = "\\x1B" | ||
x = "\\\x1B" | ||
x = "\U0001F60E" | ||
x = "\u0001F60E" | ||
x = r"\u0001F60E" | ||
x = "don't format me" | ||
x = "\xA3" | ||
x = "\u2717" | ||
x = "\uFaCe" | ||
x = "\N{ox}\N{OX}" | ||
x = "\N{lAtIn smaLL letteR x}" | ||
x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}" | ||
x = b"\x1Fdon't byte" | ||
x = rb"\x1Fdon't format" | ||
|
||
# output | ||
|
||
x = "\x1f" | ||
x = "\\x1B" | ||
x = "\\\x1b" | ||
x = "\U0001f60e" | ||
x = "\u0001F60E" | ||
x = r"\u0001F60E" | ||
x = "don't format me" | ||
x = "\xa3" | ||
x = "\u2717" | ||
x = "\uface" | ||
x = "\N{OX}\N{OX}" | ||
x = "\N{LATIN SMALL LETTER X}" | ||
x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" | ||
x = b"\x1fdon't byte" | ||
x = rb"\x1Fdon't format" |
Uh oh!
There was an error while loading. Please reload this page.