From da5d9289160a9323a5ac368403c888ab28a3d898 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Tue, 17 Jun 2025 09:01:53 +0300
Subject: [PATCH] Fix parsing of integer literals with base prefix

MicroPython 1.25.0 introduced a breaking change, aligning the behaviour
of the int() function with the behaviour of CPython (assume a decimal
number, unless a base is specified. Only if a base of 0 is specified
will the base be inferred from the string).

This commit implements a new custom parsing function `parse_int`. It
can correctly parse the following string literals:

* 0x[0-9]+ -> treated as hex
* 0b[0-9]+ -> treated as binary
* 0o[0-9]+ -> treated as octal (Python style)
* 0[0-9]+ -> treated as octal (GNU as style)
* anything else parsed as decimal

It only handles the GNU as style octal case directly, letting the
original `int()` function handle the other cases (using base 0).

In fact, the GNU as octal case was not handled correctly previously,
and this commit fixes that.

Some new tests for previous functionality were added to show that
both new and previous cases are being handled correctly.

Note: GNU as does not actually accept the octal prefix 0o..., but we
accept it as a convenience, as this is accepted in Python code. This
means however, that our assembler accepts code which GNU as does not
accept. But the other way around, we still accept all code that GNU
as accepts, which was one of our goals.
---
 esp32_ulp/assemble.py   |  6 +++---
 esp32_ulp/opcodes.py    | 11 +++++++---
 esp32_ulp/opcodes_s2.py | 11 +++++++---
 esp32_ulp/util.py       | 12 +++++++++++
 tests/opcodes.py        | 15 +++++++++++---
 tests/opcodes_s2.py     | 15 +++++++++++---
 tests/util.py           | 45 ++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 99 insertions(+), 16 deletions(-)

diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index 8b79071..33cec42 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -219,13 +219,13 @@ def fill(self, section, amount, fill_byte):
             raise ValueError('fill in bss section not allowed')
         if section is TEXT:  # TODO: text section should be filled with NOPs
             raise ValueError('fill/skip/align in text section not supported')
-        fill = int(fill_byte or 0).to_bytes(1, 'little') * amount
+        fill = int(self.opcodes.eval_arg(str(fill_byte or 0))).to_bytes(1, 'little') * amount
         self.offsets[section] += len(fill)
         if section is not BSS:
             self.sections[section].append(fill)
 
     def d_skip(self, amount, fill=None):
-        amount = int(amount)
+        amount = int(self.opcodes.eval_arg(amount))
         self.fill(self.section, amount, fill)
 
     d_space = d_skip
@@ -246,7 +246,7 @@ def d_global(self, symbol):
         self.symbols.set_global(symbol)
 
     def append_data(self, wordlen, args):
-        data = [int(arg).to_bytes(wordlen, 'little') for arg in args]
+        data = [int(self.opcodes.eval_arg(arg)).to_bytes(wordlen, 'little') for arg in args]
         self.append_section(b''.join(data))
 
     def d_byte(self, *args):
diff --git a/esp32_ulp/opcodes.py b/esp32_ulp/opcodes.py
index 10fc3d1..03849a3 100644
--- a/esp32_ulp/opcodes.py
+++ b/esp32_ulp/opcodes.py
@@ -13,7 +13,7 @@
 from uctypes import struct, addressof, LITTLE_ENDIAN, UINT32, BFUINT32, BF_POS, BF_LEN
 
 from .soc import *
-from .util import split_tokens, validate_expression
+from .util import split_tokens, validate_expression, parse_int
 
 # XXX dirty hack: use a global for the symbol table
 symbols = None
@@ -285,7 +285,12 @@ def eval_arg(arg):
             _, _, sym_value = symbols.get_sym(token)
             parts.append(str(sym_value))
         else:
-            parts.append(token)
+            try:
+                # attempt to parse, to convert numbers with base prefix correctly
+                int_token = parse_int(token)
+                parts.append(str(int_token))
+            except ValueError:
+                parts.append(token)
     parts = "".join(parts)
     if not validate_expression(parts):
         raise ValueError('Unsupported expression: %s' % parts)
@@ -311,7 +316,7 @@ def arg_qualify(arg):
         if arg_lower in ['--', 'eq', 'ov', 'lt', 'gt', 'ge', 'le']:
             return ARG(COND, arg_lower, arg)
     try:
-        return ARG(IMM, int(arg), arg)
+        return ARG(IMM, parse_int(arg), arg)
     except ValueError:
         pass
     try:
diff --git a/esp32_ulp/opcodes_s2.py b/esp32_ulp/opcodes_s2.py
index 91549af..3a9d643 100644
--- a/esp32_ulp/opcodes_s2.py
+++ b/esp32_ulp/opcodes_s2.py
@@ -12,7 +12,7 @@
 from ucollections import namedtuple
 from uctypes import struct, addressof, LITTLE_ENDIAN, UINT32, BFUINT32, BF_POS, BF_LEN
 
-from .util import split_tokens, validate_expression
+from .util import split_tokens, validate_expression, parse_int
 
 # XXX dirty hack: use a global for the symbol table
 symbols = None
@@ -301,7 +301,12 @@ def eval_arg(arg):
             _, _, sym_value = symbols.get_sym(token)
             parts.append(str(sym_value))
         else:
-            parts.append(token)
+            try:
+                # attempt to parse, to convert numbers with base prefix correctly
+                int_token = parse_int(token)
+                parts.append(str(int_token))
+            except ValueError:
+                parts.append(token)
     parts = "".join(parts)
     if not validate_expression(parts):
         raise ValueError('Unsupported expression: %s' % parts)
@@ -327,7 +332,7 @@ def arg_qualify(arg):
         if arg_lower in ['--', 'eq', 'ov', 'lt', 'gt', 'ge', 'le']:
             return ARG(COND, arg_lower, arg)
     try:
-        return ARG(IMM, int(arg), arg)
+        return ARG(IMM, parse_int(arg), arg)
     except ValueError:
         pass
     try:
diff --git a/esp32_ulp/util.py b/esp32_ulp/util.py
index 78e7c85..5f1628d 100644
--- a/esp32_ulp/util.py
+++ b/esp32_ulp/util.py
@@ -77,6 +77,18 @@ def validate_expression(param):
     return True
 
 
+def parse_int(literal):
+    """
+    GNU as compatible parsing of string literals into integers
+    Specifically, GNU as treats literals starting with 0 as octal
+    All other literals are correctly parsed by Python
+    See: https://sourceware.org/binutils/docs/as/Integers.html
+    """
+    if len(literal) >= 2 and (literal.startswith("0") or literal.startswith("-0")) and literal.lstrip("-0").isdigit():
+        return int(literal, 8)
+    return int(literal, 0)
+
+
 def file_exists(filename):
     try:
         os.stat(filename)
diff --git a/tests/opcodes.py b/tests/opcodes.py
index 3dc7453..f8109b5 100644
--- a/tests/opcodes.py
+++ b/tests/opcodes.py
@@ -7,7 +7,7 @@
 
 from uctypes import UINT32, BFUINT32, BF_POS, BF_LEN
 from esp32_ulp.opcodes import make_ins, make_ins_struct_def
-from esp32_ulp.opcodes import get_reg, get_imm, get_cond, arg_qualify, eval_arg, ARG, REG, IMM, SYM, COND
+from esp32_ulp.opcodes import get_reg, get_imm, get_cond, arg_qualify, parse_int, eval_arg, ARG, REG, IMM, SYM, COND
 from esp32_ulp.assemble import SymbolTable, ABS, REL, TEXT
 import esp32_ulp.opcodes as opcodes
 
@@ -46,6 +46,7 @@ def test_arg_qualify():
     assert arg_qualify('-1') == ARG(IMM, -1, '-1')
     assert arg_qualify('1') == ARG(IMM, 1, '1')
     assert arg_qualify('0x20') == ARG(IMM, 32, '0x20')
+    assert arg_qualify('0100') == ARG(IMM, 64, '0100')
     assert arg_qualify('0o100') == ARG(IMM, 64, '0o100')
     assert arg_qualify('0b1000') == ARG(IMM, 8, '0b1000')
     assert arg_qualify('eq') == ARG(COND, 'eq', 'eq')
@@ -96,6 +97,11 @@ def test_eval_arg():
     assert eval_arg('const >> 1') == 21
     assert eval_arg('(const|4)&0xf') == 0xe
 
+    assert eval_arg('0x7') == 7
+    assert eval_arg('010') == 8
+    assert eval_arg('-0x7') == -7  # negative
+    assert eval_arg('~0x7') == -8  # complement
+
     assert_raises(ValueError, eval_arg, 'evil()')
     assert_raises(ValueError, eval_arg, 'def cafe()')
     assert_raises(ValueError, eval_arg, '1 ^ 2')
@@ -105,14 +111,17 @@ def test_eval_arg():
     opcodes.symbols = None
 
 
-def assert_raises(exception, func, *args):
+def assert_raises(exception, func, *args, message=None):
     try:
         func(*args)
-    except exception:
+    except exception as e:
         raised = True
+        actual_message = e.args[0]
     else:
         raised = False
     assert raised
+    if message:
+        assert actual_message == message, '%s == %s' % (actual_message, message)
 
 
 def test_reg_direct_ulp_addressing():
diff --git a/tests/opcodes_s2.py b/tests/opcodes_s2.py
index 4525049..b9d74d3 100644
--- a/tests/opcodes_s2.py
+++ b/tests/opcodes_s2.py
@@ -7,7 +7,7 @@
 
 from uctypes import UINT32, BFUINT32, BF_POS, BF_LEN
 from esp32_ulp.opcodes_s2 import make_ins, make_ins_struct_def
-from esp32_ulp.opcodes_s2 import get_reg, get_imm, get_cond, arg_qualify, eval_arg, ARG, REG, IMM, SYM, COND
+from esp32_ulp.opcodes_s2 import get_reg, get_imm, get_cond, arg_qualify, parse_int, eval_arg, ARG, REG, IMM, SYM, COND
 from esp32_ulp.assemble import SymbolTable, ABS, REL, TEXT
 import esp32_ulp.opcodes_s2 as opcodes
 
@@ -46,6 +46,7 @@ def test_arg_qualify():
     assert arg_qualify('-1') == ARG(IMM, -1, '-1')
     assert arg_qualify('1') == ARG(IMM, 1, '1')
     assert arg_qualify('0x20') == ARG(IMM, 32, '0x20')
+    assert arg_qualify('0100') == ARG(IMM, 64, '0100')
     assert arg_qualify('0o100') == ARG(IMM, 64, '0o100')
     assert arg_qualify('0b1000') == ARG(IMM, 8, '0b1000')
     assert arg_qualify('eq') == ARG(COND, 'eq', 'eq')
@@ -96,6 +97,11 @@ def test_eval_arg():
     assert eval_arg('const >> 1') == 21
     assert eval_arg('(const|4)&0xf') == 0xe
 
+    assert eval_arg('0x7') == 7
+    assert eval_arg('010') == 8
+    assert eval_arg('-0x7') == -7  # negative
+    assert eval_arg('~0x7') == -8  # complement
+
     assert_raises(ValueError, eval_arg, 'evil()')
     assert_raises(ValueError, eval_arg, 'def cafe()')
     assert_raises(ValueError, eval_arg, '1 ^ 2')
@@ -105,14 +111,17 @@ def test_eval_arg():
     opcodes.symbols = None
 
 
-def assert_raises(exception, func, *args):
+def assert_raises(exception, func, *args, message=None):
     try:
         func(*args)
-    except exception:
+    except exception as e:
         raised = True
+        actual_message = e.args[0]
     else:
         raised = False
     assert raised
+    if message:
+        assert actual_message == message, '%s == %s' % (actual_message, message)
 
 
 def test_reg_direct_ulp_addressing():
diff --git a/tests/util.py b/tests/util.py
index 6aadc8b..5f487ae 100644
--- a/tests/util.py
+++ b/tests/util.py
@@ -6,7 +6,7 @@
 # SPDX-License-Identifier: MIT
 
 import os
-from esp32_ulp.util import split_tokens, validate_expression, file_exists
+from esp32_ulp.util import split_tokens, validate_expression, parse_int, file_exists
 
 tests = []
 
@@ -18,6 +18,19 @@ def test(param):
     tests.append(param)
 
 
+def assert_raises(exception, func, *args, message=None):
+    try:
+        func(*args)
+    except exception as e:
+        raised = True
+        actual_message = e.args[0]
+    else:
+        raised = False
+    assert raised
+    if message:
+        assert actual_message == message, '%s == %s' % (actual_message, message)
+
+
 @test
 def test_split_tokens():
     assert split_tokens("") == []
@@ -69,6 +82,36 @@ def test_validate_expression():
     assert validate_expression('def CAFE()') is False
 
 
+@test
+def test_parse_int():
+    # decimal
+    assert parse_int("0") == 0, "0 == 0"
+    assert parse_int("5") == 5, "5 == 5"
+    assert parse_int("-0") == 0, "-0 == 0"
+    assert parse_int("-5") == -5, "-5 == -5"
+    # hex
+    assert parse_int("0x5") == 5, "0x5 == 5"
+    assert parse_int("0x5a") == 90, "0x5a == 90"
+    assert parse_int("-0x5a") == -90, "-0x5a == -90"
+    # binary
+    assert parse_int("0b1001") == 9, "0b1001 == 9"
+    assert parse_int("-0b1001") == -9, "-0b1001 == 9"
+    # octal
+    assert parse_int("07") == 7, "07 == 7"
+    assert parse_int("0100") == 64, "0100 == 64"
+    assert parse_int("0o210") == 136, "0o210 == 136"
+    assert parse_int("00000010") == 8, "00000010 == 8"
+    assert parse_int("-07") == -7, "-07 == -7"
+    assert parse_int("-0100") == -64, "-0100 == -64"
+    assert parse_int("-0o210") == -136, "-0o210 == -136"
+    assert parse_int("-00000010") == -8, "-00000010 == -8"
+    # negative cases
+    assert_raises(ValueError, parse_int, '0b123', message="invalid syntax for integer with base 2: '123'")
+    assert_raises(ValueError, parse_int, '0900', message="invalid syntax for integer with base 8: '0900'")
+    assert_raises(ValueError, parse_int, '0o900', message="invalid syntax for integer with base 8: '900'")
+    assert_raises(ValueError, parse_int, '0xg', message="invalid syntax for integer with base 16: 'g'")
+
+
 @test
 def test_file_exists():
     testfile = '.testfile'