Skip to content

Commit 182c0c7

Browse files
authored
Use MathML attributes for PDFs read in Adobe Acrobat (#17984)
This fixes #17980 Summary of the issue: For PDF, the code was not grabbing MathML attributes needed for speech. Description of user facing changes The speech for math, and rarely the braille (e.g., bevelled fractions in Nemeth), was not always correct due to attributes not being picked up and the defaults being used. Description of development approach Unfortunately, the PDF interface does not allow grabbing all the attributes. Instead, one must ask for each attribute individually. Most attributes don't affect speech or braille, so it is not necessary to get them. I looked at what MathCAT used and added those.
1 parent e2be6d7 commit 182c0c7

File tree

2 files changed

+65
-29
lines changed

2 files changed

+65
-29
lines changed

source/NVDAObjects/IAccessible/adobeAcrobat.py

Lines changed: 64 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -119,61 +119,96 @@ def _isEqual(self, other):
119119
return self.accID == other.accID
120120
return super(AcrobatNode, self)._isEqual(other)
121121

122-
def _getNodeMathMl(self, node):
122+
@staticmethod
123+
def getMathMLAttributes(node: IPDDomElement, attrList: list) -> str:
124+
"""Get the MathML attributes in 'attrList' for a 'node' (MathML element)."""
125+
attrValues = ""
126+
for attr in attrList:
127+
# "NSO" comes from the PDF spec
128+
val = node.GetAttribute(attr, "NSO")
129+
if val:
130+
attrValues += f' {attr}="{val}"'
131+
return attrValues
132+
133+
def _getNodeMathMl(self, node: IPDDomElement) -> str:
134+
"""Traverse the MathML tree and return an XML string representing the math"""
135+
123136
tag = node.GetTagName()
124-
yield "<%s" % tag
125-
# Output relevant attributes.
126-
if tag == "mfenced":
127-
for attr in "open", "close", "separators":
128-
val = node.GetAttribute(attr, "XML-1.00")
129-
if val:
130-
yield ' %s="%s"' % (attr, val)
131-
yield ">"
137+
answer = f"<{tag}"
138+
# Output relevant attributes
139+
id = node.GetID()
140+
if id:
141+
answer += f' id="{id}"'
142+
# The PDF interface lacks a way to get all the attributes, so we have to get specific ones
143+
# The attributes below affect accessibility
144+
answer += AcrobatNode.getMathMLAttributes(node, ["intent", "arg"])
145+
match tag:
146+
case "mi" | "mn" | "mo" | "mtext":
147+
answer += AcrobatNode.getMathMLAttributes(node, ["mathvariant"])
148+
case "mfenced":
149+
answer += AcrobatNode.getMathMLAttributes(node, ["open", "close", "separators"])
150+
case "menclose":
151+
answer += AcrobatNode.getMathMLAttributes(node, ["notation", "notationtype"])
152+
case "annotation-xml" | "annotation":
153+
answer += AcrobatNode.getMathMLAttributes(node, ["encoding"])
154+
case "ms":
155+
answer += AcrobatNode.getMathMLAttributes(node, ["open", "close"])
156+
case _:
157+
pass
158+
answer += ">"
132159
val = node.GetValue()
133160
if val:
134-
yield val
161+
answer += val
135162
else:
136163
for childNum in range(node.GetChildCount()):
137164
try:
138165
subNode = node.GetChild(childNum).QueryInterface(IPDDomElement)
139166
except COMError:
140167
continue
141168
for sub in self._getNodeMathMl(subNode):
142-
yield sub
143-
yield "</%s>" % tag
169+
answer += sub
170+
return answer + f"</{tag}>"
144171

145172
def _get_mathMl(self) -> str:
146173
"""Return the MathML associated with a Formula tag"""
174+
# There are two ways that MathML can be represented in a PDF:
175+
# 1. As a series of nested tags, each with a MathML element as the value.
176+
# 2. As a Formula tag with MathML as the value (comes from MathML in an Associated File)
147177
if self.pdDomNode is None:
148178
log.debugWarning("_get_mathMl: self.pdDomNode is None!")
149179
raise LookupError
180+
181+
# see if it is MathML tagging is used
182+
for childNum in range(self.pdDomNode.GetChildCount()):
183+
try:
184+
child = self.pdDomNode.GetChild(childNum).QueryInterface(IPDDomElement)
185+
except COMError:
186+
log.debugWarning(f"COMError trying to get {childNum=}")
187+
continue
188+
if log.isEnabledFor(log.DEBUG):
189+
log.debug(f"\t(PDF) get_mathMl: tag={child.GetTagName()}")
190+
if child.GetTagName() == "math":
191+
answer = "".join(self._getNodeMathMl(child))
192+
log.debug(f"_get_mathMl (PDF): found tagged MathML = {answer}")
193+
return answer
194+
150195
mathMl = self.pdDomNode.GetValue()
151196
if log.isEnabledFor(log.DEBUG):
152197
log.debug(
153198
(
154-
f"_get_mathMl: math recognized: {mathMl.startswith('<math')}, "
199+
f"_get_mathMl (PDF): math recognized: {mathMl.startswith('<math')}, "
155200
f"child count={self.pdDomNode.GetChildCount()},"
156-
f"\n name='{self.pdDomNode.GetName()}', value='{mathMl}'"
201+
f"\n name='{self.pdDomNode.GetName()}', value found from AF ='{mathMl}'"
157202
),
158203
)
159204
# this test and the replacement doesn't work if someone uses a namespace tag (which they shouldn't, but..)
160205
if mathMl.startswith("<math"):
161206
return mathMl.replace('xmlns:mml="http://www.w3.org/1998/Math/MathML"', "")
162-
# Alternative for tagging: all the sub expressions are tagged -- gather up the MathML
163-
for childNum in range(self.pdDomNode.GetChildCount()):
164-
try:
165-
child = self.pdDomNode.GetChild(childNum).QueryInterface(IPDDomElement)
166-
except COMError:
167-
log.debugWarning(f"COMError trying to get childNum={childNum}")
168-
continue
169-
if log.isEnabledFor(log.DEBUG):
170-
log.debug(f"\tget_mathMl: tag={child.GetTagName()}")
171-
if child.GetTagName() == "math":
172-
return "".join(self._getNodeMathMl(child))
173-
# fall back to return the contents, which is hopefully alt text
174-
if log.isEnabledFor(log.DEBUG):
175-
log.debug("_get_mathMl: didn't find MathML -- returning value as mtext")
176-
return f"<math><mtext>{self.pdDomNode.GetValue()}</mtext></math>"
207+
208+
# not MathML -- fall back to return the contents, which is hopefully alt text, inside an <mtext>
209+
answer = f"<math><mtext>{mathMl}</mtext></math>"
210+
log.debug(f"_get_mathMl: didn't find MathML -- returning value as mtext: {answer}")
211+
return answer
177212

178213

179214
class RootNode(AcrobatNode):

user_docs/en/changes.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,7 @@ There are many minor bug fixes for applications, such as Thunderbird, Adobe Read
598598

599599
### Bug Fixes
600600

601+
* Fixed math attributes being read in Adobe Reader, this resulted in poor or wrong speech and braille. (#17980)
601602
* Windows 11 fixes:
602603
* NVDA will once again announce hardware keyboard input suggestions. (#16283, @josephsl)
603604
* In Version 24H2 (2024 Update and Windows Server 2025), mouse and touch interaction can be used in quick settings. (#16348, @josephsl)

0 commit comments

Comments
 (0)