@@ -119,61 +119,96 @@ def _isEqual(self, other):
119
119
return self .accID == other .accID
120
120
return super (AcrobatNode , self )._isEqual (other )
121
121
122
- def _getNodeMathMl (self , node ):
122
+ @staticmethod
123
+ def getMathMLAttributes (node : IPDDomElement , attrList : list ) -> str :
124
+ """Get the MathML attributes in 'attrList' for a 'node' (MathML element)."""
125
+ attrValues = ""
126
+ for attr in attrList :
127
+ # "NSO" comes from the PDF spec
128
+ val = node .GetAttribute (attr , "NSO" )
129
+ if val :
130
+ attrValues += f' { attr } ="{ val } "'
131
+ return attrValues
132
+
133
+ def _getNodeMathMl (self , node : IPDDomElement ) -> str :
134
+ """Traverse the MathML tree and return an XML string representing the math"""
135
+
123
136
tag = node .GetTagName ()
124
- yield "<%s" % tag
125
- # Output relevant attributes.
126
- if tag == "mfenced" :
127
- for attr in "open" , "close" , "separators" :
128
- val = node .GetAttribute (attr , "XML-1.00" )
129
- if val :
130
- yield ' %s="%s"' % (attr , val )
131
- yield ">"
137
+ answer = f"<{ tag } "
138
+ # Output relevant attributes
139
+ id = node .GetID ()
140
+ if id :
141
+ answer += f' id="{ id } "'
142
+ # The PDF interface lacks a way to get all the attributes, so we have to get specific ones
143
+ # The attributes below affect accessibility
144
+ answer += AcrobatNode .getMathMLAttributes (node , ["intent" , "arg" ])
145
+ match tag :
146
+ case "mi" | "mn" | "mo" | "mtext" :
147
+ answer += AcrobatNode .getMathMLAttributes (node , ["mathvariant" ])
148
+ case "mfenced" :
149
+ answer += AcrobatNode .getMathMLAttributes (node , ["open" , "close" , "separators" ])
150
+ case "menclose" :
151
+ answer += AcrobatNode .getMathMLAttributes (node , ["notation" , "notationtype" ])
152
+ case "annotation-xml" | "annotation" :
153
+ answer += AcrobatNode .getMathMLAttributes (node , ["encoding" ])
154
+ case "ms" :
155
+ answer += AcrobatNode .getMathMLAttributes (node , ["open" , "close" ])
156
+ case _:
157
+ pass
158
+ answer += ">"
132
159
val = node .GetValue ()
133
160
if val :
134
- yield val
161
+ answer += val
135
162
else :
136
163
for childNum in range (node .GetChildCount ()):
137
164
try :
138
165
subNode = node .GetChild (childNum ).QueryInterface (IPDDomElement )
139
166
except COMError :
140
167
continue
141
168
for sub in self ._getNodeMathMl (subNode ):
142
- yield sub
143
- yield "</%s>" % tag
169
+ answer += sub
170
+ return answer + f "</{ tag } >"
144
171
145
172
def _get_mathMl (self ) -> str :
146
173
"""Return the MathML associated with a Formula tag"""
174
+ # There are two ways that MathML can be represented in a PDF:
175
+ # 1. As a series of nested tags, each with a MathML element as the value.
176
+ # 2. As a Formula tag with MathML as the value (comes from MathML in an Associated File)
147
177
if self .pdDomNode is None :
148
178
log .debugWarning ("_get_mathMl: self.pdDomNode is None!" )
149
179
raise LookupError
180
+
181
+ # see if it is MathML tagging is used
182
+ for childNum in range (self .pdDomNode .GetChildCount ()):
183
+ try :
184
+ child = self .pdDomNode .GetChild (childNum ).QueryInterface (IPDDomElement )
185
+ except COMError :
186
+ log .debugWarning (f"COMError trying to get { childNum = } " )
187
+ continue
188
+ if log .isEnabledFor (log .DEBUG ):
189
+ log .debug (f"\t (PDF) get_mathMl: tag={ child .GetTagName ()} " )
190
+ if child .GetTagName () == "math" :
191
+ answer = "" .join (self ._getNodeMathMl (child ))
192
+ log .debug (f"_get_mathMl (PDF): found tagged MathML = { answer } " )
193
+ return answer
194
+
150
195
mathMl = self .pdDomNode .GetValue ()
151
196
if log .isEnabledFor (log .DEBUG ):
152
197
log .debug (
153
198
(
154
- f"_get_mathMl: math recognized: { mathMl .startswith ('<math' )} , "
199
+ f"_get_mathMl (PDF) : math recognized: { mathMl .startswith ('<math' )} , "
155
200
f"child count={ self .pdDomNode .GetChildCount ()} ,"
156
- f"\n name='{ self .pdDomNode .GetName ()} ', value='{ mathMl } '"
201
+ f"\n name='{ self .pdDomNode .GetName ()} ', value found from AF ='{ mathMl } '"
157
202
),
158
203
)
159
204
# this test and the replacement doesn't work if someone uses a namespace tag (which they shouldn't, but..)
160
205
if mathMl .startswith ("<math" ):
161
206
return mathMl .replace ('xmlns:mml="http://www.w3.org/1998/Math/MathML"' , "" )
162
- # Alternative for tagging: all the sub expressions are tagged -- gather up the MathML
163
- for childNum in range (self .pdDomNode .GetChildCount ()):
164
- try :
165
- child = self .pdDomNode .GetChild (childNum ).QueryInterface (IPDDomElement )
166
- except COMError :
167
- log .debugWarning (f"COMError trying to get childNum={ childNum } " )
168
- continue
169
- if log .isEnabledFor (log .DEBUG ):
170
- log .debug (f"\t get_mathMl: tag={ child .GetTagName ()} " )
171
- if child .GetTagName () == "math" :
172
- return "" .join (self ._getNodeMathMl (child ))
173
- # fall back to return the contents, which is hopefully alt text
174
- if log .isEnabledFor (log .DEBUG ):
175
- log .debug ("_get_mathMl: didn't find MathML -- returning value as mtext" )
176
- return f"<math><mtext>{ self .pdDomNode .GetValue ()} </mtext></math>"
207
+
208
+ # not MathML -- fall back to return the contents, which is hopefully alt text, inside an <mtext>
209
+ answer = f"<math><mtext>{ mathMl } </mtext></math>"
210
+ log .debug (f"_get_mathMl: didn't find MathML -- returning value as mtext: { answer } " )
211
+ return answer
177
212
178
213
179
214
class RootNode (AcrobatNode ):
0 commit comments