@@ -119,61 +119,91 @@ def _isEqual(self, other):
119
119
return self .accID == other .accID
120
120
return super (AcrobatNode , self )._isEqual (other )
121
121
122
- def _getNodeMathMl (self , node ):
123
- tag = node . GetTagName ()
124
- yield "<%s" % tag
125
- # Output relevant attributes.
126
- if tag == "mfenced" :
127
- for attr in "open" , "close" , "separators" :
128
- val = node .GetAttribute (attr , "XML-1.00 " )
122
+ def _getNodeMathMl (self , node ) -> str :
123
+ """Traverse the MathML tree and return an XML string representing the math"""
124
+
125
+ def getMathMLAttributes ( element , attrList : list ) -> str :
126
+ attrValues = ""
127
+ for attr in attrList :
128
+ val = element .GetAttribute (attr , "NSO " )
129
129
if val :
130
- yield ' %s="%s"' % (attr , val )
131
- yield ">"
130
+ attrValues += f' { attr } ="{ val } "'
131
+ return attrValues
132
+
133
+ tag = node .GetTagName ()
134
+ answer = f"<{ tag } "
135
+ # Output relevant attributes
136
+ id = node .GetID ()
137
+ if id :
138
+ answer += f' id="{ id } "'
139
+ answer += getMathMLAttributes (node , ["intent" , "arg" ])
140
+ match tag :
141
+ case "mi" | "mn" | "mo" | "mtext" :
142
+ answer += getMathMLAttributes (node , ["mathvariant" ])
143
+ case "mfenced" :
144
+ answer += getMathMLAttributes (node , ["open" , "close" , "separators" ])
145
+ case "menclose" :
146
+ answer += getMathMLAttributes (node , ["notation" , "notationtype" ])
147
+ case "annotation-xml" | "annotation" :
148
+ answer += getMathMLAttributes (node , ["encoding" ])
149
+ case "ms" :
150
+ answer += getMathMLAttributes (node , ["open" , "close" ])
151
+ case _:
152
+ pass
153
+ answer += ">"
132
154
val = node .GetValue ()
133
155
if val :
134
- yield val
156
+ answer += val
135
157
else :
136
158
for childNum in range (node .GetChildCount ()):
137
159
try :
138
160
subNode = node .GetChild (childNum ).QueryInterface (IPDDomElement )
139
161
except COMError :
140
162
continue
141
163
for sub in self ._getNodeMathMl (subNode ):
142
- yield sub
143
- yield "</%s>" % tag
164
+ answer += sub
165
+ return answer + f "</{ tag } >"
144
166
145
167
def _get_mathMl (self ) -> str :
146
168
"""Return the MathML associated with a Formula tag"""
169
+ # There are two ways that MathML can be represented in a PDF:
170
+ # 1. As a series of nested tags, each with a MathML element as the value.
171
+ # 2. As a Formula tag with MathML as the value (comes from MathML in an Associated File)
147
172
if self .pdDomNode is None :
148
173
log .debugWarning ("_get_mathMl: self.pdDomNode is None!" )
149
174
raise LookupError
175
+
176
+ # see if it is MathML tagging is used
177
+ for childNum in range (self .pdDomNode .GetChildCount ()):
178
+ try :
179
+ child = self .pdDomNode .GetChild (childNum ).QueryInterface (IPDDomElement )
180
+ except COMError :
181
+ log .debugWarning (f"COMError trying to get childNum={ childNum } " )
182
+ continue
183
+ if log .isEnabledFor (log .DEBUG ):
184
+ log .debug (f"\t (PDF) get_mathMl: tag={ child .GetTagName ()} " )
185
+ if child .GetTagName () == "math" :
186
+ answer = "" .join (self ._getNodeMathMl (child ))
187
+ log .debug (f"_get_mathMl (PDF): found tagged MathML = { answer } " )
188
+ return answer
189
+
150
190
mathMl = self .pdDomNode .GetValue ()
151
191
if log .isEnabledFor (log .DEBUG ):
152
192
log .debug (
153
193
(
154
- f"_get_mathMl: math recognized: { mathMl .startswith ('<math' )} , "
194
+ f"_get_mathMl (PDF) : math recognized: { mathMl .startswith ('<math' )} , "
155
195
f"child count={ self .pdDomNode .GetChildCount ()} ,"
156
- f"\n name='{ self .pdDomNode .GetName ()} ', value='{ mathMl } '"
196
+ f"\n name='{ self .pdDomNode .GetName ()} ', value found from AF ='{ mathMl } '"
157
197
),
158
198
)
159
199
# this test and the replacement doesn't work if someone uses a namespace tag (which they shouldn't, but..)
160
200
if mathMl .startswith ("<math" ):
161
201
return mathMl .replace ('xmlns:mml="http://www.w3.org/1998/Math/MathML"' , "" )
162
- # Alternative for tagging: all the sub expressions are tagged -- gather up the MathML
163
- for childNum in range (self .pdDomNode .GetChildCount ()):
164
- try :
165
- child = self .pdDomNode .GetChild (childNum ).QueryInterface (IPDDomElement )
166
- except COMError :
167
- log .debugWarning (f"COMError trying to get childNum={ childNum } " )
168
- continue
169
- if log .isEnabledFor (log .DEBUG ):
170
- log .debug (f"\t get_mathMl: tag={ child .GetTagName ()} " )
171
- if child .GetTagName () == "math" :
172
- return "" .join (self ._getNodeMathMl (child ))
173
- # fall back to return the contents, which is hopefully alt text
174
- if log .isEnabledFor (log .DEBUG ):
175
- log .debug ("_get_mathMl: didn't find MathML -- returning value as mtext" )
176
- return f"<math><mtext>{ self .pdDomNode .GetValue ()} </mtext></math>"
202
+
203
+ # not MathML -- fall back to return the contents, which is hopefully alt text, inside an <mtext>
204
+ answer = f"<math><mtext>{ mathMl } </mtext></math>"
205
+ log .debug (f"_get_mathMl: didn't find MathML -- returning value as mtext: { answer } " )
206
+ return answer
177
207
178
208
179
209
class RootNode (AcrobatNode ):
0 commit comments