PK ! R~ mathml2latex/__init__.py__version__ = '0.1.0'
PK ! zc$, , mathml2latex/mathml.py#! /usr/bin/env python3
# -*- coding:utf-8 -*-
import re
import sys
from bs4.element import NavigableString
from mathml2latex.parts_pickup import select_insert_list
from mathml2latex.parts_pickup import select_true_chidren
from mathml2latex.process_each_tag.math import process_math
from mathml2latex.process_each_tag.mrow import process_mrow
from mathml2latex.process_each_tag.msub import process_msub
from mathml2latex.process_each_tag.msup import process_msup
from mathml2latex.process_each_tag.msubsup import process_msubsup
from mathml2latex.process_each_tag.mfrac import process_mfrac
from mathml2latex.process_each_tag.mroot import process_mroot
from mathml2latex.process_each_tag.msqrt import process_msqrt # mrootと同じでいいのか確認(メモ:2019/01/08)
from mathml2latex.process_each_tag.mpadded import process_mpadded
from mathml2latex.process_each_tag.mphantom import process_mphantom
from mathml2latex.process_each_tag.mmultiscripts import process_mmultiscripts
from mathml2latex.process_each_tag.munder import process_munder
from mathml2latex.process_each_tag.mover import process_mover
from mathml2latex.process_each_tag.munderover import process_munderover
from mathml2latex.process_each_tag.mtd import process_mtd
from mathml2latex.process_each_tag.mtr import process_mtr
from mathml2latex.process_each_tag.mtable import process_mtable
from mathml2latex.process_each_tag.mfenced import process_mfenced
from mathml2latex.process_each_tag.mspace import process_mspace
from mathml2latex.process_each_tag.mstyle import process_mstyle
from mathml2latex.process_each_tag.mtext import process_mtext
pattern = {}
pattern['math'] = re.compile('math$')
pattern['mrow'] = re.compile('mrow$')
pattern['msub'] = re.compile('msub$')
pattern['msup'] = re.compile('msup$')
pattern['msubsup'] = re.compile('msubsup$')
pattern['mfrac'] = re.compile('mfrac$')
pattern['mroot'] = re.compile('mroot$')
pattern['msqrt'] = re.compile('msqrt$')
pattern['mprescripts'] = re.compile('mprescripts$')
pattern['mpadded'] = re.compile('mpadded$')
pattern['mphantom'] = re.compile('mphantom$')
pattern['mmultiscripts'] = re.compile('mmultiscripts$')
pattern['munder'] = re.compile('munder$')
pattern['mover'] = re.compile('mover$')
pattern['munderover'] = re.compile('munderover$')
pattern['mtd'] = re.compile('mtd$')
pattern['mtr'] = re.compile('mtr$')
pattern['mtable'] = re.compile('mtable$')
pattern['mfenced'] = re.compile('mfenced$')
pattern['mspace'] = re.compile('mspace$')
pattern['none'] = re.compile('none$')
pattern['mi'] = re.compile('mi$')
pattern['mn'] = re.compile('mn$')
pattern['mo'] = re.compile('mo$')
pattern['mstyle'] = re.compile('mstyle$')
pattern['mtext'] = re.compile('mtext$')
pattern['malignmark'] = re.compile('malignmark$')
pattern['maligngroup'] = re.compile('maligngroup$')
# These objects are all tags that may become parent tags.
def process_mathml(soup):
insertion_dict = {}
insertion_dict['math'] = []
insertion_dict['mrow'] = []
insertion_dict['mfrac'] = []
insertion_dict['mroot'] = []
insertion_dict['msqrt'] = []
insertion_dict['msub'] = []
insertion_dict['msup'] = []
insertion_dict['msubsup'] = []
insertion_dict['mpadded'] = []
insertion_dict['mphantom'] = []
insertion_dict['munder'] = []
insertion_dict['mover'] = []
insertion_dict['munderover'] = []
insertion_dict['mtd'] = []
insertion_dict['mtr'] = []
insertion_dict['mtable'] = []
insertion_dict['mmultiscripts'] = []
insertion_dict['mspace'] = []
insertion_dict['mstyle'] = []
insertion_dict['mtext'] = []
insertion_dict['mfenced'] = []
insertion_dict['none'] = ['']
insertion_dict['mprescripts'] = ['']
insertion_dict['mi'] = ['']
insertion_dict['mn'] = ['']
insertion_dict['mo'] = ['']
insertion_dict['malignmark'] = ['']
insertion_dict['maligngroup'] = ['']
stack_list = []
for descendant in list(soup.descendants)[::-1]:
if isinstance(descendant, NavigableString):
# print('this element is NOT tag.')
continue
else:
end_point = pattern['math'].match(descendant.name)
if not end_point:
tagname = descendant.name
string = descendant.string
# 子タグのとき, `.string`はstr型(Noneにはならない)
if string:
stack_list.append(string)
# 親タグのとき, `.string`はNoneになる
else:
# あとから考えるべきこと: 各process内での蒸発してしまうchild_listの要素の扱い, hoge, fuga
insertion_list = select_insert_list(descendant, insertion_dict)
child_list = select_true_chidren(descendant, stack_list)
stack_list = [] # re: initialize
if pattern['mspace'].match(tagname):
# Synatax: None
insertion_dict['mspace'].append(process_mspace(descendant))
elif pattern['mfrac'].match(tagname):
# Syntax: ` numerator denominator `
insertion_dict['mfrac'].append(process_mfrac(descendant, insertion_list))
elif pattern['mroot'].match(tagname):
# Syntax: ` base index .`
insertion_dict['mroot'].append(process_mroot(descendant, insertion_list))
elif pattern['msqrt'].match(tagname):
# Syntax: ` base `
insertion_dict['msqrt'].append(process_msqrt(descendant, insertion_list))
elif pattern['mrow'].match(tagname):
insertion_dict['mrow'].append(process_mrow(descendant, child_list, insertion_list))
elif pattern['mstyle'].match(tagname):
insertion_dict['mstyle'].append(process_mstyle(descendant, child_list, insertion_list))
elif pattern['mtext'].match(tagname):
insertion_dict['mtext'].append(process_mtext(descendant, child_list, insertion_list))
elif pattern['msub'].match(tagname):
# Syntax: ` base subscript `
insertion_dict['msub'].append(process_msub(descendant, insertion_list))
elif pattern['msup'].match(tagname):
# Syntax: ` base superscript `
insertion_dict['msup'].append(process_msup(descendant, insertion_list))
elif pattern['msubsup'].match(tagname):
# Syntax: ` base subscript superscript `
insertion_dict['msubsup'].append(process_msubsup(descendant, insertion_list))
elif pattern['mpadded'].match(tagname):
# Syntax: None → 形を視覚的に整えるためだけのもの
insertion_dict['mpadded'].append(process_mpadded(descendant, insertion_list))
elif pattern['mphantom'].match(tagname):
# Syntax: None → 形を視覚的に整えるためだけのもの
insertion_dict['mphantom'].append(process_mphantom(descendant, insertion_list))
elif pattern['none'].match(tagname):
# `` は `mmultiscripts`の中でしか出現しない
# `mmultiscripts` の中で同時に処理してもらう
pass
elif pattern['mmultiscripts'].match(tagname):
# Syntax: ` base (subscript superscript)* [ \
# (presubscript presuperscript)* ] `
insertion_dict['mmultiscripts'].append(process_mmultiscripts(descendant, insertion_list))
elif pattern['mprescripts'].match(tagname):
# Syntax: None → mmultiscripts の内部で処理
continue
elif pattern['munder'].match(tagname):
# Synatax: ` base underscript `
insertion_dict['munder'].append(process_munder(descendant, insertion_list))
elif pattern['mover'].match(tagname):
# Syntax: ` base overscript `
insertion_dict['mover'].append(process_mover(descendant, insertion_list))
elif pattern['munderover'].match(tagname):
# Syntax: ` base underscript overscript munderover>`
insertion_dict['munderover'].append(process_munderover(descendant, insertion_list))
elif pattern['mtd'].match(tagname):
# Syntax: ` content mi|mn|mo|mparent> mtd>`
insertion_dict['mtd'].append(process_mtd(child_list, insertion_list))
elif pattern['mtr'].match(tagname):
# Syntax: ` mtd> ... mtr>`
insertion_dict['mtr'].append(process_mtr(insertion_list))
elif pattern['mtable'].match(tagname):
# Syntax: ` hogehoge .. mtr> ... ... mtable>`
insertion_dict['mtable'].append(process_mtable(insertion_list))
elif pattern['mfenced'].match(tagname):
# Syntax: ` mn|mi|mo|mparent> ... mfenced> `
insertion_dict['mfenced'].append(process_mfenced(descendant, insertion_list))
elif pattern['mi'].match(tagname):
continue
elif pattern['mn'].match(tagname):
continue
elif pattern['mo'].match(tagname):
continue
elif pattern['malignmark'].match(tagname):
continue
elif pattern['maligngroup'].match(tagname):
continue
else:
print(descendant, '\n')
print('Unknown Tag appeared!! terminate the program.\n')
print('New tag name is ; ', descendant.name)
# sys.exit() # debug
# ##### if endpoint: ##### #
else:
insertion_list = select_insert_list(descendant, insertion_dict)
insertion_dict['math'].append(process_math(descendant, insertion_list))
mathml_list = insertion_dict['math']
# print('MathML list is : ', mathml_list) # debug
mathml_str_block = ''.join(mathml_list)
mathml_str_block = mathml_str_block.strip()
return mathml_str_block # in Production environment (type: string)
PK ! KF F mathml2latex/parts_pickup.py#! /usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4.element import NavigableString
# insert_list: descendantが任意の親タグであったとき、そのsyntaxをリストにしたもの
# 入れ子構造となる部分(親タグがsyntaxに含まれている場合)だけを判定
def select_insert_list(descendant, insertion_dict):
"""
return: list object (list of element to insert that parent)
"""
# 大前提: 突っ込む必要があるタグの中にある値は、insertion_dictの中に格納されているはずである
# していること: 任意のタグにおける入れ子構造の検出
# していないこと: 最終的に返すリスト内の値がどのようなものであるのか検証すること
# # => 挿入したい値の型検証(inspection)は、各`process_m-hogehoge`でやってもらうべき
# 思案したポイント:
# `.siblings` の役割を果たす `.parent.children` を使う => 重複タグがないかどうかの分岐を作る必要がある
# 未解決問題: insertion_dictが保持する値が初期化したくないリストなので、メモリが大量に必要になるかもしれない
# decendant が入れ子構造の外側にあり、keyの上書きを考慮する必要がある場合
# nested_name_list = [x.name for x in list(descendant.children) if x.children and x.string == None]
nested_name_list = [x.name for x in (
y for y in (
z for z in list(descendant.children) if not z.string
) if not isinstance(y, NavigableString)
) if x.children]
selected_list = []
if not nested_name_list:
# 入れ子構造がない場合 <= 挿入すべき親要素がない
return selected_list
else: # 入れ子構造がある場合
# 重複している要素そのものと、そのインデックスのペアを取り出してリストにする
# インデックスは、insertion_list内のどの要素にアクセスすればよいのかを特定する
# 参照:https://bit.ly/2u3ft8o
confirm_list = [[key, nested_name_list.count(key)] for key in set(nested_name_list)
if nested_name_list.count(key) > 1]
# 上記のペアが存在しない場合(要素の重複がない場合)
if not confirm_list: # ex. `confirm_list = []`
# insertion_dict[key]に一番新しく`append`された要素を取ってくる
selected_list = [insertion_dict[key][-1] for key in nested_name_list if insertion_dict.get(key)]
else:
# print('Confirm_list is: ', confirm_list) # debug
# 上記のペアが存在する場合(入れ子構造かつ同じ階層に重複あり)
index = {} # あとでインデックス代わりに使う辞書オブジェクト
for key in nested_name_list:
index[key] = 1
# print('Insertion_dict[`{}`] is : '.format(key), insertion_dict[key]) # debug
for tagname in nested_name_list:
try:
selected_list.append(insertion_dict[tagname][-index[tagname]])
index[tagname] += 1
except IndexError: # `` が含まれるときに起こる
if len(insertion_dict[tagname]) == 0:
print(tagname)
print('descendant:', descendant)
print('insertion_block: ', insertion_dict)
selected_list += insertion_dict[tagname][-1]
# confirm_listの中身の有無に依らず、selected_listを返す
return selected_list
# 親タグの塊に実際の子タグ以外の子タグ(親タグと同じ階層にある子タグ)が含まれてしまう問題の解決
def select_true_chidren(descendant, stack_list):
# 子タグ要素のうち、親タグ要素になっているものを探す
# そのために、`.string`を使って検出しようとしている(親タグ要素 => `None object` になるはず)
temp_list = [x.string for x in list(descendant.children)]
i = 0
# 先頭が親タグ要素であれば、`descendant`の直下に親タグの入れ子構造があることになり、
# 親タグを検出するたびにchild_listは初期化されているはずだから、`stack_list`はそのまま返す
if not temp_list: # ==> if not None: と同じ意味
stack_list = stack_list[::-1][:i]
else:
# `temp_list` の大きさはそのまま`descendant`の子タグの数と同値である
i = len(temp_list)
stack_list = stack_list[::-1][:i]
# print('stack list is ', stack_list)
return stack_list
PK ! ) mathml2latex/process_each_tag/__init__.pyPK ! Z Z , mathml2latex/process_each_tag/maligngroup.py#! /usr/bin/env python3
# -*- coding:utf-8 -*-
def process_aligngroup():
pass
PK ! iZ Z + mathml2latex/process_each_tag/malignmark.py#! /usr/bin/env python3
# -*- coding:utf-8 -*-
def process_malignmark():
pass
PK ! G % mathml2latex/process_each_tag/math.py#! /usr/bin/env python3
# -*- coding:utf-8 -*-
def process_math(descendant, insertion_list):
child_list = [x for x in list(descendant.children) if x.name]
temp_list = []
i = 0
for x in child_list:
x_string = x.string
if x_string:
temp_list.append(x_string)
else:
temp_list.append(insertion_list[i])
i += 1
continue
math = ''.join(temp_list)
return math
PK ! S[{ { ( mathml2latex/process_each_tag/mfenced.py#! /usr/bin/env python3
# -*- coding:utf-8 -*-
# => 属性から区切り記号を取ってくる必要あり
def process_mfenced(descendant, insertion_list):
determine_list = [x.string for x in list(descendant.children) if x.name]
temp_list = []
i = 0
for x_string in determine_list:
if x_string is None:
temp_list.append(insertion_list[i])
i += 1
else:
temp_list.append(x_string)
try:
fence_left = descendant['open']
except Exception as e:
# print(e)
fence_left = '('
try:
fence_right = descendant['close']
except Exception as e:
# print(e)
fence_right = ')'
try:
separators = list(descendant['separators'])
try:
last_separator = separators[-1]
except Exception as e:
# print(e)
last_separator = ','
length1 = len(determine_list)
length2 = len(separators)
if length1 > length2:
num = length1 - length2
separators += [last_separator] * num
mfenced = ''.join([x + y for x, y in zip(temp_list, separators)])
except Exception as e:
mfenced = ''.join(temp_list)
mfenced = fence_left + mfenced + fence_right
return mfenced
PK ! gi7 7 &