#! /usr/bin/env python
#! -*- coding:utf-8 -*-

# Copyright (c) 2007, PediaPress GmbH
# See README.rst for additional licensing information.

import sys
import inspect
import unicodedata

from mwlib.advtree import removeNewlines
from mwlib.advtree import (Article, ArticleLink, Big, Blockquote, Book, BreakingReturn, Caption, CategoryLink, Cell, Center, Chapter,
                           Cite, Code,DefinitionDescription, DefinitionList, DefinitionTerm, Deleted, Div, Emphasized, Gallery,
                           HorizontalRule, ImageLink, ImageMap, Inserted, InterwikiLink, Italic, Item, ItemList, LangLink, Link,
                           Math, NamedURL, NamespaceLink, Overline, Paragraph, PreFormatted, Reference, ReferenceList,
                           Row, Section, Small, Source, Span, SpecialLink, Strike, Strong, Sub, Sup, Table, Teletyped, Text, Timeline,
                           Underline, URL, Var)

from mwlib.treecleanerhelper import getNodeHeight, splitRow
from mwlib import parser
from mwlib.writer import styleutils, miscutils

def show(n):
    parser.show(sys.stdout, n, verbose=True)

def tryRemoveNode(node):
    if node.parent is not None:
        node.parent.removeChild(node)
        return True


def _all(list):
    for item in list:
        if item == False:
            return False
    return True

def _any(list):
    for x in list:
        if x:
            return True
    return False


class TreeCleaner(object):

    """The TreeCleaner object cleans the parse tree to optimize writer ouput.

    All transformations should be lossless.

    """


    cleanerMethods = ['cleanVlist',
                      'markInfoboxes',
                      'removeEditLinks',
                      'removeEmptyTextNodes',
                      'removeInvisibleLinks', 
                      'cleanSectionCaptions',
                      'removeChildlessNodes',
                      'removeNoPrintNodes',
                      'removeListOnlyParagraphs',
                      'removeInvalidFiletypes',
                      'fixParagraphs',
                      'simplifyBlockNodes',
                      'removeAbsolutePositionedNode',
                      'removeScrollElements',
                      'galleryFix',
                      'fixNesting',
                      'removeChildlessNodes',
                      'unNestEndingCellContent',
                      'removeCriticalTables',
                      'removeTextlessStyles', 
                      'removeBrokenChildren',
                      'fixTableColspans',
                      'removeEmptyTrailingTableRows',
                      'splitTableLists', 
                      'transformSingleColTables',
                      'splitTableToColumns', 
                      'linearizeWideNestedTables',
                      'removeBreakingReturns', 
                      'removeEmptyReferenceLists',
                      'swapNodes',
                      'removeBigSectionsFromCells',
                      'transformNestedTables',
                      'splitBigTableCells',
                      'limitImageCaptionsize',
                      'removeDuplicateLinksInReferences',
                      'fixItemLists',
                      'fixSubSup',
                      'removeLeadingParaInList',
                      'removeChildlessNodes', # methods above might leave empty nodes behind - clean up
                      'removeNewlines', # imported from advtree - clean up newlines that are not needed
                      'removeBreakingReturns',
                      'removeSeeAlso',
                      'buildDefinitionLists',
                      'restrictChildren',
                      'fixReferenceNodes',
                      'removeBrokenChildren',
                      'fixMathDir',
                      'fixNesting', # pull DefinitionLists out of Paragraphs
                      'fixPreFormatted',
                      'fixListNesting',
                      'handleOnlyInPrint',
                      'removeEmptyTextNodes',
                      'removeChildlessNodes', 
                      'removeBreakingReturns',
                      'removeEmptySections',
                      'markShortParagraph',
                      ]

    skipMethods = []


    def __init__(self, tree, save_reports=False, nesting_strictness='loose', status_cb=None, rtl=False):
        """Init with parsetree.

        The input tree needs to be an AdvancedTree, generated by advtree.buildAdvancedTree
        """
        
        self.tree = tree
        # list of actions by the treecleaner
        # each cleaner method has to report its actions
        # this helps debugging and testing the treecleaner
        self.reports = []

        # reports are only saved, if set to True
        self.save_reports = save_reports

        self.status_cb=status_cb
        self.rtl = rtl
        # list of nodes which do not require child nodes
        self.childlessOK = [ArticleLink, BreakingReturn, CategoryLink, Cell, Chapter, Code, 
                            HorizontalRule, ImageLink, ImageMap, InterwikiLink, LangLink, Link, Math,
                            NamedURL, NamespaceLink, ReferenceList, Reference, SpecialLink, Text, Timeline, URL]
        # exceptions to the above. if any of the list items is explicitly set as a css style the node is not removed
        self.childless_exceptions = {Div: [u'width', u'height'],
                                     Span: [u'width', u'height'],}

        # FIXME: not used currently. remove if this is not used soon. could be used as reference
        # list nodes that apply styles to their children
        # FIXME: Center node might be problematic. Center is a block node and not inline
        self.inlineStyleNodes = [Big, Center, Cite, Code, Deleted, Emphasized, Inserted, Italic,
                                 Overline, Small, Strike, Strong, Sub, Sup, Teletyped, Underline, Var]


        # USED IN fixNesting if nesting_strictness == 'loose'
        # keys are nodes, that are not allowed to be inside one of the nodes in the value-list
        # ex: pull image links out of preformatted nodes
        # fixme rename to ancestors
        self.forbidden_parents = {ImageLink:[PreFormatted],
                                  ItemList:[Div, PreFormatted],
                                  Source:self.inlineStyleNodes,
                                  DefinitionList:[Paragraph],
                                  Blockquote:[PreFormatted],
                                  Center:[PreFormatted],
                                  Paragraph:[PreFormatted],
                                  Section:[PreFormatted],
                                  Gallery:[PreFormatted, DefinitionDescription, DefinitionList, DefinitionTerm],
                                  Table:[DefinitionList, DefinitionDescription],
                                  PreFormatted: [Code],
                                  }
        self.forbidden_parents[Source].append(PreFormatted)

        # when checking nesting, some Nodes prevent outside nodes to be visible to inner nodes
        # ex: Paragraphs can not be inside Paragraphs. but if the inner paragraph is inside a
        # table which itself is inside a paragraph this is not a problem
        self.outsideParentsInvisible = [Table, Section, Reference]
        self.nesting_strictness = nesting_strictness # loose | strict

        
        # ex: delete preformatted nodes which are inside reference nodes,
        # all children off the preformatted node are kept
        self.removeNodes = {PreFormatted: [Reference, PreFormatted],
                            Cite: [Item, Reference],
                            Code: [PreFormatted],
                            ImageLink: [Reference],
                            Div: [Reference, Item],
                            Center:[Reference],
                            Teletyped:[Reference],
                            ReferenceList: [Reference],
                            Teletyped: [Source],
                            Table:[ImageLink],
                            Reference:[Reference],
                            Paragraph:[Gallery],
                            }

        self.removeNodesAllChildren = {Table:[ImageLink], # used to indicate that children should be removed
                                       }

        
        # ex: some tags need to be swapped: center nodes have to be pulled out of underline nodes
        # e.g. but only if the center is a direct and only child
        self.swapNodesMap = { Center:[Underline, Emphasized]} # { ChildClass: [ParentClass, ParentClass2]}


        # list of css classes OR id's which trigger the removal of the node from the tree
        # the following list is wikipedia specific
        self.noDisplayClasses = ['hiddenStructure',
                                 'dablink',
                                 'editlink',
                                 'metadata',
                                 'noprint',
                                 'portal',
                                 'sisterproject',
                                 'NavFrame',
                                 'geo-multi-punct',
                                 'coordinates_3_ObenRechts',
                                 'microformat',
                                 'navbox',
                                 'navbox-vertical',
                                 'Vorlage_Gesundheitshinweis',
                                 ]

        # keys are nodes which can only have child nodes of types inside the valuelist.
        # children of different classes are deleted
        self.allowedChildren = {Gallery: [ImageLink],
                                }

        self.cell_splitter_params = {
            'maxCellHeight': (7*72) * 3/4 ,
            'lineHeight':  26,
            'charsPerLine': 40,
            'paragraphMargin': 2, # add 10 pt margin-safety after each node
            'imgHeight': 6, # approximate image height in units of lineHeights
            }


        self.style_nodes = [Italic, Emphasized, Strong, Overline, Underline, Sub, Sup, Small, Big, Var]

        # list of classes or IDs of table nodes which are split into their content. used by splitTableToColumns
        self.split_table_classIDs = ['mp-upper'] 

        # remove ImageLinks which end with the following file types
        self.forbidden_file_endings = ['ogg']

        # emtpy sections are removed by removeEmptySections
        # all node classes that have content but no text need to be listed here to prevent removal
        self.contentWithoutTextClasses = [Gallery, ImageLink]


    def clean(self, cleanerMethods):
        """Clean parse tree using cleaner methods in the methodList."""
        cleanerList = []
        for method in cleanerMethods:
            f = getattr(self, method, None)
            if f:
                cleanerList.append(f)
            else:
                raise 'TreeCleaner has no method: %r' % method

        # FIXME: performance could be improved, if individual articles would be cleaned
        # the algorithm below splits on the first level, if a book is found
        # --> if chapters are used, whole chapters are cleaned which slows things down

        if self.tree.__class__ == Book :
            children = self.tree.children
        else:
            children = [self.tree]

        total_children = len(children)
        for (i, child) in enumerate(children):
            for cleaner in cleanerList:
                try:
                    cleaner(child)
                except Exception, e:
                    self.report('ERROR:', e)
                    print 'TREECLEANER ERROR in %s: %r' % (getattr(child, 'caption', u'').encode('utf-8'),
                                                           repr(e))
                    import traceback
                    traceback.print_exc()
            if self.status_cb:
                self.status_cb(progress=100*i/total_children)

    def cleanAll(self, skipMethods=[]):
        """Clean parse tree using all available cleaner methods."""
        skipMethods = skipMethods or self.skipMethods
        self.clean([cm for cm in self.cleanerMethods if cm not in skipMethods])

    def report(self, *args):
        if not self.save_reports:
            return
        caller = inspect.stack()[1][3]
        msg = ''
        if args:
            msg = ' '.join([repr(arg) for arg in args])
        self.reports.append((caller, msg))

    def getReports(self):
        return self.reports

    def removeNewlines(self, node):
        removeNewlines(node)

    def removeEmptyTextNodes(self, node):
        """Removes Text nodes which contain no text at all.

        Text nodes which only contain whitespace are kept.
        """
        if node.__class__ == Text and node.parent:
            if (node.previous and node.previous.isblocknode and node.next and node.next.isblocknode and not node.caption.strip()) or not node.caption:
                self.report('removed empty text node')
                node.parent.removeChild(node)
                return
        for c in node.children:
            self.removeEmptyTextNodes(c)

    def removeListOnlyParagraphs(self, node):
        """Removes paragraph nodes which only have lists as the only childen - keep the lists."""
        if node.__class__ == Paragraph:
            list_only_children = _all([c.__class__ == ItemList for c in node.children])
            if list_only_children and node.parent:
                self.report('replaced children:', node, '-->', node.children, 'for node:', node.parent)
                node.parent.replaceChild(node, node.children)
                
        for c in node.children[:]:
            self.removeListOnlyParagraphs(c)

    def removeChildlessNodes(self, node):
        """Remove nodes that have no children except for nodes in childlessOk list."""   
        is_exception = False
        if node.__class__ in self.childless_exceptions.keys() and node.style:
            for style_type in self.childless_exceptions[node.__class__]:
                if style_type in node.style.keys():
                    is_exception = True

        if not node.children and node.__class__ not in self.childlessOK and not is_exception:
            if node.parent.__class__ == Section and not node.previous: 
                return # make sure that the first child of a section is not removed - this is the section caption
            removeNode = node
            while removeNode.parent and not removeNode.siblings and removeNode.parent.__class__ not in self.childlessOK:
                removeNode = removeNode.parent
            if removeNode.parent:
                self.report('removed:', removeNode)
                removeNode.parent.removeChild(removeNode)
        for c in node.children[:]:
            self.removeChildlessNodes(c)
            
    # FIXME: this method is obsolete as of now. 'navbox' is now a member of the noDisplayClasses and removed alltogether
    def removeCriticalTables(self, node):
        """Remove problematic table nodes - keep content.
               
        The content is preserved if possible and only the outmost 'container' table is removed.
        """

        if node.__class__ == Table and node.hasClassID(['navbox']):
            children = []
            for row in node.children:
                for cell in row:
                    for n in cell:
                        children.append(n)
            if node.parent:
                self.report('replaced child:', node, children)
                node.parent.replaceChild(node, children)
            return

        for c in node.children:
            self.removeCriticalTables(c)

    def fixTableColspans(self, node):
        """ Fix erronous colspanning information in table nodes.

        1. SINGLE CELL COLSPAN: if a row contains a single cell, the
           colspanning amount is limited to the maximum table width
        """
        # SINGLE CELL COLSPAN 
        if node.__class__ == Table:
            maxwidth = 0
            for row in node.children:
                numCells = len(row.children)
                rowwidth = 0
                for cell in row.children:
                    colspan = cell.attributes.get('colspan', 1)
                    if numCells > 1:
                        rowwidth += colspan
                    else:
                        rowwidth += 1
                maxwidth = max(maxwidth,  rowwidth)
            for row in node.children:
                numCells = len(row.children)
                if numCells == 1:
                    cell = row.children[0]
                    colspan = cell.attributes.get('colspan', 1)
                    if colspan and colspan > maxwidth:
                        self.report('fixed colspan from', cell.vlist.get('colspan', 'undefined'), 'to', maxwidth)
                        cell.vlist['colspan'] = maxwidth
        # /SINGLE CELL COLSPAN

        def emptyEndingCell(row):
            if not row.children:
                return False
            last_cell = row.children[-1]
            if not last_cell.children:
                return last_cell

        if node.__class__ == Table:
            # FIX for: http://de.wikipedia.org/w/index.php?title=Benutzer:Volker.haas/Test&oldid=73993014
            if len(node.children) == 1 and node.children[0].__class__ == Row:
                row = node.children[0]
                cell =  emptyEndingCell(row)
                while cell:
                    cell.parent.removeChild(cell)
                    cell = emptyEndingCell(row)
                    self.report('removed empty cell in single-row table')

        for c in node.children:
            self.fixTableColspans(c)

    def removeBrokenChildren(self, node):
        """Remove Nodes (while keeping their children) which can't be nested with their parents."""
        if node.__class__ in self.removeNodes.keys():
            if _any([parent.__class__ in self.removeNodes[node.__class__] for parent in node.parents]):
                if node.children and not _any([ parent.__class__ in self.removeNodesAllChildren.get(node.__class__, []) for parent in node.parents]):
                    children = node.children
                    self.report('replaced child', node, children)
                    node.parent.replaceChild(node, newchildren=children)
                else:
                    self.report('removed child', node)
                    node.parent.removeChild(node)
                #return

        for c in node.children:
            self.removeBrokenChildren(c)


    def transformSingleColTables(self, node):
        # "not 'box' in node.attr(class)" is a hack to detect infoboxes and thelike. they are not split into divs.
        # tables like this should be detected and marked in a separate module probably
        single_col = node.__class__ == Table and node.numcols == 1
        is_long = len(node.getAllDisplayText()) > 2500
        contains_gallery = len(node.getChildNodesByClass(Gallery)) > 0
        if single_col:
            all_images = True
            for row in node.children:
                for cell in row.children:
                    for item in cell.children:
                        if item.__class__ != ImageLink:
                            all_images = False
        else:
            all_images = False
        if single_col and ( (not getattr(node, 'isInfobox', False) and is_long) or all_images or contains_gallery):
            if not node.parents:
                return
            divs = []
            items = []
            content_len = len(node.getAllDisplayText())
            if content_len > 4000 \
                   or all_images \
                   or len(node.getChildNodesByClass(Cell)) > 30 and content_len > 500 \
                   or (node.getChildNodesByClass(Section) and node.getChildNodesByClass(ImageLink) and content_len > 1000) \
                   or contains_gallery:
                div_wrapper = False
            else:
                div_wrapper = True
            for row in node:
                for cell in row:
                    if div_wrapper:
                        d = Div()
                        d.border = 1
                        d.vlist = node.vlist
                        for item in cell:
                            d.appendChild(item)
                        divs.append(d)
                    else:
                        for item in cell:
                            items.append(item)                            
            parent = node.parent
            if div_wrapper:
                parent.replaceChild(node, divs)
                self.report('replaced single col table with div. div children:',  parent.children)
            else:
                parent.replaceChild(node, items)
                self.report('replaced single col table with items:',  parent.children)
        for c in node.children:
            self.transformSingleColTables(c)
        
    def _getNext(self, node): #FIXME: name collides with advtree.getNext
        if not (node.next or node.parent):
            return
        next = node.next or node.parent.next
        if next and not next.isblocknode:
            if not next.getAllDisplayText().strip():
                return self._getNext(next)
        return next

    def _getPrev(self, node): #FIXME: name collides with advtree.getPrev(ious)
        if not (node.previous or node.parent):
            return
        prev = node.previous or node.parent 
        if prev and not prev.isblocknode:
            if not prev.getAllDisplayText().strip():
                return self._getPrev(prev)
        return prev

    def _nextAdjacentNode(self, node):
        if node and node.next:
            res = node.next.getFirstLeaf() or node.next
            return res
        if node.parent:
            return self._nextAdjacentNode(node.parent)
        return None


    def removeBreakingReturns(self, node): 
        """Remove BreakingReturns that occur around blocknodes or as the first/last element inside a blocknode."""
        if node.isblocknode:
            changed = True
            while changed:
                check_node = [node.getFirstLeaf(),
                             node.getLastLeaf(),
                             self._getNext(node),
                             self._getPrev(node)
                             ]
                changed = False
                for n in check_node:
                    if n.__class__ == BreakingReturn:
                        self.report('removing node', n)
                        tryRemoveNode(n)
                        changed = True

        if node.__class__ == BreakingReturn:
            next_node = self._nextAdjacentNode(node)
            if next_node.__class__ == BreakingReturn:
                node.parent.removeChild(node)


        for c in node.children:
            self.removeBreakingReturns(c)


    def _fixParagraphs(self, node):
        """Move paragraphs to the child list of the last section (if existent)"""

        if isinstance(node, Paragraph) and isinstance(node.previous, Section) \
                and node.previous is not node.parent:
            prev = node.previous
            parent = node.parent
            target = prev.getLastChild()
            self.report('moving node', node, 'to', target)
            node.moveto(target)
            return True # changed
        else:
            for c in node.children[:]:
                if self._fixParagraphs(c):
                    return True

    def fixParagraphs(self, node):
        while self._fixParagraphs(node):
            pass

    def _nestingBroken(self, node):
        # FIXME: the list below is used and not node.isblocknode. is there a reason for that?
        blocknodes = (Paragraph, PreFormatted, ItemList, Section, Table,
                      Blockquote, DefinitionList, HorizontalRule, Source)
        parents = node.getParents()
        clean_parents = []
        parents.reverse()
        for p in parents:
            if p.__class__ not in self.outsideParentsInvisible:
                clean_parents.append(p)
            else:
                break
        #clean_parents.reverse()
        parents = clean_parents

        if self.nesting_strictness == 'loose':
            for parent in parents:
                if parent.__class__ in self.forbidden_parents.get(node.__class__, []):
                    return parent
        elif self.nesting_strictness == 'strict':
            for parent in parents:
                if node.__class__ != Section and node.__class__ in blocknodes and parent.__class__ in blocknodes:
                    return parent
        return None
           

    def _markNodes(self, node, divide, problem_node=None):
        got_divide = False
        for c in node.children:
            if getattr(node, 'nesting_pos', None):
                c.nesting_pos = node.nesting_pos
                continue
            if c in divide:
                got_divide = True
                if c == problem_node:
                    c.nesting_pos = 'problem'
                continue
            if not got_divide:
                c.nesting_pos = 'top'
            else:
                c.nesting_pos = 'bottom'
        for c in node.children:
            self._markNodes(c, divide, problem_node=problem_node)

    def _cleanUpMarks(self, node):
        if hasattr(node, 'nesting_pos'):
            del node.nesting_pos
        for c in node.children:
            self._cleanUpMarks(c)
            
    def _filterTree(self, node, nesting_filter=[]):
        if getattr(node, 'nesting_pos', None) in nesting_filter:
            node.parent.removeChild(node)
            return
        for c in node.children[:]:
            self._filterTree(c, nesting_filter=nesting_filter)

    def _isException(self, node):
        try:
            has_direction = node.vlist['style']['direction']
        except (KeyError, AttributeError, TypeError):
            return False
        else:
            return True


    def _fixNesting(self, node):
        """Nesting of nodes is corrected.

        The strictness depends on nesting_strictness which can either be 'loose' or 'strict'.
        Depending on the strictness the _nestingBroken method uses different approaches to
        detect forbidden nesting.

        Example for 'strict' setting: (bn --> blocknode, nbn --> nonblocknode)
        bn_1
         nbn_2
         bn_3
         nbn_4

        becomes:
        bn_1.1
         nbn_2
        bn_3
        bn_1.2
         nbn_4
        """

        if self._isException(node):
            return

        bad_parent = self._nestingBroken(node)
        if not bad_parent:
            for c in node.children:
                if self._fixNesting(c):
                    return True
            return False

        divide = node.getParents()
        divide.append(node)
        self._markNodes(bad_parent, divide, problem_node=node)

        top_tree = bad_parent.copy()
        self._filterTree(top_tree, nesting_filter=['bottom', 'problem'])
        middle_tree = bad_parent.copy()
        self._filterTree(middle_tree, nesting_filter=['top', 'bottom'])
        middle_tree = middle_tree.children[0]
        bottom_tree = bad_parent.copy()
        self._filterTree(bottom_tree, nesting_filter=['top', 'problem'])
        new_tree = [part for part in [top_tree, middle_tree, bottom_tree] if part != None]

        self.report('moved', node, 'from', bad_parent)
        parent = bad_parent.parent
        parent.replaceChild(bad_parent, new_tree)
        self._cleanUpMarks(parent)
        return True
    
    def fixNesting(self, node):
        while self._fixNesting(node):
            pass
        
   
    # ex: some tags need to be swapped: center nodes have to be pulled out of underline nodes
    # e.g. but only if the center is a direct and only child
    def swapNodes(self, node):
        """Swaps two nodes if nesting is problematic.

        Some writers have problems with some node combinations
        ex. <u><center>Text</center></u> --> <center><u>Text</u></center>
        """
        def swap(a,b): 
            assert len(a.children) == 1 and a.children[0] is b and b.parent is a and a.parent is not None
            ap = a.parent
            ap.replaceChild(a, [b])
            a.children = [] # a.removeChild(b) wouldn't work since check for b.parent which already is ap fails
            for c in b.children:
                a.appendChild(c)
            b.children = []
            b.appendChild(a)

        if node.__class__ in self.swapNodesMap:
            p = node.parent
            if p and p.parent and p.__class__ in self.swapNodesMap[node.__class__] and len(p.children) == 1:
                self.report('swapping nodes:', node.parent, node)
                swap(node.parent, node)

        for c in node.children[:]:
            self.swapNodes(c)

    def removeBigSectionsFromCells(self, node):
        """Remove very big sections from tables. It can be assumed that they were not intentionally put inside the table"""
        if node.__class__ == Cell:   
            sections = [n for n in node.children if n.__class__ == Section]
            if len(node.getAllDisplayText()) > 2000 and sections:
                for section in sections:
                    if len(section.getAllDisplayText()) > 2000:
                        parentTable = node.getParentNodesByClass(Table)[-1]
                        self.report('move big section out of table')
                        section.moveto(parentTable)
                        

        for c in node.children:
            self.removeBigSectionsFromCells(c)

    def transformNestedTables(self, node):
        """ Remove Container tables that only contain large nested tables"""
        
        if node.__class__ == Table and node.parent and not node.getParentNodesByClass(Table):

            # remove tables which only contain a single table
            if len(node.children) == 1 and node.numcols == 1:
                first_cell_content = node.children[0].children[0].children
                if len(first_cell_content) == 1 and first_cell_content[0].__class__ == Table:
                    node.parent.replaceChild(node, first_cell_content)
                    return
                
            parent = node.parent
            rows = [ r for r in node.children if r.__class__ == Row]
            captions = [ c for c in node.children if c.__class__ == Caption]
            tables = []
            non_tables = []
            for row in rows:
                for cell in row.children:
                    for item in cell.children:
                        if item.__class__ != Table:
                            non_tables.append(item)
                        else:
                            tables.append(item)

            if non_tables:
                non_tables_text = ''.join([ n.getAllDisplayText() for n in non_tables]).strip()
            else:
                non_tables_text = None
            if tables:
                tables_text = ''.join([ n.getAllDisplayText() for n in tables]).strip()
            else:
                tables_text = None

            if tables and (len(tables_text) > 500 ) and not non_tables_text:
                if captions:
                    for c in captions[::-1]:
                        tables.insert(0, c)
                parent.replaceChild(node, tables)
                self.report('removed container table around large tables', node, tables)
                return

        for c in node.children:
            self.transformNestedTables(c)
    
            
            
    def splitBigTableCells(self, node):
        """Splits table cells if their height exceeds the output page height.

        This method is only needed for writers that output on a paginated medium.
        Often these writers can not handle tables where a single cell exceeds the page height.
        Using heuristics in the treecleanerhelper.getNodeHeight function the height of a cell
        is estimated and the cell is split if necessary.        
        """      

        if node.__class__ == Row:
            for cell in node.children:
                h = getNodeHeight(cell, self.cell_splitter_params)
                if h > self.cell_splitter_params['maxCellHeight'] and len(cell.children) > 1:
                    rows = splitRow(node, self.cell_splitter_params)
                    self.report('replacing child', node, rows)
                    node.parent.replaceChild(node, rows)                   
                    return
            return

        for c in node.children[:]:
            self.splitBigTableCells(c)


    def _getNamedRefs(self, node):
        named_refs= []
        for n in node.getChildNodesByClass(Reference) + [node]:
            if n.__class__ == Reference and n.attributes.get('name'):
                named_refs.append(n)
        return named_refs

    def _safeRemove(self, node, named_refs):
        if node in named_refs:
            node.no_display = True
            return
        for ref in named_refs:
            ref.no_display = True
            table_parents = node.getParentNodesByClass(Table)
            if table_parents:
                ref.moveto(table_parents[0], prefix=True)
            else:
                ref.moveto(node, prefix=True)
        node.parent.removeChild(node)
            
    def removeNoPrintNodes(self, node):
        if (node.hasClassID(self.noDisplayClasses) or not node.visible) and node.parent:
            named_refs = self._getNamedRefs(node)
            if named_refs:
                self.report('removing child - keeping named reference', node)
                self._safeRemove(node, named_refs)
            else:
                self.report('removing child', node)
                node.parent.removeChild(node)
            return

        for c in node.children[:]:
            self.removeNoPrintNodes(c)


    def cleanSectionCaptions(self, node):
        """Remove all block nodes from Section nodes, keep the content. If section title is empty replace section by br node"""

        if node.__class__ == Section and node.parents:
            if not node.children:
                self.report('section contained no children')
                return
            if not node.children[0].getAllDisplayText():
                children = [BreakingReturn()]
                if len(node.children) > 1: # at least one "content" node
                    children.extend(node.children)
                self.report('replaced section with empty title with br node')
                node.parent.replaceChild(node, children)
    
        if node.__class__ == Section:
            caption_node = node.children[0]
            children = caption_node.getAllChildren()
            for c in children:
                if c.isblocknode:
                    self.report('removed block node', c)
                    c.parent.replaceChild(c, c.children)

        for c in node.children[:]:
            self.cleanSectionCaptions(c)
            

    def buildDefinitionLists(self, node):
        if node.__class__ in [DefinitionTerm, DefinitionDescription]:
            if node.getChildNodesByClass(ItemList) or node.getParentNodesByClass(ItemList):
                return
            prev = node.getPrevious()
            parent = node.getParent()
            if prev.__class__ == DefinitionList: 
                node.moveto(prev.getLastChild())
                self.report('moved node to prev. definition list')
            else: 
                dl = DefinitionList()
                parent.replaceChild(node, [dl])
                dl.appendChild(node)
                self.report('created new definition list')

        for c in node.children[:]:
            self.buildDefinitionLists(c)


    def restrictChildren(self, node):

        if node.__class__ in self.allowedChildren.keys():
            for c in node.children[:]:
                if c.__class__ not in self.allowedChildren[node.__class__]:
                    node.removeChild(c)
                    self.report('removed restricted child %s from parent %s' % (c, node))
            return 

        for c in node.children:
            self.restrictChildren(c)


    def simplifyBlockNodes(self, node):
        """Remove paragraphs which have a single block node child - keep the child"""
        if node.__class__ == Paragraph:
            if len(node.children) == 1 and node.children[0].isblocknode:
                if node.parent:
                    node.parent.replaceChild(node, [node.children[0]])
                    self.report('remove superfluous wrapping paragraph from node:', node.children[0])

        for c in node.children:
            self.simplifyBlockNodes(c)

    def removeTextlessStyles(self, node):
        """Remove style nodes that have no children with text"""
        if node.__class__ in self.style_nodes:
            if not node.getAllDisplayText().strip() and node.parent:
                if node.children:
                    node.parent.replaceChild(node, newchildren=node.children)
                    self.report('remove style', node, 'with text-less children', node.children )
                else:
                    node.parent.removeChild(node)
                    self.report('removed style without children', node)
                return

        for c in node.children[:]:
            self.removeTextlessStyles(c)
        

    def removeInvisibleLinks(self, node):
        """Remove category links that are not displayed in the text, but only used to stick the article in a category"""

        if (node.__class__ == CategoryLink or node.__class__ == LangLink) and not node.colon and node.parent:
            node.parent.removeChild(node)
            self.report('remove invisible link', node)
            return

        for c in node.children[:]:
            self.removeInvisibleLinks(c)
          

    def fixPreFormatted(self, node):
        """Rearrange PreFormatted nodes. Text is broken down into individual lines which are separated by BreakingReturns """
        if node.__class__ == PreFormatted:
            if not node.getAllDisplayText().strip() and node.parent:
                node.parent.removeChild(node)
                self.report('removed empty preformatted', node)
            children = node.getAllChildren()
            for c in children:
                lines = c.caption.split('\n')
                if len(lines) > 1:
                    text_nodes = []
                    for line in lines:
                        t = Text(line)
                        text_nodes.append(t)
                        text_nodes.append(BreakingReturn())
                    text_nodes.pop()  # remove last BR
                    c.parent.replaceChild(c, text_nodes)
            return

        for c in node.children:
            self.fixPreFormatted(c)
            
    def fixListNesting(self, node):
        """workaround for #81"""
        if node.__class__ == ItemList and len(node.children) == 1:
            item = node.children[0]
            if len(item.children) == 1 and item.children[0].__class__ == ItemList:
                dd = DefinitionDescription()
                dd.appendChild(item.children[0])
                node.parent.replaceChild(node, [dd])
                self.report('transformed indented list item', node)

        for c in node.children:
            self.fixListNesting(c)


    def linearizeWideNestedTables(self, node):
        """Remove wide tables which are nesting inside another table """
        if node.__class__ == Table:
            if getattr(node, 'isInfobox', False):
                return
            parent_tables = node.getParentNodesByClass(Table)            
            if parent_tables and node.numcols > 15:
                while parent_tables:
                    parent_table = parent_tables.pop(0)
                    cell_items = []
                    for row in parent_table.children:
                        for cell in row.children:
                            for item in cell.children:
                                cell_items.append(item)
                    self.report('wide nested table linearized. wrapper:', node, ' replaced by items:', cell_items)
                    parent_table.parent.replaceChild(parent_table, cell_items)

        for c in node.children:
            self.linearizeWideNestedTables(c)


    def _isBigCell(self, cell):
        is_big = False
        content_len = len(cell.getAllDisplayText())
        num_images = 1 + len(cell.getChildNodesByClass(ImageLink))
        if content_len > 5000/num_images:
            return True

        tables = cell.getChildNodesByClass(Table)
        if tables:
            for table in tables:
                if table.numcols > 30:
                    return True
                if len(table.children) >= 25:
                    return True

        itemlists = cell.getChildNodesByClass(ItemList)
        for itemlist in itemlists:
            if len(itemlist.children) > 25:
                return True

        return is_big


            
    def splitTableToColumns(self, node):
        """Removes a table if contained cells are very large. Column content is linearized."""
        if node.__class__ == Table and not getattr(node, 'isInfobox', False):
            split_table = False
            for row in node.children:
                for cell in row.children:
                    if self._isBigCell(cell):
                        split_table = True

            if node.numcols == 2 and not split_table:
                num_border_tables = 0
                for t in node.getChildNodesByClass(Table):
                    if styleutils.tableBorder(t):
                        colspan = t.getParentNodesByClass(Cell)[0].colspan
                        if colspan != 2:
                            num_border_tables += 1
                if num_border_tables >= 3:
                    split_table = True
                        
            if node.hasClassID(self.split_table_classIDs):
                split_table = True

            if node.numcols >= 3 and len(node.getAllDisplayText())>2500:
                # table in "impact" section of http://en.wikipedia.org/wiki/Futurama
                headings = [False]*node.numcols
                lists = [False]*node.numcols
                for row in node.children:
                    for col_idx, cell in enumerate(row.children):
                        if cell.getChildNodesByClass(Section) or cell.getChildNodesByClass(Big):
                            headings[col_idx] = True
                        if cell.getChildNodesByClass(ItemList):
                            lists[col_idx] = True
                if any(headings) and all(lists):
                    split_table = True

            if split_table:
                cols = [[] for i in range(node.numcols)]

                for row in node.children:
                    for col_idx, cell in enumerate(row.children):
                        for item in cell.children:
                            cols[col_idx].append(item)

                lin_cols = []
                for col in cols:
                    for item in col:
                        lin_cols.append(item)
                self.report('removed table. outputting linearize columns')
                node.parent.replaceChild(node, lin_cols)

        for c in node.children[:]:
            self.splitTableToColumns(c)           

    def fixReferenceNodes(self, node):
        ref_nodes = node.getChildNodesByClass(Reference)
        name2children = {}
        for ref_node in ref_nodes:
            ref_name = ref_node.attributes.get('name')
            if ref_name and ref_name != ref_name.strip('"'):
                ref_name = ref_name.strip('"')
                ref_node.vlist['name'] = ref_name
            if ref_name and ref_node.children and not name2children.has_key(ref_name):
                name2children[ref_name] = ref_node.children

        ref_defined = {}
        for ref_node in ref_nodes:
            ref_name = ref_node.attributes.get('name')
            if not ref_name or not name2children.has_key(ref_name):
                continue
            if ref_node.children:
                if ref_defined.get(ref_name): # del children
                    ref_node.children = []
                else:
                    ref_defined[ref_name] = True
            else:                
                if not ref_defined.get(ref_name): # move ref here
                    children = name2children[ref_name]
                    for child in children:
                        ref_node.appendChild(child)
                    ref_defined[ref_name] = True

    def removeEmptyReferenceLists(self, node):
        """
        empty ReferenceLists are removed. they typically stick in a section which only contains the ReferenceList. That section is also removed
        """
        if node.__class__ == ReferenceList:
            sections = node.getParentNodesByClass(Section)
            if sections:
                section = sections[0]
                display_text = []
                for c in section.children[1:]:
                    display_text.append(c.getAllDisplayText().strip())
                if not ''.join(display_text).strip() and section.parent:
                    section.parent.removeChild(section)
                    self.report('removed empty reference list')
                        
        for c in node.children:
            self.removeEmptyReferenceLists(c)



    def removeDuplicateLinksInReferences(self, node):
        if node.__class__ == Reference:
            seen_targets = {}
            links = node.getChildNodesByClass(NamedURL)
            links.extend(node.getChildNodesByClass(URL))
            links.extend(node.getChildNodesByClass(ArticleLink))
            if links:
                for link in links:
                    target = getattr(link, 'caption', None)
                    if target:
                        if seen_targets.get(target):
                            link.parent.removeChild(link)
                        else:
                            seen_targets[target] = True

        for c in node.children:
            self.removeDuplicateLinksInReferences(c)
                
    def removeInvalidFiletypes(self, node):
        """remove ImageLinks which end with the following file types"""
        if node.__class__ == ImageLink:
            for file_ending in self.forbidden_file_endings:
                if node.target.endswith(file_ending):
                    self.report("removed invalid 'image' type with target %r", node.target)
                    node.parent.removeChild(node)

        for c in node.children:
            self.removeInvalidFiletypes(c)

    def limitImageCaptionsize(self, node):

        if node.__class__ == ImageLink:
            txt = node.getAllDisplayText()
            if len(txt) > 500:
                brs = node.getChildNodesByClass(BreakingReturn)
                for br in brs:
                    br.parent.removeChild(br)
                if brs:
                    self.report('removed BreakingReturns from long image caption')

        for c in node.children:
            self.limitImageCaptionsize(c)

    def removeLeadingParaInList(self, node):

        if node.__class__ in [Item, Reference]:
            if node.children and node.children[0].__class__ == Paragraph:
                node.replaceChild(node.children[0], node.children[0].children)
                self.report('remove leading Paragraph in Item')
        for c in node.children:
            self.removeLeadingParaInList(c)


    def fixItemLists(self, node):
        if node.__class__ == ItemList:
            for child in node.children:
                if child.__class__ != Item:
                    i = Item()
                    node.replaceChild(child, [i])
                    i.appendChild(child)
                    self.report('ItemList contained %r. node wrapped in Item node' % child.__class__.__name__)
                    
        for c in node.children:
            self.fixItemLists(c)
    

    def _isEmptyRow(self, row):
        for cell in row.children:
            if cell.children:
                return False
        return True

    def removeEmptyTrailingTableRows(self, node):

        if node.__class__ == Table:
            while node.children and self._isEmptyRow(node.children[-1]):
                node.removeChild(node.children[-1])
                self.report('remove emtpy trailing table row')

        for c in node.children:
            self.removeEmptyTrailingTableRows(c)


    def removeEmptySections(self, node):
        """Remove section nodes which do not contain any text """
        if node.__class__ == Section and node.parent and not node.getParentNodesByClass(Table):
            if len(node.children) == 1:
                node.parent.removeChild(node)
                self.report('removed empty section')
                return
            has_txt = False
            for klass in self.contentWithoutTextClasses:
                if node.getChildNodesByClass(klass):
                    has_txt = True
                    break
            if not has_txt:
                for c in node.children[1:]:
                    if c.getAllDisplayText():
                        has_txt = True
                        break

            if not has_txt:
                self.report('removing empty section')
                node.parent.removeChild(node)
                return
        
        for c in node.children[:]:
            self.removeEmptySections(c)


    def _splitRow(self, node, max_items, all_items):
        cells = node.children
        node.children = []
        for row_index in range(max_items):
            nr = node.copy()
            for (col_index, cell) in enumerate(cells):
                try:
                    content = all_items[col_index][row_index]
                except IndexError:
                    content = None
                cell.children = []
                nc = cell.copy()
                nc.compact = True
                if content:
                    item_list = ItemList()
                    item_list.appendChild(content)
                    item_list.compact = True
                    nc.appendChild(item_list)
                nr.appendChild(nc)                        
            nr.moveto(node, prefix=True)
            if row_index < max_items-1:
                nr.suppress_bottom_border = True
        node.parent.removeChild(node)

    def splitTableLists(self, node):
        """a table row which contains only itemlists is split into muliple rows."""

        if node.__class__ == Row:
            only_lists = True
            max_items = 0
            all_items = []
            for cell in node.children:
                if cell.rowspan > 1:
                    only_lists = False
                    break
                items = [ item for item in cell.getChildNodesByClass(Item) if len(item.getParentNodesByClass(ItemList)) < 2]
                max_items = max(max_items, len(items))
                all_items.append(items)
                for item in cell:
                    if item.__class__ != ItemList:
                        only_lists = False
                        break
                if not only_lists:
                    break
            if only_lists and max_items > 5:
                self._splitRow(node, max_items, all_items)
                self.report('splitting list only table row')
                return
            
        for c in node.children:
            self.splitTableLists(c)
                


    def markShortParagraph(self, node):
        """Hint for writers that allows for special handling of short paragraphs """
        if node.__class__ == Paragraph \
               and len(node.getAllDisplayText()) < 80 \
               and not node.getParentNodesByClass(Table) \
               and not _any([c.isblocknode for c in node.children]):
            node.short_paragraph = True
            
        for c in node.children:
            self.markShortParagraph(c)

    def handleOnlyInPrint(self, node):
        '''Remove nodes with the css class "printonly" which contain URLs.

        printonly nodes are used in citations for example to explicitly print out URLs.
        Since we handle URLs differently, we can ignore printonly nodes
        '''
        if 'printonly' in node.attributes.get('class', ''):
            if _any([c.__class__ in [URL,
                                     NamedURL,
                                     ArticleLink,
                                     NamespaceLink,
                                     InterwikiLink,
                                     SpecialLink] for c in node.children]):
                self.report('removed "printonly" node:', node)
                node.parent.removeChild(node)
                return
        for c in node.children:
            self.handleOnlyInPrint(c)


    def markInfoboxes(self, node):
        if node.__class__ == Article:
            article_ns = getattr(node, 'ns', 0)
            tables = node.getChildNodesByClass(Table)
            found_infobox = False
            for table in tables:
                if miscutils.hasInfoboxAttrs(table) and article_ns != 100:
                    table.isInfobox = found_infobox = True
            if found_infobox or not tables:
                return
            if miscutils.articleStartsWithTable(node, max_text_until_infobox=200) and article_ns != 100:
                tables[0].isInfobox = True
            return

        for c in node.children:
            self.markInfoboxes(c)


    def removeAbsolutePositionedNode(self, node):
        def pos(n):
            return n.style.get('position', '').lower().strip()

        if pos(node) == 'relative':
            if all([pos(c) == 'absolute' for c in node.children]):
                if node.parent:
                    node.parent.removeChild(node)
                    self.report('removed absolute positioned node', node)
                    return

        for c in node.children:
            self.removeAbsolutePositionedNode(c)


    def _unNestCond(self, node):
        tables = node.getChildNodesByClass(Table)
        if tables:
            for table in tables:
                if len(table.children) > 20:
                    return True
        return False


    def unNestEndingCellContent(self, node):
        '''http://de.wikipedia.org/w/index.php?title=Bahnstrecke_Berlin%E2%80%93Dresden&oldid=72891289'''
        if node.__class__ == Table and not node.getParentNodesByClass(Table):
            if not node.children:
                return
            last_row = node.children[-1]
            if not last_row or  len(last_row.children) != 1:
                return
            last_cell = last_row.children[0]
            if last_cell.__class__ != Cell or last_cell.colspan != node.numcols:
                return
            if self._unNestCond(last_cell):
                d = Div()
                d.border = 1
                d.vlist = last_cell.vlist
                for item in last_cell.children:
                    d.appendChild(item)
                last_cell.children = []
                d.moveto(node)
                self.report('moved content behind table', d)

        for c in node.children:
            self.unNestEndingCellContent(c)


    def removeScrollElements(self, node):
        '''overflow:auto
http://en.wikipedia.org/wiki/Pope_John_Paul_II
http://de.wikipedia.org/wiki/Portal:Maschinenbau/Themenliste_Maschinenbau
http://de.wikipedia.org/wiki/Portal:Ethnologie
'''
        if node.style and node.parent and node.style.get('overflow', '').lower() == 'auto':
            height = styleutils.scaleLength(node.style.get('height', ''))
            if height > 100:
                if node.getParentNodesByClass(Table) or node.__class__ == Table :
                    node.force_tablesplit = True
                    if node.getParentNodesByClass(Table):
                        table_node = node.getParentNodesByClass(Table)[0]
                    else:
                        table_node = node
                    content = []
                    for cell in table_node.getChildNodesByClass(Cell):
                        content.extend(cell.children)
                    table_node.parent.replaceChild(table_node, content)
                    self.report('removed overflow:auto table')
                    return
                else:
                    continue_node = node.parent
                    node.parent.replaceChild(node, node.children)
                    node = continue_node
                    self.report('removed overflow:auto element')

        for c in node.children:
            self.removeScrollElements(c)


    def galleryFix(self, node):
        '''move gallery nodes out of tables.'''
        galleries = node.getChildNodesByClass(Gallery)
        for g in galleries:
            tables = g.getParentNodesByClass(Table)
            if tables:
                g.moveto(tables[0])
                self.report('removed gallery from table')

    def fixSubSup(self, node):
        if node.__class__ in [Sup, Sub] and node.parent:
            if len(node.getAllDisplayText())>200:
                node.parent.replaceChild(node, node.children)
                self.report('removed long sup/sub')
        for c in node.children:
            self.fixSubSup(c)

    def removeEditLinks(self, node):

        if node.__class__ == NamedURL and node.caption.endswith('?action=edit'):
            self.report('removing edit link', node)
            node.parent.removeChild(node)

        for c in node:
            self.removeEditLinks(c)

    def removeSeeAlso(self, node):
        try:
            seealso_section =  _('See also')
        except NameError:
            seealso_section = 'See also'


        if node.__class__ == Section \
           and len(node.children):
            try:
                section_title = node.children[0].children[0].caption
            except IndexError:
                section_title = ''
            if isinstance(section_title, basestring) and section_title.strip() == seealso_section:
                self.report('removed see also section', node)
                node.parent.removeChild(node)

        for c in node:
            self.removeSeeAlso(c)



    def cleanVlist(self, node):
        if node.vlist:
            for attr, val in node.vlist.items():
                if attr != attr.lower():
                    node.vlist[attr.lower()] = val

        for c in node:
            self.cleanVlist(c)

    def _isLTR(self, node):
        if isinstance(node, Math):
            return True
        if isinstance(node, Text):
            for c in node.caption:
                if unicodedata.bidirectional(c) not in ['WS']:
                    return False
            return True
        return False

    def fixMathDir(self, node):
        if not self.rtl:
            return
        math_nodes = node.getChildNodesByClass(Math)
        for m in math_nodes:
            p = m.parent
            if all(self._isLTR(c) for c in p.children):
                p.vlist['dir'] = 'ltr'
