import os
import tidylib
import lxml.etree
from fs.walk import Walker
from fs.copy import copy_file

def transform(filename, data):

    base, ext = os.path.splitext(filename)
    if ext.lower().endswith('.html'):
        data, errors = tidylib.tidy_document(
                data,
                options={'output-xhtml': True})
        root = lxml.etree.fromstring(data)
        import pdb; pdb.set_trace()
    elif ext.lower().endswith('.xml'):
        root = lxml.etree.fromstring(data)
    return data

def mirror_fs(src, dst):

    walker = Walker()
    for path in walker.dirs(src):
        print path
        dst.makedir(path, recreate=True)

    for path in walker.files(src):
        with src.open(path, 'rb') as fp_in:
            with dst.open(path, 'wb') as fp_out:
                data = fp_in.read()
                data = transform(path, data)
                fp_out.write(data)


if __name__ == '__main__':

    import fs.opener
    src,_ = fs.opener.open('file:///tmp/sd')
#    src,_ = fs.opener.open('zip:///tmp/sd.zip', writeable=False)
    dst,_ = fs.opener.open('file:///tmp/dest')

    mirror_fs(src, dst)
