OnJava8-Examples/tools/ProcessEbook.py

# py -3
# -*- coding: utf8 -*-
"""
Ebook Processor. Part of ebook build chain, along with WordCleaner7
Capture Intro and Quote as <blockquote>
"""
from pathlib import Path
import pprint
import os, sys, re, shutil, time
from itertools import chain
from sortedcontainers import SortedSet
from collections import OrderedDict
from betools import CmdLine, visitDir, ruler, head
import webbrowser
import textwrap


ebookName = "onjava"
rootPath = Path(r"C:\Users\Bruce\Dropbox\___OnJava")
docm = rootPath / "OnJava.docm"
ebookBuildPath = rootPath / "ebook_build"
html = ebookBuildPath / (ebookName + ".html")
ebookResources = rootPath / "ebook_resources"
css = ebookResources / (ebookName + ".css")
fonts = ebookResources.glob("ubuntumono-*")
cover = ebookResources / "cover" / "cover.jpg"
example_path = Path(r"C:\Users\Bruce\Dropbox\___OnJava\ExtractedExamples")
tablepath = ebookBuildPath / "tables"

def start_marker(tag):
    return '[${}$]'.format(tag)

def end_marker(tag):
    return '[$end_{}$]'.format(tag)


def show_all_code_tags():
    """
    Shows all html "Code" tag variations used in book.
    """
    tag = re.compile("<.*?>")
    with html.open(encoding="utf8") as ht:
        tags = SortedSet(tag.findall(ht.read()))
    for t in tags:
        if "Code" in t:
            print(t)


style = """
    <style type="text/css">
        @font-face {
            font-family: Ubuntu Mono;
            src: url('ubuntumono-r-webfont.eot');
            src: url('ubuntumono-r-webfont.eot?#iefix') format('embedded-opentype'),
                 url('ubuntumono-r-webfont.woff') format('woff'),
                 url('ubuntumono-r-webfont.ttf') format('truetype'),
                 url('ubuntumono-r-webfont.svg#ubuntu_monoregular') format('svg');
            font-weight: normal;
            font-style: normal;
        }
        blockquote { font-size:130% }
        code { font-size: 85%; font-family:'Ubuntu Mono' }
        thead {
            font-weight: bold;
            font-size: 120%;
        }
        table, th, td {
            border: 2px solid black;
            border-collapse: collapse;
                    padding-left: 10px;
                  padding-right: 10px;      }
    </style>
    </head>
"""

blank_table_row = """\
                </tbody>
                <tr>
                    <td>
                    </td>
                    <td>
                    </td>
                </tr>
            </table>"""

fixed_table_row = """\
                </tbody>
            </table>"""


@CmdLine('f')
def fresh_start():
    """
    Create book build directory and copy resources into it
    """
    print("Cleaning ...")
    if ebookBuildPath.exists():
        shutil.rmtree(str(ebookBuildPath))
    time.sleep(1)
    ebookBuildPath.mkdir()
    shutil.copy(str(docm), str(ebookBuildPath))
    def _cp(src):
        shutil.copy(str(src), str(ebookBuildPath))
    for font in fonts:
        _cp(font)
    _cp(cover)
    _cp(css)


count = 0
# @CmdLine('r')
def rewrite_html():
    """
    Pre-processing HTML tagging and fixups.
    """
    codeblock = re.compile('''(<p class="Code">.*?</p>\s*)+''', re.DOTALL)
    codeline = re.compile('''<p class="Code">(.*?)</p>\s*''', re.DOTALL)
    def rewrite_code_line(matchobj):
        return matchobj.group(1).rstrip() + start_marker("br")
    def rewrite_code_block(matchobj):
        global count
        count += 1
        return  start_marker("code") + \
                codeline.sub(rewrite_code_line, matchobj.group(0)) + \
                "\n" + end_marker("code") + "\n"


    intro = re.compile('''<p class="Intro">(.*?)</p>''', re.DOTALL)
    quote = re.compile('''<p class="Quote">(.*?)</p>''', re.DOTALL)

    def rewrite_bq(matchobj):
        return  start_marker("blockquote") + \
                matchobj.group(1).rstrip() + \
                "\n" + end_marker("blockquote") + "\n"

    with html.open(encoding="utf8") as ht:
        rewritten = codeblock.sub(rewrite_code_block, ht.read())
        rewritten = intro.sub(rewrite_bq, rewritten)
        rewritten = quote.sub(rewrite_bq, rewritten)

    with html.with_name(html.stem + "-2.html").open('w', encoding="utf8") as ht:
        ht.write(rewritten)

    print(count)


@CmdLine('x')
def cleanup_stripped_html():
    """
    Clean up stripped HTML -- final housekeeping
    """
    fixes = [
        (start_marker("code"), "<code>\n"),
        (end_marker("code"), "\n</code>"),
        (start_marker("blockquote"), "<blockquote>"),
        (end_marker("blockquote"), "</blockquote>"),
        ("</head>", style),
        ('<table cellspacing="0" cellpadding="0">', '<table align="center">'),
        (blank_table_row, fixed_table_row),
    ]
    with html.with_name(html.stem + "-2.html").open(encoding="utf8") as ht:
        doc = ht.read()
        for fix in fixes:
            doc = doc.replace(*fix)

    with html.with_name(html.stem + "-3.html").open('w', encoding="utf8") as ht:
        ht.write(doc)

@CmdLine('t')
def extract_and_check_tables():
    """
    Extract tables and view them for checking
    """
    # Extract tables:
    print("extracting tables ...")
    if tablepath.exists():
        shutil.rmtree(str(tablepath))
    time.sleep(1)
    tablepath.mkdir()

    os.chdir(str(tablepath))
    with html.with_name(html.stem + "-3.html").open(encoding="utf8") as ht:
        doc = ht.read()
    # doc = doc.replace("<thead>", "")
    # doc = doc.replace("</thead>", "")
    # doc = doc.replace("<tbody>", "")
    # doc = doc.replace("</tbody>", "")
    tables = re.compile("(<table.*?>)(.*?</table>)", re.DOTALL)
    for n, table in enumerate(tables.findall(doc)):
        fname = "%02d_table.html" % n
        # print(fname)
        with (tablepath / fname).open('w', encoding="utf8") as tablefile:
            tablefile.write(table[0])
            tablefile.write(table[1])
        pandoc = "pandoc {} -t markdown -o {}.md".format(fname, fname.split('.')[0])
        print(pandoc)
        os.system(pandoc)

@CmdLine('v')
def view_tables():
    """
    View tables for checking
    """
    os.chdir(str(tablepath))
    # for html in Path(".").glob("*.html"):
    #     webbrowser.open("ed {}".format(html))
    for md in Path(".").glob("*.md"):
        os.system("ed {}".format(md))


@CmdLine('c')
def convert_to_html():
    "Convert to html"
    os.chdir(str(ebookBuildPath))
    print("Convert to HTML")
    os.system('''WordCleaner7''')
    show_all_code_tags()
    rewrite_html()
    print("TEST Clean up existing HTML and remove formatting")
    os.system('''WordCleaner7''')
    cleanup_stripped_html()


@CmdLine('m')
def convert_to_markdown():
    "Convert to markdown"
    os.chdir(str(ebookBuildPath))
    cmd = "pandoc onjava-3.html -f html -t markdown -o onjava.md"
    print(cmd)
    os.system(cmd)
    # with Path("onjava.md").open(encoding="utf8") as mdown:
    #     markdown = mdown.read()
    # with Path("onjava.md").open('w', encoding="utf8") as mdown:
    #     mdown.write(markdown)


silly = r"""</div>

\
<div>"""

standalone_start_old = r"""
` """
standalone_start_new = r"""
```java
 """
standalone_end_old = r"""
 `
"""
standalone_end_new = r"""
```"""

@CmdLine('s')
def reconstruct_source_code_files():
    "Reconstruct source code from examples, make sure you attach output first"
    print("reconstruct_source_code_files")
    os.chdir(str(ebookBuildPath))
    example = re.compile(r"` //: (.*?\.(java|txt|cpp|py|prop))(.*?)///:~.*?`", re.DOTALL)

    def restore_example(matchobj):
        ename = matchobj.group(1)
        print(ename.encode("utf8"))
        example_source = example_path / Path(ename)
        # print(str(example_source))
        assert example_source.exists(), "{} doesn't exist".format(example_source)
        with example_source.open() as example_code:
            return "```java\n" + \
                example_code.read() + \
                "```\n"

    with Path("onjava.md").open(encoding="utf8", errors="ignore") as md:
        markdown = md.read()
        markdown = markdown.replace(" **** ", "") # Clean out empty bolding
        markdown = markdown.replace("\n**** ", "\n") # Clean out empty bolding
        markdown = markdown.replace(" ****\n", "\n") # Clean out empty bolding
        markdown = markdown.replace(" ** ", "") # Clean out empty italics
        markdown = markdown.replace("\n** ", "\n") # Clean out empty italics
        markdown = markdown.replace(" **\n", "\n") # Clean out empty italics
        restored = example.sub(restore_example, markdown)
        restored = restored.replace(start_marker("br"), "\n")

    restored = restored.replace(silly, "")
    restored = restored.replace(standalone_start_old, standalone_start_new)
    restored = restored.replace(standalone_end_old, standalone_end_new)

    ######### This is the new section:
    codeblocks = re.compile("```java\n(.*?)\n```")
    def dedent(matchobj):
        return "```java\n" + textwrap.dedent(matchobj.group(1)) + "\n```"
    restored = codeblocks.sub(dedent, restored)

    with Path("onjava-2.md").open('w', encoding="utf8") as ojmd2:
        ojmd2.write(restored)

@CmdLine('b')
def break_up_markdown_file():
    "turn markdown file into a collection of chapter-based files"
    os.chdir(str(ebookBuildPath))

    def mdfilename(h1, n):
        fn = h1.replace(": ", "_")
        fn = fn.replace(" ", "") + ".md"
        fn = fn.replace("&", "and")
        return "%02d_" % n + fn

    chapters = re.compile(r"\n([A-Za-z\:\& ]*)\n=+\n")
    with Path("onjava-2.md").open(encoding="utf8") as ojmd2:
        book = ojmd2.read()
    parts = chapters.split(book)
    names = parts[1::2]
    bodies = parts[0::2]
    chaps = OrderedDict()
    chaps["Front"] = bodies[0]
    for i, nm in enumerate(names):
        chaps[nm] = bodies[i + 1]

    for i, p in enumerate(chaps):
        print(mdfilename(p, i).encode("utf8"))
        with Path(mdfilename(p, i)).open('w', encoding="utf8") as chp:
            chp.write(p + "\n")
            chp.write("=" * len(p) + "\n")
            chp.write(chaps[p])


@CmdLine('w')
def view_in_texts():
    "Show all separate .md files in wysiwyg markdown editor"
    os.chdir(str(ebookBuildPath))
    for md in Path(".").glob("*.md"):
        os.system("texts {}".format(md))


@CmdLine('e')
def everything():
    """Produce Markdown file from Word doc"""
    # fresh_start()
    convert_to_html()
    convert_to_markdown()
    reconstruct_source_code_files()
    break_up_markdown_file()


@CmdLine('r')
def reassemble_and_convert_to_epub():
    """
    Put markdown files together, then pandoc to epub
    """
    output_name = "onjava-assembled.md"
    os.chdir(str(ebookBuildPath))
    assembled = ""
    for md in Path(".").glob("[0-9][0-9]_*.md"):
        print(str(md))
        with md.open(encoding="utf8") as part:
            assembled += part.read() + "\n"
    with Path(output_name).open('w', encoding="utf8") as book:
        book.write(assembled)
    shutil.copy(str(css), ".")
    pandoc = ("pandoc {} -f markdown-native_divs -t epub -o OnJava.epub" + \
        " --epub-cover-image=cover.jpg " + \
        " --epub-embed-font=ubuntumono-r-webfont.ttf " + \
        " --epub-chapter-level=1 " + \
        " --toc-depth=2 " + \
        # " --no-highlight " + \
        " --epub-stylesheet=onjava.css "
        ).format(output_name)
    print(pandoc)
    os.system(pandoc)
    shutil.copy("OnJava.epub", "OnJava.zip")
    os.system("unzip OnJava.zip -d epub_files")


if __name__ == '__main__': CmdLine.run()