# py -3 # -*- coding: utf8 -*- """ Ebook Processor. Part of ebook build chain, along with WordCleaner7 Capture Intro and Quote as
""" from pathlib import Path import pprint import os, sys, re, shutil, time from itertools import chain from sortedcontainers import SortedSet from collections import OrderedDict from betools import CmdLine, visitDir, ruler, head import webbrowser import textwrap ebookName = "onjava" rootPath = Path(r"C:\Users\Bruce\Dropbox\___OnJava") docm = rootPath / "OnJava.docm" ebookBuildPath = rootPath / "ebook_build" html = ebookBuildPath / (ebookName + ".html") ebookResources = rootPath / "ebook_resources" css = ebookResources / (ebookName + ".css") fonts = ebookResources.glob("ubuntumono-*") cover = ebookResources / "cover" / "cover.jpg" example_path = Path(r"C:\Users\Bruce\Dropbox\___OnJava\ExtractedExamples") tablepath = ebookBuildPath / "tables" def start_marker(tag): return '[${}$]'.format(tag) def end_marker(tag): return '[$end_{}$]'.format(tag) def show_all_code_tags(): """ Shows all html "Code" tag variations used in book. """ tag = re.compile("<.*?>") with html.open(encoding="utf8") as ht: tags = SortedSet(tag.findall(ht.read())) for t in tags: if "Code" in t: print(t) style = """ """ blank_table_row = """\""" fixed_table_row = """\ """ @CmdLine('f') def fresh_start(): """ Create book build directory and copy resources into it """ print("Cleaning ...") if ebookBuildPath.exists(): shutil.rmtree(str(ebookBuildPath)) time.sleep(1) ebookBuildPath.mkdir() shutil.copy(str(docm), str(ebookBuildPath)) def _cp(src): shutil.copy(str(src), str(ebookBuildPath)) for font in fonts: _cp(font) _cp(cover) _cp(css) count = 0 # @CmdLine('r') def rewrite_html(): """ Pre-processing HTML tagging and fixups. """ codeblock = re.compile('''( .*?
\s*)+''', re.DOTALL) codeline = re.compile('''(.*?)
\s*''', re.DOTALL) def rewrite_code_line(matchobj): return matchobj.group(1).rstrip() + start_marker("br") def rewrite_code_block(matchobj): global count count += 1 return start_marker("code") + \ codeline.sub(rewrite_code_line, matchobj.group(0)) + \ "\n" + end_marker("code") + "\n" intro = re.compile('''(.*?)
''', re.DOTALL) quote = re.compile('''(.*?)
''', re.DOTALL) def rewrite_bq(matchobj): return start_marker("blockquote") + \ matchobj.group(1).rstrip() + \ "\n" + end_marker("blockquote") + "\n" with html.open(encoding="utf8") as ht: rewritten = codeblock.sub(rewrite_code_block, ht.read()) rewritten = intro.sub(rewrite_bq, rewritten) rewritten = quote.sub(rewrite_bq, rewritten) with html.with_name(html.stem + "-2.html").open('w', encoding="utf8") as ht: ht.write(rewritten) print(count) @CmdLine('x') def cleanup_stripped_html(): """ Clean up stripped HTML -- final housekeeping """ fixes = [ (start_marker("code"), "\n"), (end_marker("code"), "\n
"), (start_marker("blockquote"), ""), (end_marker("blockquote"), ""), ("", style), ('', '
'), (blank_table_row, fixed_table_row), ] with html.with_name(html.stem + "-2.html").open(encoding="utf8") as ht: doc = ht.read() for fix in fixes: doc = doc.replace(*fix) with html.with_name(html.stem + "-3.html").open('w', encoding="utf8") as ht: ht.write(doc) @CmdLine('t') def extract_and_check_tables(): """ Extract tables and view them for checking """ # Extract tables: print("extracting tables ...") if tablepath.exists(): shutil.rmtree(str(tablepath)) time.sleep(1) tablepath.mkdir() os.chdir(str(tablepath)) with html.with_name(html.stem + "-3.html").open(encoding="utf8") as ht: doc = ht.read() # doc = doc.replace("", "") # doc = doc.replace("", "") # doc = doc.replace("", "") # doc = doc.replace("", "") tables = re.compile("(
)", re.DOTALL) for n, table in enumerate(tables.findall(doc)): fname = "%02d_table.html" % n # print(fname) with (tablepath / fname).open('w', encoding="utf8") as tablefile: tablefile.write(table[0]) tablefile.write(table[1]) pandoc = "pandoc {} -t markdown -o {}.md".format(fname, fname.split('.')[0]) print(pandoc) os.system(pandoc) @CmdLine('v') def view_tables(): """ View tables for checking """ os.chdir(str(tablepath)) # for html in Path(".").glob("*.html"): # webbrowser.open("ed {}".format(html)) for md in Path(".").glob("*.md"): os.system("ed {}".format(md)) @CmdLine('c') def convert_to_html(): "Convert to html" os.chdir(str(ebookBuildPath)) print("Convert to HTML") os.system('''WordCleaner7''') show_all_code_tags() rewrite_html() print("TEST Clean up existing HTML and remove formatting") os.system('''WordCleaner7''') cleanup_stripped_html() @CmdLine('m') def convert_to_markdown(): "Convert to markdown" os.chdir(str(ebookBuildPath)) cmd = "pandoc onjava-3.html -f html -t markdown -o onjava.md" print(cmd) os.system(cmd) # with Path("onjava.md").open(encoding="utf8") as mdown: # markdown = mdown.read() # with Path("onjava.md").open('w', encoding="utf8") as mdown: # mdown.write(markdown) silly = r""" \)(.*? """ standalone_start_old = r""" ` """ standalone_start_new = r""" ```java """ standalone_end_old = r""" ` """ standalone_end_new = r""" ```""" @CmdLine('s') def reconstruct_source_code_files(): "Reconstruct source code from examples, make sure you attach output first" print("reconstruct_source_code_files") os.chdir(str(ebookBuildPath)) example = re.compile(r"` //: (.*?\.(java|txt|cpp|py|prop))(.*?)///:~.*?`", re.DOTALL) def restore_example(matchobj): ename = matchobj.group(1) print(ename.encode("utf8")) example_source = example_path / Path(ename) # print(str(example_source)) assert example_source.exists(), "{} doesn't exist".format(example_source) with example_source.open() as example_code: return "```java\n" + \ example_code.read() + \ "```\n" with Path("onjava.md").open(encoding="utf8", errors="ignore") as md: markdown = md.read() markdown = markdown.replace(" **** ", "") # Clean out empty bolding markdown = markdown.replace("\n**** ", "\n") # Clean out empty bolding markdown = markdown.replace(" ****\n", "\n") # Clean out empty bolding markdown = markdown.replace(" ** ", "") # Clean out empty italics markdown = markdown.replace("\n** ", "\n") # Clean out empty italics markdown = markdown.replace(" **\n", "\n") # Clean out empty italics restored = example.sub(restore_example, markdown) restored = restored.replace(start_marker("br"), "\n") restored = restored.replace(silly, "") restored = restored.replace(standalone_start_old, standalone_start_new) restored = restored.replace(standalone_end_old, standalone_end_new) ######### This is the new section: codeblocks = re.compile("```java\n(.*?)\n```") def dedent(matchobj): return "```java\n" + textwrap.dedent(matchobj.group(1)) + "\n```" restored = codeblocks.sub(dedent, restored) with Path("onjava-2.md").open('w', encoding="utf8") as ojmd2: ojmd2.write(restored) @CmdLine('b') def break_up_markdown_file(): "turn markdown file into a collection of chapter-based files" os.chdir(str(ebookBuildPath)) def mdfilename(h1, n): fn = h1.replace(": ", "_") fn = fn.replace(" ", "") + ".md" fn = fn.replace("&", "and") return "%02d_" % n + fn chapters = re.compile(r"\n([A-Za-z\:\& ]*)\n=+\n") with Path("onjava-2.md").open(encoding="utf8") as ojmd2: book = ojmd2.read() parts = chapters.split(book) names = parts[1::2] bodies = parts[0::2] chaps = OrderedDict() chaps["Front"] = bodies[0] for i, nm in enumerate(names): chaps[nm] = bodies[i + 1] for i, p in enumerate(chaps): print(mdfilename(p, i).encode("utf8")) with Path(mdfilename(p, i)).open('w', encoding="utf8") as chp: chp.write(p + "\n") chp.write("=" * len(p) + "\n") chp.write(chaps[p]) @CmdLine('w') def view_in_texts(): "Show all separate .md files in wysiwyg markdown editor" os.chdir(str(ebookBuildPath)) for md in Path(".").glob("*.md"): os.system("texts {}".format(md)) @CmdLine('e') def everything(): """Produce Markdown file from Word doc""" # fresh_start() convert_to_html() convert_to_markdown() reconstruct_source_code_files() break_up_markdown_file() @CmdLine('r') def reassemble_and_convert_to_epub(): """ Put markdown files together, then pandoc to epub """ output_name = "onjava-assembled.md" os.chdir(str(ebookBuildPath)) assembled = "" for md in Path(".").glob("[0-9][0-9]_*.md"): print(str(md)) with md.open(encoding="utf8") as part: assembled += part.read() + "\n" with Path(output_name).open('w', encoding="utf8") as book: book.write(assembled) shutil.copy(str(css), ".") pandoc = ("pandoc {} -f markdown-native_divs -t epub -o OnJava.epub" + \ " --epub-cover-image=cover.jpg " + \ " --epub-embed-font=ubuntumono-r-webfont.ttf " + \ " --epub-chapter-level=1 " + \ " --toc-depth=2 " + \ # " --no-highlight " + \ " --epub-stylesheet=onjava.css " ).format(output_name) print(pandoc) os.system(pandoc) shutil.copy("OnJava.epub", "OnJava.zip") os.system("unzip OnJava.zip -d epub_files") if __name__ == '__main__': CmdLine.run()