OnJava8-Examples/tools/ProcessEbook.py

369 lines
11 KiB
Python
Raw Normal View History

2015-06-15 15:55:34 -07:00
# py -3
# -*- coding: utf8 -*-
"""
Ebook Processor. Part of ebook build chain, along with WordCleaner7
2015-06-16 23:21:17 -07:00
Capture Intro and Quote as <blockquote>
2015-06-15 15:55:34 -07:00
"""
from pathlib import Path
import pprint
import os, sys, re, shutil, time
from itertools import chain
from sortedcontainers import SortedSet
2015-06-16 23:21:17 -07:00
from collections import OrderedDict
2015-06-15 15:55:34 -07:00
from betools import CmdLine, visitDir, ruler, head
2015-06-17 17:37:28 -07:00
import webbrowser
2015-06-18 21:26:52 -07:00
import textwrap
2015-06-17 17:37:28 -07:00
2015-06-15 15:55:34 -07:00
ebookName = "onjava"
rootPath = Path(r"C:\Users\Bruce\Dropbox\___OnJava")
docm = rootPath / "OnJava.docm"
ebookBuildPath = rootPath / "ebook_build"
html = ebookBuildPath / (ebookName + ".html")
ebookResources = rootPath / "ebook_resources"
css = ebookResources / (ebookName + ".css")
fonts = ebookResources.glob("ubuntumono-*")
2015-06-17 17:37:28 -07:00
cover = ebookResources / "cover" / "cover.jpg"
2015-06-16 23:21:17 -07:00
example_path = Path(r"C:\Users\Bruce\Dropbox\___OnJava\ExtractedExamples")
2015-06-17 23:51:32 -07:00
tablepath = ebookBuildPath / "tables"
2015-06-16 23:21:17 -07:00
def start_marker(tag):
return '[${}$]'.format(tag)
def end_marker(tag):
return '[$end_{}$]'.format(tag)
2015-06-15 15:55:34 -07:00
def show_all_code_tags():
"""
Shows all html "Code" tag variations used in book.
"""
tag = re.compile("<.*?>")
with html.open(encoding="utf8") as ht:
tags = SortedSet(tag.findall(ht.read()))
for t in tags:
if "Code" in t:
print(t)
style = """
<style type="text/css">
@font-face {
font-family: Ubuntu Mono;
src: url('ubuntumono-r-webfont.eot');
src: url('ubuntumono-r-webfont.eot?#iefix') format('embedded-opentype'),
url('ubuntumono-r-webfont.woff') format('woff'),
url('ubuntumono-r-webfont.ttf') format('truetype'),
url('ubuntumono-r-webfont.svg#ubuntu_monoregular') format('svg');
font-weight: normal;
font-style: normal;
}
2015-06-16 23:21:17 -07:00
blockquote { font-size:130% }
code { font-size: 85%; font-family:'Ubuntu Mono' }
2015-06-15 15:55:34 -07:00
thead {
font-weight: bold;
font-size: 120%;
}
table, th, td {
border: 2px solid black;
border-collapse: collapse;
padding-left: 10px;
padding-right: 10px; }
</style>
</head>
"""
blank_table_row = """\
</tbody>
<tr>
<td>
</td>
<td>
</td>
</tr>
</table>"""
fixed_table_row = """\
</tbody>
</table>"""
2015-06-16 23:21:17 -07:00
@CmdLine('f')
def fresh_start():
"""
Create book build directory and copy resources into it
"""
print("Cleaning ...")
if ebookBuildPath.exists():
shutil.rmtree(str(ebookBuildPath))
time.sleep(1)
ebookBuildPath.mkdir()
shutil.copy(str(docm), str(ebookBuildPath))
def _cp(src):
shutil.copy(str(src), str(ebookBuildPath))
for font in fonts:
_cp(font)
_cp(cover)
2015-06-18 21:26:52 -07:00
_cp(css)
2015-06-16 23:21:17 -07:00
count = 0
2015-06-18 21:26:52 -07:00
# @CmdLine('r')
2015-06-16 23:21:17 -07:00
def rewrite_html():
"""
Pre-processing HTML tagging and fixups.
"""
codeblock = re.compile('''(<p class="Code">.*?</p>\s*)+''', re.DOTALL)
codeline = re.compile('''<p class="Code">(.*?)</p>\s*''', re.DOTALL)
def rewrite_code_line(matchobj):
return matchobj.group(1).rstrip() + start_marker("br")
def rewrite_code_block(matchobj):
global count
count += 1
return start_marker("code") + \
codeline.sub(rewrite_code_line, matchobj.group(0)) + \
"\n" + end_marker("code") + "\n"
intro = re.compile('''<p class="Intro">(.*?)</p>''', re.DOTALL)
quote = re.compile('''<p class="Quote">(.*?)</p>''', re.DOTALL)
def rewrite_bq(matchobj):
return start_marker("blockquote") + \
matchobj.group(1).rstrip() + \
"\n" + end_marker("blockquote") + "\n"
with html.open(encoding="utf8") as ht:
rewritten = codeblock.sub(rewrite_code_block, ht.read())
rewritten = intro.sub(rewrite_bq, rewritten)
rewritten = quote.sub(rewrite_bq, rewritten)
with html.with_name(html.stem + "-2.html").open('w', encoding="utf8") as ht:
ht.write(rewritten)
print(count)
@CmdLine('x')
2015-06-15 15:55:34 -07:00
def cleanup_stripped_html():
"""
Clean up stripped HTML -- final housekeeping
"""
fixes = [
2015-06-17 23:51:32 -07:00
(start_marker("code"), "<code>\n"),
(end_marker("code"), "\n</code>"),
2015-06-16 23:21:17 -07:00
(start_marker("blockquote"), "<blockquote>"),
(end_marker("blockquote"), "</blockquote>"),
2015-06-15 15:55:34 -07:00
("</head>", style),
('<table cellspacing="0" cellpadding="0">', '<table align="center">'),
(blank_table_row, fixed_table_row),
]
with html.with_name(html.stem + "-2.html").open(encoding="utf8") as ht:
doc = ht.read()
for fix in fixes:
doc = doc.replace(*fix)
with html.with_name(html.stem + "-3.html").open('w', encoding="utf8") as ht:
ht.write(doc)
2015-06-17 17:37:28 -07:00
@CmdLine('t')
def extract_and_check_tables():
"""
Extract tables and view them for checking
"""
# Extract tables:
print("extracting tables ...")
if tablepath.exists():
shutil.rmtree(str(tablepath))
time.sleep(1)
tablepath.mkdir()
os.chdir(str(tablepath))
with html.with_name(html.stem + "-3.html").open(encoding="utf8") as ht:
doc = ht.read()
2015-06-17 23:51:32 -07:00
# doc = doc.replace("<thead>", "")
# doc = doc.replace("</thead>", "")
# doc = doc.replace("<tbody>", "")
# doc = doc.replace("</tbody>", "")
2015-06-17 17:37:28 -07:00
tables = re.compile("(<table.*?>)(.*?</table>)", re.DOTALL)
for n, table in enumerate(tables.findall(doc)):
fname = "%02d_table.html" % n
# print(fname)
with (tablepath / fname).open('w', encoding="utf8") as tablefile:
tablefile.write(table[0])
tablefile.write(table[1])
pandoc = "pandoc {} -t markdown -o {}.md".format(fname, fname.split('.')[0])
print(pandoc)
os.system(pandoc)
2015-06-17 23:51:32 -07:00
@CmdLine('v')
def view_tables():
"""
View tables for checking
"""
os.chdir(str(tablepath))
# for html in Path(".").glob("*.html"):
# webbrowser.open("ed {}".format(html))
for md in Path(".").glob("*.md"):
os.system("ed {}".format(md))
2015-06-17 17:37:28 -07:00
2015-06-15 15:55:34 -07:00
2015-06-16 23:21:17 -07:00
@CmdLine('c')
def convert_to_html():
"Convert to html"
2015-06-15 15:55:34 -07:00
os.chdir(str(ebookBuildPath))
print("Convert to HTML")
os.system('''WordCleaner7''')
show_all_code_tags()
2015-06-16 23:21:17 -07:00
rewrite_html()
2015-06-15 15:55:34 -07:00
print("TEST Clean up existing HTML and remove formatting")
os.system('''WordCleaner7''')
cleanup_stripped_html()
2015-06-16 23:21:17 -07:00
@CmdLine('m')
def convert_to_markdown():
"Convert to markdown"
os.chdir(str(ebookBuildPath))
2015-06-17 00:23:37 -07:00
cmd = "pandoc {} -f html -t markdown -o {}.md".format("onjava-3.html", "onjava")
2015-06-16 23:21:17 -07:00
print(cmd)
os.system(cmd)
2015-06-18 21:26:52 -07:00
with Path("onjava.md").open(encoding="utf8") as mdown:
markdown = mdown.read()
markdown = markdown.replace("****", "") # Clean out reduntant bolding
with Path("onjava.md").open('w', encoding="utf8") as mdown:
mdown.write(markdown)
2015-06-16 23:21:17 -07:00
silly = r"""</div>
\
<div>"""
2015-06-17 00:23:37 -07:00
standalone_start_old = r"""
` """
standalone_start_new = r"""
```java
"""
standalone_end_old = r"""
`
"""
standalone_end_new = r"""
2015-06-18 21:26:52 -07:00
```"""
2015-06-17 00:23:37 -07:00
2015-06-16 23:21:17 -07:00
@CmdLine('s')
def reconstruct_source_code_files():
"Reconstruct source code from examples, make sure you attach output first"
2015-06-18 21:26:52 -07:00
print("reconstruct_source_code_files")
2015-06-16 23:21:17 -07:00
os.chdir(str(ebookBuildPath))
2015-06-17 00:23:37 -07:00
example = re.compile(r"` //: (.*?\.(java|txt|cpp|py|prop))(.*?)///:~.*?`", re.DOTALL)
2015-06-16 23:21:17 -07:00
def restore_example(matchobj):
ename = matchobj.group(1)
print(ename.encode("utf8"))
example_source = example_path / Path(ename)
# print(str(example_source))
assert example_source.exists(), "{} doesn't exist".format(example_source)
with example_source.open() as example_code:
return "```java\n" + \
example_code.read() + \
"```\n"
with Path("onjava.md").open(encoding="utf8", errors="ignore") as md:
restored = example.sub(restore_example, md.read())
2015-06-17 00:23:37 -07:00
restored = restored.replace(start_marker("br"), "\n")
2015-06-16 23:21:17 -07:00
restored = restored.replace(silly, "")
2015-06-17 00:23:37 -07:00
restored = restored.replace(standalone_start_old, standalone_start_new)
restored = restored.replace(standalone_end_old, standalone_end_new)
2015-06-16 23:21:17 -07:00
2015-06-17 23:51:32 -07:00
######### This is the new section:
codeblocks = re.compile("```java\n(.*?)\n```")
def dedent(matchobj):
return "```java\n" + textwrap.dedent(matchobj.group(1)) + "\n```"
restored = codeblocks.sub(dedent, restored)
2015-06-16 23:21:17 -07:00
with Path("onjava-2.md").open('w', encoding="utf8") as ojmd2:
ojmd2.write(restored)
@CmdLine('b')
def break_up_markdown_file():
"turn markdown file into a collection of chapter-based files"
os.chdir(str(ebookBuildPath))
def mdfilename(h1, n):
fn = h1.replace(": ", "_")
fn = fn.replace(" ", "") + ".md"
fn = fn.replace("&", "and")
return "%02d_" % n + fn
chapters = re.compile(r"\n([A-Za-z\:\& ]*)\n=+\n")
with Path("onjava-2.md").open(encoding="utf8") as ojmd2:
book = ojmd2.read()
parts = chapters.split(book)
names = parts[1::2]
bodies = parts[0::2]
chaps = OrderedDict()
chaps["Front"] = bodies[0]
for i, nm in enumerate(names):
chaps[nm] = bodies[i + 1]
for i, p in enumerate(chaps):
print(mdfilename(p, i).encode("utf8"))
with Path(mdfilename(p, i)).open('w', encoding="utf8") as chp:
chp.write(p + "\n")
chp.write("=" * len(p) + "\n")
chp.write(chaps[p])
2015-06-15 15:55:34 -07:00
2015-06-18 21:26:52 -07:00
2015-06-17 23:51:32 -07:00
@CmdLine('w')
def view_in_texts():
"Show all separate .md files in wysiwyg markdown editor"
os.chdir(str(ebookBuildPath))
for md in Path(".").glob("*.md"):
os.system("texts {}".format(md))
2015-06-17 00:23:37 -07:00
@CmdLine('e')
def everything():
2015-06-18 21:26:52 -07:00
"""Produce Markdown file from Word doc"""
2015-06-17 17:37:28 -07:00
# fresh_start()
2015-06-17 00:23:37 -07:00
convert_to_html()
convert_to_markdown()
reconstruct_source_code_files()
break_up_markdown_file()
2015-06-15 15:55:34 -07:00
2015-06-18 21:26:52 -07:00
@CmdLine('r')
def reassemble_and_convert_to_epub():
"""
Put markdown files together, then pandoc to epub
"""
output_name = "onjava-assembled.md"
os.chdir(str(ebookBuildPath))
assembled = ""
for md in Path(".").glob("[0-9][0-9]_*.md"):
print(str(md))
with md.open(encoding="utf8") as part:
assembled += part.read() + "\n"
with Path(output_name).open('w', encoding="utf8") as book:
book.write(assembled)
pandoc = ("pandoc {} -f markdown-native_divs -t epub -o OnJava.epub" + \
" --epub-cover-image=cover.jpg " + \
" --epub-embed-font=ubuntumono-r-webfont.ttf " + \
" --epub-chapter-level=1 " + \
" --toc-depth=2 " + \
" --no-highlight " + \
" --epub-stylesheet=onjava.css "
).format(output_name)
print(pandoc)
os.system(pandoc)
shutil.copy("OnJava.epub", "OnJava.zip")
os.system("unzip OnJava.zip -d epub_files")
2015-06-15 15:55:34 -07:00
if __name__ == '__main__': CmdLine.run()