[Python-modules-commits] [python-odf] 87/118: Many improvements to EPub output

Fri Oct 3 21:27:27 UTC 2014

This is an automated email from the git hooks/post-receive script.

debacle pushed a commit to reference refs/remotes/upstream/master
in repository python-odf.

commit 09b1834c73f4c41fea9a7adde0f002df14f0e641
Author: Søren Roug <soren.roug at eea.europa.eu>
Date:   Sun May 16 16:42:33 2010 +0000

    Many improvements to EPub output
---
 contrib/odf2epub/odf2epub |  84 +++++++++++++++++++----------
 odf/load.py               |   2 +-
 odf/odf2xhtml.py          | 131 ++++++++++++++++++++++++----------------------
 3 files changed, 127 insertions(+), 90 deletions(-)

diff --git a/contrib/odf2epub/odf2epub b/contrib/odf2epub/odf2epub
index 8a77b57..53f51fb 100755
--- a/contrib/odf2epub/odf2epub
+++ b/contrib/odf2epub/odf2epub
@@ -19,6 +19,7 @@
 #
 from odf.odf2xhtml import ODF2XHTML
 from odf.namespaces import TEXTNS, XLINKNS
+from odf.opendocument import load
 import sys, getopt, time, zipfile
 from StringIO import StringIO
 from cgi import escape
@@ -68,12 +69,27 @@ class ODF2EPUB(ODF2XHTML):
 #       pass
 
     def s_text_h(self, tag, attrs):
+        """ Handle a heading
+            If the heading is a level 1 heading, then split the HTML file.
+            We have to be careful, because the heading can be inside a frame or a table.
+        """
         level = int(attrs[(TEXTNS,'outline-level')])
         if level == 1:
-            self.closetag('body')
-            self.closetag('html')
+            tags_to_keep = self.htmlstack[:]
+            tags_to_close = self.htmlstack[:]
+            tags_to_close.reverse()
+            for htag,hattrs,hblock in tags_to_close:
+                if htag == 'body':
+                    self.generate_footnotes()
+                    self._resetfootnotes()
+                self.closetag(htag)
+            # I have to do this rather ugly, as the saved header part doesn't
+            # go through the self.opentag() method
             self.chapters.append(''.join(self.headerpart + self.lines))
             self.lines = []
+            self.htmlstack = tags_to_keep[:2] # Only <html> and <body>
+            for htag,hattrs,hblock in tags_to_keep[2:]:
+                self.opentag(htag,hattrs,hblock)
         return ODF2XHTML.s_text_h(self, tag, attrs)
 
     def e_text_h(self, tag, attrs):
@@ -83,12 +99,16 @@ class ODF2EPUB(ODF2XHTML):
         if level < 1: level = 1
         lev = self.headinglevels[1:level+1]
         outline = '.'.join(map(str,lev) )
-        anchor = self.get_anchor("%s.%s" % ( outline, ''.join(self.data)))
-        n = NavpointEntry(anchor, ''.join(self.data), len(self.chapters), level)
+        heading = ''.join(self.data)
+        anchor = self.get_anchor("%s.%s" % ( outline, heading))
+        n = NavpointEntry(anchor, heading, len(self.chapters), level)
         self.navpoint_list.append(n)
         return ODF2XHTML.e_text_h(self, tag, attrs)
 
     def s_office_text(self, tag, attrs):
+        """ Save all the lines up to and including the <body> tag
+            so I can split the file into more files
+        """
         ODF2XHTML.s_office_text(self, tag, attrs)
         self.headerpart = self.lines
         self.lines = []
@@ -130,16 +150,18 @@ class EPublication:
       <dc:language>%s</dc:language>
       <dc:identifier id="BookID" opf:scheme="URI">%s</dc:identifier>
       <dc:creator>%s</dc:creator>
+      %s
     </metadata>
     <manifest>
-        <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>"""
+        <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
+        <item id="styles-css" href="styles.css" media-type="text/css"/>"""
 
     toc_ncx_head = """<?xml version="1.0"?>
 <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
    "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
 
 <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
-       <head>
+    <head>
         <meta name="dtb:uid" content="%s"/>
         <meta name="dtb:depth" content="2"/>
         <meta name="dtb:totalPageCount" content="0"/>
@@ -159,10 +181,11 @@ class EPublication:
 </ncx>"""
 
     def __init__(self, filename, coverimage):
-        self.filename = filename
+        self.doc = load(filename)
         self.coverimage = coverimage
         self.odhandler = ODF2EPUB(True, False)
-        self.odhandler.load(filename)
+        self.odhandler.set_style_file("styles.css")
+        self.odhandler.load(self.doc)
 
     def _zipwrite(self, outputfp):
         """ Write the document to an open file pointer """
@@ -181,6 +204,12 @@ class EPublication:
         zout.external_attr = UNIXPERMS
         outputfp.writestr(zout, self.container)
 
+        # Write CSS part
+        zout = zipfile.ZipInfo('OEBPS/styles.css', now)
+        zout.compress_type = zipfile.ZIP_DEFLATED
+        zout.external_attr = UNIXPERMS
+        outputfp.writestr(zout, self.odhandler.css())
+
         # Write HTML parts
         for chapter in range(len(self.odhandler.chapters)):
             zout = zipfile.ZipInfo('OEBPS/chapter%d.xhtml' % chapter, now)
@@ -190,35 +219,37 @@ class EPublication:
             outputfp.writestr(zout, xhtml)
 
         # Copy images over to output
-        z = zipfile.ZipFile(self.filename)
-        for zinfo in z.infolist():
-            if zinfo.filename[0:9] == 'Pictures/':
-                zipinfo = zipfile.ZipInfo("OEBPS/" + zinfo.filename, now)
-                zipinfo.external_attr = UNIXPERMS
-                outputfp.writestr(zipinfo, z.read(zinfo.filename))
+        for arcname, picturerec in self.doc.Pictures.items():
+            what_it_is, fileobj, mediatype = picturerec
+            zi = zipfile.ZipInfo("OEBPS/" + str(arcname), now)
+            zi.compress_type = zipfile.ZIP_STORED
+            zi.external_attr = UNIXPERMS
+            outputfp.writestr(zi, fileobj)
 
         # Write content.opf
         zout = zipfile.ZipInfo('OEBPS/content.opf', now)
         zout.compress_type = zipfile.ZIP_DEFLATED
         zout.external_attr = UNIXPERMS
         opf = []
+        if self.coverimage:
+            covermeta = """<meta name="cover" content="cover-image"/>"""
+        else:
+            covermeta = ""
         opf.append(self.content_opf_head % (escaped(self.odhandler.title), escaped(self.odhandler.language),
-                escaped(args[0]), escaped(self.odhandler.creator)))
+                escaped(args[0]), escaped(self.odhandler.creator), covermeta))
         if self.coverimage:
-            opf.append("""        <item id="cover"       href="cover.xhtml"    media-type="application/xhtml+xml"/>""")
+            opf.append("""        <item id="cover-page"       href="cover.xhtml"    media-type="application/xhtml+xml"/>""")
             opf.append("""        <item id="cover-image" href="Pictures/cover.jpg" media-type="image/jpeg"/>""")
         for chapter in range(len(self.odhandler.chapters)):
             opf.append("""        <item id="chapter%d.xhtml" href="chapter%d.xhtml" media-type="application/xhtml+xml"/>""" % (chapter, chapter))
         # Write manifest of images.
-        # FIXME: Set correct media-type
-        # FIXME: Provide a valid identifier
-        for zname in z.namelist():
-            if zname[0:9] == 'Pictures/':
-                opf.append("""        <item id="%s" href="%s" media-type="image/jpeg"/>""" % (zname, zname))
+        for arcname, picturerec in self.doc.Pictures.items():
+            what_it_is, fileobj, mediatype = picturerec
+            opf.append("""        <item id="%s" href="%s" media-type="%s"/>""" % (arcname.replace('/','_'), arcname, mediatype))
         opf.append("""</manifest>""")
         opf.append("""<spine toc="ncx">""")
         if self.coverimage:
-            opf.append("""        <itemref idref="cover" linear="no"/>""")
+            opf.append("""        <itemref idref="cover-page" linear="no"/>""")
         for chapter in range(len(self.odhandler.chapters)):
             opf.append("""        <itemref idref="chapter%d.xhtml"/>""" % chapter)
         opf.append("""</spine>""")
@@ -231,8 +262,6 @@ class EPublication:
         opf.append('</package>')
         outputfp.writestr(zout, '\n'.join(opf))
 
-        z.close()
-
         # Write toc.ncx
         zout = zipfile.ZipInfo('OEBPS/toc.ncx', now)
         zout.compress_type = zipfile.ZIP_DEFLATED
@@ -246,16 +275,17 @@ class EPublication:
         for np in self.odhandler.navpoint_list:
             if np_inx == 2:
                 np.level = 1
+            if np.level > 2: np.level = 2
             if np_inx != 2 and np.level <= np_level:
-                opf.append("""        </navPoint> <!-- same level -->""");
+                opf.append("""        </navPoint>""");
             if np_inx != 2 and np.level < np_level:
                 opf.append("""        </navPoint>""");
-            opf.append("""        <navPoint id="navPoint-%d" playOrder="%d">
+            opf.append("""        <navPoint id="navPoint-%d" playOrder="%d"> <!-- L%d -->
             <navLabel>
                 <text>%s</text>
             </navLabel>
             <content src="chapter%d.xhtml#%s"/>
-        """ % (np_inx, np_inx, escaped(np.title), np.chapter, np.anchor))
+        """ % (np_inx, np_inx, np.level, escaped(np.title), np.chapter, np.anchor))
             np_inx += 1
             np_level = np.level
         opf.append("""        </navPoint>""");
diff --git a/odf/load.py b/odf/load.py
index 1f0e45e..cdcc794 100644
--- a/odf/load.py
+++ b/odf/load.py
@@ -63,7 +63,7 @@ class LoadParser(handler.ContentHandler):
 
         self.level = self.level + 1
         # Add any accumulated text content
-        content = ''.join(self.data).strip()
+        content = ''.join(self.data)
         if len(content) > 0:
             self.parent.addText(content, check_grammar=False)
             self.data = []
diff --git a/odf/odf2xhtml.py b/odf/odf2xhtml.py
index 85397de..cb15844 100644
--- a/odf/odf2xhtml.py
+++ b/odf/odf2xhtml.py
@@ -20,13 +20,11 @@
 #
 #import pdb
 #pdb.set_trace()
-import zipfile
-import xml.sax
-from xml.sax import handler, expatreader
-from xml.sax.xmlreader import InputSource
+from xml.sax import handler
 from xml.sax.saxutils import escape, quoteattr
 from xml.dom import Node
-from cStringIO import StringIO
+
+from opendocument import load
 
 from namespaces import ANIMNS, CHARTNS, CONFIGNS, DCNS, DR3DNS, DRAWNS, FONS, \
   FORMNS, MATHNS, METANS, NUMBERNS, OFFICENS, PRESENTATIONNS, SCRIPTNS, \
@@ -358,6 +356,7 @@ class ODF2XHTML(handler.ContentHandler):
         (NUMBERNS, "date-style"):(self.s_ignorexml, None),
         (NUMBERNS, "number-style"):(self.s_ignorexml, None),
         (NUMBERNS, "text-style"):(self.s_ignorexml, None),
+        (OFFICENS, "annotation"):(self.s_ignorexml, None),
         (OFFICENS, "automatic-styles"):(self.s_office_automatic_styles, None),
         (OFFICENS, "document"):(self.s_office_document_content, self.e_office_document_content),
         (OFFICENS, "document-content"):(self.s_office_document_content, self.e_office_document_content),
@@ -432,6 +431,12 @@ class ODF2XHTML(handler.ContentHandler):
             self.elements[(OFFICENS, u"document-content")] = (None,None)
         self._resetobject()
 
+    def _resetfootnotes(self):
+        # Footnotes and endnotes
+        self.notedict = {}
+        self.currentnote = 0
+        self.notebody = ''
+
     def _resetobject(self):
         self.lines = []
         self._wfunc = self._wlines
@@ -441,11 +446,13 @@ class ODF2XHTML(handler.ContentHandler):
         self.creator = ''
         self.data = []
         self.tagstack = TagStack()
+        self.htmlstack = []
         self.pstack = []
         self.processelem = True
         self.processcont = True
         self.listtypes = {}
         self.headinglevels = [0, 0,0,0,0,0, 0,0,0,0,0] # level 0 to 10
+        self.use_internal_css = True
         self.cs = StyleToCSS()
         self.anchors = {}
 
@@ -454,10 +461,7 @@ class ODF2XHTML(handler.ContentHandler):
         self.styledict = {}
         self.currentstyle = None
 
-        # Footnotes and endnotes
-        self.notedict = {}
-        self.currentnote = 0
-        self.notebody = ''
+        self._resetfootnotes()
 
         # Tags from meta.xml
         self.metatags = []
@@ -474,6 +478,7 @@ class ODF2XHTML(handler.ContentHandler):
 
     def opentag(self, tag, attrs={}, block=False):
         """ Create an open HTML tag """
+        self.htmlstack.append((tag,attrs,block))
         a = []
         for key,val in attrs.items():
             a.append('''%s=%s''' % (key, quoteattr(val)))
@@ -485,6 +490,8 @@ class ODF2XHTML(handler.ContentHandler):
             self.writeout("\n")
 
     def closetag(self, tag, block=True):
+        """ Close an open HTML tag """
+        self.htmlstack.pop()
         self.writeout("</%s>" % tag)
         if block == True:
             self.writeout("\n")
@@ -574,7 +581,7 @@ class ODF2XHTML(handler.ContentHandler):
         """ Get the title from the meta data and create a HTML <title>
         """
         self.title = ''.join(self.data)
-        self.metatags.append('<title>%s</title>\n' % escape(self.title))
+        #self.metatags.append('<title>%s</title>\n' % escape(self.title))
         self.data = []
 
     def e_dc_metatag(self, tag, attrs):
@@ -729,13 +736,9 @@ class ODF2XHTML(handler.ContentHandler):
 
     def html_body(self, tag, attrs):
         self.writedata()
-        if self.generate_css:
+        if self.generate_css and self.use_internal_css:
             self.opentag('style', {'type':"text/css"}, True)
             self.writeout('/*<![CDATA[*/\n')
-            self.writeout('\nimg { width: 100%; height: 100%; }\n')
-            self.writeout('* { padding: 0; margin: 0;  background-color:white; }\n')
-            self.writeout('body { margin: 0 1em; }\n')
-            self.writeout('ol, ul { padding-left: 2em; }\n')
             self.generate_stylesheet()
             self.writeout('/*]]>*/\n')
             self.closetag('style')
@@ -743,6 +746,13 @@ class ODF2XHTML(handler.ContentHandler):
         self.closetag('head')
         self.opentag('body', block=True)
 
+    default_styles = """
+img { width: 100%; height: 100%; }
+* { padding: 0; margin: 0;  background-color:white; }
+body { margin: 0 1em; }
+ol, ul { padding-left: 2em; }
+"""
+
     def generate_stylesheet(self):
         for name in self.stylestack:
             styles = self.styledict.get(name)
@@ -762,6 +772,7 @@ class ODF2XHTML(handler.ContentHandler):
                 styles = parentstyle
             self.styledict[name] = styles
         # Write the styles to HTML
+        self.writeout(self.default_styles)
         for name in self.stylestack:
             styles = self.styledict.get(name)
             css2 = self.cs.convert_styles(styles)
@@ -803,6 +814,7 @@ class ODF2XHTML(handler.ContentHandler):
         self.emptytag('meta', { 'http-equiv':"Content-Type", 'content':"text/html;charset=UTF-8"})
         for metaline in self.metatags:
             self.writeout(metaline)
+        self.writeout('<title>%s</title>\n' % escape(self.title))
 
     def e_office_document_content(self, tag, attrs):
         """ Last tag """
@@ -1090,20 +1102,26 @@ class ODF2XHTML(handler.ContentHandler):
         self.purgedata()
 
     def e_text_h(self, tag, attrs):
-        """ Headings end """
+        """ Headings end
+            Side-effect: If there is no title in the metadata, then it is taken
+            from the first heading of any level.
+        """
         self.writedata()
         level = int(attrs[(TEXTNS,'outline-level')])
         if level > 6: level = 6 # Heading levels go only to 6 in XHTML
         if level < 1: level = 1
         lev = self.headinglevels[1:level+1]
         outline = '.'.join(map(str,lev) )
-        anchor = self.get_anchor("%s.%s" % ( outline, ''.join(self.data)))
+        heading = ''.join(self.data)
+        if self.title == '': self.title = heading
+        anchor = self.get_anchor("%s.%s" % ( outline, heading))
         self.opentag('a', {'id': anchor} )
         self.closetag('a', False)
         self.closetag('h%s' % level)
         self.purgedata()
 
     def s_text_line_break(self, tag, attrs):
+        """ Force a line break (<br/>) """
         self.writedata()
         self.emptytag('br')
         self.purgedata()
@@ -1123,9 +1141,9 @@ class ODF2XHTML(handler.ContentHandler):
             name = self.tagstack.rfindattr( (TEXTNS,'style-name') )
         list_class = "%s_%d" % (name, level)
         if self.generate_css:
-            self.opentag('%s' % self.listtypes.get(list_class,'UL'), {'class': list_class })
+            self.opentag('%s' % self.listtypes.get(list_class,'ul'), {'class': list_class })
         else:
-            self.opentag('%s' % self.listtypes.get(list_class,'UL'))
+            self.opentag('%s' % self.listtypes.get(list_class,'ul'))
         self.purgedata()
 
     def e_text_list(self, tag, attrs):
@@ -1140,7 +1158,7 @@ class ODF2XHTML(handler.ContentHandler):
             # textbox itself may be nested within another list.
             name = self.tagstack.rfindattr( (TEXTNS,'style-name') )
         list_class = "%s_%d" % (name, level)
-        self.closetag(self.listtypes.get(list_class,'UL'))
+        self.closetag(self.listtypes.get(list_class,'ul'))
         self.purgedata()
 
     def s_text_list_item(self, tag, attrs):
@@ -1319,25 +1337,14 @@ class ODF2XHTML(handler.ContentHandler):
 #-----------------------------------------------------------------------------
 
     def load(self, odffile):
-        self._odffile = odffile
-
-    def newcss(self, doc):
-        self._wfunc = self._writenothing
+        self.lines = []
+        self._wfunc = self._wlines
+        if isinstance(odffile, basestring):
+            doc = load(odffile)
+        else:
+            doc = odffile
         self._walknode(doc.topnode)
-        self._csslines = []
-        self._wfunc = self._writecss
-        self.generate_stylesheet()
-        res = ''.join(self._csslines)
-        del self._csslines
-        return res
 
-    def newxhtml(self, doc):
-        """ Takes a document opened with load() and parses it
-            The return value is the xhtml output
-        """
-        self._walknode(doc.topnode)
-        return ''.join(self.lines)
-        
     def _walknode(self, node):
         if node.nodeType == Node.ELEMENT_NODE:
             self.startElementNS(node.qname, node.tagName, node.attributes)
@@ -1347,26 +1354,6 @@ class ODF2XHTML(handler.ContentHandler):
         if node.nodeType == Node.TEXT_NODE or node.nodeType == Node.CDATA_SECTION_NODE:
             self.characters(unicode(node))
 
-    def parseodf(self):
-        self._resetobject()
-        # Extract the interesting files
-        z = zipfile.ZipFile(self._odffile)
-
-        # For some reason Trac has trouble when xml.sax.make_parser() is used.
-        # Could it be because PyXML is installed, and therefore a different parser
-        # might be chosen? By calling expatreader directly we avoid this issue
-        parser = expatreader.create_parser()
-        parser.setFeature(handler.feature_namespaces, 1)
-        parser.setContentHandler(self)
-        parser.setErrorHandler(handler.ErrorHandler())
-        inpsrc = InputSource()
-
-        for xmlfile in ('meta.xml', 'styles.xml', 'content.xml'):
-            self.xmlfile = xmlfile
-            content = z.read(xmlfile)
-            inpsrc.setByteStream(StringIO(content))
-            parser.parse(inpsrc)
-        z.close()
 
     def odf2xhtml(self, odffile):
         """ Load a file and return XHTML
@@ -1378,9 +1365,13 @@ class ODF2XHTML(handler.ContentHandler):
         if s != '': self.lines.append(s)
 
     def xhtml(self):
-        self.lines = []
-        self._wfunc = self._wlines
-        self.parseodf()
+        """ Parses the document and returns the HTML content """
+        return ''.join(self.lines)
+
+    def newxhtml(self, doc):
+        """ Takes a document opened with load() and parses it
+            The return value is the xhtml output
+        """
         return ''.join(self.lines)
 
     def _writecss(self, s):
@@ -1389,12 +1380,28 @@ class ODF2XHTML(handler.ContentHandler):
     def _writenothing(self, s):
         pass
 
+    def newcss(self, doc):
+        self._csslines = []
+        self._wfunc = self._writecss
+        self.generate_stylesheet()
+        res = ''.join(self._csslines)
+        self._wfunc = self._wlines
+        del self._csslines
+        return res
+
     def css(self):
-        self._wfunc = self._writenothing
-        self.parseodf()
+        """ Parses the document and returns the CSS content """
         self._csslines = []
         self._wfunc = self._writecss
         self.generate_stylesheet()
         res = ''.join(self._csslines)
+        self._wfunc = self._wlines
         del self._csslines
         return res
+
+    def set_style_file(self, stylefilename, media=None):
+        self.use_internal_css = False
+        if media:
+            self.metatags.append('<link rel="stylesheet" type="text/css" href="%s" media="%s"/>\n' % (stylefilename,media))
+        else:
+            self.metatags.append('<link rel="stylesheet" type="text/css" href="%s"/>\n' % (stylefilename))

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-odf.git