[Python-modules-commits] [python-odf] 26/118: More work on html parser
Wolfgang Borgert
debacle at moszumanska.debian.org
Fri Oct 3 21:27:18 UTC 2014
This is an automated email from the git hooks/post-receive script.
debacle pushed a commit to reference refs/remotes/upstream/master
in repository python-odf.
commit 5d3a0a69fd14b680db130888d44b44aad6fc0a16
Author: Søren Roug <soren.roug at eea.europa.eu>
Date: Sat Dec 20 17:27:01 2008 +0000
More work on html parser
---
contrib/html2odt/htmlstyles.py | 14 ++++---
contrib/html2odt/shtml2odt.py | 88 ++++++++++++++++++++----------------------
2 files changed, 50 insertions(+), 52 deletions(-)
diff --git a/contrib/html2odt/htmlstyles.py b/contrib/html2odt/htmlstyles.py
index 08d8391..665d035 100644
--- a/contrib/html2odt/htmlstyles.py
+++ b/contrib/html2odt/htmlstyles.py
@@ -9,18 +9,20 @@ def addStandardStyles(doc):
style.addElement(p)
doc.styles.addElement(style)
- style = Style(name="Text_20_body_20_indent", displayname="Text body indent", family="paragraph", parentstylename="Text_20_body", attributes={'class':"text"})
- p = ParagraphProperties(marginleft="0.499cm", marginright="0cm", textindent="0cm", autotextindent="false")
+ style = Style(name="List_20_Contents", displayname="List Contents", family="paragraph", parentstylename="Standard", attributes={'class':"html"})
+ p = ParagraphProperties(marginleft="1cm", marginright="0cm", textindent="0cm", autotextindent="false")
style.addElement(p)
doc.styles.addElement(style)
- style = Style(name="Salutation", family="paragraph", parentstylename="Standard", attributes={'class':"text"})
- p = ParagraphProperties(numberlines="false", linenumber=0)
+
+ style = Style(name="List_20_Heading", displayname="List Heading", family="paragraph", parentstylename="Standard",
+ nextstylename="List_20_Contents", attributes={'class':"html"})
+ p = ParagraphProperties(marginleft="0cm", marginright="0cm", textindent="0cm", autotextindent="false")
style.addElement(p)
doc.styles.addElement(style)
- style = Style(name="Signature", family="paragraph", parentstylename="Standard", attributes={'class':"text"})
- p = ParagraphProperties(numberlines="false", linenumber=0)
+ style = Style(name="Text_20_body_20_indent", displayname="Text body indent", family="paragraph", parentstylename="Text_20_body", attributes={'class':"text"})
+ p = ParagraphProperties(marginleft="0.499cm", marginright="0cm", textindent="0cm", autotextindent="false")
style.addElement(p)
doc.styles.addElement(style)
diff --git a/contrib/html2odt/shtml2odt.py b/contrib/html2odt/shtml2odt.py
index 1717a43..4cb1adf 100644
--- a/contrib/html2odt/shtml2odt.py
+++ b/contrib/html2odt/shtml2odt.py
@@ -26,7 +26,7 @@ from cgi import escape,parse_header
from types import StringType
from odf.opendocument import OpenDocumentText
-from odf import dc,text
+from odf import dc, text, table
import htmlstyles
@@ -106,38 +106,37 @@ class HTML2ODTParser(HTMLParser):
self.processcont = True
self.__data = []
self.elements = {
- 'a': (self.s_html_a, self.e_html_a),
+ 'a': (self.s_html_a, self.close_tag),
'base': ( self.output_base, None),
- 'b': ( self.s_html_emphasis, self.e_html_emphasis),
+ 'b': ( self.s_html_emphasis, self.close_tag),
'br': ( self.output_br, None),
- 'caption': ( self.output_caption, None),
'col': ( self.s_html_col, None),
- 'dd': ( self.s_html_dd, None),
+ 'dd': ( self.s_html_dd, self.close_tag),
'dt': ( self.s_html_dt, None),
'div': ( self.s_html_section, self.e_html_section),
- 'em': ( self.s_html_emphasis, self.e_html_emphasis),
- 'h1': ( self.s_html_headline, self.e_html_headline),
- 'h2': ( self.s_html_headline, self.e_html_headline),
- 'h3': ( self.s_html_headline, self.e_html_headline),
- 'h4': ( self.s_html_headline, self.e_html_headline),
- 'h5': ( self.s_html_headline, self.e_html_headline),
- 'h6': ( self.s_html_headline, self.e_html_headline),
+ 'em': ( self.s_html_emphasis, self.close_tag),
+ 'h1': ( self.s_html_headline, self.close_tag),
+ 'h2': ( self.s_html_headline, self.close_tag),
+ 'h3': ( self.s_html_headline, self.close_tag),
+ 'h4': ( self.s_html_headline, self.close_tag),
+ 'h5': ( self.s_html_headline, self.close_tag),
+ 'h6': ( self.s_html_headline, self.close_tag),
'head': ( self.s_ignorexml, None),
- 'i': ( self.s_html_emphasis, self.e_html_emphasis),
+ 'i': ( self.s_html_emphasis, self.close_tag),
'img': ( self.output_img, None),
'li': ( self.s_html_li, self.e_html_li),
'meta': ( self.meta_encoding, None),
'ol': ( self.output_ol, self.e_html_list),
'p': ( self.s_html_block, self.e_html_block),
- 'span': ( self.s_html_span, self.e_html_span),
- 'strong':( self.s_html_emphasis, self.e_html_emphasis),
+ 'span': ( self.s_html_span, self.close_tag),
+ 'strong':( self.s_html_emphasis, self.close_tag),
'table':( self.s_html_table, self.e_html_table),
- 'td': ( self.s_html_td, self.e_html_td),
- 'th': ( self.s_html_td, self.e_html_td),
+ 'td': ( self.s_html_td, self.close_tag),
+ 'th': ( self.s_html_td, self.close_tag),
'title':( self.s_html_title, self.e_html_title),
- 'tr': ( self.s_html_tr, self.e_html_tr),
+ 'tr': ( self.s_html_tr, self.close_tag),
'ul': ( self.output_ul, self.e_html_list),
- 'var': ( self.s_html_emphasis, self.e_html_emphasis),
+ 'var': ( self.s_html_emphasis, self.close_tag),
'input':( self.output_input, None),
'select':( self.output_select, None),
'textarea':( self.output_textarea, None),
@@ -218,17 +217,11 @@ class HTML2ODTParser(HTMLParser):
self.curr.addElement(e)
self.curr = e
- def e_html_emphasis(self, tag):
- self.curr = self.curr.parentNode
-
def s_html_span(self, tag, attrs):
e = text.Span()
self.curr.addElement(e)
self.curr = e
- def e_html_span(self, tag):
- self.curr = self.curr.parentNode
-
def s_html_title(self, tag, attrs):
e = dc.Title()
self.doc.meta.addElement(e)
@@ -256,14 +249,20 @@ class HTML2ODTParser(HTMLParser):
e = text.A(type="simple", href=href)
else:
e = text.A()
+# if self.curr.parentNode.qname != text.P().qname:
+# p = text.P()
+# self.curr.addElement(p)
+# self.curr = p
self.curr.addElement(e)
self.curr = e
- def e_html_a(self, tag):
+ def close_tag(self, tag):
self.curr = self.curr.parentNode
def s_html_dd(self, tag, attrs):
- self.write_odt(u'<text:p text:style-name="List_20_Contents">')
+ e = text.P(stylename="List_20_Contents")
+ self.curr.addElement(e)
+ self.curr = e
def s_html_dt(self, tag, attrs):
self.write_odt(u'<text:p text:style-name="List_20_Heading">')
@@ -316,32 +315,27 @@ class HTML2ODTParser(HTMLParser):
self.curr.addElement(e)
self.curr = e
- def e_html_headline(self, tag):
- self.curr = self.curr.parentNode
-
def s_html_table(self, tag, attrs):
- self.write_odt(u'<table:table>')
+ e = table.Table()
+ self.curr.addElement(e)
+ self.curr = e
def e_html_table(self, tag):
- self.write_odt(u'</table:table>')
+ self.curr = self.curr.parentNode
def s_html_td(self, tag, attrs):
- self.write_odt(u'<table:table-cell>')
-
- def e_html_td(self, tag):
- self.write_odt(u'</table:table-cell>')
+ e = table.TableCell()
+ self.curr.addElement(e)
+ self.curr = e
def s_html_tr(self, tag, attrs):
- self.write_odt(u'<table:table-row>')
-
- def e_html_tr(self, tag):
- self.write_odt(u'</table:table-row>')
+ e = table.TableRow()
+ self.curr.addElement(e)
+ self.curr = e
def s_html_col(self, tag, attrs):
- self.write_odt(u'<table:table-column/>')
-
- def output_caption(self, tag, attrs):
- self.write_odt(u'Caption: ')
+ e = table.TableColumn()
+ self.curr.addElement(e)
def s_html_section(self, tag, attrs):
""" Outputs block tag such as <p> and <div> """
@@ -349,11 +343,13 @@ class HTML2ODTParser(HTMLParser):
if name is None:
self.sectnum = self.sectnum + 1
name = "Sect%d" % self.sectnum
- self.write_odt(u'<text:section text:name="%s">' % name)
+ e = text.Section(name=name)
+ self.curr.addElement(e)
+ self.curr = e
def e_html_section(self, tag):
""" Outputs block tag such as <p> and <div> """
- self.write_odt(u'</text:section>')
+ self.curr = self.curr.parentNode
def s_html_block(self, tag, attrs):
""" Outputs block tag such as <p> and <div> """
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-odf.git
More information about the Python-modules-commits
mailing list