[med-svn] [python-latexcodec] 01/04: Imported Upstream version 1.0.1
Kevin Murray
daube-guest at moszumanska.debian.org
Fri Oct 9 10:20:51 UTC 2015
This is an automated email from the git hooks/post-receive script.
daube-guest pushed a commit to branch master
in repository python-latexcodec.
commit 83b86f3e5938b89a36ccd7d37b58f0d01b4b714a
Author: Kevin Murray <spam at kdmurray.id.au>
Date: Thu Oct 8 22:26:36 2015 +1100
Imported Upstream version 1.0.1
---
.travis.yml | 27 ++
AUTHORS.rst | 26 ++
CHANGELOG.rst | 54 +++
INSTALL.rst | 43 +++
LICENSE.rst | 23 ++
MANIFEST.in | 14 +
README.rst | 32 ++
VERSION | 1 +
doc/Makefile | 153 ++++++++
doc/_build/.gitignore | 0
doc/api.rst | 8 +
doc/api/codec.rst | 1 +
doc/api/lexer.rst | 1 +
doc/authors.rst | 5 +
doc/changes.rst | 7 +
doc/conf.py | 41 +++
doc/index.rst | 25 ++
doc/license.rst | 11 +
doc/make.bat | 190 ++++++++++
doc/quickstart.rst | 13 +
latexcodec/__init__.py | 2 +
latexcodec/codec.py | 810 +++++++++++++++++++++++++++++++++++++++++++
latexcodec/lexer.py | 420 ++++++++++++++++++++++
requirements.txt | 1 +
setup.cfg | 8 +
setup.py | 45 +++
test/test_install_example.py | 19 +
test/test_latex_codec.py | 362 +++++++++++++++++++
test/test_latex_lexer.py | 442 +++++++++++++++++++++++
29 files changed, 2784 insertions(+)
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..46a3160
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,27 @@
+language: python
+python:
+ - "3.4"
+ - "3.3"
+ - "2.7"
+ - "2.6"
+ - "pypy"
+branches:
+ only:
+ - develop
+install:
+ - "pip install ."
+ - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then pip install coveralls check-manifest flake8 Sphinx; fi"
+script:
+ - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then check-manifest; fi"
+ - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then flake8; fi"
+ - "pushd doc"
+ - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then make html; fi"
+ - "popd"
+ - "pushd test"
+ - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then coverage run --source=latexcodec `type -p nosetests`; fi"
+ - "if [[ $TRAVIS_PYTHON_VERSION != '2.7' ]]; then nosetests; fi"
+ - "popd"
+after_success:
+ - "pushd test"
+ - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then coveralls; fi"
+ - "popd"
diff --git a/AUTHORS.rst b/AUTHORS.rst
new file mode 100644
index 0000000..d97e846
--- /dev/null
+++ b/AUTHORS.rst
@@ -0,0 +1,26 @@
+Main authors:
+
+* David Eppstein
+
+ - wrote the original LaTeX codec as a recipe on ActiveState
+ http://code.activestate.com/recipes/252124-latex-codec/
+
+* Peter Tröger
+
+ - wrote the original latexcodec package, which contained a simple
+ but very effective LaTeX encoder
+
+* Matthias Troffaes (matthias.troffaes at gmail.com)
+
+ - wrote the lexer
+
+ - integrated codec with the lexer for a simpler and more robust
+ design
+
+ - various bugfixes
+
+Contributors:
+
+* Michael Radziej
+
+* Philipp Spitzer
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
new file mode 100644
index 0000000..c9b7570
--- /dev/null
+++ b/CHANGELOG.rst
@@ -0,0 +1,54 @@
+1.0.1 (24 September 2014)
+-------------------------
+
+* br"\par" is now decoded using two newlines (see issue #26, reported
+ by Jorrit Wronski).
+
+* Fix encoding and decoding of the ogonek (see issue #24, reported by
+ beltiste).
+
+1.0.0 (5 August 2014)
+---------------------
+
+* Add Python 3.4 support.
+
+* Fix "DZ" decoding (see issue #21, reported and fixed by Philipp
+ Spitzer).
+
+0.3.2 (17 April 2014)
+---------------------
+
+* Fix underscore "\_" encoding (see issue #17, reported and fixed by
+ Michael Radziej).
+
+0.3.1 (5 February 2014)
+-----------------------
+
+* Drop Python 3.2 support.
+
+* Drop 2to3 and instead use six to support both Python 2 and 3 from a
+ single code base.
+
+* Fix control space "\ " decoding.
+
+* Fix LaTeX encoding of number sign "#" and other special ascii
+ characters (see issues #11 and #13, reported by beltiste).
+
+0.3.0 (19 August 2013)
+----------------------
+
+* Copied lexer and codec from sphinxcontrib-bibtex.
+
+* Initial usage and API documentation.
+
+* Some small bugs fixed.
+
+0.2 (28 September 2012)
+-----------------------
+
+* Adding additional codec with brackets around special characters.
+
+0.1 (26 May 2012)
+-----------------
+
+* Initial release.
diff --git a/INSTALL.rst b/INSTALL.rst
new file mode 100644
index 0000000..5f0503a
--- /dev/null
+++ b/INSTALL.rst
@@ -0,0 +1,43 @@
+Install the module with ``pip install latexcodec``, or from
+source using ``python setup.py install``.
+
+Minimal Example
+---------------
+
+Simply import the :mod:`latexcodec` module to enable ``"latex"``
+to be used as an encoding:
+
+.. code-block:: python
+
+ import latexcodec
+ text_latex = br"\'el\`eve"
+ assert text_latex.decode("latex") == u"élève"
+ text_unicode = u"ångström"
+ assert text_unicode.encode("latex") == br'\aa ngstr\"om'
+
+By default, the LaTeX input is assumed to be ascii, as per standard LaTeX.
+However, you can also specify an extra codec
+as ``latex+<encoding>``, where ``<encoding>`` describes another encoding.
+In this case characters will be
+translated to and from that encoding whenever possible.
+The following code snippet demonstrates this behaviour:
+
+.. code-block:: python
+
+ import latexcodec
+ text_latex = b"\xfe"
+ assert text_latex.decode("latex+latin1") == u"þ"
+ assert text_latex.decode("latex+latin2") == u"ţ"
+ text_unicode = u"ţ"
+ assert text_unicode.encode("latex+latin1") == b'\\c t' # ţ is not latin1
+ assert text_unicode.encode("latex+latin2") == b'\xfe' # but it is latin2
+
+Limitations
+-----------
+
+* Not all unicode characters are registered. If you find any missing,
+ please report them on the tracker:
+
+ https://github.com/mcmtroffaes/latexcodec/issues
+
+* Unicode combining characters are currently not handled.
diff --git a/LICENSE.rst b/LICENSE.rst
new file mode 100644
index 0000000..8e9e89e
--- /dev/null
+++ b/LICENSE.rst
@@ -0,0 +1,23 @@
+| latexcodec is a lexer and codec to work with LaTeX code in Python
+| Copyright (c) 2011-2014 by Matthias C. M. Troffaes
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..8fe92ed
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,14 @@
+include VERSION
+include README.rst
+include INSTALL.rst
+include CHANGELOG.rst
+include LICENSE.rst
+include AUTHORS.rst
+include requirements.txt
+include tox.ini
+recursive-include doc *
+recursive-include test *
+global-exclude *.pyc
+global-exclude .gitignore
+prune doc/_build
+exclude .travis.yml
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..26e55b2
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,32 @@
+latexcodec
+==========
+
+|travis| |coveralls| |downloads| |version| |license|
+
+A lexer and codec to work with LaTeX code in Python.
+
+* Download: http://pypi.python.org/pypi/latexcodec/#downloads
+
+* Documentation: http://latexcodec.readthedocs.org/
+
+* Development: http://github.com/mcmtroffaes/latexcodec/
+
+.. |travis| image:: https://travis-ci.org/mcmtroffaes/latexcodec.png?branch=develop
+ :target: https://travis-ci.org/mcmtroffaes/latexcodec
+ :alt: travis-ci
+
+.. |coveralls| image:: https://coveralls.io/repos/mcmtroffaes/latexcodec/badge.png?branch=develop
+ :target: https://coveralls.io/r/mcmtroffaes/latexcodec?branch=develop
+ :alt: coveralls.io
+
+.. |downloads| image:: https://pypip.in/d/latexcodec/badge.png
+ :target: http://pypi.python.org/pypi/latexcodec/
+ :alt: downloads
+
+.. |version| image:: https://pypip.in/v/latexcodec/badge.png
+ :target: http://pypi.python.org/pypi/latexcodec/
+ :alt: latest version
+
+.. |license| image:: https://pypip.in/license/latexcodec/badge.png
+ :target: http://pypi.python.org/pypi/latexcodec/
+ :alt: license
diff --git a/VERSION b/VERSION
new file mode 100644
index 0000000..7dea76e
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+1.0.1
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..57c9fc5
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,153 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ -rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/latexcodec.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/latexcodec.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/latexcodec"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/latexcodec"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/doc/_build/.gitignore b/doc/_build/.gitignore
new file mode 100644
index 0000000..e69de29
diff --git a/doc/api.rst b/doc/api.rst
new file mode 100644
index 0000000..c5c989a
--- /dev/null
+++ b/doc/api.rst
@@ -0,0 +1,8 @@
+API
+~~~
+
+.. toctree::
+ :maxdepth: 2
+
+ api/codec
+ api/lexer
diff --git a/doc/api/codec.rst b/doc/api/codec.rst
new file mode 100644
index 0000000..ff39d09
--- /dev/null
+++ b/doc/api/codec.rst
@@ -0,0 +1 @@
+.. automodule:: latexcodec.codec
diff --git a/doc/api/lexer.rst b/doc/api/lexer.rst
new file mode 100644
index 0000000..89f9cbc
--- /dev/null
+++ b/doc/api/lexer.rst
@@ -0,0 +1 @@
+.. automodule:: latexcodec.lexer
diff --git a/doc/authors.rst b/doc/authors.rst
new file mode 100644
index 0000000..45122fc
--- /dev/null
+++ b/doc/authors.rst
@@ -0,0 +1,5 @@
+Authors
+=======
+
+.. include:: ../AUTHORS.rst
+
diff --git a/doc/changes.rst b/doc/changes.rst
new file mode 100644
index 0000000..2eb28cc
--- /dev/null
+++ b/doc/changes.rst
@@ -0,0 +1,7 @@
+:tocdepth: 1
+
+Changes
+=======
+
+.. include:: ../CHANGELOG.rst
+
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000..0f3942f
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+#
+# latexcodec documentation build configuration file, created by
+# sphinx-quickstart on Wed Aug 3 15:45:22 2011.
+
+extensions = [
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.doctest',
+ 'sphinx.ext.intersphinx',
+ 'sphinx.ext.todo',
+ 'sphinx.ext.coverage',
+ 'sphinx.ext.pngmath',
+ 'sphinx.ext.viewcode']
+source_suffix = '.rst'
+master_doc = 'index'
+project = u'latexcodec'
+copyright = u'2011-2014, Matthias C. M. Troffaes'
+with open("../VERSION", "rb") as version_file:
+ release = version_file.read().strip()
+version = '.'.join(release.split('.')[:2])
+exclude_patterns = ['_build']
+pygments_style = 'sphinx'
+html_theme = 'default'
+htmlhelp_basename = 'latexcodecdoc'
+latex_documents = [
+ ('index', 'latexcodec.tex',
+ u'latexcodec Documentation',
+ u'Matthias C. M. Troffaes', 'manual'),
+]
+man_pages = [
+ ('index', 'latexcodec', u'latexcodec Documentation',
+ [u'Matthias C. M. Troffaes'], 1)
+]
+texinfo_documents = [
+ ('index', 'latexcodec', u'latexcodec Documentation',
+ u'Matthias C. M. Troffaes',
+ 'latexcodec', 'One line description of project.', 'Miscellaneous'),
+]
+intersphinx_mapping = {
+ 'python': ('http://docs.python.org/', None),
+}
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..05bd2cf
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,25 @@
+Welcome to latexcodec's documentation!
+======================================
+
+:Release: |release|
+:Date: |today|
+
+Contents
+--------
+
+.. toctree::
+ :maxdepth: 2
+
+ quickstart
+ api
+ changes
+ authors
+ license
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/doc/license.rst b/doc/license.rst
new file mode 100644
index 0000000..81a43fc
--- /dev/null
+++ b/doc/license.rst
@@ -0,0 +1,11 @@
+License
+=======
+
+.. include:: ../LICENSE.rst
+
+.. rubric:: Remark
+
+Versions 0.1 and 0.2 of the latexcodec package were written by
+Peter Tröger, and were released under the Academic Free License 3.0.
+The current version of the latexcodec package shares no code with those
+earlier versions.
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 0000000..b280cac
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,190 @@
+ at ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+ set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+ set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+ :help
+ echo.Please use `make ^<target^>` where ^<target^> is one of
+ echo. html to make standalone HTML files
+ echo. dirhtml to make HTML files named index.html in directories
+ echo. singlehtml to make a single large HTML file
+ echo. pickle to make pickle files
+ echo. json to make JSON files
+ echo. htmlhelp to make HTML files and a HTML help project
+ echo. qthelp to make HTML files and a qthelp project
+ echo. devhelp to make HTML files and a Devhelp project
+ echo. epub to make an epub
+ echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+ echo. text to make text files
+ echo. man to make manual pages
+ echo. texinfo to make Texinfo files
+ echo. gettext to make PO message catalogs
+ echo. changes to make an overview over all changed/added/deprecated items
+ echo. linkcheck to check all external links for integrity
+ echo. doctest to run all doctests embedded in the documentation if enabled
+ goto end
+)
+
+if "%1" == "clean" (
+ for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+ del /q /s %BUILDDIR%\*
+ goto end
+)
+
+if "%1" == "html" (
+ %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+ goto end
+)
+
+if "%1" == "dirhtml" (
+ %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+ goto end
+)
+
+if "%1" == "singlehtml" (
+ %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+ goto end
+)
+
+if "%1" == "pickle" (
+ %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the pickle files.
+ goto end
+)
+
+if "%1" == "json" (
+ %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the JSON files.
+ goto end
+)
+
+if "%1" == "htmlhelp" (
+ %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+ goto end
+)
+
+if "%1" == "qthelp" (
+ %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+ echo.^> qcollectiongenerator %BUILDDIR%\qthelp\latexcodec.qhcp
+ echo.To view the help file:
+ echo.^> assistant -collectionFile %BUILDDIR%\qthelp\latexcodec.ghc
+ goto end
+)
+
+if "%1" == "devhelp" (
+ %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished.
+ goto end
+)
+
+if "%1" == "epub" (
+ %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The epub file is in %BUILDDIR%/epub.
+ goto end
+)
+
+if "%1" == "latex" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "text" (
+ %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The text files are in %BUILDDIR%/text.
+ goto end
+)
+
+if "%1" == "man" (
+ %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The manual pages are in %BUILDDIR%/man.
+ goto end
+)
+
+if "%1" == "texinfo" (
+ %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+ goto end
+)
+
+if "%1" == "gettext" (
+ %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+ goto end
+)
+
+if "%1" == "changes" (
+ %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.The overview file is in %BUILDDIR%/changes.
+ goto end
+)
+
+if "%1" == "linkcheck" (
+ %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+ goto end
+)
+
+if "%1" == "doctest" (
+ %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+ goto end
+)
+
+:end
diff --git a/doc/quickstart.rst b/doc/quickstart.rst
new file mode 100644
index 0000000..d8680a8
--- /dev/null
+++ b/doc/quickstart.rst
@@ -0,0 +1,13 @@
+Getting Started
+===============
+
+Overview
+--------
+
+.. include:: ../README.rst
+ :start-line: 5
+
+Installation
+------------
+
+.. include:: ../INSTALL.rst
diff --git a/latexcodec/__init__.py b/latexcodec/__init__.py
new file mode 100644
index 0000000..9ef80c3
--- /dev/null
+++ b/latexcodec/__init__.py
@@ -0,0 +1,2 @@
+import latexcodec.codec
+latexcodec.codec.register()
diff --git a/latexcodec/codec.py b/latexcodec/codec.py
new file mode 100644
index 0000000..173989e
--- /dev/null
+++ b/latexcodec/codec.py
@@ -0,0 +1,810 @@
+# -*- coding: utf-8 -*-
+"""
+ LaTeX Codec
+ ~~~~~~~~~~~
+
+ The :mod:`latexcodec.codec` module
+ contains all classes and functions for LaTeX code
+ translation. For practical use,
+ you should only ever need to import the :mod:`latexcodec` module,
+ which will automatically register the codec
+ so it can be used by :meth:`str.encode`, :meth:`str.decode`,
+ and any of the functions defined in the :mod:`codecs` module
+ such as :func:`codecs.open` and so on.
+ The other functions and classes
+ are exposed in case someone would want to extend them.
+
+ .. autofunction:: register
+
+ .. autofunction:: find_latex
+
+ .. autoclass:: LatexIncrementalEncoder
+ :show-inheritance:
+ :members:
+
+ .. autoclass:: LatexIncrementalDecoder
+ :show-inheritance:
+ :members:
+
+ .. autoclass:: LatexCodec
+ :show-inheritance:
+ :members:
+
+ .. autoclass:: LatexUnicodeTable
+ :members:
+"""
+
+# Copyright (c) 2003, 2008 David Eppstein
+# Copyright (c) 2011-2014 Matthias C. M. Troffaes
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+from __future__ import print_function
+
+import codecs
+from six import string_types
+from six.moves import range
+
+from latexcodec import lexer
+
+
+def register():
+ """Register the :func:`find_latex` codec search function.
+
+ .. seealso:: :func:`codecs.register`
+ """
+ codecs.register(find_latex)
+
+# returns the codec search function
+# this is used if latex_codec.py were to be placed in stdlib
+
+
+def getregentry():
+ """Encodings module API."""
+ return find_latex('latex')
+
+
+class LatexUnicodeTable:
+
+ """Tabulates a translation between LaTeX and unicode."""
+
+ def __init__(self, lexer):
+ self.lexer = lexer
+ self.unicode_map = {}
+ self.max_length = 0
+ self.latex_map = {}
+ self.register_all()
+
+ def register_all(self):
+ """Register all symbols and their LaTeX equivalents
+ (called by constructor).
+ """
+ # TODO complete this list
+ # register special symbols
+ self.register(u'\n\n', b' \\par', encode=False)
+ self.register(u'\n\n', b'\\par', encode=False)
+ self.register(u' ', b'\\ ', encode=False)
+ self.register(u'\N{EN DASH}', b'--')
+ self.register(u'\N{EN DASH}', b'\\textendash')
+ self.register(u'\N{EM DASH}', b'---')
+ self.register(u'\N{EM DASH}', b'\\textemdash')
+ self.register(u'\N{LEFT SINGLE QUOTATION MARK}', b'`', decode=False)
+ self.register(u'\N{RIGHT SINGLE QUOTATION MARK}', b"'", decode=False)
+ self.register(u'\N{LEFT DOUBLE QUOTATION MARK}', b'``')
+ self.register(u'\N{RIGHT DOUBLE QUOTATION MARK}', b"''")
+ self.register(u'\N{DAGGER}', b'\\dag')
+ self.register(u'\N{DOUBLE DAGGER}', b'\\ddag')
+
+ self.register(u'\N{BULLET}', b'\\bullet', mode='math')
+ self.register(u'\N{BULLET}', b'\\textbullet', package='textcomp')
+
+ self.register(u'\N{NUMBER SIGN}', b'\\#')
+ self.register(u'\N{LOW LINE}', b'\\_')
+ self.register(u'\N{AMPERSAND}', b'\\&')
+ self.register(u'\N{NO-BREAK SPACE}', b'~')
+ self.register(u'\N{INVERTED EXCLAMATION MARK}', b'!`')
+ self.register(u'\N{CENT SIGN}', b'\\not{c}')
+
+ self.register(u'\N{POUND SIGN}', b'\\pounds')
+ self.register(u'\N{POUND SIGN}', b'\\textsterling', package='textcomp')
+
+ self.register(u'\N{SECTION SIGN}', b'\\S')
+ self.register(u'\N{DIAERESIS}', b'\\"{}')
+ self.register(u'\N{NOT SIGN}', b'\\neg')
+ self.register(u'\N{SOFT HYPHEN}', b'\\-')
+ self.register(u'\N{MACRON}', b'\\={}')
+
+ self.register(u'\N{DEGREE SIGN}', b'^\\circ', mode='math')
+ self.register(u'\N{DEGREE SIGN}', b'\\textdegree', package='textcomp')
+
+ self.register(u'\N{PLUS-MINUS SIGN}', b'\\pm', mode='math')
+ self.register(u'\N{PLUS-MINUS SIGN}', b'\\textpm', package='textcomp')
+
+ self.register(u'\N{SUPERSCRIPT TWO}', b'^2', mode='math')
+ self.register(
+ u'\N{SUPERSCRIPT TWO}',
+ b'\\texttwosuperior',
+ package='textcomp')
+
+ self.register(u'\N{SUPERSCRIPT THREE}', b'^3', mode='math')
+ self.register(
+ u'\N{SUPERSCRIPT THREE}',
+ b'\\textthreesuperior',
+ package='textcomp')
+
+ self.register(u'\N{ACUTE ACCENT}', b"\\'{}")
+
+ self.register(u'\N{MICRO SIGN}', b'\\mu', mode='math')
+ self.register(u'\N{MICRO SIGN}', b'\\micro', package='gensymb')
+
+ self.register(u'\N{PILCROW SIGN}', b'\\P')
+
+ self.register(u'\N{MIDDLE DOT}', b'\\cdot', mode='math')
+ self.register(
+ u'\N{MIDDLE DOT}',
+ b'\\textperiodcentered',
+ package='textcomp')
+
+ self.register(u'\N{CEDILLA}', b'\\c{}')
+
+ self.register(u'\N{SUPERSCRIPT ONE}', b'^1', mode='math')
+ self.register(
+ u'\N{SUPERSCRIPT ONE}',
+ b'\\textonesuperior',
+ package='textcomp')
+
+ self.register(u'\N{INVERTED QUESTION MARK}', b'?`')
+ self.register(u'\N{LATIN CAPITAL LETTER A WITH GRAVE}', b'\\`A')
+ self.register(u'\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}', b'\\^A')
+ self.register(u'\N{LATIN CAPITAL LETTER A WITH TILDE}', b'\\~A')
+ self.register(u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}', b'\\"A')
+ self.register(u'\N{LATIN CAPITAL LETTER A WITH RING ABOVE}', b'\\AA')
+ self.register(u'\N{LATIN CAPITAL LETTER AE}', b'\\AE')
+ self.register(u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}', b'\\c C')
+ self.register(u'\N{LATIN CAPITAL LETTER E WITH GRAVE}', b'\\`E')
+ self.register(u'\N{LATIN CAPITAL LETTER E WITH ACUTE}', b"\\'E")
+ self.register(u'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}', b'\\^E')
+ self.register(u'\N{LATIN CAPITAL LETTER E WITH DIAERESIS}', b'\\"E')
+ self.register(u'\N{LATIN CAPITAL LETTER I WITH GRAVE}', b'\\`I')
+ self.register(u'\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}', b'\\^I')
+ self.register(u'\N{LATIN CAPITAL LETTER I WITH DIAERESIS}', b'\\"I')
+ self.register(u'\N{LATIN CAPITAL LETTER N WITH TILDE}', b'\\~N')
+ self.register(u'\N{LATIN CAPITAL LETTER O WITH GRAVE}', b'\\`O')
+ self.register(u'\N{LATIN CAPITAL LETTER O WITH ACUTE}', b"\\'O")
+ self.register(u'\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}', b'\\^O')
+ self.register(u'\N{LATIN CAPITAL LETTER O WITH TILDE}', b'\\~O')
+ self.register(u'\N{LATIN CAPITAL LETTER O WITH DIAERESIS}', b'\\"O')
+ self.register(u'\N{MULTIPLICATION SIGN}', b'\\times', mode='math')
+ self.register(u'\N{LATIN CAPITAL LETTER O WITH STROKE}', b'\\O')
+ self.register(u'\N{LATIN CAPITAL LETTER U WITH GRAVE}', b'\\`U')
+ self.register(u'\N{LATIN CAPITAL LETTER U WITH ACUTE}', b"\\'U")
+ self.register(u'\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}', b'\\^U')
+ self.register(u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}', b'\\"U')
+ self.register(u'\N{LATIN CAPITAL LETTER Y WITH ACUTE}', b"\\'Y")
+ self.register(u'\N{LATIN SMALL LETTER SHARP S}', b'\\ss')
+ self.register(u'\N{LATIN SMALL LETTER A WITH GRAVE}', b'\\`a')
+ self.register(u'\N{LATIN SMALL LETTER A WITH ACUTE}', b"\\'a")
+ self.register(u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}', b'\\^a')
+ self.register(u'\N{LATIN SMALL LETTER A WITH TILDE}', b'\\~a')
+ self.register(u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', b'\\"a')
+ self.register(u'\N{LATIN SMALL LETTER A WITH RING ABOVE}', b'\\aa')
+ self.register(u'\N{LATIN SMALL LETTER AE}', b'\\ae')
+ self.register(u'\N{LATIN SMALL LETTER C WITH CEDILLA}', b'\\c c')
+ self.register(u'\N{LATIN SMALL LETTER E WITH GRAVE}', b'\\`e')
+ self.register(u'\N{LATIN SMALL LETTER E WITH ACUTE}', b"\\'e")
+ self.register(u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', b'\\^e')
+ self.register(u'\N{LATIN SMALL LETTER E WITH DIAERESIS}', b'\\"e')
+ self.register(u'\N{LATIN SMALL LETTER I WITH GRAVE}', b'\\`\\i')
+ self.register(u'\N{LATIN SMALL LETTER I WITH GRAVE}', b'\\`i')
+ self.register(u'\N{LATIN SMALL LETTER I WITH ACUTE}', b"\\'\\i")
+ self.register(u'\N{LATIN SMALL LETTER I WITH ACUTE}', b"\\'i")
+ self.register(u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}', b'\\^\\i')
+ self.register(u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}', b'\\^i')
+ self.register(u'\N{LATIN SMALL LETTER I WITH DIAERESIS}', b'\\"\\i')
+ self.register(u'\N{LATIN SMALL LETTER I WITH DIAERESIS}', b'\\"i')
+ self.register(u'\N{LATIN SMALL LETTER N WITH TILDE}', b'\\~n')
+ self.register(u'\N{LATIN SMALL LETTER O WITH GRAVE}', b'\\`o')
+ self.register(u'\N{LATIN SMALL LETTER O WITH ACUTE}', b"\\'o")
+ self.register(u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}', b'\\^o')
+ self.register(u'\N{LATIN SMALL LETTER O WITH TILDE}', b'\\~o')
+ self.register(u'\N{LATIN SMALL LETTER O WITH DIAERESIS}', b'\\"o')
+ self.register(u'\N{DIVISION SIGN}', b'\\div', mode='math')
+ self.register(u'\N{LATIN SMALL LETTER O WITH STROKE}', b'\\o')
+ self.register(u'\N{LATIN SMALL LETTER U WITH GRAVE}', b'\\`u')
+ self.register(u'\N{LATIN SMALL LETTER U WITH ACUTE}', b"\\'u")
+ self.register(u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}', b'\\^u')
+ self.register(u'\N{LATIN SMALL LETTER U WITH DIAERESIS}', b'\\"u')
+ self.register(u'\N{LATIN SMALL LETTER Y WITH ACUTE}', b"\\'y")
+ self.register(u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}', b'\\"y')
+ self.register(u'\N{LATIN CAPITAL LETTER A WITH MACRON}', b'\\=A')
+ self.register(u'\N{LATIN SMALL LETTER A WITH MACRON}', b'\\=a')
+ self.register(u'\N{LATIN CAPITAL LETTER A WITH BREVE}', b'\\u A')
+ self.register(u'\N{LATIN SMALL LETTER A WITH BREVE}', b'\\u a')
+ self.register(u'\N{LATIN CAPITAL LETTER A WITH OGONEK}', b'\\k A')
+ self.register(u'\N{LATIN SMALL LETTER A WITH OGONEK}', b'\\k a')
+ self.register(u'\N{LATIN CAPITAL LETTER C WITH ACUTE}', b"\\'C")
+ self.register(u'\N{LATIN SMALL LETTER C WITH ACUTE}', b"\\'c")
+ self.register(u'\N{LATIN CAPITAL LETTER C WITH CIRCUMFLEX}', b'\\^C')
+ self.register(u'\N{LATIN SMALL LETTER C WITH CIRCUMFLEX}', b'\\^c')
+ self.register(u'\N{LATIN CAPITAL LETTER C WITH DOT ABOVE}', b'\\.C')
+ self.register(u'\N{LATIN SMALL LETTER C WITH DOT ABOVE}', b'\\.c')
+ self.register(u'\N{LATIN CAPITAL LETTER C WITH CARON}', b'\\v C')
+ self.register(u'\N{LATIN SMALL LETTER C WITH CARON}', b'\\v c')
+ self.register(u'\N{LATIN CAPITAL LETTER D WITH CARON}', b'\\v D')
+ self.register(u'\N{LATIN SMALL LETTER D WITH CARON}', b'\\v d')
+ self.register(u'\N{LATIN CAPITAL LETTER E WITH MACRON}', b'\\=E')
+ self.register(u'\N{LATIN SMALL LETTER E WITH MACRON}', b'\\=e')
+ self.register(u'\N{LATIN CAPITAL LETTER E WITH BREVE}', b'\\u E')
+ self.register(u'\N{LATIN SMALL LETTER E WITH BREVE}', b'\\u e')
+ self.register(u'\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}', b'\\.E')
+ self.register(u'\N{LATIN SMALL LETTER E WITH DOT ABOVE}', b'\\.e')
+ self.register(u'\N{LATIN CAPITAL LETTER E WITH OGONEK}', b'\\k E')
+ self.register(u'\N{LATIN SMALL LETTER E WITH OGONEK}', b'\\k e')
+ self.register(u'\N{LATIN CAPITAL LETTER E WITH CARON}', b'\\v E')
+ self.register(u'\N{LATIN SMALL LETTER E WITH CARON}', b'\\v e')
+ self.register(u'\N{LATIN CAPITAL LETTER G WITH CIRCUMFLEX}', b'\\^G')
+ self.register(u'\N{LATIN SMALL LETTER G WITH CIRCUMFLEX}', b'\\^g')
+ self.register(u'\N{LATIN CAPITAL LETTER G WITH BREVE}', b'\\u G')
+ self.register(u'\N{LATIN SMALL LETTER G WITH BREVE}', b'\\u g')
+ self.register(u'\N{LATIN CAPITAL LETTER G WITH DOT ABOVE}', b'\\.G')
+ self.register(u'\N{LATIN SMALL LETTER G WITH DOT ABOVE}', b'\\.g')
+ self.register(u'\N{LATIN CAPITAL LETTER G WITH CEDILLA}', b'\\c G')
+ self.register(u'\N{LATIN SMALL LETTER G WITH CEDILLA}', b'\\c g')
+ self.register(u'\N{LATIN CAPITAL LETTER H WITH CIRCUMFLEX}', b'\\^H')
+ self.register(u'\N{LATIN SMALL LETTER H WITH CIRCUMFLEX}', b'\\^h')
+ self.register(u'\N{LATIN CAPITAL LETTER I WITH TILDE}', b'\\~I')
+ self.register(u'\N{LATIN SMALL LETTER I WITH TILDE}', b'\\~\\i')
+ self.register(u'\N{LATIN SMALL LETTER I WITH TILDE}', b'\\~i')
+ self.register(u'\N{LATIN CAPITAL LETTER I WITH MACRON}', b'\\=I')
+ self.register(u'\N{LATIN SMALL LETTER I WITH MACRON}', b'\\=\\i')
+ self.register(u'\N{LATIN SMALL LETTER I WITH MACRON}', b'\\=i')
+ self.register(u'\N{LATIN CAPITAL LETTER I WITH BREVE}', b'\\u I')
+ self.register(u'\N{LATIN SMALL LETTER I WITH BREVE}', b'\\u\\i')
+ self.register(u'\N{LATIN SMALL LETTER I WITH BREVE}', b'\\u i')
+ self.register(u'\N{LATIN CAPITAL LETTER I WITH OGONEK}', b'\\k I')
+ self.register(u'\N{LATIN SMALL LETTER I WITH OGONEK}', b'\\k i')
+ self.register(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}', b'\\.I')
+ self.register(u'\N{LATIN SMALL LETTER DOTLESS I}', b'\\i')
+ self.register(u'\N{LATIN CAPITAL LIGATURE IJ}', b'IJ', decode=False)
+ self.register(u'\N{LATIN SMALL LIGATURE IJ}', b'ij', decode=False)
+ self.register(u'\N{LATIN CAPITAL LETTER J WITH CIRCUMFLEX}', b'\\^J')
+ self.register(u'\N{LATIN SMALL LETTER J WITH CIRCUMFLEX}', b'\\^\\j')
+ self.register(u'\N{LATIN SMALL LETTER J WITH CIRCUMFLEX}', b'\\^j')
+ self.register(u'\N{LATIN CAPITAL LETTER K WITH CEDILLA}', b'\\c K')
+ self.register(u'\N{LATIN SMALL LETTER K WITH CEDILLA}', b'\\c k')
+ self.register(u'\N{LATIN CAPITAL LETTER L WITH ACUTE}', b"\\'L")
+ self.register(u'\N{LATIN SMALL LETTER L WITH ACUTE}', b"\\'l")
+ self.register(u'\N{LATIN CAPITAL LETTER L WITH CEDILLA}', b'\\c L')
+ self.register(u'\N{LATIN SMALL LETTER L WITH CEDILLA}', b'\\c l')
+ self.register(u'\N{LATIN CAPITAL LETTER L WITH CARON}', b'\\v L')
+ self.register(u'\N{LATIN SMALL LETTER L WITH CARON}', b'\\v l')
+ self.register(u'\N{LATIN CAPITAL LETTER L WITH STROKE}', b'\\L')
+ self.register(u'\N{LATIN SMALL LETTER L WITH STROKE}', b'\\l')
+ self.register(u'\N{LATIN CAPITAL LETTER N WITH ACUTE}', b"\\'N")
+ self.register(u'\N{LATIN SMALL LETTER N WITH ACUTE}', b"\\'n")
+ self.register(u'\N{LATIN CAPITAL LETTER N WITH CEDILLA}', b'\\c N')
+ self.register(u'\N{LATIN SMALL LETTER N WITH CEDILLA}', b'\\c n')
+ self.register(u'\N{LATIN CAPITAL LETTER N WITH CARON}', b'\\v N')
+ self.register(u'\N{LATIN SMALL LETTER N WITH CARON}', b'\\v n')
+ self.register(u'\N{LATIN CAPITAL LETTER O WITH MACRON}', b'\\=O')
+ self.register(u'\N{LATIN SMALL LETTER O WITH MACRON}', b'\\=o')
+ self.register(u'\N{LATIN CAPITAL LETTER O WITH BREVE}', b'\\u O')
+ self.register(u'\N{LATIN SMALL LETTER O WITH BREVE}', b'\\u o')
+ self.register(
+ u'\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}',
+ b'\\H O')
+ self.register(u'\N{LATIN SMALL LETTER O WITH DOUBLE ACUTE}', b'\\H o')
+ self.register(u'\N{LATIN CAPITAL LIGATURE OE}', b'\\OE')
+ self.register(u'\N{LATIN SMALL LIGATURE OE}', b'\\oe')
+ self.register(u'\N{LATIN CAPITAL LETTER R WITH ACUTE}', b"\\'R")
+ self.register(u'\N{LATIN SMALL LETTER R WITH ACUTE}', b"\\'r")
+ self.register(u'\N{LATIN CAPITAL LETTER R WITH CEDILLA}', b'\\c R')
+ self.register(u'\N{LATIN SMALL LETTER R WITH CEDILLA}', b'\\c r')
+ self.register(u'\N{LATIN CAPITAL LETTER R WITH CARON}', b'\\v R')
+ self.register(u'\N{LATIN SMALL LETTER R WITH CARON}', b'\\v r')
+ self.register(u'\N{LATIN CAPITAL LETTER S WITH ACUTE}', b"\\'S")
+ self.register(u'\N{LATIN SMALL LETTER S WITH ACUTE}', b"\\'s")
+ self.register(u'\N{LATIN CAPITAL LETTER S WITH CIRCUMFLEX}', b'\\^S')
+ self.register(u'\N{LATIN SMALL LETTER S WITH CIRCUMFLEX}', b'\\^s')
+ self.register(u'\N{LATIN CAPITAL LETTER S WITH CEDILLA}', b'\\c S')
+ self.register(u'\N{LATIN SMALL LETTER S WITH CEDILLA}', b'\\c s')
+ self.register(u'\N{LATIN CAPITAL LETTER S WITH CARON}', b'\\v S')
+ self.register(u'\N{LATIN SMALL LETTER S WITH CARON}', b'\\v s')
+ self.register(u'\N{LATIN CAPITAL LETTER T WITH CEDILLA}', b'\\c T')
+ self.register(u'\N{LATIN SMALL LETTER T WITH CEDILLA}', b'\\c t')
+ self.register(u'\N{LATIN CAPITAL LETTER T WITH CARON}', b'\\v T')
+ self.register(u'\N{LATIN SMALL LETTER T WITH CARON}', b'\\v t')
+ self.register(u'\N{LATIN CAPITAL LETTER U WITH TILDE}', b'\\~U')
+ self.register(u'\N{LATIN SMALL LETTER U WITH TILDE}', b'\\~u')
+ self.register(u'\N{LATIN CAPITAL LETTER U WITH MACRON}', b'\\=U')
+ self.register(u'\N{LATIN SMALL LETTER U WITH MACRON}', b'\\=u')
+ self.register(u'\N{LATIN CAPITAL LETTER U WITH BREVE}', b'\\u U')
+ self.register(u'\N{LATIN SMALL LETTER U WITH BREVE}', b'\\u u')
+ self.register(u'\N{LATIN CAPITAL LETTER U WITH RING ABOVE}', b'\\r U')
+ self.register(u'\N{LATIN SMALL LETTER U WITH RING ABOVE}', b'\\r u')
+ self.register(
+ u'\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}',
+ b'\\H U')
+ self.register(u'\N{LATIN SMALL LETTER U WITH DOUBLE ACUTE}', b'\\H u')
+ self.register(u'\N{LATIN CAPITAL LETTER U WITH OGONEK}', b'\\k U')
+ self.register(u'\N{LATIN SMALL LETTER U WITH OGONEK}', b'\\k u')
+ self.register(u'\N{LATIN CAPITAL LETTER W WITH CIRCUMFLEX}', b'\\^W')
+ self.register(u'\N{LATIN SMALL LETTER W WITH CIRCUMFLEX}', b'\\^w')
+ self.register(u'\N{LATIN CAPITAL LETTER Y WITH CIRCUMFLEX}', b'\\^Y')
+ self.register(u'\N{LATIN SMALL LETTER Y WITH CIRCUMFLEX}', b'\\^y')
+ self.register(u'\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}', b'\\"Y')
+ self.register(u'\N{LATIN CAPITAL LETTER Z WITH ACUTE}', b"\\'Z")
+ self.register(u'\N{LATIN SMALL LETTER Z WITH ACUTE}', b"\\'Z")
+ self.register(u'\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}', b'\\.Z')
+ self.register(u'\N{LATIN SMALL LETTER Z WITH DOT ABOVE}', b'\\.Z')
+ self.register(u'\N{LATIN CAPITAL LETTER Z WITH CARON}', b'\\v Z')
+ self.register(u'\N{LATIN SMALL LETTER Z WITH CARON}', b'\\v z')
+ self.register(u'\N{LATIN CAPITAL LETTER DZ WITH CARON}', b'D\\v Z')
+ self.register(
+ u'\N{LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON}',
+ b'D\\v z')
+ self.register(u'\N{LATIN SMALL LETTER DZ WITH CARON}', b'd\\v z')
+ self.register(u'\N{LATIN CAPITAL LETTER LJ}', b'LJ', decode=False)
+ self.register(
+ u'\N{LATIN CAPITAL LETTER L WITH SMALL LETTER J}',
+ b'Lj',
+ decode=False)
+ self.register(u'\N{LATIN SMALL LETTER LJ}', b'lj', decode=False)
+ self.register(u'\N{LATIN CAPITAL LETTER NJ}', b'NJ', decode=False)
+ self.register(
+ u'\N{LATIN CAPITAL LETTER N WITH SMALL LETTER J}',
+ b'Nj',
+ decode=False)
+ self.register(u'\N{LATIN SMALL LETTER NJ}', b'nj', decode=False)
+ self.register(u'\N{LATIN CAPITAL LETTER A WITH CARON}', b'\\v A')
+ self.register(u'\N{LATIN SMALL LETTER A WITH CARON}', b'\\v a')
+ self.register(u'\N{LATIN CAPITAL LETTER I WITH CARON}', b'\\v I')
+ self.register(u'\N{LATIN SMALL LETTER I WITH CARON}', b'\\v\\i')
+ self.register(u'\N{LATIN CAPITAL LETTER O WITH CARON}', b'\\v O')
+ self.register(u'\N{LATIN SMALL LETTER O WITH CARON}', b'\\v o')
+ self.register(u'\N{LATIN CAPITAL LETTER U WITH CARON}', b'\\v U')
+ self.register(u'\N{LATIN SMALL LETTER U WITH CARON}', b'\\v u')
+ self.register(u'\N{LATIN CAPITAL LETTER G WITH CARON}', b'\\v G')
+ self.register(u'\N{LATIN SMALL LETTER G WITH CARON}', b'\\v g')
+ self.register(u'\N{LATIN CAPITAL LETTER K WITH CARON}', b'\\v K')
+ self.register(u'\N{LATIN SMALL LETTER K WITH CARON}', b'\\v k')
+ self.register(u'\N{LATIN CAPITAL LETTER O WITH OGONEK}', b'\\k O')
+ self.register(u'\N{LATIN SMALL LETTER O WITH OGONEK}', b'\\k o')
+ self.register(u'\N{LATIN SMALL LETTER J WITH CARON}', b'\\v\\j')
+ self.register(u'\N{LATIN CAPITAL LETTER DZ}', b'DZ', decode=False)
+ self.register(
+ u'\N{LATIN CAPITAL LETTER D WITH SMALL LETTER Z}',
+ b'Dz',
+ decode=False)
+ self.register(u'\N{LATIN SMALL LETTER DZ}', b'dz', decode=False)
+ self.register(u'\N{LATIN CAPITAL LETTER G WITH ACUTE}', b"\\'G")
+ self.register(u'\N{LATIN SMALL LETTER G WITH ACUTE}', b"\\'g")
+ self.register(u'\N{LATIN CAPITAL LETTER AE WITH ACUTE}', b"\\'\\AE")
+ self.register(u'\N{LATIN SMALL LETTER AE WITH ACUTE}', b"\\'\\ae")
+ self.register(
+ u'\N{LATIN CAPITAL LETTER O WITH STROKE AND ACUTE}',
+ b"\\'\\O")
+ self.register(
+ u'\N{LATIN SMALL LETTER O WITH STROKE AND ACUTE}',
+ b"\\'\\o")
+ self.register(u'\N{PARTIAL DIFFERENTIAL}', b'\\partial', mode='math')
+ self.register(u'\N{N-ARY PRODUCT}', b'\\prod', mode='math')
+ self.register(u'\N{N-ARY SUMMATION}', b'\\sum', mode='math')
+ self.register(u'\N{SQUARE ROOT}', b'\\surd', mode='math')
+ self.register(u'\N{INFINITY}', b'\\infty', mode='math')
+ self.register(u'\N{INTEGRAL}', b'\\int', mode='math')
+ self.register(u'\N{INTERSECTION}', b'\\cap', mode='math')
+ self.register(u'\N{UNION}', b'\\cup', mode='math')
+ self.register(u'\N{RIGHTWARDS ARROW}', b'\\rightarrow', mode='math')
+ self.register(
+ u'\N{RIGHTWARDS DOUBLE ARROW}',
+ b'\\Rightarrow',
+ mode='math')
+ self.register(u'\N{LEFTWARDS ARROW}', b'\\leftarrow', mode='math')
+ self.register(
+ u'\N{LEFTWARDS DOUBLE ARROW}',
+ b'\\Leftarrow',
+ mode='math')
+ self.register(u'\N{LOGICAL OR}', b'\\vee', mode='math')
+ self.register(u'\N{LOGICAL AND}', b'\\wedge', mode='math')
+ self.register(u'\N{ALMOST EQUAL TO}', b'\\approx', mode='math')
+ self.register(u'\N{NOT EQUAL TO}', b'\\neq', mode='math')
+ self.register(u'\N{LESS-THAN OR EQUAL TO}', b'\\leq', mode='math')
+ self.register(u'\N{GREATER-THAN OR EQUAL TO}', b'\\geq', mode='math')
+ self.register(u'\N{MODIFIER LETTER CIRCUMFLEX ACCENT}', b'\\^{}')
+ self.register(u'\N{CARON}', b'\\v{}')
+ self.register(u'\N{BREVE}', b'\\u{}')
+ self.register(u'\N{DOT ABOVE}', b'\\.{}')
+ self.register(u'\N{RING ABOVE}', b'\\r{}')
+ self.register(u'\N{OGONEK}', b'\\k{}')
+ self.register(u'\N{SMALL TILDE}', b'\\~{}')
+ self.register(u'\N{DOUBLE ACUTE ACCENT}', b'\\H{}')
+ self.register(u'\N{LATIN SMALL LIGATURE FI}', b'fi', decode=False)
+ self.register(u'\N{LATIN SMALL LIGATURE FL}', b'fl', decode=False)
+ self.register(u'\N{LATIN SMALL LIGATURE FF}', b'ff', decode=False)
+
+ self.register(u'\N{GREEK SMALL LETTER ALPHA}', b'\\alpha', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER BETA}', b'\\beta', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER GAMMA}', b'\\gamma', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER DELTA}', b'\\delta', mode='math')
+ self.register(
+ u'\N{GREEK SMALL LETTER EPSILON}',
+ b'\\epsilon',
+ mode='math')
+ self.register(u'\N{GREEK SMALL LETTER ZETA}', b'\\zeta', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER ETA}', b'\\eta', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER THETA}', b'\\theta', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER IOTA}', b'\\iota', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER KAPPA}', b'\\kappa', mode='math')
+ self.register(
+ u'\N{GREEK SMALL LETTER LAMDA}',
+ b'\\lambda',
+ mode='math') # LAMDA not LAMBDA
+ self.register(u'\N{GREEK SMALL LETTER MU}', b'\\mu', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER NU}', b'\\nu', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER XI}', b'\\xi', mode='math')
+ self.register(
+ u'\N{GREEK SMALL LETTER OMICRON}',
+ b'\\omicron',
+ mode='math')
+ self.register(u'\N{GREEK SMALL LETTER PI}', b'\\pi', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER RHO}', b'\\rho', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER SIGMA}', b'\\sigma', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER TAU}', b'\\tau', mode='math')
+ self.register(
+ u'\N{GREEK SMALL LETTER UPSILON}',
+ b'\\upsilon',
+ mode='math')
+ self.register(u'\N{GREEK SMALL LETTER PHI}', b'\\phi', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER CHI}', b'\\chi', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER PSI}', b'\\psi', mode='math')
+ self.register(u'\N{GREEK SMALL LETTER OMEGA}', b'\\omega', mode='math')
+ self.register(
+ u'\N{GREEK CAPITAL LETTER ALPHA}',
+ b'\\Alpha',
+ mode='math')
+ self.register(u'\N{GREEK CAPITAL LETTER BETA}', b'\\Beta', mode='math')
+ self.register(
+ u'\N{GREEK CAPITAL LETTER GAMMA}',
+ b'\\Gamma',
+ mode='math')
+ self.register(
+ u'\N{GREEK CAPITAL LETTER DELTA}',
+ b'\\Delta',
+ mode='math')
+ self.register(
+ u'\N{GREEK CAPITAL LETTER EPSILON}',
+ b'\\Epsilon',
+ mode='math')
+ self.register(u'\N{GREEK CAPITAL LETTER ZETA}', b'\\Zeta', mode='math')
+ self.register(u'\N{GREEK CAPITAL LETTER ETA}', b'\\Eta', mode='math')
+ self.register(
+ u'\N{GREEK CAPITAL LETTER THETA}',
+ b'\\Theta',
+ mode='math')
+ self.register(u'\N{GREEK CAPITAL LETTER IOTA}', b'\\Iota', mode='math')
+ self.register(
+ u'\N{GREEK CAPITAL LETTER KAPPA}',
+ b'\\Kappa',
+ mode='math')
+ self.register(
+ u'\N{GREEK CAPITAL LETTER LAMDA}',
+ b'\\Lambda',
+ mode='math') # LAMDA not LAMBDA
+ self.register(u'\N{GREEK CAPITAL LETTER MU}', b'\\Mu', mode='math')
+ self.register(u'\N{GREEK CAPITAL LETTER NU}', b'\\Nu', mode='math')
+ self.register(u'\N{GREEK CAPITAL LETTER XI}', b'\\Xi', mode='math')
+ self.register(
+ u'\N{GREEK CAPITAL LETTER OMICRON}',
+ b'\\Omicron',
+ mode='math')
+ self.register(u'\N{GREEK CAPITAL LETTER PI}', b'\\Pi', mode='math')
+ self.register(u'\N{GREEK CAPITAL LETTER RHO}', b'\\Rho', mode='math')
+ self.register(
+ u'\N{GREEK CAPITAL LETTER SIGMA}',
+ b'\\Sigma',
+ mode='math')
+ self.register(u'\N{GREEK CAPITAL LETTER TAU}', b'\\Tau', mode='math')
+ self.register(
+ u'\N{GREEK CAPITAL LETTER UPSILON}',
+ b'\\Upsilon',
+ mode='math')
+ self.register(u'\N{GREEK CAPITAL LETTER PHI}', b'\\Phi', mode='math')
+ self.register(u'\N{GREEK CAPITAL LETTER CHI}', b'\\Chi', mode='math')
+ self.register(u'\N{GREEK CAPITAL LETTER PSI}', b'\\Psi', mode='math')
+ self.register(
+ u'\N{GREEK CAPITAL LETTER OMEGA}',
+ b'\\Omega',
+ mode='math')
+ self.register(u'\N{COPYRIGHT SIGN}', b'\\copyright')
+ self.register(u'\N{COPYRIGHT SIGN}', b'\\textcopyright')
+ self.register(u'\N{LATIN CAPITAL LETTER A WITH ACUTE}', b"\\'A")
+ self.register(u'\N{LATIN CAPITAL LETTER I WITH ACUTE}', b"\\'I")
+ self.register(u'\N{HORIZONTAL ELLIPSIS}', b'\\ldots')
+ self.register(u'\N{TRADE MARK SIGN}', b'^{TM}', mode='math')
+ self.register(
+ u'\N{TRADE MARK SIGN}',
+ b'\\texttrademark',
+ package='textcomp')
+ # \=O and \=o will be translated into Ō and ō before we can
+ # match the full latex string... so decoding disabled for now
+ self.register(u'Ǭ', br'\textogonekcentered{\=O}', decode=False)
+ self.register(u'ǭ', br'\textogonekcentered{\=o}', decode=False)
+
+ def register(self, unicode_text, latex_text, mode='text', package=None,
+ decode=True, encode=True):
+ """Register a correspondence between *unicode_text* and *latex_text*.
+
+ :param str unicode_text: A unicode character.
+ :param bytes latex_text: Its corresponding LaTeX translation.
+ :param str mode: LaTeX mode in which the translation applies
+ (``'text'`` or ``'math'``).
+ :param str package: LaTeX package requirements (currently ignored).
+ :param bool decode: Whether this translation applies to decoding
+ (default: ``True``).
+ :param bool encode: Whether this translation applies to encoding
+ (default: ``True``).
+ """
+ if package is not None:
+ # TODO implement packages
+ pass
+ if mode == 'math':
+ # also register text version
+ self.register(unicode_text, b'$' + latex_text + b'$', mode='text',
+ package=package, decode=decode, encode=encode)
+ # XXX for the time being, we do not perform in-math substitutions
+ return
+ # tokenize, and register unicode translation
+ self.lexer.reset()
+ self.lexer.state = 'M'
+ tokens = tuple(self.lexer.get_tokens(latex_text, final=True))
+ if decode:
+ if tokens not in self.unicode_map:
+ self.max_length = max(self.max_length, len(tokens))
+ self.unicode_map[tokens] = unicode_text
+ # also register token variant with brackets, if appropriate
+ # for instance, "\'{e}" for "\'e", "\c{c}" for "\c c", etc.
+ # note: we do not remove brackets (they sometimes matter,
+ # e.g. bibtex uses them to prevent lower case transformation)
+ if (len(tokens) == 2
+ and tokens[0].name.startswith('control')
+ and tokens[1].name == 'chars'):
+ alt_tokens = (
+ tokens[0], lexer.Token('chars', b'{'),
+ tokens[1], lexer.Token('chars', b'}'),
+ )
+ if alt_tokens not in self.unicode_map:
+ self.max_length = max(self.max_length, len(alt_tokens))
+ self.unicode_map[alt_tokens] = u"{" + unicode_text + u"}"
+ if encode and unicode_text not in self.latex_map:
+ assert len(unicode_text) == 1
+ self.latex_map[unicode_text] = (latex_text, tokens)
+
+_LATEX_UNICODE_TABLE = LatexUnicodeTable(lexer.LatexIncrementalDecoder())
+
+# incremental encoder does not need a buffer
+# but decoder does
+
+
+class LatexIncrementalEncoder(lexer.LatexIncrementalEncoder):
+
+ """Translating incremental encoder for latex. Maintains a state to
+ determine whether control spaces etc. need to be inserted.
+ """
+
+ table = _LATEX_UNICODE_TABLE
+ """Translation table."""
+
+ def __init__(self, errors='strict'):
+ lexer.LatexIncrementalEncoder.__init__(self, errors=errors)
+ self.reset()
+
+ def reset(self):
+ self.state = 'M'
+
+ def get_space_bytes(self, bytes_):
+ """Inserts space bytes in space eating mode."""
+ if self.state == 'S':
+ # in space eating mode
+ # control space needed?
+ if bytes_.startswith(b' '):
+ # replace by control space
+ return b'\\ ', bytes_[1:]
+ else:
+ # insert space (it is eaten, but needed for separation)
+ return b' ', bytes_
+ else:
+ return b'', bytes_
+
+ def _get_latex_bytes_tokens_from_char(self, c):
+ # if ascii, try latex equivalents
+ # (this covers \, #, &, and other special LaTeX characters)
+ if ord(c) < 128:
+ try:
+ return self.table.latex_map[c]
+ except KeyError:
+ pass
+ # next, try input encoding
+ try:
+ bytes_ = c.encode(self.inputenc, 'strict')
+ except UnicodeEncodeError:
+ pass
+ else:
+ return bytes_, (lexer.Token(name='chars', text=bytes_),)
+ # next, try latex equivalents of common unicode characters
+ try:
+ return self.table.latex_map[c]
+ except KeyError:
+ # translation failed
+ if self.errors == 'strict':
+ raise UnicodeEncodeError(
+ "latex", # codec
+ c, # problematic input
+ 0, 1, # location of problematic character
+ "don't know how to translate {0} into latex"
+ .format(repr(c)))
+ elif self.errors == 'ignore':
+ return b'', (lexer.Token(),)
+ elif self.errors == 'replace':
+ # use the \\char command
+ # this assumes
+ # \usepackage[T1]{fontenc}
+ # \usepackage[utf8]{inputenc}
+ bytes_ = b'{\\char' + str(ord(c)).encode("ascii") + b'}'
+ return bytes_, (lexer.Token(name='chars', text=bytes_),)
+ else:
+ raise ValueError(
+ "latex codec does not support {0} errors"
+ .format(self.errors))
+
+ def get_latex_bytes(self, unicode_, final=False):
+ if not isinstance(unicode_, string_types):
+ raise TypeError(
+ "expected unicode for encode input, but got {0} instead"
+ .format(unicode_.__class__.__name__))
+ # convert character by character
+ for pos, c in enumerate(unicode_):
+ bytes_, tokens = self._get_latex_bytes_tokens_from_char(c)
+ space, bytes_ = self.get_space_bytes(bytes_)
+ # update state
+ if tokens[-1].name == 'control_word':
+ # we're eating spaces
+ self.state = 'S'
+ else:
+ self.state = 'M'
+ if space:
+ yield space
+ yield bytes_
+
+
+class LatexIncrementalDecoder(lexer.LatexIncrementalDecoder):
+
+ """Translating incremental decoder for LaTeX."""
+
+ table = _LATEX_UNICODE_TABLE
+ """Translation table."""
+
+ def __init__(self, errors='strict'):
+ lexer.LatexIncrementalDecoder.__init__(self, errors=errors)
+
+ def reset(self):
+ lexer.LatexIncrementalDecoder.reset(self)
+ self.token_buffer = []
+
+ # python codecs API does not support multibuffer incremental decoders
+
+ def getstate(self):
+ raise NotImplementedError
+
+ def setstate(self, state):
+ raise NotImplementedError
+
+ def get_unicode_tokens(self, bytes_, final=False):
+ for token in self.get_tokens(bytes_, final=final):
+ # at this point, token_buffer does not match anything
+ self.token_buffer.append(token)
+ # new token appended at the end, see if we have a match now
+ # note: match is only possible at the *end* of the buffer
+ # because all other positions have already been checked in
+ # earlier iterations
+ for i in range(len(self.token_buffer), 0, -1):
+ last_tokens = tuple(self.token_buffer[-i:]) # last i tokens
+ try:
+ unicode_text = self.table.unicode_map[last_tokens]
+ except KeyError:
+ # no match: continue
+ continue
+ else:
+ # match!! flush buffer, and translate last bit
+ # exclude last i tokens
+ for token in self.token_buffer[:-i]:
+ yield token.decode(self.inputenc)
+ yield unicode_text
+ self.token_buffer = []
+ break
+ # flush tokens that can no longer match
+ while len(self.token_buffer) >= self.table.max_length:
+ yield self.token_buffer.pop(0).decode(self.inputenc)
+ # also flush the buffer at the end
+ if final:
+ for token in self.token_buffer:
+ yield token.decode(self.inputenc)
+ self.token_buffer = []
+
+
+class LatexCodec(codecs.Codec):
+ IncrementalEncoder = None
+ IncrementalDecoder = None
+
+ def encode(self, unicode_, errors='strict'):
+ """Convert unicode string to LaTeX bytes."""
+ encoder = self.IncrementalEncoder(errors=errors)
+ return (
+ encoder.encode(unicode_, final=True),
+ len(unicode_),
+ )
+
+ def decode(self, bytes_, errors='strict'):
+ """Convert LaTeX bytes to unicode string."""
+ decoder = self.IncrementalDecoder(errors=errors)
+ return (
+ decoder.decode(bytes_, final=True),
+ len(bytes_),
+ )
+
+
+def find_latex(encoding):
+ """Return a :class:`codecs.CodecInfo` instance for the requested
+ LaTeX *encoding*, which must be equal to ``latex``,
+ or to ``latex+<encoding>``
+ where ``<encoding>`` describes another encoding.
+ """
+ # check if requested codec info is for latex encoding
+ if not encoding.startswith('latex'):
+ return None
+ # set up all classes with correct latex input encoding
+ inputenc_ = encoding[6:] if encoding.startswith('latex+') else 'ascii'
+
+ class IncrementalEncoder_(LatexIncrementalEncoder):
+ inputenc = inputenc_
+
+ class IncrementalDecoder_(LatexIncrementalDecoder):
+ inputenc = inputenc_
+
+ class Codec(LatexCodec):
+ IncrementalEncoder = IncrementalEncoder_
+ IncrementalDecoder = IncrementalDecoder_
+
+ class StreamWriter(Codec, codecs.StreamWriter):
+ pass
+
+ class StreamReader(Codec, codecs.StreamReader):
+ pass
+
+ return codecs.CodecInfo(
+ encode=Codec().encode,
+ decode=Codec().decode,
+ incrementalencoder=IncrementalEncoder_,
+ incrementaldecoder=IncrementalDecoder_,
+ streamreader=StreamReader,
+ streamwriter=StreamWriter,
+ )
diff --git a/latexcodec/lexer.py b/latexcodec/lexer.py
new file mode 100644
index 0000000..031f3e6
--- /dev/null
+++ b/latexcodec/lexer.py
@@ -0,0 +1,420 @@
+# -*- coding: utf-8 -*-
+"""
+ LaTeX Lexer
+ ~~~~~~~~~~~
+
+ This module contains all classes for lexing LaTeX code, as well as
+ general purpose base classes for incremental LaTeX decoders and
+ encoders, which could be useful in case you are writing your own
+ custom LaTeX codec.
+
+ .. autoclass:: Token(name, text)
+ :members: decode, __len__, __nonzero__
+
+ .. autoclass:: LatexLexer
+ :show-inheritance:
+ :members:
+
+ .. autoclass:: LatexIncrementalLexer
+ :show-inheritance:
+ :members:
+
+ .. autoclass:: LatexIncrementalDecoder
+ :show-inheritance:
+ :members:
+
+ .. autoclass:: LatexIncrementalEncoder
+ :show-inheritance:
+ :members:
+"""
+
+# Copyright (c) 2003, 2008 David Eppstein
+# Copyright (c) 2011-2014 Matthias C. M. Troffaes
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import codecs
+import collections
+import re
+from six import string_types
+
+
+class Token(collections.namedtuple("Token", "name text")):
+
+ """A :func:`collections.namedtuple` storing information about a
+ matched token.
+
+ .. seealso:: :attr:`LatexLexer.tokens`
+
+ .. attribute:: name
+
+ The name of the token as a :class:`str`.
+
+ .. attribute:: text
+
+ The matched token text as :class:`bytes`.
+ The constructor also accepts text as :class:`memoryview`,
+ in which case it is automatically converted to :class:`bytes`.
+ This ensures that the token is hashable.
+ """
+
+ __slots__ = () # efficiency
+
+ def __new__(cls, name=None, text=None):
+ # text can be memoryview; convert to bytes so Token remains hashable
+ return tuple.__new__(
+ cls,
+ (name if name is not None else 'unknown',
+ bytes(text) if text is not None else b''))
+
+ def __nonzero__(self):
+ """Whether the token contains any text."""
+ return bool(self.text)
+
+ def __len__(self):
+ """Length of the token text."""
+ return len(self.text)
+
+ def decode(self, encoding):
+ """Returns the decoded token text in the specified *encoding*.
+
+ .. note::
+
+ Control words get an extra space added at the back to make
+ sure separation from the next token, so that decoded token
+ sequences can be :meth:`str.join`\ ed together.
+
+ For example, the tokens ``b'\\hello'`` and ``b'world'``
+ will correctly result in ``u'\\hello world'`` (remember
+ that LaTeX eats space following control words). If no space
+ were added, this would wrongfully result in
+ ``u'\\helloworld'``.
+
+ """
+ if self.name == 'control_word':
+ return self.text.decode(encoding) + u' '
+ else:
+ return self.text.decode(encoding)
+
+# implementation note: we derive from IncrementalDecoder because this
+# class serves excellently as a base class for incremental decoders,
+# but of course we don't decode yet until later
+
+
+class LatexLexer(codecs.IncrementalDecoder):
+
+ """A very simple lexer for tex/latex code."""
+
+ # implementation note: every token **must** be decodable by inputenc
+ tokens = [
+ # comment: for ease, and for speed, we handle it as a token
+ ('comment', br'%.*?\n'),
+ # control tokens
+ # in latex, some control tokens skip following whitespace
+ # ('control-word' and 'control-symbol')
+ # others do not ('control-symbol-x')
+ # XXX TBT says no control symbols skip whitespace (except '\ ')
+ # XXX but tests reveal otherwise?
+ ('control_word', br'[\\][a-zA-Z]+'),
+ ('control_symbol', br'[\\][~' br"'" br'"` =^!]'),
+ # TODO should only match ascii
+ ('control_symbol_x', br'[\\][^a-zA-Z]'),
+ # parameter tokens
+ # also support a lone hash so we can lex things like b'#a'
+ ('parameter', br'\#[0-9]|\#'),
+ # any remaining characters; for ease we also handle space and
+ # newline as tokens
+ ('space', br' '),
+ ('newline', br'\n'),
+ ('mathshift', br'[$]'),
+ # note: some chars joined together to make it easier to detect
+ # symbols that have a special function (i.e. --, ---, etc.)
+ ('chars',
+ br'---|--|-|[`][`]'
+ br"|['][']"
+ br'|[?][`]|[!][`]'
+ # separate chars because brackets are optional
+ # e.g. fran\\c cais = fran\\c{c}ais in latex
+ # so only way to detect \\c acting on c only is this way
+ br'|[0-9a-zA-Z{}]'
+ # we have to join everything else together to support
+ # multibyte encodings: every token must be decodable!!
+ # this means for instance that \\c öké is NOT equivalent to
+ # \\c{ö}ké
+ br'|[^ %#$\n\\]+'),
+ # trailing garbage which we cannot decode otherwise
+ # (such as a lone '\' at the end of a buffer)
+ # is never emitted, but used internally by the buffer
+ ('unknown', br'.'),
+ ]
+ """List of token names, and the regular expressions they match."""
+
+ def __init__(self, errors='strict'):
+ """Initialize the codec."""
+ self.errors = errors
+ # regular expression used for matching
+ self.regexp = re.compile(
+ b"|".join(
+ b"(?P<" + name.encode() + b">" + regexp + b")"
+ for name, regexp in self.tokens),
+ re.DOTALL)
+ # reset state
+ self.reset()
+
+ def reset(self):
+ """Reset state."""
+ # buffer for storing last (possibly incomplete) token
+ self.raw_buffer = Token()
+
+ def getstate(self):
+ """Get state."""
+ return (self.raw_buffer.text, 0)
+
+ def setstate(self, state):
+ """Set state. The *state* must correspond to the return value
+ of a previous :meth:`getstate` call.
+ """
+ self.raw_buffer = Token('unknown', state[0])
+
+ def get_raw_tokens(self, bytes_, final=False):
+ """Yield tokens without any further processing. Tokens are one of:
+
+ - ``\\<word>``: a control word (i.e. a command)
+ - ``\\<symbol>``: a control symbol (i.e. \\^ etc.)
+ - ``#<n>``: a parameter
+ - a series of byte characters
+ """
+ if self.raw_buffer:
+ bytes_ = self.raw_buffer.text + bytes_
+ self.raw_buffer = Token()
+ for match in self.regexp.finditer(bytes_):
+ for name, regexp in self.tokens:
+ text = match.group(name)
+ if text is not None:
+ # yield the buffer token(s)
+ for token in self.flush_raw_tokens():
+ yield token
+ # fill buffer with next token
+ self.raw_buffer = Token(name, text)
+ break
+ if final:
+ for token in self.flush_raw_tokens():
+ yield token
+
+ def flush_raw_tokens(self):
+ """Flush the raw token buffer."""
+ if self.raw_buffer:
+ yield self.raw_buffer
+ self.raw_buffer = Token()
+
+
+class LatexIncrementalLexer(LatexLexer):
+
+ """A very simple incremental lexer for tex/latex code. Roughly
+ follows the state machine described in Tex By Topic, Chapter 2.
+
+ The generated tokens satisfy:
+
+ * no newline characters: paragraphs are separated by '\\par'
+ * spaces following control tokens are compressed
+ """
+
+ def reset(self):
+ LatexLexer.reset(self)
+ # three possible states:
+ # newline (N), skipping spaces (S), and middle of line (M)
+ self.state = 'N'
+ # inline math mode?
+ self.inline_math = False
+
+ def getstate(self):
+ # state 'M' is most common, so let that be zero
+ return (
+ self.raw_buffer,
+ {'M': 0, 'N': 1, 'S': 2}[self.state]
+ | (4 if self.inline_math else 0)
+ )
+
+ def setstate(self, state):
+ self.raw_buffer = state[0]
+ self.state = {0: 'M', 1: 'N', 2: 'S'}[state[1] & 3]
+ self.inline_math = bool(state[1] & 4)
+
+ def get_tokens(self, bytes_, final=False):
+ """Yield tokens while maintaining a state. Also skip
+ whitespace after control words and (some) control symbols.
+ Replaces newlines by spaces and \\par commands depending on
+ the context.
+ """
+ # current position relative to the start of bytes_ in the sequence
+ # of bytes that have been decoded
+ pos = -len(self.raw_buffer)
+ for token in self.get_raw_tokens(bytes_, final=final):
+ pos = pos + len(token)
+ assert pos >= 0 # first token includes at least self.raw_buffer
+ if token.name == 'newline':
+ if self.state == 'N':
+ # if state was 'N', generate new paragraph
+ yield Token('control_word', b'\\par')
+ elif self.state == 'S':
+ # switch to 'N' state, do not generate a space
+ self.state = 'N'
+ elif self.state == 'M':
+ # switch to 'N' state, generate a space
+ self.state = 'N'
+ yield Token('space', b' ')
+ else:
+ raise AssertionError(
+ "unknown tex state {0!r}".format(self.state))
+ elif token.name == 'space':
+ if self.state == 'N':
+ # remain in 'N' state, no space token generated
+ pass
+ elif self.state == 'S':
+ # remain in 'S' state, no space token generated
+ pass
+ elif self.state == 'M':
+ # in M mode, generate the space,
+ # but switch to space skip mode
+ self.state = 'S'
+ yield token
+ else:
+ raise AssertionError(
+ "unknown state {0!r}".format(self.state))
+ elif token.name == 'mathshift':
+ self.inline_math = not self.inline_math
+ yield token
+ elif token.name == 'parameter':
+ self.state = 'M'
+ yield token
+ elif token.name == 'control_word':
+ # go to space skip mode
+ self.state = 'S'
+ yield token
+ elif token.name == 'control_symbol':
+ # go to space skip mode
+ self.state = 'S'
+ yield token
+ elif token.name == 'control_symbol_x':
+ # don't skip following space, so go to M mode
+ self.state = 'M'
+ yield token
+ elif token.name == 'comment':
+ # go to newline mode, no token is generated
+ # note: comment includes the newline
+ self.state = 'N'
+ elif token.name == 'chars':
+ self.state = 'M'
+ yield token
+ elif token.name == 'unknown':
+ if self.errors == 'strict':
+ # current position within bytes_
+ # this is the position right after the unknown token
+ raise UnicodeDecodeError(
+ "latex", # codec
+ bytes_, # problematic input
+ pos - len(token), # start of problematic token
+ pos, # end of it
+ "unknown token {0!r}".format(token.text))
+ elif self.errors == 'ignore':
+ # do nothing
+ pass
+ elif self.errors == 'replace':
+ yield Token('chars', b'?' * len(token))
+ else:
+ raise NotImplementedError(
+ "error mode {0!r} not supported".format(self.errors))
+ else:
+ raise AssertionError(
+ "unknown token name {0!r}".format(token.name))
+
+
+class LatexIncrementalDecoder(LatexIncrementalLexer):
+
+ """Simple incremental decoder. Transforms lexed LaTeX tokens into
+ unicode.
+
+ To customize decoding, subclass and override
+ :meth:`get_unicode_tokens`.
+ """
+
+ inputenc = "ascii"
+ """Input encoding. **Must** extend ascii."""
+
+ def get_unicode_tokens(self, bytes_, final=False):
+ """Decode every token in :attr:`inputenc` encoding. Override to
+ process the tokens in some other way (for example, for token
+ translation).
+ """
+ for token in self.get_tokens(bytes_, final=final):
+ yield token.decode(self.inputenc)
+
+ def decode(self, bytes_, final=False):
+ """Decode LaTeX *bytes_* into a unicode string.
+
+ This implementation calls :meth:`get_unicode_tokens` and joins
+ the resulting unicode strings together.
+ """
+ try:
+ return u''.join(self.get_unicode_tokens(bytes_, final=final))
+ except UnicodeDecodeError as e:
+ # API requires that the encode method raises a ValueError
+ # in this case
+ raise ValueError(e)
+
+
+class LatexIncrementalEncoder(codecs.IncrementalEncoder):
+
+ """Simple incremental encoder for LaTeX. Transforms unicode into
+ :class:`bytes`.
+
+ To customize decoding, subclass and override
+ :meth:`get_latex_bytes`.
+ """
+
+ inputenc = "ascii"
+ """Input encoding. **Must** extend ascii."""
+
+ def get_latex_bytes(self, unicode_, final=False):
+ """Encode every character in :attr:`inputenc` encoding. Override to
+ process the unicode in some other way (for example, for character
+ translation).
+ """
+ if not isinstance(unicode_, string_types):
+ raise TypeError(
+ "expected unicode for encode input, but got {0} instead"
+ .format(unicode_.__class__.__name__))
+ for c in unicode_:
+ yield c.encode(self.inputenc, self.errors)
+
+ def encode(self, unicode_, final=False):
+ """Encode the *unicode_* string into LaTeX :class:`bytes`.
+
+ This implementation calls :meth:`get_latex_bytes` and joins
+ the resulting :class:`bytes` together.
+ """
+ try:
+ return b''.join(self.get_latex_bytes(unicode_, final=final))
+ except UnicodeEncodeError as e:
+ # API requires that the encode method raises a ValueError
+ # in this case
+ raise ValueError(e)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3fc41aa
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+six>=1.4.1
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..85ffac4
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,8 @@
+[nosetests]
+with-coverage=1
+cover-package=latexcodec
+cover-branches=1
+cover-html=1
+
+[wheel]
+universal = 1
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..f003c0c
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+
+import io
+from setuptools import setup, find_packages
+
+
+def readfile(filename):
+ with io.open(filename, encoding="utf-8") as stream:
+ return stream.read().split("\n")
+
+readme = readfile("README.rst")[5:] # skip title and badges
+requires = readfile("requirements.txt")
+version = readfile("VERSION")[0].strip()
+
+setup(
+ name='latexcodec',
+ version=version,
+ url='https://github.com/mcmtroffaes/latexcodec',
+ download_url='http://pypi.python.org/pypi/latexcodec',
+ license='MIT',
+ author='Matthias C. M. Troffaes',
+ author_email='matthias.troffaes at gmail.com',
+ description=readme[0],
+ long_description="\n".join(readme[2:]),
+ zip_safe=True,
+ classifiers=[
+ 'Development Status :: 5 - Production/Stable',
+ 'Environment :: Console',
+ 'Intended Audience :: Developers',
+ 'License :: OSI Approved :: MIT License',
+ 'Operating System :: OS Independent',
+ 'Programming Language :: Python',
+ 'Programming Language :: Python :: 2',
+ 'Programming Language :: Python :: 2.6',
+ 'Programming Language :: Python :: 2.7',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.3',
+ 'Programming Language :: Python :: 3.4',
+ 'Topic :: Text Processing :: Markup :: LaTeX',
+ 'Topic :: Text Processing :: Filters',
+ ],
+ platforms='any',
+ packages=find_packages(),
+ install_requires=requires,
+)
diff --git a/test/test_install_example.py b/test/test_install_example.py
new file mode 100644
index 0000000..b732d4b
--- /dev/null
+++ b/test/test_install_example.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+
+
+def test_install_example_1():
+ import latexcodec # noqa
+ text_latex = br"\'el\`eve"
+ assert text_latex.decode("latex") == u"élève"
+ text_unicode = u"ångström"
+ assert text_unicode.encode("latex") == br'\aa ngstr\"om'
+
+
+def test_install_example_2():
+ import latexcodec # noqa
+ text_latex = b"\xfe"
+ assert text_latex.decode("latex+latin1") == u"þ"
+ assert text_latex.decode("latex+latin2") == u"ţ"
+ text_unicode = u"ţ"
+ assert text_unicode.encode("latex+latin1") == b'\\c t' # ţ is not latin1
+ assert text_unicode.encode("latex+latin2") == b'\xfe' # but it is latin2
diff --git a/test/test_latex_codec.py b/test/test_latex_codec.py
new file mode 100644
index 0000000..d1a843e
--- /dev/null
+++ b/test/test_latex_codec.py
@@ -0,0 +1,362 @@
+# -*- coding: utf-8 -*-
+"""Tests for the latex codec."""
+
+from __future__ import print_function
+
+import codecs
+import nose.tools
+from six import text_type, binary_type, BytesIO, PY2
+from unittest import TestCase
+
+import latexcodec
+
+
+def test_getregentry():
+ assert latexcodec.codec.getregentry() is not None
+
+
+def test_find_latex():
+ assert latexcodec.codec.find_latex('hello') is None
+
+
+def test_latex_incremental_decoder_getstate():
+ encoder = codecs.getincrementaldecoder('latex')()
+ nose.tools.assert_raises(NotImplementedError, lambda: encoder.getstate())
+
+
+def test_latex_incremental_decoder_setstate():
+ encoder = codecs.getincrementaldecoder('latex')()
+ state = (u'', 0)
+ nose.tools.assert_raises(
+ NotImplementedError,
+ lambda: encoder.setstate(state))
+
+
+def split_input(input_):
+ """Helper function for testing the incremental encoder and decoder."""
+ if not isinstance(input_, (text_type, binary_type)):
+ raise TypeError("expected unicode or bytes input")
+ if input_:
+ for i in range(len(input_)):
+ if i + 1 < len(input_):
+ yield input_[i:i + 1], False
+ else:
+ yield input_[i:i + 1], True
+ else:
+ yield input_, True
+
+
+class TestDecoder(TestCase):
+
+ """Stateless decoder tests."""
+ maxDiff = None
+
+ def decode(self, text_utf8, text_latex, inputenc=None):
+ """Main test function."""
+ encoding = 'latex+' + inputenc if inputenc else 'latex'
+ decoded, n = codecs.getdecoder(encoding)(text_latex)
+ self.assertEqual((decoded, n), (text_utf8, len(text_latex)))
+
+ @nose.tools.raises(TypeError)
+ def test_invalid_type(self):
+ self.decode(object(), object())
+
+ @nose.tools.raises(ValueError)
+ def test_invalid_code(self):
+ # b'\xe9' is invalid utf-8 code
+ self.decode(u'', b'\xe9 ', 'utf-8')
+
+ def test_null(self):
+ self.decode(u'', b'')
+
+ def test_maelstrom(self):
+ self.decode(u"mælström", br'm\ae lstr\"om')
+
+ def test_maelstrom_latin1(self):
+ self.decode(u"mælström", b'm\\ae lstr\xf6m', 'latin1')
+
+ def test_laren(self):
+ self.decode(
+ u"© låren av björn",
+ br'\copyright\ l\aa ren av bj\"orn')
+
+ def test_laren_brackets(self):
+ self.decode(
+ u"© l{å}ren av bj{ö}rn",
+ br'\copyright\ l{\aa}ren av bj{\"o}rn')
+
+ def test_laren_latin1(self):
+ self.decode(
+ u"© låren av björn",
+ b'\\copyright\\ l\xe5ren av bj\xf6rn',
+ 'latin1')
+
+ def test_droitcivil(self):
+ self.decode(
+ u"Même s'il a fait l'objet d'adaptations suite à l'évolution, "
+ u"la transformation sociale, économique et politique du pays, "
+ u"le code civil fran{ç}ais est aujourd'hui encore le texte "
+ u"fondateur "
+ u"du droit civil français mais aussi du droit civil belge "
+ u"ainsi que "
+ u"de plusieurs autres droits civils.",
+ b"M\\^eme s'il a fait l'objet d'adaptations suite "
+ b"\\`a l'\\'evolution, \nla transformation sociale, "
+ b"\\'economique et politique du pays, \nle code civil "
+ b"fran\\c{c}ais est aujourd'hui encore le texte fondateur \n"
+ b"du droit civil fran\\c cais mais aussi du droit civil "
+ b"belge ainsi que \nde plusieurs autres droits civils.",
+ )
+
+ def test_oeuf(self):
+ self.decode(
+ u"D'un point de vue diététique, l'œuf apaise la faim.",
+ br"D'un point de vue di\'et\'etique, l'\oe uf apaise la faim.",
+ )
+
+ def test_oeuf_latin1(self):
+ self.decode(
+ u"D'un point de vue diététique, l'œuf apaise la faim.",
+ b"D'un point de vue di\xe9t\xe9tique, l'\\oe uf apaise la faim.",
+ 'latin1'
+ )
+
+ def test_alpha(self):
+ self.decode(u"α", b"$\\alpha$")
+
+ def test_maelstrom_multibyte_encoding(self):
+ self.decode(u"\\c öké", b'\\c \xc3\xb6k\xc3\xa9', 'utf8')
+
+ def test_serafin(self):
+ self.decode(u"Seraf{\xed}n", b"Seraf{\\'i}n")
+
+ def test_astrom(self):
+ self.decode(u"{\xc5}str{\xf6}m", b'{\\AA}str{\\"o}m')
+
+ def test_space_1(self):
+ self.decode(u"ææ", br'\ae \ae')
+
+ def test_space_2(self):
+ self.decode(u"æ æ", br'\ae\ \ae')
+
+ def test_number_sign_1(self):
+ self.decode(u"# hello", br'\#\ hello')
+
+ def test_number_sign_2(self):
+ # LaTeX does not absorb the space following '\#':
+ # check decoding is correct
+ self.decode(u"# hello", br'\# hello')
+
+ def test_number_sign_3(self):
+ # a single '#' is not valid LaTeX:
+ # for the moment we ignore this error and return # unchanged
+ self.decode(u"# hello", br'# hello')
+
+ def test_underscore(self):
+ self.decode(u"_", br'\_')
+
+ def test_dz(self):
+ self.decode(u"DZ", br'DZ')
+
+ def test_newline(self):
+ self.decode(u"hello world", b"hello\nworld")
+
+ def test_par1(self):
+ self.decode(u"hello\n\nworld", b"hello\n\nworld")
+
+ def test_par2(self):
+ self.decode(u"hello\n\nworld", b"hello\\par world")
+
+ def test_par3(self):
+ self.decode(u"hello\n\nworld", b"hello \\par world")
+
+ def test_ogonek1(self):
+ self.decode(u"ĄąĘęĮįǪǫŲų",
+ br'\k A\k a\k E\k e\k I\k i\k O\k o\k U\k u')
+
+ def test_ogonek2(self):
+ # note: should decode into u"Ǭǭ" but can't support this yet...
+ self.decode(u"\\textogonekcentered {Ō}\\textogonekcentered {ō}",
+ br'\textogonekcentered{\=O}\textogonekcentered{\=o}')
+
+
+class TestStreamDecoder(TestDecoder):
+
+ """Stream decoder tests."""
+
+ def decode(self, text_utf8, text_latex, inputenc=None):
+ encoding = 'latex+' + inputenc if inputenc else 'latex'
+ stream = BytesIO(text_latex)
+ reader = codecs.getreader(encoding)(stream)
+ self.assertEqual(text_utf8, reader.read())
+
+ # in this test, BytesIO(object()) is eventually called
+ # this is valid on Python 2, so we skip this test there
+ def test_invalid_type(self):
+ if PY2:
+ raise nose.plugins.skip.SkipTest
+ else:
+ TestDecoder.test_invalid_type(self)
+
+
+class TestIncrementalDecoder(TestDecoder):
+
+ """Incremental decoder tests."""
+
+ def decode(self, text_utf8, text_latex, inputenc=None):
+ encoding = 'latex+' + inputenc if inputenc else 'latex'
+ decoder = codecs.getincrementaldecoder(encoding)()
+ decoded_parts = (
+ decoder.decode(text_latex_part, final)
+ for text_latex_part, final in split_input(text_latex))
+ self.assertEqual(text_utf8, u''.join(decoded_parts))
+
+
+class TestEncoder(TestCase):
+
+ """Stateless encoder tests."""
+
+ def encode(self, text_utf8, text_latex, inputenc=None, errors='strict'):
+ """Main test function."""
+ encoding = 'latex+' + inputenc if inputenc else 'latex'
+ encoded, n = codecs.getencoder(encoding)(text_utf8, errors=errors)
+ self.assertEqual((encoded, n), (text_latex, len(text_utf8)))
+
+ @nose.tools.raises(TypeError)
+ def test_invalid_type(self):
+ self.encode(object(), object())
+
+ # note concerning test_invalid_code_* methods:
+ # u'\u2328' (0x2328 = 9000) is unicode for keyboard symbol
+ # we currently provide no translation for this into LaTeX code
+
+ @nose.tools.raises(ValueError)
+ def test_invalid_code_strict(self):
+ self.encode(u'\u2328', b'', 'ascii', 'strict')
+
+ def test_invalid_code_ignore(self):
+ self.encode(u'\u2328', b'', 'ascii', 'ignore')
+
+ def test_invalid_code_replace(self):
+ self.encode(u'\u2328', b'{\\char9000}', 'ascii', 'replace')
+
+ @nose.tools.raises(ValueError)
+ def test_invalid_code_baderror(self):
+ self.encode(u'\u2328', b'', 'ascii', '**baderror**')
+
+ def test_null(self):
+ self.encode(u'', b'')
+
+ def test_maelstrom(self):
+ self.encode(u"mælström", br'm\ae lstr\"om')
+
+ def test_maelstrom_latin1(self):
+ self.encode(u"mælström", b'm\xe6lstr\xf6m', 'latin1')
+
+ def test_laren(self):
+ self.encode(
+ u"© låren av björn",
+ br'\copyright\ l\aa ren av bj\"orn')
+
+ def test_laren_latin1(self):
+ self.encode(
+ u"© låren av björn",
+ b'\xa9 l\xe5ren av bj\xf6rn',
+ 'latin1')
+
+ def test_droitcivil(self):
+ self.encode(
+ u"Même s'il a fait l'objet d'adaptations suite à l'évolution, \n"
+ u"la transformation sociale, économique et politique du pays, \n"
+ u"le code civil fran{ç}ais est aujourd'hui encore le texte "
+ u"fondateur \n"
+ u"du droit civil français mais aussi du droit civil belge "
+ u"ainsi que \n"
+ u"de plusieurs autres droits civils.",
+ b"M\\^eme s'il a fait l'objet d'adaptations suite "
+ b"\\`a l'\\'evolution, \nla transformation sociale, "
+ b"\\'economique et politique du pays, \nle code civil "
+ b"fran{\\c c}ais est aujourd'hui encore le texte fondateur \n"
+ b"du droit civil fran\\c cais mais aussi du droit civil "
+ b"belge ainsi que \nde plusieurs autres droits civils.",
+ )
+
+ def test_oeuf(self):
+ self.encode(
+ u"D'un point de vue diététique, l'œuf apaise la faim.",
+ br"D'un point de vue di\'et\'etique, l'\oe uf apaise la faim.",
+ )
+
+ def test_oeuf_latin1(self):
+ self.encode(
+ u"D'un point de vue diététique, l'œuf apaise la faim.",
+ b"D'un point de vue di\xe9t\xe9tique, l'\\oe uf apaise la faim.",
+ 'latin1'
+ )
+
+ def test_alpha(self):
+ self.encode(u"α", b"$\\alpha$")
+
+ def test_serafin(self):
+ self.encode(u"Seraf{\xed}n", b"Seraf{\\'\\i }n")
+
+ def test_space_1(self):
+ self.encode(u"ææ", br'\ae \ae')
+
+ def test_space_2(self):
+ self.encode(u"æ æ", br'\ae\ \ae')
+
+ def test_number_sign(self):
+ # note: no need for control space after \#
+ self.encode(u"# hello", br'\# hello')
+
+ def test_underscore(self):
+ self.encode(u"_", br'\_')
+
+ def test_dz1(self):
+ self.encode(u"DZ", br'DZ')
+
+ def test_dz2(self):
+ self.encode(u"DZ", br'DZ')
+
+ def test_newline(self):
+ self.encode(u"hello\nworld", b"hello\nworld")
+
+ def test_par1(self):
+ self.encode(u"hello\n\nworld", b"hello\n\nworld")
+
+ def test_par2(self):
+ self.encode(u"hello\\par world", b"hello\\par world")
+
+ def test_ogonek1(self):
+ self.encode(u"ĄąĘęĮįǪǫŲų",
+ br'\k A\k a\k E\k e\k I\k i\k O\k o\k U\k u')
+
+ def test_ogonek2(self):
+ self.encode(u"Ǭǭ",
+ br'\textogonekcentered{\=O}\textogonekcentered{\=o}')
+
+
+class TestStreamEncoder(TestEncoder):
+
+ """Stream encoder tests."""
+
+ def encode(self, text_utf8, text_latex, inputenc=None, errors='strict'):
+ encoding = 'latex+' + inputenc if inputenc else 'latex'
+ stream = BytesIO()
+ writer = codecs.getwriter(encoding)(stream, errors=errors)
+ writer.write(text_utf8)
+ self.assertEqual(text_latex, stream.getvalue())
+
+
+class TestIncrementalEncoder(TestEncoder):
+
+ """Incremental encoder tests."""
+
+ def encode(self, text_utf8, text_latex, inputenc=None, errors='strict'):
+ encoding = 'latex+' + inputenc if inputenc else 'latex'
+ encoder = codecs.getincrementalencoder(encoding)(errors=errors)
+ encoded_parts = (
+ encoder.encode(text_utf8_part, final)
+ for text_utf8_part, final in split_input(text_utf8))
+ self.assertEqual(text_latex, b''.join(encoded_parts))
diff --git a/test/test_latex_lexer.py b/test/test_latex_lexer.py
new file mode 100644
index 0000000..924171c
--- /dev/null
+++ b/test/test_latex_lexer.py
@@ -0,0 +1,442 @@
+"""Tests for the tex lexer."""
+
+import nose.tools
+from unittest import TestCase
+
+from latexcodec.lexer import (
+ LatexLexer, LatexIncrementalLexer, LatexIncrementalDecoder,
+ LatexIncrementalEncoder, Token)
+
+
+def test_token_create():
+ t = Token()
+ nose.tools.assert_equal(t.name, 'unknown')
+ nose.tools.assert_equal(t.text, b'')
+
+
+def test_token_create_with_args():
+ t = Token('hello', b'world')
+ nose.tools.assert_equal(t.name, 'hello')
+ nose.tools.assert_equal(t.text, b'world')
+
+
+ at nose.tools.raises(AttributeError)
+def test_token_assign_name():
+ t = Token()
+ t.name = 'test'
+
+
+ at nose.tools.raises(AttributeError)
+def test_token_assign_text():
+ t = Token()
+ t.text = 'test'
+
+
+ at nose.tools.raises(AttributeError)
+def test_token_assign_other():
+ t = Token()
+ t.blabla = 'test'
+
+
+class BaseLatexLexerTest(TestCase):
+
+ errors = 'strict'
+
+ def setUp(self):
+ self.lexer = LatexLexer(errors=self.errors)
+
+ def lex_it(self, latex_code, latex_tokens, final=False):
+ tokens = self.lexer.get_raw_tokens(latex_code, final=final)
+ self.assertEqual(
+ list(token.text for token in tokens),
+ latex_tokens)
+
+ def tearDown(self):
+ del self.lexer
+
+
+class LatexLexerTest(BaseLatexLexerTest):
+
+ def test_null(self):
+ self.lex_it(b'', [], final=True)
+
+ def test_hello(self):
+ self.lex_it(
+ b'hello! [#1] This \\is\\ \\^ a \ntest.\n'
+ b' \nHey.\n\n\# x \#x',
+ br'h|e|l|l|o|!| | |[|#1|]| |T|h|i|s| |\is|\ | | |\^| |a| '
+ b'|\n|t|e|s|t|.|\n| | | | |\n|H|e|y|.|\n|\n'
+ br'|\#| |x| |\#|x'.split(b'|'),
+ final=True
+ )
+
+ def test_comment(self):
+ self.lex_it(
+ b'test% some comment\ntest',
+ b't|e|s|t|% some comment\n|t|e|s|t'.split(b'|'),
+ final=True
+ )
+
+ def test_comment_newline(self):
+ self.lex_it(
+ b'test% some comment\n\ntest',
+ b't|e|s|t|% some comment\n|\n|t|e|s|t'.split(b'|'),
+ final=True
+ )
+
+ def test_control(self):
+ self.lex_it(
+ b'\\hello\\world',
+ b'\\hello|\\world'.split(b'|'),
+ final=True
+ )
+
+ def test_control_whitespace(self):
+ self.lex_it(
+ b'\\hello \\world ',
+ b'\\hello| | | |\\world| | | '.split(b'|'),
+ final=True
+ )
+
+ def test_controlx(self):
+ self.lex_it(
+ b'\\#\\&',
+ b'\\#|\\&'.split(b'|'),
+ final=True
+ )
+
+ def test_controlx_whitespace(self):
+ self.lex_it(
+ b'\\# \\& ',
+ b'\\#| | | | |\\&| | | '.split(b'|'),
+ final=True
+ )
+
+ def test_buffer(self):
+ self.lex_it(
+ b'hi\\t',
+ b'h|i'.split(b'|'),
+ )
+ self.lex_it(
+ b'here',
+ [b'\\there'],
+ final=True,
+ )
+
+ def test_state(self):
+ self.lex_it(
+ b'hi\\t',
+ b'h|i'.split(b'|'),
+ )
+ state = self.lexer.getstate()
+ self.lexer.reset()
+ self.lex_it(
+ b'here',
+ b'h|e|r|e'.split(b'|'),
+ final=True,
+ )
+ self.lexer.setstate(state)
+ self.lex_it(
+ b'here',
+ [b'\\there'],
+ final=True,
+ )
+
+ @nose.tools.raises(NotImplementedError)
+ def test_decode(self):
+ self.lexer.decode(b'')
+
+ def test_final_backslash(self):
+ self.lex_it(
+ b'notsogood\\',
+ b'n|o|t|s|o|g|o|o|d|\\'.split(b'|'),
+ final=True
+ )
+
+ def test_final_comment(self):
+ self.lex_it(
+ b'hello%',
+ b'h|e|l|l|o|%'.split(b'|'),
+ final=True
+ )
+
+ def test_hash(self):
+ self.lex_it(b'#', [b'#'], final=True)
+
+
+class BaseTexLexerTest(TestCase):
+
+ """Tex lexer fixture."""
+
+ errors = 'strict'
+
+ def setUp(self):
+ self.lexer = LatexIncrementalDecoder(self.errors)
+
+ def lex_it(self, latex_code, latex_tokens, final=False):
+ tokens = self.lexer.get_tokens(latex_code, final=final)
+ self.assertEqual(
+ list(token.text for token in tokens),
+ latex_tokens)
+
+ def tearDown(self):
+ del self.lexer
+
+
+class TexLexerTest(BaseTexLexerTest):
+
+ def test_null(self):
+ self.lex_it(b'', [], final=True)
+
+ def test_hello(self):
+ self.lex_it(
+ b'hello! [#1] This \\is\\ \\^ a \ntest.\n'
+ b' \nHey.\n\n\# x \#x',
+ br'h|e|l|l|o|!| |[|#1|]| |T|h|i|s| |\is|\ |\^|a| '
+ br'|t|e|s|t|.| |\par|H|e|y|.| '
+ br'|\par|\#| |x| |\#|x'.split(b'|'),
+ final=True
+ )
+
+ def test_comment(self):
+ self.lex_it(
+ b'test% some comment\ntest',
+ b't|e|s|t|t|e|s|t'.split(b'|'),
+ final=True
+ )
+
+ def test_comment_newline(self):
+ self.lex_it(
+ b'test% some comment\n\ntest',
+ b't|e|s|t|\\par|t|e|s|t'.split(b'|'),
+ final=True
+ )
+
+ def test_control(self):
+ self.lex_it(
+ b'\\hello\\world',
+ b'\\hello|\\world'.split(b'|'),
+ final=True
+ )
+
+ def test_control_whitespace(self):
+ self.lex_it(
+ b'\\hello \\world ',
+ b'\\hello|\\world'.split(b'|'),
+ final=True
+ )
+
+ def test_controlx(self):
+ self.lex_it(
+ b'\\#\\&',
+ b'\\#|\\&'.split(b'|'),
+ final=True
+ )
+
+ def test_controlx_whitespace(self):
+ self.lex_it(
+ b'\\# \\& ',
+ b'\\#| |\\&| '.split(b'|'),
+ final=True
+ )
+
+ def test_buffer(self):
+ self.lex_it(
+ b'hi\\t',
+ b'h|i'.split(b'|'),
+ )
+ self.lex_it(
+ b'here',
+ [b'\\there'],
+ final=True,
+ )
+
+ def test_buffer_decode(self):
+ self.assertEqual(
+ self.lexer.decode(b'hello! [#1] This \\i'),
+ u'hello! [#1] This ',
+ )
+ self.assertEqual(
+ self.lexer.decode(b's\\ \\^ a \ntest.\n'),
+ u'\\is \\ \\^a test.',
+ )
+ self.assertEqual(
+ self.lexer.decode(b' \nHey.\n\n\# x \#x', final=True),
+ u' \\par Hey. \\par \\# x \\#x',
+ )
+
+ def test_state_middle(self):
+ self.lex_it(
+ b'hi\\t',
+ b'h|i'.split(b'|'),
+ )
+ state = self.lexer.getstate()
+ self.assertEqual(self.lexer.state, 'M')
+ self.assertEqual(self.lexer.raw_buffer.name, 'control_word')
+ self.assertEqual(self.lexer.raw_buffer.text, b'\\t')
+ self.lexer.reset()
+ self.assertEqual(self.lexer.state, 'N')
+ self.assertEqual(self.lexer.raw_buffer.name, 'unknown')
+ self.assertEqual(self.lexer.raw_buffer.text, b'')
+ self.lex_it(
+ b'here',
+ b'h|e|r|e'.split(b'|'),
+ final=True,
+ )
+ self.lexer.setstate(state)
+ self.assertEqual(self.lexer.state, 'M')
+ self.assertEqual(self.lexer.raw_buffer.name, 'control_word')
+ self.assertEqual(self.lexer.raw_buffer.text, b'\\t')
+ self.lex_it(
+ b'here',
+ [b'\\there'],
+ final=True,
+ )
+
+ def test_state_inline_math(self):
+ self.lex_it(
+ b'hi$t',
+ b'h|i|$'.split(b'|'),
+ )
+ assert self.lexer.inline_math
+ self.lex_it(
+ b'here$',
+ b't|h|e|r|e|$'.split(b'|'),
+ final=True,
+ )
+ assert not self.lexer.inline_math
+
+ # counterintuitive?
+ @nose.tools.raises(UnicodeDecodeError)
+ def test_final_backslash(self):
+ self.lex_it(
+ b'notsogood\\',
+ [b'notsogood'],
+ final=True
+ )
+
+ # counterintuitive?
+ @nose.tools.raises(UnicodeDecodeError)
+ def test_final_comment(self):
+ self.lex_it(
+ b'hello%',
+ [b'hello'],
+ final=True
+ )
+
+ def test_hash(self):
+ self.lex_it(b'#', [b'#'], final=True)
+
+
+class TexLexerReplaceTest(BaseTexLexerTest):
+
+ errors = 'replace'
+
+ def test_errors_replace(self):
+ self.lex_it(
+ b'hello%',
+ b'h|e|l|l|o|?'.split(b'|'),
+ final=True
+ )
+
+
+class TexLexerIgnoreTest(BaseTexLexerTest):
+
+ errors = 'ignore'
+
+ def test_errors_ignore(self):
+ self.lex_it(
+ b'hello%',
+ b'h|e|l|l|o'.split(b'|'),
+ final=True
+ )
+
+
+class TexLexerInvalidErrorTest(BaseTexLexerTest):
+
+ errors = '**baderror**'
+
+ @nose.tools.raises(NotImplementedError)
+ def test_errors_invalid(self):
+ self.lex_it(
+ b'hello%',
+ b'h|e|l|l|o'.split(b'|'),
+ final=True
+ )
+
+
+def invalid_token_test():
+ lexer = LatexIncrementalDecoder()
+ # piggyback an implementation which results in invalid tokens
+ lexer.get_raw_tokens = lambda bytes_, final: [Token('**invalid**', bytes_)]
+ nose.tools.assert_raises(AssertionError, lambda: lexer.decode(b'hello'))
+
+
+def invalid_state_test_1():
+ lexer = LatexIncrementalDecoder()
+ # piggyback invalid state
+ lexer.state = '**invalid**'
+ nose.tools.assert_raises(AssertionError, lambda: lexer.decode(b'\n\n\n'))
+
+
+def invalid_state_test_2():
+ lexer = LatexIncrementalDecoder()
+ # piggyback invalid state
+ lexer.state = '**invalid**'
+ nose.tools.assert_raises(AssertionError, lambda: lexer.decode(b' '))
+
+
+class LatexIncrementalLexerTest(TestCase):
+
+ errors = 'strict'
+
+ def setUp(self):
+ self.lexer = LatexIncrementalLexer(errors=self.errors)
+
+ def lex_it(self, latex_code, latex_tokens, final=False):
+ tokens = self.lexer.get_tokens(latex_code, final=final)
+ self.assertEqual(
+ list(token.text for token in tokens),
+ latex_tokens)
+
+ def tearDown(self):
+ del self.lexer
+
+ def test_newline(self):
+ self.lex_it(
+ b"hello\nworld", b"h|e|l|l|o| |w|o|r|l|d".split(b'|'),
+ final=True)
+
+ def test_par(self):
+ self.lex_it(
+ b"hello\n\nworld", b"h|e|l|l|o| |\\par|w|o|r|l|d".split(b'|'),
+ final=True)
+
+
+class LatexIncrementalEncoderTest(TestCase):
+
+ """Encoder test fixture."""
+
+ errors = 'strict'
+
+ def setUp(self):
+ self.encoder = LatexIncrementalEncoder(self.errors)
+
+ def encode(self, latex_code, latex_bytes, final=False):
+ result = self.encoder.encode(latex_code, final=final)
+ self.assertEqual(result, latex_bytes)
+
+ def tearDown(self):
+ del self.encoder
+
+ @nose.tools.raises(TypeError)
+ def test_invalid_type(self):
+ self.encoder.encode(object())
+
+ @nose.tools.raises(ValueError)
+ def test_invalid_code(self):
+ # default encoding is ascii, \u00ff is not ascii translatable
+ self.encoder.encode(u"\u00ff")
+
+ def test_hello(self):
+ self.encode(u'hello', b'hello')
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-latexcodec.git
More information about the debian-med-commit
mailing list