[Python-modules-commits] r13346 - in packages/pdfminer/trunk/debian (7 files)
jwilk at users.alioth.debian.org
jwilk at users.alioth.debian.org
Tue Jun 8 22:05:04 UTC 2010
Date: Tuesday, June 8, 2010 @ 22:04:48
Author: jwilk
Revision: 13346
Add manpage for pdf2txt.
Added:
packages/pdfminer/trunk/debian/manpages/
packages/pdfminer/trunk/debian/manpages/Makefile
packages/pdfminer/trunk/debian/manpages/pdf2txt.xml
packages/pdfminer/trunk/debian/python-pdfminer.manpages
Modified:
packages/pdfminer/trunk/debian/clean
packages/pdfminer/trunk/debian/control
packages/pdfminer/trunk/debian/rules
Modified: packages/pdfminer/trunk/debian/clean
===================================================================
--- packages/pdfminer/trunk/debian/clean 2010-06-08 21:34:05 UTC (rev 13345)
+++ packages/pdfminer/trunk/debian/clean 2010-06-08 22:04:48 UTC (rev 13346)
@@ -1 +1,2 @@
+debian/manpages/*.[0-9]
docs/changelog
Modified: packages/pdfminer/trunk/debian/control
===================================================================
--- packages/pdfminer/trunk/debian/control 2010-06-08 21:34:05 UTC (rev 13345)
+++ packages/pdfminer/trunk/debian/control 2010-06-08 22:04:48 UTC (rev 13346)
@@ -5,7 +5,8 @@
Uploaders: Debian Python Modules Team <python-modules-team at lists.alioth.debian.org>
Build-Depends: debhelper (>= 7.0.50~),
python-all (>= 2.4), python-support (>= 0.90), python-nose,
- elinks-lite | elinks
+ elinks-lite | elinks,
+ docbook-xsl, docbook-xml, xsltproc, libxml2-utils
XS-Python-Version: >= 2.4
Standards-Version: 3.8.4
Homepage: http://www.unixuser.org/~euske/python/pdfminer/
Added: packages/pdfminer/trunk/debian/manpages/Makefile
===================================================================
--- packages/pdfminer/trunk/debian/manpages/Makefile (rev 0)
+++ packages/pdfminer/trunk/debian/manpages/Makefile 2010-06-08 22:04:48 UTC (rev 13346)
@@ -0,0 +1,18 @@
+XML_FILES = $(wildcard *.xml)
+MAN_FILES = $(XML_FILES:.xml=)
+
+XSL = http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl
+XSL_PARAMS = --param man.charmap.use.subset 0
+
+.PHONY: all
+all: $(MAN_FILES)
+
+%: %.xml
+ xmllint --valid --noout $(<)
+ xsltproc $(XSL_PARAMS) $(XSL) $(<)
+
+.PHONY: clean
+clean:
+ rm $(MAN_FILES)
+
+# vim:ts=4 sw=4 noet
Property changes on: packages/pdfminer/trunk/debian/manpages/Makefile
___________________________________________________________________
Added: svn:mime-type
+ text/x-makefile
Added: packages/pdfminer/trunk/debian/manpages/pdf2txt.xml
===================================================================
--- packages/pdfminer/trunk/debian/manpages/pdf2txt.xml (rev 0)
+++ packages/pdfminer/trunk/debian/manpages/pdf2txt.xml 2010-06-08 22:04:48 UTC (rev 13346)
@@ -0,0 +1,222 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.docbook.org/xml/4.5/docbookx.dtd">
+<refentry>
+ <refentryinfo>
+ <title>PDFMiner Manual</title>
+ <productname>pdf2txt</productname>
+ <authorgroup>
+ <author>
+ <firstname>Jakub</firstname>
+ <surname>Wilk</surname>
+ <contrib>Wrote this manual page for the Debian system.</contrib>
+ <address><email>jwilk at debian.org</email></address>
+ </author>
+ <author>
+ <firstname>Yusuke</firstname>
+ <surname>Shinyama</surname>
+ <contrib>Author of PDFMiner and its original HTML documentation.</contrib>
+ <address><email>yusuke at cs.nyu.edu</email></address>
+ </author>
+ </authorgroup>
+ </refentryinfo>
+ <refmeta>
+ <refentrytitle>pdf2txt</refentrytitle>
+ <manvolnum>1</manvolnum>
+ </refmeta>
+ <refnamediv>
+ <refname>pdf2txt</refname>
+ <refpurpose>extracts text contents of PDF files</refpurpose>
+ </refnamediv>
+ <refsynopsisdiv>
+ <cmdsynopsis>
+ <command>pdf2txt</command>
+ <arg choice='opt' rep='repeat'><replaceable>option</replaceable></arg>
+ <arg choice='plain' rep='repeat'><replaceable>file</replaceable></arg>
+ </cmdsynopsis>
+ </refsynopsisdiv>
+ <refsection>
+ <title>Description</title>
+ <para>
+ <command>pdf2txt</command> extracts text contents from a PDF file. It extracts all the texts
+ that are to be rendered programmatically, ie. text represented as ASCII or Unicode strings. It
+ cannot recognize texts drawn as images that would require optical character recognition. It
+ also extracts the corresponding locations, font names, font sizes, writing direction
+ (horizontal or vertical) for each text portion. You need to provide a password for protected
+ PDF documents when its access is restricted. You cannot extract any text from a PDF document
+ which does not have extraction permission.
+ </para>
+ </refsection>
+ <refsection>
+ <title>Options</title>
+ <variablelist>
+ <varlistentry>
+ <term><option>-o <replaceable>file</replaceable></option></term>
+ <listitem>
+ <para>Specifies the output file name. The default is to print the extracted contents to
+ standand output in text format.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>-p <replaceable>pageno</replaceable><replaceable>[,pageno,â¦]</replaceable></option></term>
+ <listitem>
+ <para>Specifies the comma-separated list of the page numbers to be extracted. Page numbers
+ are starting from one. By default, it extracts texts from all the pages.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>-c <replaceable>codec</replaceable></option></term>
+ <listitem>
+ <para>Specifies the output codec.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>-t <replaceable>type</replaceable></option></term>
+ <listitem>
+ <para>Specifies the output format. The following formats are currently supported:</para>
+ <variablelist>
+ <varlistentry>
+ <term>text</term>
+ <listitem>
+ <para>Text format. This is the default.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term>html</term>
+ <listitem>
+ <para>HTML format. It is not recommended.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term>xml</term>
+ <listitem>
+ <para>XML format. It provides the most information available.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term>tag</term>
+ <listitem>
+ <para>âTagged PDFâ format. A tagged PDF has its own contents annotated with
+ HTML-like tags. <command>pdf2txt</command> tries to extract its content streams
+ rather than inferring its text locations. Tags used here are defined in the <ulink
+ url='http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf'>PDF
+ Reference, Sixth Edition</ulink> (§10.7 âTagged PDFâ).</para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>-T <replaceable>writing-mode</replaceable></option></term>
+ <listitem>
+ <para>Specifies the writing mode of text outputs:</para>
+ <variablelist>
+ <varlistentry>
+ <term>lr-tb</term>
+ <listitem>
+ <para>Left-to-right, top-to-bottom.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term>tb-rl</term>
+ <listitem>
+ <para>Top-to-bottom, right-to-left.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term>auto</term>
+ <listitem>
+ <para>Determine writing mode automatically</para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>-M <replaceable>char-margin</replaceable></option></term>
+ <term><option>-L <replaceable>line-margin</replaceable></option></term>
+ <term><option>-W <replaceable>word-margin</replaceable></option></term>
+ <listitem>
+ <para>
+ These are the parameters used for layout analysis. In an actual PDF file, texts might be
+ split into several chunks in the middle of its running, depending on the authoring
+ software. Therefore, text extraction needs to splice text chunks. In the figure below,
+ two text chunks whose distance is closer than the <replaceable>char-margin</replaceable>
+ is considered continuous and get grouped into one. Also, two lines whose distance is
+ closer than the <replaceable>line-margin</replaceable> is grouped as a text box, which
+ is a rectangular area that contains a âclusterâ of texts. Furthermore, it may be
+ required to insert blank characters (spaces) as necessary if the distance between two
+ words is greater than the <replaceable>word-margin</replaceable>, as a blank between
+ words might not be represented as a space, but indicated by the positioning of each word.
+ </para>
+ <para>
+ Each value is specified not as an actual length, but as a proportion of the length to
+ the size of each character in question. The default values are
+ <replaceable>char-margin</replaceable> = 1.0, <replaceable>line-margin</replaceable> =
+ 0.3, and <replaceable>W = 0.2</replaceable>, respectively.
+ </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>-n</option></term>
+ <listitem>
+ <para>Suppress layout analysis.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>-A</option></term>
+ <listitem>
+ <para>Force to perform layout analysis for all the text strings, including texts contained
+ in figures.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>-s <replaceable>scale</replaceable></option></term>
+ <listitem>
+ <para>Specifies the output scale. This option can be used in HTML format only.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>-m <replaceable>n</replaceable></option></term>
+ <listitem>
+ <para>Specifies the maximum number of pages to extract. By default, all the pages in a
+ document are extracted.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>-P <replaceable>password</replaceable></option></term>
+ <listitem>
+ <para>Provides the user password to access PDF contents.</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry>
+ <term><option>-d</option></term>
+ <listitem>
+ <para>Increase the debug level.</para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </refsection>
+ <refsection>
+ <title>Examples</title>
+ <para>
+ Extract text as an HTML file whose filename is output.html:
+ <screen><prompt>$</prompt> <command>pdf2txt</command> -o output.html samples/naacl06-shinyama.pdf</screen>
+ </para>
+ <para>
+ Extract a Japanese HTML file in vertical writing:
+ <screen><prompt>$</prompt> <command>pdf2txt</command> -c euc-jp -D tb-rl -o output.html samples/jo.pdf</screen>
+ </para>
+ <para>
+ Extract text from an encrypted PDF file:
+ <screen><prompt>$</prompt> <command>pdf2txt</command> -P mypassword -o output.txt secret.pdf</screen>
+ </para>
+ </refsection>
+ <refsection>
+ <title>See also</title>
+ <para>
+ <citerefentry><refentrytitle>dumppdf</refentrytitle><manvolnum>1</manvolnum></citerefentry>
+ </para>
+ </refsection>
+</refentry>
+
+<!-- vim:set ts=2 sw=2 et:-->
Property changes on: packages/pdfminer/trunk/debian/manpages/pdf2txt.xml
___________________________________________________________________
Added: svn:mime-type
+ text/xml
Added: packages/pdfminer/trunk/debian/python-pdfminer.manpages
===================================================================
--- packages/pdfminer/trunk/debian/python-pdfminer.manpages (rev 0)
+++ packages/pdfminer/trunk/debian/python-pdfminer.manpages 2010-06-08 22:04:48 UTC (rev 13346)
@@ -0,0 +1 @@
+debian/manpages/*.[0-9]
Modified: packages/pdfminer/trunk/debian/rules
===================================================================
--- packages/pdfminer/trunk/debian/rules 2010-06-08 21:34:05 UTC (rev 13345)
+++ packages/pdfminer/trunk/debian/rules 2010-06-08 22:04:48 UTC (rev 13346)
@@ -16,6 +16,11 @@
rename.ul .py '' debian/python-pdfminer/usr/bin/*.py
dh_install
+.PHONY: override_dh_installman
+override_dh_installman:
+ $(MAKE) -C debian/manpages/
+ dh_installman
+
.PHONY: override_dh_auto_test
override_dh_auto_test:
ifeq ($(filter nocheck,$(DEB_BUILD_OPTIONS)),)
More information about the Python-modules-commits
mailing list