[Python-modules-commits] r13346 - in packages/pdfminer/trunk/debian (7 files)

jwilk at users.alioth.debian.org jwilk at users.alioth.debian.org
Tue Jun 8 22:05:04 UTC 2010


    Date: Tuesday, June 8, 2010 @ 22:04:48
  Author: jwilk
Revision: 13346

Add manpage for pdf2txt.

Added:
  packages/pdfminer/trunk/debian/manpages/
  packages/pdfminer/trunk/debian/manpages/Makefile
  packages/pdfminer/trunk/debian/manpages/pdf2txt.xml
  packages/pdfminer/trunk/debian/python-pdfminer.manpages
Modified:
  packages/pdfminer/trunk/debian/clean
  packages/pdfminer/trunk/debian/control
  packages/pdfminer/trunk/debian/rules

Modified: packages/pdfminer/trunk/debian/clean
===================================================================
--- packages/pdfminer/trunk/debian/clean	2010-06-08 21:34:05 UTC (rev 13345)
+++ packages/pdfminer/trunk/debian/clean	2010-06-08 22:04:48 UTC (rev 13346)
@@ -1 +1,2 @@
+debian/manpages/*.[0-9]
 docs/changelog

Modified: packages/pdfminer/trunk/debian/control
===================================================================
--- packages/pdfminer/trunk/debian/control	2010-06-08 21:34:05 UTC (rev 13345)
+++ packages/pdfminer/trunk/debian/control	2010-06-08 22:04:48 UTC (rev 13346)
@@ -5,7 +5,8 @@
 Uploaders: Debian Python Modules Team <python-modules-team at lists.alioth.debian.org>
 Build-Depends: debhelper (>= 7.0.50~),
   python-all (>= 2.4), python-support (>= 0.90), python-nose,
-  elinks-lite | elinks
+  elinks-lite | elinks,
+  docbook-xsl, docbook-xml, xsltproc, libxml2-utils
 XS-Python-Version: >= 2.4
 Standards-Version: 3.8.4
 Homepage: http://www.unixuser.org/~euske/python/pdfminer/

Added: packages/pdfminer/trunk/debian/manpages/Makefile
===================================================================
--- packages/pdfminer/trunk/debian/manpages/Makefile	                        (rev 0)
+++ packages/pdfminer/trunk/debian/manpages/Makefile	2010-06-08 22:04:48 UTC (rev 13346)
@@ -0,0 +1,18 @@
+XML_FILES = $(wildcard *.xml)
+MAN_FILES = $(XML_FILES:.xml=)
+
+XSL = http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl
+XSL_PARAMS = --param man.charmap.use.subset 0
+
+.PHONY: all
+all: $(MAN_FILES)
+
+%: %.xml
+	xmllint --valid --noout $(<)
+	xsltproc $(XSL_PARAMS) $(XSL) $(<)
+
+.PHONY: clean
+clean:
+	rm $(MAN_FILES)
+
+# vim:ts=4 sw=4 noet


Property changes on: packages/pdfminer/trunk/debian/manpages/Makefile
___________________________________________________________________
Added: svn:mime-type
   + text/x-makefile

Added: packages/pdfminer/trunk/debian/manpages/pdf2txt.xml
===================================================================
--- packages/pdfminer/trunk/debian/manpages/pdf2txt.xml	                        (rev 0)
+++ packages/pdfminer/trunk/debian/manpages/pdf2txt.xml	2010-06-08 22:04:48 UTC (rev 13346)
@@ -0,0 +1,222 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.docbook.org/xml/4.5/docbookx.dtd">
+<refentry>
+  <refentryinfo>
+    <title>PDFMiner Manual</title>
+    <productname>pdf2txt</productname>
+    <authorgroup>
+      <author>
+        <firstname>Jakub</firstname>
+        <surname>Wilk</surname>
+        <contrib>Wrote this manual page for the Debian system.</contrib>
+        <address><email>jwilk at debian.org</email></address>
+      </author>
+      <author>
+        <firstname>Yusuke</firstname>
+        <surname>Shinyama</surname>
+        <contrib>Author of PDFMiner and its original HTML documentation.</contrib>
+        <address><email>yusuke at cs.nyu.edu</email></address>
+      </author>
+    </authorgroup>
+  </refentryinfo>
+  <refmeta>
+    <refentrytitle>pdf2txt</refentrytitle>
+    <manvolnum>1</manvolnum>
+  </refmeta>
+  <refnamediv>
+    <refname>pdf2txt</refname>
+    <refpurpose>extracts text contents of PDF files</refpurpose>
+  </refnamediv>
+  <refsynopsisdiv>
+    <cmdsynopsis>
+      <command>pdf2txt</command>
+      <arg choice='opt' rep='repeat'><replaceable>option</replaceable></arg>
+      <arg choice='plain' rep='repeat'><replaceable>file</replaceable></arg>
+    </cmdsynopsis>
+  </refsynopsisdiv>
+  <refsection>
+    <title>Description</title>
+    <para>
+      <command>pdf2txt</command> extracts text contents from a PDF file. It extracts all the texts
+      that are to be rendered programmatically, ie. text represented as ASCII or Unicode strings. It
+      cannot recognize texts drawn as images that would require optical character recognition. It
+      also extracts the corresponding locations, font names, font sizes, writing direction
+      (horizontal or vertical) for each text portion. You need to provide a password for protected
+      PDF documents when its access is restricted. You cannot extract any text from a PDF document
+      which does not have extraction permission.  
+    </para>  
+  </refsection>
+  <refsection>
+    <title>Options</title>
+    <variablelist>
+      <varlistentry>
+        <term><option>-o <replaceable>file</replaceable></option></term>
+        <listitem>
+          <para>Specifies the output file name. The default is to print the extracted contents to
+            standand output in text format.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-p <replaceable>pageno</replaceable><replaceable>[,pageno,…]</replaceable></option></term>
+        <listitem>
+          <para>Specifies the comma-separated list of the page numbers to be extracted. Page numbers
+            are starting from one. By default, it extracts texts from all the pages.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-c <replaceable>codec</replaceable></option></term>
+        <listitem>
+          <para>Specifies the output codec.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-t <replaceable>type</replaceable></option></term>
+        <listitem>
+          <para>Specifies the output format. The following formats are currently supported:</para>
+          <variablelist>
+            <varlistentry>
+              <term>text</term>
+              <listitem>
+                <para>Text format. This is the default.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>html</term>
+              <listitem>
+                <para>HTML format. It is not recommended.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>xml</term>
+              <listitem>
+                <para>XML format. It provides the most information available.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>tag</term>
+              <listitem>
+                <para>“Tagged PDF” format. A tagged PDF has its own contents annotated with
+                  HTML-like tags. <command>pdf2txt</command> tries to extract its content streams
+                  rather than inferring its text locations. Tags used here are defined in the <ulink
+                  url='http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf'>PDF
+                  Reference, Sixth Edition</ulink> (§10.7 “Tagged PDF”).</para>
+              </listitem>
+            </varlistentry>
+          </variablelist>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-T <replaceable>writing-mode</replaceable></option></term>
+        <listitem>
+          <para>Specifies the writing mode of text outputs:</para>
+          <variablelist>
+            <varlistentry>
+              <term>lr-tb</term>
+              <listitem>
+                <para>Left-to-right, top-to-bottom.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>tb-rl</term>
+              <listitem>
+                <para>Top-to-bottom, right-to-left.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>auto</term>
+              <listitem>
+                <para>Determine writing mode automatically</para>
+              </listitem>
+            </varlistentry>
+          </variablelist>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-M <replaceable>char-margin</replaceable></option></term>
+        <term><option>-L <replaceable>line-margin</replaceable></option></term>
+        <term><option>-W <replaceable>word-margin</replaceable></option></term>
+        <listitem>
+          <para>
+            These are the parameters used for layout analysis. In an actual PDF file, texts might be
+            split into several chunks in the middle of its running, depending on the authoring
+            software. Therefore, text extraction needs to splice text chunks. In the figure below,
+            two text chunks whose distance is closer than the <replaceable>char-margin</replaceable>
+            is considered continuous and get grouped into one. Also, two lines whose distance is
+            closer than the <replaceable>line-margin</replaceable> is grouped as a text box, which
+            is a rectangular area that contains a “cluster” of texts. Furthermore, it may be
+            required to insert blank characters (spaces) as necessary if the distance between two
+            words is greater than the <replaceable>word-margin</replaceable>, as a blank between
+            words might not be represented as a space, but indicated by the positioning of each word.
+          </para>
+          <para>
+            Each value is specified not as an actual length, but as a proportion of the length to
+            the size of each character in question. The default values are
+            <replaceable>char-margin</replaceable> = 1.0, <replaceable>line-margin</replaceable> =
+            0.3, and <replaceable>W = 0.2</replaceable>, respectively.
+          </para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-n</option></term>
+        <listitem>
+          <para>Suppress layout analysis.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-A</option></term>
+        <listitem>
+          <para>Force to perform layout analysis for all the text strings, including texts contained
+            in figures.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-s <replaceable>scale</replaceable></option></term>
+        <listitem>
+          <para>Specifies the output scale. This option can be used in HTML format only.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-m <replaceable>n</replaceable></option></term>
+        <listitem>
+          <para>Specifies the maximum number of pages to extract. By default, all the pages in a
+            document are extracted.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-P <replaceable>password</replaceable></option></term>
+        <listitem>
+          <para>Provides the user password to access PDF contents.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-d</option></term>
+        <listitem>
+          <para>Increase the debug level.</para>
+        </listitem>
+      </varlistentry>
+    </variablelist>
+  </refsection>
+  <refsection>
+    <title>Examples</title>
+    <para>
+      Extract text as an HTML file whose filename is output.html:
+      <screen><prompt>$</prompt> <command>pdf2txt</command> -o output.html samples/naacl06-shinyama.pdf</screen>
+    </para>
+    <para>
+      Extract a Japanese HTML file in vertical writing:
+      <screen><prompt>$</prompt> <command>pdf2txt</command> -c euc-jp -D tb-rl -o output.html samples/jo.pdf</screen>
+    </para>
+    <para>
+      Extract text from an encrypted PDF file:
+      <screen><prompt>$</prompt> <command>pdf2txt</command> -P mypassword -o output.txt secret.pdf</screen>
+    </para>
+  </refsection>
+  <refsection>
+    <title>See also</title>
+    <para>
+      <citerefentry><refentrytitle>dumppdf</refentrytitle><manvolnum>1</manvolnum></citerefentry>
+    </para>
+  </refsection>
+</refentry>
+
+<!-- vim:set ts=2 sw=2 et:-->


Property changes on: packages/pdfminer/trunk/debian/manpages/pdf2txt.xml
___________________________________________________________________
Added: svn:mime-type
   + text/xml

Added: packages/pdfminer/trunk/debian/python-pdfminer.manpages
===================================================================
--- packages/pdfminer/trunk/debian/python-pdfminer.manpages	                        (rev 0)
+++ packages/pdfminer/trunk/debian/python-pdfminer.manpages	2010-06-08 22:04:48 UTC (rev 13346)
@@ -0,0 +1 @@
+debian/manpages/*.[0-9]

Modified: packages/pdfminer/trunk/debian/rules
===================================================================
--- packages/pdfminer/trunk/debian/rules	2010-06-08 21:34:05 UTC (rev 13345)
+++ packages/pdfminer/trunk/debian/rules	2010-06-08 22:04:48 UTC (rev 13346)
@@ -16,6 +16,11 @@
 	rename.ul .py '' debian/python-pdfminer/usr/bin/*.py
 	dh_install
 
+.PHONY: override_dh_installman
+override_dh_installman:
+	$(MAKE) -C debian/manpages/
+	dh_installman
+
 .PHONY: override_dh_auto_test
 override_dh_auto_test:
 ifeq ($(filter nocheck,$(DEB_BUILD_OPTIONS)),)




More information about the Python-modules-commits mailing list