[Python-modules-commits] r13347 - in packages/pdfminer/trunk/debian/manpages (2 files)

jwilk at users.alioth.debian.org jwilk at users.alioth.debian.org
Tue Jun 8 22:25:32 UTC 2010


    Date: Tuesday, June 8, 2010 @ 22:25:31
  Author: jwilk
Revision: 13347

Rename manpage XML.

Added:
  packages/pdfminer/trunk/debian/manpages/pdf2txt.1.xml
    (from rev 13346, packages/pdfminer/trunk/debian/manpages/pdf2txt.xml)
Deleted:
  packages/pdfminer/trunk/debian/manpages/pdf2txt.xml

Copied: packages/pdfminer/trunk/debian/manpages/pdf2txt.1.xml (from rev 13346, packages/pdfminer/trunk/debian/manpages/pdf2txt.xml)
===================================================================
--- packages/pdfminer/trunk/debian/manpages/pdf2txt.1.xml	                        (rev 0)
+++ packages/pdfminer/trunk/debian/manpages/pdf2txt.1.xml	2010-06-08 22:25:31 UTC (rev 13347)
@@ -0,0 +1,222 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.docbook.org/xml/4.5/docbookx.dtd">
+<refentry>
+  <refentryinfo>
+    <title>PDFMiner Manual</title>
+    <productname>pdf2txt</productname>
+    <authorgroup>
+      <author>
+        <firstname>Jakub</firstname>
+        <surname>Wilk</surname>
+        <contrib>Wrote this manual page for the Debian system.</contrib>
+        <address><email>jwilk at debian.org</email></address>
+      </author>
+      <author>
+        <firstname>Yusuke</firstname>
+        <surname>Shinyama</surname>
+        <contrib>Author of PDFMiner and its original HTML documentation.</contrib>
+        <address><email>yusuke at cs.nyu.edu</email></address>
+      </author>
+    </authorgroup>
+  </refentryinfo>
+  <refmeta>
+    <refentrytitle>pdf2txt</refentrytitle>
+    <manvolnum>1</manvolnum>
+  </refmeta>
+  <refnamediv>
+    <refname>pdf2txt</refname>
+    <refpurpose>extracts text contents of PDF files</refpurpose>
+  </refnamediv>
+  <refsynopsisdiv>
+    <cmdsynopsis>
+      <command>pdf2txt</command>
+      <arg choice='opt' rep='repeat'><replaceable>option</replaceable></arg>
+      <arg choice='plain' rep='repeat'><replaceable>file</replaceable></arg>
+    </cmdsynopsis>
+  </refsynopsisdiv>
+  <refsection>
+    <title>Description</title>
+    <para>
+      <command>pdf2txt</command> extracts text contents from a PDF file. It extracts all the texts
+      that are to be rendered programmatically, ie. text represented as ASCII or Unicode strings. It
+      cannot recognize texts drawn as images that would require optical character recognition. It
+      also extracts the corresponding locations, font names, font sizes, writing direction
+      (horizontal or vertical) for each text portion. You need to provide a password for protected
+      PDF documents when its access is restricted. You cannot extract any text from a PDF document
+      which does not have extraction permission.  
+    </para>  
+  </refsection>
+  <refsection>
+    <title>Options</title>
+    <variablelist>
+      <varlistentry>
+        <term><option>-o <replaceable>file</replaceable></option></term>
+        <listitem>
+          <para>Specifies the output file name. The default is to print the extracted contents to
+            standand output in text format.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-p <replaceable>pageno</replaceable><replaceable>[,pageno,…]</replaceable></option></term>
+        <listitem>
+          <para>Specifies the comma-separated list of the page numbers to be extracted. Page numbers
+            are starting from one. By default, it extracts texts from all the pages.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-c <replaceable>codec</replaceable></option></term>
+        <listitem>
+          <para>Specifies the output codec.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-t <replaceable>type</replaceable></option></term>
+        <listitem>
+          <para>Specifies the output format. The following formats are currently supported:</para>
+          <variablelist>
+            <varlistentry>
+              <term>text</term>
+              <listitem>
+                <para>Text format. This is the default.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>html</term>
+              <listitem>
+                <para>HTML format. It is not recommended.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>xml</term>
+              <listitem>
+                <para>XML format. It provides the most information available.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>tag</term>
+              <listitem>
+                <para>“Tagged PDF” format. A tagged PDF has its own contents annotated with
+                  HTML-like tags. <command>pdf2txt</command> tries to extract its content streams
+                  rather than inferring its text locations. Tags used here are defined in the <ulink
+                  url='http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf'>PDF
+                  Reference, Sixth Edition</ulink> (§10.7 “Tagged PDF”).</para>
+              </listitem>
+            </varlistentry>
+          </variablelist>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-T <replaceable>writing-mode</replaceable></option></term>
+        <listitem>
+          <para>Specifies the writing mode of text outputs:</para>
+          <variablelist>
+            <varlistentry>
+              <term>lr-tb</term>
+              <listitem>
+                <para>Left-to-right, top-to-bottom.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>tb-rl</term>
+              <listitem>
+                <para>Top-to-bottom, right-to-left.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>auto</term>
+              <listitem>
+                <para>Determine writing mode automatically</para>
+              </listitem>
+            </varlistentry>
+          </variablelist>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-M <replaceable>char-margin</replaceable></option></term>
+        <term><option>-L <replaceable>line-margin</replaceable></option></term>
+        <term><option>-W <replaceable>word-margin</replaceable></option></term>
+        <listitem>
+          <para>
+            These are the parameters used for layout analysis. In an actual PDF file, texts might be
+            split into several chunks in the middle of its running, depending on the authoring
+            software. Therefore, text extraction needs to splice text chunks. In the figure below,
+            two text chunks whose distance is closer than the <replaceable>char-margin</replaceable>
+            is considered continuous and get grouped into one. Also, two lines whose distance is
+            closer than the <replaceable>line-margin</replaceable> is grouped as a text box, which
+            is a rectangular area that contains a “cluster” of texts. Furthermore, it may be
+            required to insert blank characters (spaces) as necessary if the distance between two
+            words is greater than the <replaceable>word-margin</replaceable>, as a blank between
+            words might not be represented as a space, but indicated by the positioning of each word.
+          </para>
+          <para>
+            Each value is specified not as an actual length, but as a proportion of the length to
+            the size of each character in question. The default values are
+            <replaceable>char-margin</replaceable> = 1.0, <replaceable>line-margin</replaceable> =
+            0.3, and <replaceable>W = 0.2</replaceable>, respectively.
+          </para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-n</option></term>
+        <listitem>
+          <para>Suppress layout analysis.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-A</option></term>
+        <listitem>
+          <para>Force to perform layout analysis for all the text strings, including texts contained
+            in figures.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-s <replaceable>scale</replaceable></option></term>
+        <listitem>
+          <para>Specifies the output scale. This option can be used in HTML format only.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-m <replaceable>n</replaceable></option></term>
+        <listitem>
+          <para>Specifies the maximum number of pages to extract. By default, all the pages in a
+            document are extracted.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-P <replaceable>password</replaceable></option></term>
+        <listitem>
+          <para>Provides the user password to access PDF contents.</para>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term><option>-d</option></term>
+        <listitem>
+          <para>Increase the debug level.</para>
+        </listitem>
+      </varlistentry>
+    </variablelist>
+  </refsection>
+  <refsection>
+    <title>Examples</title>
+    <para>
+      Extract text as an HTML file whose filename is output.html:
+      <screen><prompt>$</prompt> <command>pdf2txt</command> -o output.html samples/naacl06-shinyama.pdf</screen>
+    </para>
+    <para>
+      Extract a Japanese HTML file in vertical writing:
+      <screen><prompt>$</prompt> <command>pdf2txt</command> -c euc-jp -D tb-rl -o output.html samples/jo.pdf</screen>
+    </para>
+    <para>
+      Extract text from an encrypted PDF file:
+      <screen><prompt>$</prompt> <command>pdf2txt</command> -P mypassword -o output.txt secret.pdf</screen>
+    </para>
+  </refsection>
+  <refsection>
+    <title>See also</title>
+    <para>
+      <citerefentry><refentrytitle>dumppdf</refentrytitle><manvolnum>1</manvolnum></citerefentry>
+    </para>
+  </refsection>
+</refentry>
+
+<!-- vim:set ts=2 sw=2 et:-->

Deleted: packages/pdfminer/trunk/debian/manpages/pdf2txt.xml
===================================================================
--- packages/pdfminer/trunk/debian/manpages/pdf2txt.xml	2010-06-08 22:04:48 UTC (rev 13346)
+++ packages/pdfminer/trunk/debian/manpages/pdf2txt.xml	2010-06-08 22:25:31 UTC (rev 13347)
@@ -1,222 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.docbook.org/xml/4.5/docbookx.dtd">
-<refentry>
-  <refentryinfo>
-    <title>PDFMiner Manual</title>
-    <productname>pdf2txt</productname>
-    <authorgroup>
-      <author>
-        <firstname>Jakub</firstname>
-        <surname>Wilk</surname>
-        <contrib>Wrote this manual page for the Debian system.</contrib>
-        <address><email>jwilk at debian.org</email></address>
-      </author>
-      <author>
-        <firstname>Yusuke</firstname>
-        <surname>Shinyama</surname>
-        <contrib>Author of PDFMiner and its original HTML documentation.</contrib>
-        <address><email>yusuke at cs.nyu.edu</email></address>
-      </author>
-    </authorgroup>
-  </refentryinfo>
-  <refmeta>
-    <refentrytitle>pdf2txt</refentrytitle>
-    <manvolnum>1</manvolnum>
-  </refmeta>
-  <refnamediv>
-    <refname>pdf2txt</refname>
-    <refpurpose>extracts text contents of PDF files</refpurpose>
-  </refnamediv>
-  <refsynopsisdiv>
-    <cmdsynopsis>
-      <command>pdf2txt</command>
-      <arg choice='opt' rep='repeat'><replaceable>option</replaceable></arg>
-      <arg choice='plain' rep='repeat'><replaceable>file</replaceable></arg>
-    </cmdsynopsis>
-  </refsynopsisdiv>
-  <refsection>
-    <title>Description</title>
-    <para>
-      <command>pdf2txt</command> extracts text contents from a PDF file. It extracts all the texts
-      that are to be rendered programmatically, ie. text represented as ASCII or Unicode strings. It
-      cannot recognize texts drawn as images that would require optical character recognition. It
-      also extracts the corresponding locations, font names, font sizes, writing direction
-      (horizontal or vertical) for each text portion. You need to provide a password for protected
-      PDF documents when its access is restricted. You cannot extract any text from a PDF document
-      which does not have extraction permission.  
-    </para>  
-  </refsection>
-  <refsection>
-    <title>Options</title>
-    <variablelist>
-      <varlistentry>
-        <term><option>-o <replaceable>file</replaceable></option></term>
-        <listitem>
-          <para>Specifies the output file name. The default is to print the extracted contents to
-            standand output in text format.</para>
-        </listitem>
-      </varlistentry>
-      <varlistentry>
-        <term><option>-p <replaceable>pageno</replaceable><replaceable>[,pageno,…]</replaceable></option></term>
-        <listitem>
-          <para>Specifies the comma-separated list of the page numbers to be extracted. Page numbers
-            are starting from one. By default, it extracts texts from all the pages.</para>
-        </listitem>
-      </varlistentry>
-      <varlistentry>
-        <term><option>-c <replaceable>codec</replaceable></option></term>
-        <listitem>
-          <para>Specifies the output codec.</para>
-        </listitem>
-      </varlistentry>
-      <varlistentry>
-        <term><option>-t <replaceable>type</replaceable></option></term>
-        <listitem>
-          <para>Specifies the output format. The following formats are currently supported:</para>
-          <variablelist>
-            <varlistentry>
-              <term>text</term>
-              <listitem>
-                <para>Text format. This is the default.</para>
-              </listitem>
-            </varlistentry>
-            <varlistentry>
-              <term>html</term>
-              <listitem>
-                <para>HTML format. It is not recommended.</para>
-              </listitem>
-            </varlistentry>
-            <varlistentry>
-              <term>xml</term>
-              <listitem>
-                <para>XML format. It provides the most information available.</para>
-              </listitem>
-            </varlistentry>
-            <varlistentry>
-              <term>tag</term>
-              <listitem>
-                <para>“Tagged PDF” format. A tagged PDF has its own contents annotated with
-                  HTML-like tags. <command>pdf2txt</command> tries to extract its content streams
-                  rather than inferring its text locations. Tags used here are defined in the <ulink
-                  url='http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf'>PDF
-                  Reference, Sixth Edition</ulink> (§10.7 “Tagged PDF”).</para>
-              </listitem>
-            </varlistentry>
-          </variablelist>
-        </listitem>
-      </varlistentry>
-      <varlistentry>
-        <term><option>-T <replaceable>writing-mode</replaceable></option></term>
-        <listitem>
-          <para>Specifies the writing mode of text outputs:</para>
-          <variablelist>
-            <varlistentry>
-              <term>lr-tb</term>
-              <listitem>
-                <para>Left-to-right, top-to-bottom.</para>
-              </listitem>
-            </varlistentry>
-            <varlistentry>
-              <term>tb-rl</term>
-              <listitem>
-                <para>Top-to-bottom, right-to-left.</para>
-              </listitem>
-            </varlistentry>
-            <varlistentry>
-              <term>auto</term>
-              <listitem>
-                <para>Determine writing mode automatically</para>
-              </listitem>
-            </varlistentry>
-          </variablelist>
-        </listitem>
-      </varlistentry>
-      <varlistentry>
-        <term><option>-M <replaceable>char-margin</replaceable></option></term>
-        <term><option>-L <replaceable>line-margin</replaceable></option></term>
-        <term><option>-W <replaceable>word-margin</replaceable></option></term>
-        <listitem>
-          <para>
-            These are the parameters used for layout analysis. In an actual PDF file, texts might be
-            split into several chunks in the middle of its running, depending on the authoring
-            software. Therefore, text extraction needs to splice text chunks. In the figure below,
-            two text chunks whose distance is closer than the <replaceable>char-margin</replaceable>
-            is considered continuous and get grouped into one. Also, two lines whose distance is
-            closer than the <replaceable>line-margin</replaceable> is grouped as a text box, which
-            is a rectangular area that contains a “cluster” of texts. Furthermore, it may be
-            required to insert blank characters (spaces) as necessary if the distance between two
-            words is greater than the <replaceable>word-margin</replaceable>, as a blank between
-            words might not be represented as a space, but indicated by the positioning of each word.
-          </para>
-          <para>
-            Each value is specified not as an actual length, but as a proportion of the length to
-            the size of each character in question. The default values are
-            <replaceable>char-margin</replaceable> = 1.0, <replaceable>line-margin</replaceable> =
-            0.3, and <replaceable>W = 0.2</replaceable>, respectively.
-          </para>
-        </listitem>
-      </varlistentry>
-      <varlistentry>
-        <term><option>-n</option></term>
-        <listitem>
-          <para>Suppress layout analysis.</para>
-        </listitem>
-      </varlistentry>
-      <varlistentry>
-        <term><option>-A</option></term>
-        <listitem>
-          <para>Force to perform layout analysis for all the text strings, including texts contained
-            in figures.</para>
-        </listitem>
-      </varlistentry>
-      <varlistentry>
-        <term><option>-s <replaceable>scale</replaceable></option></term>
-        <listitem>
-          <para>Specifies the output scale. This option can be used in HTML format only.</para>
-        </listitem>
-      </varlistentry>
-      <varlistentry>
-        <term><option>-m <replaceable>n</replaceable></option></term>
-        <listitem>
-          <para>Specifies the maximum number of pages to extract. By default, all the pages in a
-            document are extracted.</para>
-        </listitem>
-      </varlistentry>
-      <varlistentry>
-        <term><option>-P <replaceable>password</replaceable></option></term>
-        <listitem>
-          <para>Provides the user password to access PDF contents.</para>
-        </listitem>
-      </varlistentry>
-      <varlistentry>
-        <term><option>-d</option></term>
-        <listitem>
-          <para>Increase the debug level.</para>
-        </listitem>
-      </varlistentry>
-    </variablelist>
-  </refsection>
-  <refsection>
-    <title>Examples</title>
-    <para>
-      Extract text as an HTML file whose filename is output.html:
-      <screen><prompt>$</prompt> <command>pdf2txt</command> -o output.html samples/naacl06-shinyama.pdf</screen>
-    </para>
-    <para>
-      Extract a Japanese HTML file in vertical writing:
-      <screen><prompt>$</prompt> <command>pdf2txt</command> -c euc-jp -D tb-rl -o output.html samples/jo.pdf</screen>
-    </para>
-    <para>
-      Extract text from an encrypted PDF file:
-      <screen><prompt>$</prompt> <command>pdf2txt</command> -P mypassword -o output.txt secret.pdf</screen>
-    </para>
-  </refsection>
-  <refsection>
-    <title>See also</title>
-    <para>
-      <citerefentry><refentrytitle>dumppdf</refentrytitle><manvolnum>1</manvolnum></citerefentry>
-    </para>
-  </refsection>
-</refentry>
-
-<!-- vim:set ts=2 sw=2 et:-->




More information about the Python-modules-commits mailing list