[Python-modules-commits] [pdftables] 01/03: Imported Upstream version 0.0.4

Sandro Tosi morph at moszumanska.debian.org
Tue Jul 7 20:41:36 UTC 2015


This is an automated email from the git hooks/post-receive script.

morph pushed a commit to branch bpo8
in repository pdftables.

commit c8e16958c4abbbe6a6cd9344f99f62061c92fd69
Author: Sandro Tosi <morph at debian.org>
Date:   Tue Jul 7 16:24:07 2015 -0400

    Imported Upstream version 0.0.4
---
 PKG-INFO                                  |  17 +
 pdftables.egg-info/PKG-INFO               |  17 +
 pdftables.egg-info/SOURCES.txt            |  25 ++
 pdftables.egg-info/dependency_links.txt   |   1 +
 pdftables.egg-info/entry_points.txt       |   2 +
 pdftables.egg-info/namespace_packages.txt |   1 +
 pdftables.egg-info/not-zip-safe           |   1 +
 pdftables.egg-info/requires.txt           |   2 +
 pdftables.egg-info/top_level.txt          |   1 +
 pdftables/TableFinder.py                  | 104 +++++
 pdftables/__init__.py                     |   1 +
 pdftables/counter.py                      | 189 +++++++++
 pdftables/display.py                      |  65 +++
 pdftables/pdftables.py                    | 641 ++++++++++++++++++++++++++++++
 pdftables/pdftables_analysis.py           | 140 +++++++
 pdftables/runtables.py                    | 101 +++++
 pdftables/tree.py                         | 104 +++++
 setup.cfg                                 |   5 +
 setup.py                                  |  37 ++
 test/test_Table_class.py                  |  35 ++
 test/test_all_sample_data.py              |  57 +++
 test/test_comb.py                         |  72 ++++
 test/test_contains_tables.py              |  43 ++
 test/test_finds_tables.py                 |  24 ++
 test/test_get_tables.py                   | 128 ++++++
 test/test_tree.py                         |  29 ++
 26 files changed, 1842 insertions(+)

diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..6a007b1
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,17 @@
+Metadata-Version: 1.1
+Name: pdftables
+Version: 0.0.4
+Summary: Parses PDFs and extracts what it believes to be tables.
+Home-page: http://scraperwiki.com
+Author: ScraperWiki Ltd
+Author-email: feedback at scraperwiki.com
+License: BSD
+Description: 
+        PDFTables helps with extracting tables from PDF files.
+        
+Platform: UNKNOWN
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Programming Language :: Python
diff --git a/pdftables.egg-info/PKG-INFO b/pdftables.egg-info/PKG-INFO
new file mode 100644
index 0000000..6a007b1
--- /dev/null
+++ b/pdftables.egg-info/PKG-INFO
@@ -0,0 +1,17 @@
+Metadata-Version: 1.1
+Name: pdftables
+Version: 0.0.4
+Summary: Parses PDFs and extracts what it believes to be tables.
+Home-page: http://scraperwiki.com
+Author: ScraperWiki Ltd
+Author-email: feedback at scraperwiki.com
+License: BSD
+Description: 
+        PDFTables helps with extracting tables from PDF files.
+        
+Platform: UNKNOWN
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Programming Language :: Python
diff --git a/pdftables.egg-info/SOURCES.txt b/pdftables.egg-info/SOURCES.txt
new file mode 100644
index 0000000..d0f30af
--- /dev/null
+++ b/pdftables.egg-info/SOURCES.txt
@@ -0,0 +1,25 @@
+setup.cfg
+setup.py
+pdftables/TableFinder.py
+pdftables/__init__.py
+pdftables/counter.py
+pdftables/display.py
+pdftables/pdftables.py
+pdftables/pdftables_analysis.py
+pdftables/runtables.py
+pdftables/tree.py
+pdftables.egg-info/PKG-INFO
+pdftables.egg-info/SOURCES.txt
+pdftables.egg-info/dependency_links.txt
+pdftables.egg-info/entry_points.txt
+pdftables.egg-info/namespace_packages.txt
+pdftables.egg-info/not-zip-safe
+pdftables.egg-info/requires.txt
+pdftables.egg-info/top_level.txt
+test/test_Table_class.py
+test/test_all_sample_data.py
+test/test_comb.py
+test/test_contains_tables.py
+test/test_finds_tables.py
+test/test_get_tables.py
+test/test_tree.py
\ No newline at end of file
diff --git a/pdftables.egg-info/dependency_links.txt b/pdftables.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/pdftables.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/pdftables.egg-info/entry_points.txt b/pdftables.egg-info/entry_points.txt
new file mode 100644
index 0000000..b0dac65
--- /dev/null
+++ b/pdftables.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+
+    
\ No newline at end of file
diff --git a/pdftables.egg-info/namespace_packages.txt b/pdftables.egg-info/namespace_packages.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/pdftables.egg-info/namespace_packages.txt
@@ -0,0 +1 @@
+
diff --git a/pdftables.egg-info/not-zip-safe b/pdftables.egg-info/not-zip-safe
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/pdftables.egg-info/not-zip-safe
@@ -0,0 +1 @@
+
diff --git a/pdftables.egg-info/requires.txt b/pdftables.egg-info/requires.txt
new file mode 100644
index 0000000..a71be9a
--- /dev/null
+++ b/pdftables.egg-info/requires.txt
@@ -0,0 +1,2 @@
+pdfminer==20110515
+numpy>=1.6.2
\ No newline at end of file
diff --git a/pdftables.egg-info/top_level.txt b/pdftables.egg-info/top_level.txt
new file mode 100644
index 0000000..3b79530
--- /dev/null
+++ b/pdftables.egg-info/top_level.txt
@@ -0,0 +1 @@
+pdftables
diff --git a/pdftables/TableFinder.py b/pdftables/TableFinder.py
new file mode 100644
index 0000000..6207f52
--- /dev/null
+++ b/pdftables/TableFinder.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+# ScraperWiki Limited
+# Ian Hopkinson, 2013-06-14
+# -*- coding: utf-8 -*-
+
+"""
+Code to find tables in PDF files
+"""
+
+import os
+# import requests
+import scraperwiki # pdftoxml does not work on Windows
+import lxml.html
+import glob
+import matplotlib.pyplot as plt
+import collections
+from counter import Counter
+
+# TODO - Use pdfminer
+# TODO
+
+def pdftoxml(filename,options):
+    ConverterPath = unicode(r'C:\Users\Ian\BitBucketRepos\0939-AgraInforma\bin\pdftohtml.exe')
+    directory = os.path.split(filename)[0]
+    tmpxml = os.path.join(directory,"temph.xml")
+    if tmpxml in os.listdir('.'):
+        os.remove(tmpxml)
+    cmd = '%s -xml %s "%s" %s' % (ConverterPath, options, filename, os.path.splitext(tmpxml)[0])
+
+    os.system(cmd)
+
+    f = open(tmpxml,'rb')
+    content = f.read()
+    f.close()
+
+    return content
+
+def processpage(page):
+    left=[]
+    width=[]
+    top=[]
+    right=[]
+    for textchunk in (page is not None and page.xpath('text')):
+        thisleft = int(textchunk.attrib.get('left'))
+        thiswidth = int(textchunk.attrib.get('width'))
+        left.append(thisleft)
+        width.append(thiswidth)
+        top.append(pageheight - int(textchunk.attrib.get('top')))
+        right.append(thisleft + thiswidth)
+
+    return pageheight,pagewidth,left,top,right
+
+def plotpage(pageheight,pagewidth,pagenumber,SelectedPDF,left,top,right):
+    fig = plt.figure()
+    ax1 = fig.add_subplot(111)
+    ax1.axis('equal')
+    ax1.plot([0,pagewidth,pagewidth,0,0],[0,0,pageheight,pageheight,0])
+    ax1.scatter(left, top, s=10, c='b', marker="s")
+    ax1.scatter(right, top, s=10, c='r', marker="o")
+    fig.suptitle('%s : Page %d' % (SelectedPDF,pagenumber), fontsize=15)
+    plt.show()
+    return fig
+
+PDF_TEST_FILES = unicode(r'C:\Users\Ian\BitBucketRepos\0939-AgraInforma\fixtures')
+
+# PDFList = glob.glob(os.path.join(PDF_TEST_FILES,'*.pdf'))
+
+# SelectedPDF = 6 # 6 = cit0613.pdf - table is actually an image
+
+# r = requests.get(os.path.join(PDF_TEST_FILES,PDFList[SelectedPDF]))
+# options = ""
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,PDFList[SelectedPDF]),options)
+
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"cit0613.pdf"),options) # Works but first page is an image
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"2012.01.PosRpt.pdf"),options) # PDF to HTML does not like
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"COPAWEEKLYJUNE52013.pdf"),options)
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"COPAMONTHLYMay2013.pdf"),options) # lxml doesn't like this one, interleaved <b> and <i> tags
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"13_06_12_10_36_58_boletim_ingles_junho_2013.pdf"),options) # Long document with many tables
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"1359397366Final_Coceral grain estimate_2012_December.pdf"),options)
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"ClinicalResearchDisclosureReport2012Q2.pdf"),options) # throws not allowed
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"argentina_diputados_voting_record.pdf"),options)
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"bo_page24.pdf"),options) # Multi-column text and tables mixed on the page
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"tabla_subsidios.pdf"),options) # Multi-column text and tables mixed on the page
+SelectedPDF = "argentina_diputados_voting_record.pdf"
+
+xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,SelectedPDF),options)
+
+root = lxml.etree.fromstring(xmldata)
+pages = list(root)
+
+# This is ok but
+
+
+for page in pages:
+    pagenumber = int(page.attrib.get("number"))
+    pagewidth = int(page.attrib.get("width"))
+    pageheight = int(page.attrib.get("height"))
+
+    pageheight,pagewidth,left,top,right = processpage(page)
+
+    fig = plotpage(pageheight,pagewidth,pagenumber,SelectedPDF,left,top,right)
+
+
+    # counter=Counter(left)
diff --git a/pdftables/__init__.py b/pdftables/__init__.py
new file mode 100644
index 0000000..32e5081
--- /dev/null
+++ b/pdftables/__init__.py
@@ -0,0 +1 @@
+from pdftables import *
diff --git a/pdftables/counter.py b/pdftables/counter.py
new file mode 100644
index 0000000..5fbd5c8
--- /dev/null
+++ b/pdftables/counter.py
@@ -0,0 +1,189 @@
+from operator import itemgetter
+from heapq import nlargest
+from itertools import repeat, ifilter
+
+class Counter(dict):
+    '''Dict subclass for counting hashable objects.  Sometimes called a bag
+    or multiset.  Elements are stored as dictionary keys and their counts
+    are stored as dictionary values.
+
+    >>> Counter('zyzygy')
+    Counter({'y': 3, 'z': 2, 'g': 1})
+
+    '''
+
+    def __init__(self, iterable=None, **kwds):
+        '''Create a new, empty Counter object.  And if given, count elements
+        from an input iterable.  Or, initialize the count from another mapping
+        of elements to their counts.
+
+        >>> c = Counter()                           # a new, empty counter
+        >>> c = Counter('gallahad')                 # a new counter from an iterable
+        >>> c = Counter({'a': 4, 'b': 2})           # a new counter from a mapping
+        >>> c = Counter(a=4, b=2)                   # a new counter from keyword args
+
+        '''        
+        self.update(iterable, **kwds)
+
+    def __missing__(self, key):
+        return 0
+
+    def most_common(self, n=None):
+        '''List the n most common elements and their counts from the most
+        common to the least.  If n is None, then list all element counts.
+
+        >>> Counter('abracadabra').most_common(3)
+        [('a', 5), ('r', 2), ('b', 2)]
+
+        '''        
+        if n is None:
+            return sorted(self.iteritems(), key=itemgetter(1), reverse=True)
+        return nlargest(n, self.iteritems(), key=itemgetter(1))
+
+    def elements(self):
+        '''Iterator over elements repeating each as many times as its count.
+
+        >>> c = Counter('ABCABC')
+        >>> sorted(c.elements())
+        ['A', 'A', 'B', 'B', 'C', 'C']
+
+        If an element's count has been set to zero or is a negative number,
+        elements() will ignore it.
+
+        '''
+        for elem, count in self.iteritems():
+            for _ in repeat(None, count):
+                yield elem
+
+    # Override dict methods where the meaning changes for Counter objects.
+
+    @classmethod
+    def fromkeys(cls, iterable, v=None):
+        raise NotImplementedError(
+            'Counter.fromkeys() is undefined.  Use Counter(iterable) instead.')
+
+    def update(self, iterable=None, **kwds):
+        '''Like dict.update() but add counts instead of replacing them.
+
+        Source can be an iterable, a dictionary, or another Counter instance.
+
+        >>> c = Counter('which')
+        >>> c.update('witch')           # add elements from another iterable
+        >>> d = Counter('watch')
+        >>> c.update(d)                 # add elements from another counter
+        >>> c['h']                      # four 'h' in which, witch, and watch
+        4
+
+        '''        
+        if iterable is not None:
+            if hasattr(iterable, 'iteritems'):
+                if self:
+                    self_get = self.get
+                    for elem, count in iterable.iteritems():
+                        self[elem] = self_get(elem, 0) + count
+                else:
+                    dict.update(self, iterable) # fast path when counter is empty
+            else:
+                self_get = self.get
+                for elem in iterable:
+                    self[elem] = self_get(elem, 0) + 1
+        if kwds:
+            self.update(kwds)
+
+    def copy(self):
+        'Like dict.copy() but returns a Counter instance instead of a dict.'
+        return Counter(self)
+
+    def __delitem__(self, elem):
+        'Like dict.__delitem__() but does not raise KeyError for missing values.'
+        if elem in self:
+            dict.__delitem__(self, elem)
+
+    def __repr__(self):
+        if not self:
+            return '%s()' % self.__class__.__name__
+        items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
+        return '%s({%s})' % (self.__class__.__name__, items)
+
+    # Multiset-style mathematical operations discussed in:
+    #       Knuth TAOCP Volume II section 4.6.3 exercise 19
+    #       and at http://en.wikipedia.org/wiki/Multiset
+    #
+    # Outputs guaranteed to only include positive counts.
+    #
+    # To strip negative and zero counts, add-in an empty counter:
+    #       c += Counter()
+
+    def __add__(self, other):
+        '''Add counts from two counters.
+
+        >>> Counter('abbb') + Counter('bcc')
+        Counter({'b': 4, 'c': 2, 'a': 1})
+
+
+        '''
+        if not isinstance(other, Counter):
+            return NotImplemented
+        result = Counter()
+        for elem in set(self) | set(other):
+            newcount = self[elem] + other[elem]
+            if newcount > 0:
+                result[elem] = newcount
+        return result
+
+    def __sub__(self, other):
+        ''' Subtract count, but keep only results with positive counts.
+
+        >>> Counter('abbbc') - Counter('bccd')
+        Counter({'b': 2, 'a': 1})
+
+        '''
+        if not isinstance(other, Counter):
+            return NotImplemented
+        result = Counter()
+        for elem in set(self) | set(other):
+            newcount = self[elem] - other[elem]
+            if newcount > 0:
+                result[elem] = newcount
+        return result
+
+    def __or__(self, other):
+        '''Union is the maximum of value in either of the input counters.
+
+        >>> Counter('abbb') | Counter('bcc')
+        Counter({'b': 3, 'c': 2, 'a': 1})
+
+        '''
+        if not isinstance(other, Counter):
+            return NotImplemented
+        _max = max
+        result = Counter()
+        for elem in set(self) | set(other):
+            newcount = _max(self[elem], other[elem])
+            if newcount > 0:
+                result[elem] = newcount
+        return result
+
+    def __and__(self, other):
+        ''' Intersection is the minimum of corresponding counts.
+
+        >>> Counter('abbb') & Counter('bcc')
+        Counter({'b': 1})
+
+        '''
+        if not isinstance(other, Counter):
+            return NotImplemented
+        _min = min
+        result = Counter()
+        if len(self) < len(other):
+            self, other = other, self
+        for elem in ifilter(self.__contains__, other):
+            newcount = _min(self[elem], other[elem])
+            if newcount > 0:
+                result[elem] = newcount
+        return result
+
+
+if __name__ == '__main__':
+    import doctest
+    print doctest.testmod()
diff --git a/pdftables/display.py b/pdftables/display.py
new file mode 100755
index 0000000..7b606df
--- /dev/null
+++ b/pdftables/display.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+from __future__ import unicode_literals
+from collections import defaultdict
+from StringIO import StringIO
+
+
+def to_string(table):
+    """
+    Returns a list of the maximum width for each column across all rows
+    >>> type(to_string([['foo', 'goodbye'], ['llama', 'bar']]))
+    <type 'unicode'>
+    """
+    result = StringIO()
+
+    (columns, rows) = get_dimensions(table)
+        
+    result.write("     {} columns, {} rows\n".format(columns, rows))
+    col_widths = find_column_widths(table)
+    table_width = sum(col_widths) + len(col_widths) + 2
+    hbar = '    {}\n'.format('-' * table_width)
+
+    result.write("      {}\n".format(' '.join(
+        [unicode(col_index).rjust(width, ' ') for (col_index, width)
+         in enumerate(col_widths)])))
+
+    result.write(hbar)
+    for row_index, row in enumerate(table):
+        cells = [cell.rjust(width, ' ') for (cell, width)
+                 in zip(row, col_widths)]
+        result.write("{:>3} | {}|\n".format(row_index, '|'.join(cells)))
+    result.write(hbar)
+    result.seek(0)
+    return unicode(result.read())
+
+
+def get_dimensions(table):
+    """
+    Returns columns, rows for a table.
+    >>> get_dimensions([['row1', 'apple', 'llama'], ['row2', 'plum', 'goat']])
+    (3, 2)
+    >>> get_dimensions([['row1', 'apple', 'llama'], ['row2', 'banana']])
+    (3, 2)
+    """
+    rows = len(table)
+    try:
+        cols = max(len(row) for row in table)
+    except ValueError:
+        cols = 0
+    return (cols, rows)
+
+
+def find_column_widths(table):
+    """
+    Returns a list of the maximum width for each column across all rows
+    >>> find_column_widths([['foo', 'goodbye'], ['llama', 'bar']])
+    [5, 7]
+    """
+    col_widths = defaultdict(lambda: 0)
+    for row_index, row in enumerate(table):
+        for column_index, cell in enumerate(row):
+            col_widths[column_index] = max(col_widths[column_index], len(cell))
+    return [col_widths[col] for col in sorted(col_widths)]
+
+if __name__ == '__main__':
+    print(to_string([['foo', 'goodbye'], ['llama', 'bar']]))
diff --git a/pdftables/pdftables.py b/pdftables/pdftables.py
new file mode 100755
index 0000000..5200ed7
--- /dev/null
+++ b/pdftables/pdftables.py
@@ -0,0 +1,641 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# ScraperWiki Limited
+# Ian Hopkinson, 2013-06-04
+
+from __future__ import unicode_literals
+"""
+Some experiments with pdfminer
+http://www.unixuser.org/~euske/python/pdfminer/programming.html
+Some help here:
+http://denis.papathanasiou.org/2010/08/04/extracting-text-images-from-pdf-files
+"""
+
+# TODO Identify multi-column text, for multicolumn text detect per column
+# TODO Dynamic / smarter thresholding
+# TODO Handle argentina_diputados_voting_record.pdf automatically
+# TODO Handle multiple tables on one page
+
+
+import sys
+import codecs
+
+from pdfminer.pdfparser import PDFParser, PDFDocument
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.pdfdevice import PDFDevice
+from pdfminer.layout import LAParams, LTPage
+from pdfminer.converter import PDFPageAggregator
+
+import collections
+
+from tree import Leaf, LeafList
+import requests  # TODO: remove this dependency
+from cStringIO import StringIO
+import math
+import numpy # TODO: remove this dependency
+from counter import Counter
+
+IS_TABLE_COLUMN_COUNT_THRESHOLD = 3
+IS_TABLE_ROW_COUNT_THRESHOLD = 3
+
+class TableDiagnosticData(object):
+    def __init__(self, box_list=LeafList(), top_plot=dict(), left_plot=dict(), x_comb=[], y_comb=[]):
+        self.box_list = box_list
+        self.top_plot = top_plot
+        self.left_plot = left_plot
+        self.x_comb = x_comb
+        self.y_comb = y_comb
+
+class Table(list):
+    def __init__(self, content, page, page_total, table_index, table_index_total):
+        super(Table, self).__init__(content)
+        self.page_number = page
+        self.total_pages = page_total
+        self.table_number_on_page = table_index
+        self.total_tables_on_page = table_index_total
+
+LEFT = 0
+TOP = 3
+RIGHT = 2
+BOTTOM = 1
+
+def get_tables(fh):
+    """
+    Return a list of 'tables' from the given file handle, where a table is a
+    list of rows, and a row is a list of strings.
+    """
+    result = []
+    doc, interpreter, device = initialize_pdf_miner(fh)
+    doc_length = len(list(doc.get_pages()))
+    for i, pdf_page in enumerate(doc.get_pages()):
+        #print("Trying page {}".format(i + 1))
+        if not page_contains_tables(pdf_page, interpreter, device):
+            #print("Skipping page {}: no tables.".format(i + 1))
+            continue
+
+        # receive the LTPage object for the page.
+        interpreter.process_page(pdf_page)
+        processed_page = device.get_result()
+
+        (table, _) = page_to_tables(
+            processed_page,
+            extend_y=True,
+            hints=[],
+            atomise=True)
+        crop_table(table)
+        result.append(Table(table,i+1,doc_length,1,1))
+
+    return result
+
+
+def crop_table(table):
+    """
+    Remove empty rows from the top and bottom of the table.
+    """
+    for row in list(table):  # top -> bottom
+        if not any(cell.strip() for cell in row):
+            table.remove(row)
+        else:
+            break
+
+    for row in list(reversed(table)):  # bottom -> top
+        if not any(cell.strip() for cell in row):
+            table.remove(row)
+        else:
+            break
+
+
+def initialize_pdf_miner(fh):
+    # Create a PDF parser object associated with the file object.
+    parser = PDFParser(fh)
+    # Create a PDF document object that stores the document structure.
+    doc = PDFDocument()
+    # Connect the parser and document objects.
+    parser.set_document(doc)
+    doc.set_parser(parser)
+    # Supply the password for initialization.
+    # (If no password is set, give an empty string.)
+    doc.initialize("")
+    # Check if the document allows text extraction. If not, abort.
+    if not doc.is_extractable:
+        raise ValueError("PDFDocument is_extractable was False.")
+    # Create a PDF resource manager object that stores shared resources.
+    rsrcmgr = PDFResourceManager()
+    # Create a PDF device object.
+    device = PDFDevice(rsrcmgr)
+    # Create a PDF interpreter object.
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    # Process each page contained in the document.
+    # for page in doc.get_pages():
+    #    interpreter.process_page(page)
+
+    # Set parameters for analysis.
+    laparams = LAParams()
+    laparams.word_margin = 0.0
+    # Create a PDF page aggregator object.
+    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    return doc, interpreter, device
+
+
+def contains_tables(fh):
+    """
+    contains_tables(fh) takes a file handle and returns a boolean array of the
+    length of the document which is true for pages which contains tables
+    """
+    doc, interpreter, device = initialize_pdf_miner(fh)
+
+    return [page_contains_tables(p, interpreter, device) for
+            p in doc.get_pages()]
+
+
+def page_contains_tables(pdf_page, interpreter, device):
+    # TODO: hide doc, interpreter, device inside a higher level Pdf class. It's
+    # silly that we have to care about these (see function signature!!)
+
+    interpreter.process_page(pdf_page)
+    # receive the LTPage object for the page.
+    layout = device.get_result()
+    box_list = LeafList().populate(layout)
+    for item in box_list:
+        assert isinstance(item, Leaf), "NOT LEAF"
+    yhist = box_list.histogram(Leaf._top).rounder(1)
+
+    test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD]
+    return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
+
+
+def threshold_above(hist, threshold_value):
+    """
+    >>> threshold_above(Counter({518: 10, 520: 20, 530: 20, \
+                                             525: 17}), 15)
+    [520, 530, 525]
+    """
+    if not isinstance(hist, Counter):
+        raise ValueError("requires Counter")  # TypeError then?
+
+    above = [k for k, v in hist.items() if v > threshold_value]
+    return above
+
+
+def comb(combarray, value):
+    """
+    Takes a sorted array and returns the interval number of the value passed to
+    the function
+    """
+    # Raise an error in combarray not sorted
+    if (combarray != sorted(combarray)) and (combarray != sorted(
+            combarray, reverse=True)):
+        raise Exception("comb: combarray is not sorted")
+
+    index = -1
+    if combarray[0] > combarray[-1]:
+        for i in range(1, len(combarray)):
+            if combarray[i - 1] >= value >= combarray[i]:
+                index = i - 1
+    else:
+        for i in range(1, len(combarray)):
+            if combarray[i - 1] <= value <= combarray[i]:
+                index = i - 1
+
+    return index
+
+
+def apply_combs(box_list, x_comb, y_comb):
+    """Allocates text to table cells using the x and y combs"""
+    ncolumns = len(x_comb) - 1
+    nrows = len(y_comb) - 1
+    table_array = [[''] * ncolumns for j in range(nrows)]
+    for box in box_list:
+        y = round(box.midline)
+        x = round(box.centreline)
+        rowindex = comb(y_comb, y)
+        columnindex = comb(x_comb, x)
+        if rowindex != -1 and columnindex != -1:
+            # there was already some content at this coordinate so we
+            # concatenate (in an arbitrary order!)
+            table_array[rowindex][columnindex] += box.text.rstrip('\n\r')
+
+    return table_array
+
+
+def comb_from_projection(projection, threshold, orientation):
+    """Calculates the boundaries between cells from the projection of the boxes
+    onto either the y axis (for rows) or the x-axis (for columns). These
+    boundaries are known as the comb
+    """
+    if orientation=="row":
+        tol=1
+    elif orientation=="column":
+        tol=3
+
+    projection_threshold = threshold_above(projection, threshold)
+
+    projection_threshold = sorted(projection_threshold)
+    # need to generate a list of uppers (right or top edges)
+    # and a list of lowers (left or bottom edges)
+
+    # uppers = [k for k, v in yhisttop.items() if v > yThreshold]
+    # lowers = [k for k, v in yhistbottom.items() if v > yThreshold]
+    uppers = []
+    lowers = []
+
+    lowers.append(projection_threshold[0])
+    for i in range(1, len(projection_threshold)):
+        if projection_threshold[i] > (
+                projection_threshold[i-1] + 1):
+            uppers.append(projection_threshold[i - 1])
+            lowers.append(projection_threshold[i])
+    uppers.append(projection_threshold[-1])
+
+    comb = comb_from_uppers_and_lowers(uppers, lowers, tol=tol,
+                                       projection = projection)
+    comb.reverse()
+
+    return comb
+
+
+def comb_from_uppers_and_lowers(uppers, lowers, tol=1, projection=dict()):
+    """Called by comb_from_projection to calculate the comb given a set of
+    uppers and lowers, which are upper and lower edges of the thresholded
+    projection"""
+    # tol is a tolerance to remove very small minima, increasing to 2 fowls up
+    # row separation
+    assert len(uppers) == len(lowers)
+    uppers.sort(reverse=True)
+    lowers.sort(reverse=True)
+    comb = []
+    comb.append(uppers[0])
+    for i in range(1, len(uppers)):
+        if (lowers[i - 1]-uppers[i])>tol:
+            comb.append(find_minima(lowers[i - 1], uppers[i], projection))
+            #comb.append(find_minima(lowers[i - 1], uppers[i]))
+
+    comb.append(lowers[-1])
+
+    return comb
+
+def find_minima(lower, upper, projection=dict()):
+
+    #print lower, upper, projection
+    if len(projection)==0:
+        idx = (lower + upper) / 2.0
+    else:
+        profile = []
+        for i in range(upper, lower):
+            #print projection[i]
+            profile.append(projection[i])
+
+        val, idx = min((val, idx) for (idx, val) in enumerate(profile))
+        #val, idx = min(profile)
+        idx = upper + idx
+
+    return idx
+
+def comb_extend(comb, minv, maxv):
+    """Extend the comb to minv and maxv"""
+    # TODO should this truncate if minv>minc or maxc>maxc
+    # print y_comb
+    # Find sort order of comb, convert to ascending
+    reversed = False
+    if comb[0] > comb[-1]:
+        comb.reverse()
+        reversed = True
+    # Find min and max of comb
+    minc = comb[0]
+    maxc = comb[-1]
+    # Get average row spacing
+    rowSpacing = numpy.average(numpy.diff(comb))
+    # Extend minimum
+    if minv < minc:
+        comb.reverse()
+        comb.extend(list(numpy.arange(minc, minv, -rowSpacing))[1:])
+        comb.reverse()
+    # Extend maximum
+    if maxv > maxc:
+        comb.extend(list(numpy.arange(maxc, maxv, rowSpacing))[1:])
+
+    if reversed:
+        comb.reverse()
+    return comb
+
+
+def project_boxes(box_list, orientation, erosion=0):
+    """
+    Take a set of boxes and project their extent onto an axis
+    """
+    if orientation == "column":
+        upper = RIGHT
+        lower = LEFT
+    elif orientation == "row":
+        upper = TOP
+        lower = BOTTOM
+
+    projection = {}
+    minv = round(min([box.bbox[lower]
+                 for box in box_list])) - 2  # ensure some overlap
+    maxv = round(max([box.bbox[upper] for box in box_list])) + 2
+
+    # Initialise projection structure
+    # print minv, maxv
+    coords = range(int(minv), int(maxv))
+    projection = coords
+
+    # print projection
+    for box in box_list:
+        for i in range(int(round(box.bbox[lower])) + erosion,
+                       int(round(box.bbox[upper])) - erosion):
+            # projection[i] += 1
+            projection.append(i)
+
+    return Counter(projection)
+
+
+def get_pdf_page(fh, pagenumber):
+    doc, interpreter, device = initialize_pdf_miner(fh)
+    pages = list(doc.get_pages())
+
+    try:
+        page = pages[pagenumber - 1]
+    except IndexError:
+        raise IndexError("Invalid page number")
+
+    interpreter.process_page(page)
+    # receive the LTPage object for the page.
+    processedPage = device.get_result()
+    return processedPage
+
+# def getTable(fh, page, extend_y=False, hints=[]):
+#    """placeholder for tests, refactor out"""
+#    return page_to_tables(get_pdf_page(fh, page), extend_y, hints)
+
+
+def get_min_and_max_y_from_hints(box_list, top_string, bottom_string):
+    miny = None
+    maxy = None
+    if top_string:
+        top_box = [box for box in box_list if top_string in box.text]
+        if top_box:
+            maxy = top_box[0].top
+    if bottom_string:
+        bottomBox = [box for box in box_list if bottom_string in box.text]
+        if bottomBox:
+            miny = bottomBox[0].bottom
+    return miny, maxy
+
+
+def rounder(val, tol):
+    """
+    Utility function to round numbers to arbitrary tolerance
+    """
+    return round((1.0 * val) / tol) * tol
+
+
+#def filter_box_list_by_type(box_list, flt):
+#    return [box for box in box_list if box.classname in flt]
+
+
+def multi_column_detect(page):
+    #TODO This function is under construction
+    """
+    Test for multiColumns from a box_list, returns an integer number of columns
+    and a set of (left, right) pairs delineating any columns
+    """
+    # Ways to identify multicolumns:
+    # 1. High fill factor compared to tables
+    # 2. Gullies at textwidth/2, (textwidth/3, 2*textwidth/3)...
+    # 3. Histogram of boxwidths with peak at some fraction of page width
+    # This is like project_boxes but we are projecting the length of the
+    # textbox onto the axis
+    box_list = LeafList().populate(
+        page, ['LTPage', 'LTTextLineHorizontal']).purge_empty_text()
+
+    # Should use the LTPage object to get page bounding box
+    box_list = filter_box_list_by_type(box_list, 'LTTextLineHorizontal')
+    pile = {}
+    vstep = 5  # should be scaled by modal row height
+    minv = rounder(
+        min([box.bottom for box in box_list]),
+        5)  # ensure some overlap
+    maxv = rounder(max([box.top for box in box_list]), 5)
+
+    minx = round(min([box.left for box in box_list]))  # ensure some overlap
+    maxx = round(max([box.right for box in box_list]))
+
+    # Initialise projection structure
+    # print minv, maxv
+
+    coords = range(int(minv), int(maxv) + vstep, vstep)
+
+    pile = collections.OrderedDict(zip(coords, [0] * len(coords)))
+    # print projection
+    for box in box_list:
+        # print int(rounder(box.midline, 30)), box.width
+        pile[int(rounder(box.midline, vstep))] += box.width
+
+    for key, value in pile.items():
+        pile[key] = value / (maxx - minx)
+
+    # Box width histogram
+    bstep = 10
+    boxhist = {}
+    boxwidthmin = rounder(min([box.width for box in box_list]), bstep)
+    boxwidthmax = rounder(max([box.width for box in box_list]), bstep)
+
+    coords = range(int(boxwidthmin), int(boxwidthmax) + bstep, bstep)
+    boxhist = collections.OrderedDict(zip(coords, [0] * len(coords)))
+    for box in box_list:
+        # print int(rounder(box.midline, 30)), box.width
+        boxhist[int(rounder(box.width, bstep))] += 1
+
+    nboxes = len(box_list)
+    for key, value in boxhist.items():
... 1035 lines suppressed ...

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/pdftables.git



More information about the Python-modules-commits mailing list