[Python-modules-commits] [pdftables] 01/03: Imported Upstream version 0.0.4
Sandro Tosi
morph at moszumanska.debian.org
Tue Jul 7 20:41:36 UTC 2015
This is an automated email from the git hooks/post-receive script.
morph pushed a commit to branch bpo8
in repository pdftables.
commit c8e16958c4abbbe6a6cd9344f99f62061c92fd69
Author: Sandro Tosi <morph at debian.org>
Date: Tue Jul 7 16:24:07 2015 -0400
Imported Upstream version 0.0.4
---
PKG-INFO | 17 +
pdftables.egg-info/PKG-INFO | 17 +
pdftables.egg-info/SOURCES.txt | 25 ++
pdftables.egg-info/dependency_links.txt | 1 +
pdftables.egg-info/entry_points.txt | 2 +
pdftables.egg-info/namespace_packages.txt | 1 +
pdftables.egg-info/not-zip-safe | 1 +
pdftables.egg-info/requires.txt | 2 +
pdftables.egg-info/top_level.txt | 1 +
pdftables/TableFinder.py | 104 +++++
pdftables/__init__.py | 1 +
pdftables/counter.py | 189 +++++++++
pdftables/display.py | 65 +++
pdftables/pdftables.py | 641 ++++++++++++++++++++++++++++++
pdftables/pdftables_analysis.py | 140 +++++++
pdftables/runtables.py | 101 +++++
pdftables/tree.py | 104 +++++
setup.cfg | 5 +
setup.py | 37 ++
test/test_Table_class.py | 35 ++
test/test_all_sample_data.py | 57 +++
test/test_comb.py | 72 ++++
test/test_contains_tables.py | 43 ++
test/test_finds_tables.py | 24 ++
test/test_get_tables.py | 128 ++++++
test/test_tree.py | 29 ++
26 files changed, 1842 insertions(+)
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..6a007b1
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,17 @@
+Metadata-Version: 1.1
+Name: pdftables
+Version: 0.0.4
+Summary: Parses PDFs and extracts what it believes to be tables.
+Home-page: http://scraperwiki.com
+Author: ScraperWiki Ltd
+Author-email: feedback at scraperwiki.com
+License: BSD
+Description:
+ PDFTables helps with extracting tables from PDF files.
+
+Platform: UNKNOWN
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Programming Language :: Python
diff --git a/pdftables.egg-info/PKG-INFO b/pdftables.egg-info/PKG-INFO
new file mode 100644
index 0000000..6a007b1
--- /dev/null
+++ b/pdftables.egg-info/PKG-INFO
@@ -0,0 +1,17 @@
+Metadata-Version: 1.1
+Name: pdftables
+Version: 0.0.4
+Summary: Parses PDFs and extracts what it believes to be tables.
+Home-page: http://scraperwiki.com
+Author: ScraperWiki Ltd
+Author-email: feedback at scraperwiki.com
+License: BSD
+Description:
+ PDFTables helps with extracting tables from PDF files.
+
+Platform: UNKNOWN
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Programming Language :: Python
diff --git a/pdftables.egg-info/SOURCES.txt b/pdftables.egg-info/SOURCES.txt
new file mode 100644
index 0000000..d0f30af
--- /dev/null
+++ b/pdftables.egg-info/SOURCES.txt
@@ -0,0 +1,25 @@
+setup.cfg
+setup.py
+pdftables/TableFinder.py
+pdftables/__init__.py
+pdftables/counter.py
+pdftables/display.py
+pdftables/pdftables.py
+pdftables/pdftables_analysis.py
+pdftables/runtables.py
+pdftables/tree.py
+pdftables.egg-info/PKG-INFO
+pdftables.egg-info/SOURCES.txt
+pdftables.egg-info/dependency_links.txt
+pdftables.egg-info/entry_points.txt
+pdftables.egg-info/namespace_packages.txt
+pdftables.egg-info/not-zip-safe
+pdftables.egg-info/requires.txt
+pdftables.egg-info/top_level.txt
+test/test_Table_class.py
+test/test_all_sample_data.py
+test/test_comb.py
+test/test_contains_tables.py
+test/test_finds_tables.py
+test/test_get_tables.py
+test/test_tree.py
\ No newline at end of file
diff --git a/pdftables.egg-info/dependency_links.txt b/pdftables.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/pdftables.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/pdftables.egg-info/entry_points.txt b/pdftables.egg-info/entry_points.txt
new file mode 100644
index 0000000..b0dac65
--- /dev/null
+++ b/pdftables.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/pdftables.egg-info/namespace_packages.txt b/pdftables.egg-info/namespace_packages.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/pdftables.egg-info/namespace_packages.txt
@@ -0,0 +1 @@
+
diff --git a/pdftables.egg-info/not-zip-safe b/pdftables.egg-info/not-zip-safe
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/pdftables.egg-info/not-zip-safe
@@ -0,0 +1 @@
+
diff --git a/pdftables.egg-info/requires.txt b/pdftables.egg-info/requires.txt
new file mode 100644
index 0000000..a71be9a
--- /dev/null
+++ b/pdftables.egg-info/requires.txt
@@ -0,0 +1,2 @@
+pdfminer==20110515
+numpy>=1.6.2
\ No newline at end of file
diff --git a/pdftables.egg-info/top_level.txt b/pdftables.egg-info/top_level.txt
new file mode 100644
index 0000000..3b79530
--- /dev/null
+++ b/pdftables.egg-info/top_level.txt
@@ -0,0 +1 @@
+pdftables
diff --git a/pdftables/TableFinder.py b/pdftables/TableFinder.py
new file mode 100644
index 0000000..6207f52
--- /dev/null
+++ b/pdftables/TableFinder.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+# ScraperWiki Limited
+# Ian Hopkinson, 2013-06-14
+# -*- coding: utf-8 -*-
+
+"""
+Code to find tables in PDF files
+"""
+
+import os
+# import requests
+import scraperwiki # pdftoxml does not work on Windows
+import lxml.html
+import glob
+import matplotlib.pyplot as plt
+import collections
+from counter import Counter
+
+# TODO - Use pdfminer
+# TODO
+
+def pdftoxml(filename,options):
+ ConverterPath = unicode(r'C:\Users\Ian\BitBucketRepos\0939-AgraInforma\bin\pdftohtml.exe')
+ directory = os.path.split(filename)[0]
+ tmpxml = os.path.join(directory,"temph.xml")
+ if tmpxml in os.listdir('.'):
+ os.remove(tmpxml)
+ cmd = '%s -xml %s "%s" %s' % (ConverterPath, options, filename, os.path.splitext(tmpxml)[0])
+
+ os.system(cmd)
+
+ f = open(tmpxml,'rb')
+ content = f.read()
+ f.close()
+
+ return content
+
+def processpage(page):
+ left=[]
+ width=[]
+ top=[]
+ right=[]
+ for textchunk in (page is not None and page.xpath('text')):
+ thisleft = int(textchunk.attrib.get('left'))
+ thiswidth = int(textchunk.attrib.get('width'))
+ left.append(thisleft)
+ width.append(thiswidth)
+ top.append(pageheight - int(textchunk.attrib.get('top')))
+ right.append(thisleft + thiswidth)
+
+ return pageheight,pagewidth,left,top,right
+
+def plotpage(pageheight,pagewidth,pagenumber,SelectedPDF,left,top,right):
+ fig = plt.figure()
+ ax1 = fig.add_subplot(111)
+ ax1.axis('equal')
+ ax1.plot([0,pagewidth,pagewidth,0,0],[0,0,pageheight,pageheight,0])
+ ax1.scatter(left, top, s=10, c='b', marker="s")
+ ax1.scatter(right, top, s=10, c='r', marker="o")
+ fig.suptitle('%s : Page %d' % (SelectedPDF,pagenumber), fontsize=15)
+ plt.show()
+ return fig
+
+PDF_TEST_FILES = unicode(r'C:\Users\Ian\BitBucketRepos\0939-AgraInforma\fixtures')
+
+# PDFList = glob.glob(os.path.join(PDF_TEST_FILES,'*.pdf'))
+
+# SelectedPDF = 6 # 6 = cit0613.pdf - table is actually an image
+
+# r = requests.get(os.path.join(PDF_TEST_FILES,PDFList[SelectedPDF]))
+# options = ""
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,PDFList[SelectedPDF]),options)
+
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"cit0613.pdf"),options) # Works but first page is an image
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"2012.01.PosRpt.pdf"),options) # PDF to HTML does not like
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"COPAWEEKLYJUNE52013.pdf"),options)
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"COPAMONTHLYMay2013.pdf"),options) # lxml doesn't like this one, interleaved <b> and <i> tags
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"13_06_12_10_36_58_boletim_ingles_junho_2013.pdf"),options) # Long document with many tables
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"1359397366Final_Coceral grain estimate_2012_December.pdf"),options)
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"ClinicalResearchDisclosureReport2012Q2.pdf"),options) # throws not allowed
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"argentina_diputados_voting_record.pdf"),options)
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"bo_page24.pdf"),options) # Multi-column text and tables mixed on the page
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"tabla_subsidios.pdf"),options) # Multi-column text and tables mixed on the page
+SelectedPDF = "argentina_diputados_voting_record.pdf"
+
+xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,SelectedPDF),options)
+
+root = lxml.etree.fromstring(xmldata)
+pages = list(root)
+
+# This is ok but
+
+
+for page in pages:
+ pagenumber = int(page.attrib.get("number"))
+ pagewidth = int(page.attrib.get("width"))
+ pageheight = int(page.attrib.get("height"))
+
+ pageheight,pagewidth,left,top,right = processpage(page)
+
+ fig = plotpage(pageheight,pagewidth,pagenumber,SelectedPDF,left,top,right)
+
+
+ # counter=Counter(left)
diff --git a/pdftables/__init__.py b/pdftables/__init__.py
new file mode 100644
index 0000000..32e5081
--- /dev/null
+++ b/pdftables/__init__.py
@@ -0,0 +1 @@
+from pdftables import *
diff --git a/pdftables/counter.py b/pdftables/counter.py
new file mode 100644
index 0000000..5fbd5c8
--- /dev/null
+++ b/pdftables/counter.py
@@ -0,0 +1,189 @@
+from operator import itemgetter
+from heapq import nlargest
+from itertools import repeat, ifilter
+
+class Counter(dict):
+ '''Dict subclass for counting hashable objects. Sometimes called a bag
+ or multiset. Elements are stored as dictionary keys and their counts
+ are stored as dictionary values.
+
+ >>> Counter('zyzygy')
+ Counter({'y': 3, 'z': 2, 'g': 1})
+
+ '''
+
+ def __init__(self, iterable=None, **kwds):
+ '''Create a new, empty Counter object. And if given, count elements
+ from an input iterable. Or, initialize the count from another mapping
+ of elements to their counts.
+
+ >>> c = Counter() # a new, empty counter
+ >>> c = Counter('gallahad') # a new counter from an iterable
+ >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping
+ >>> c = Counter(a=4, b=2) # a new counter from keyword args
+
+ '''
+ self.update(iterable, **kwds)
+
+ def __missing__(self, key):
+ return 0
+
+ def most_common(self, n=None):
+ '''List the n most common elements and their counts from the most
+ common to the least. If n is None, then list all element counts.
+
+ >>> Counter('abracadabra').most_common(3)
+ [('a', 5), ('r', 2), ('b', 2)]
+
+ '''
+ if n is None:
+ return sorted(self.iteritems(), key=itemgetter(1), reverse=True)
+ return nlargest(n, self.iteritems(), key=itemgetter(1))
+
+ def elements(self):
+ '''Iterator over elements repeating each as many times as its count.
+
+ >>> c = Counter('ABCABC')
+ >>> sorted(c.elements())
+ ['A', 'A', 'B', 'B', 'C', 'C']
+
+ If an element's count has been set to zero or is a negative number,
+ elements() will ignore it.
+
+ '''
+ for elem, count in self.iteritems():
+ for _ in repeat(None, count):
+ yield elem
+
+ # Override dict methods where the meaning changes for Counter objects.
+
+ @classmethod
+ def fromkeys(cls, iterable, v=None):
+ raise NotImplementedError(
+ 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.')
+
+ def update(self, iterable=None, **kwds):
+ '''Like dict.update() but add counts instead of replacing them.
+
+ Source can be an iterable, a dictionary, or another Counter instance.
+
+ >>> c = Counter('which')
+ >>> c.update('witch') # add elements from another iterable
+ >>> d = Counter('watch')
+ >>> c.update(d) # add elements from another counter
+ >>> c['h'] # four 'h' in which, witch, and watch
+ 4
+
+ '''
+ if iterable is not None:
+ if hasattr(iterable, 'iteritems'):
+ if self:
+ self_get = self.get
+ for elem, count in iterable.iteritems():
+ self[elem] = self_get(elem, 0) + count
+ else:
+ dict.update(self, iterable) # fast path when counter is empty
+ else:
+ self_get = self.get
+ for elem in iterable:
+ self[elem] = self_get(elem, 0) + 1
+ if kwds:
+ self.update(kwds)
+
+ def copy(self):
+ 'Like dict.copy() but returns a Counter instance instead of a dict.'
+ return Counter(self)
+
+ def __delitem__(self, elem):
+ 'Like dict.__delitem__() but does not raise KeyError for missing values.'
+ if elem in self:
+ dict.__delitem__(self, elem)
+
+ def __repr__(self):
+ if not self:
+ return '%s()' % self.__class__.__name__
+ items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
+ return '%s({%s})' % (self.__class__.__name__, items)
+
+ # Multiset-style mathematical operations discussed in:
+ # Knuth TAOCP Volume II section 4.6.3 exercise 19
+ # and at http://en.wikipedia.org/wiki/Multiset
+ #
+ # Outputs guaranteed to only include positive counts.
+ #
+ # To strip negative and zero counts, add-in an empty counter:
+ # c += Counter()
+
+ def __add__(self, other):
+ '''Add counts from two counters.
+
+ >>> Counter('abbb') + Counter('bcc')
+ Counter({'b': 4, 'c': 2, 'a': 1})
+
+
+ '''
+ if not isinstance(other, Counter):
+ return NotImplemented
+ result = Counter()
+ for elem in set(self) | set(other):
+ newcount = self[elem] + other[elem]
+ if newcount > 0:
+ result[elem] = newcount
+ return result
+
+ def __sub__(self, other):
+ ''' Subtract count, but keep only results with positive counts.
+
+ >>> Counter('abbbc') - Counter('bccd')
+ Counter({'b': 2, 'a': 1})
+
+ '''
+ if not isinstance(other, Counter):
+ return NotImplemented
+ result = Counter()
+ for elem in set(self) | set(other):
+ newcount = self[elem] - other[elem]
+ if newcount > 0:
+ result[elem] = newcount
+ return result
+
+ def __or__(self, other):
+ '''Union is the maximum of value in either of the input counters.
+
+ >>> Counter('abbb') | Counter('bcc')
+ Counter({'b': 3, 'c': 2, 'a': 1})
+
+ '''
+ if not isinstance(other, Counter):
+ return NotImplemented
+ _max = max
+ result = Counter()
+ for elem in set(self) | set(other):
+ newcount = _max(self[elem], other[elem])
+ if newcount > 0:
+ result[elem] = newcount
+ return result
+
+ def __and__(self, other):
+ ''' Intersection is the minimum of corresponding counts.
+
+ >>> Counter('abbb') & Counter('bcc')
+ Counter({'b': 1})
+
+ '''
+ if not isinstance(other, Counter):
+ return NotImplemented
+ _min = min
+ result = Counter()
+ if len(self) < len(other):
+ self, other = other, self
+ for elem in ifilter(self.__contains__, other):
+ newcount = _min(self[elem], other[elem])
+ if newcount > 0:
+ result[elem] = newcount
+ return result
+
+
+if __name__ == '__main__':
+ import doctest
+ print doctest.testmod()
diff --git a/pdftables/display.py b/pdftables/display.py
new file mode 100755
index 0000000..7b606df
--- /dev/null
+++ b/pdftables/display.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+from __future__ import unicode_literals
+from collections import defaultdict
+from StringIO import StringIO
+
+
+def to_string(table):
+ """
+ Returns a list of the maximum width for each column across all rows
+ >>> type(to_string([['foo', 'goodbye'], ['llama', 'bar']]))
+ <type 'unicode'>
+ """
+ result = StringIO()
+
+ (columns, rows) = get_dimensions(table)
+
+ result.write(" {} columns, {} rows\n".format(columns, rows))
+ col_widths = find_column_widths(table)
+ table_width = sum(col_widths) + len(col_widths) + 2
+ hbar = ' {}\n'.format('-' * table_width)
+
+ result.write(" {}\n".format(' '.join(
+ [unicode(col_index).rjust(width, ' ') for (col_index, width)
+ in enumerate(col_widths)])))
+
+ result.write(hbar)
+ for row_index, row in enumerate(table):
+ cells = [cell.rjust(width, ' ') for (cell, width)
+ in zip(row, col_widths)]
+ result.write("{:>3} | {}|\n".format(row_index, '|'.join(cells)))
+ result.write(hbar)
+ result.seek(0)
+ return unicode(result.read())
+
+
+def get_dimensions(table):
+ """
+ Returns columns, rows for a table.
+ >>> get_dimensions([['row1', 'apple', 'llama'], ['row2', 'plum', 'goat']])
+ (3, 2)
+ >>> get_dimensions([['row1', 'apple', 'llama'], ['row2', 'banana']])
+ (3, 2)
+ """
+ rows = len(table)
+ try:
+ cols = max(len(row) for row in table)
+ except ValueError:
+ cols = 0
+ return (cols, rows)
+
+
+def find_column_widths(table):
+ """
+ Returns a list of the maximum width for each column across all rows
+ >>> find_column_widths([['foo', 'goodbye'], ['llama', 'bar']])
+ [5, 7]
+ """
+ col_widths = defaultdict(lambda: 0)
+ for row_index, row in enumerate(table):
+ for column_index, cell in enumerate(row):
+ col_widths[column_index] = max(col_widths[column_index], len(cell))
+ return [col_widths[col] for col in sorted(col_widths)]
+
+if __name__ == '__main__':
+ print(to_string([['foo', 'goodbye'], ['llama', 'bar']]))
diff --git a/pdftables/pdftables.py b/pdftables/pdftables.py
new file mode 100755
index 0000000..5200ed7
--- /dev/null
+++ b/pdftables/pdftables.py
@@ -0,0 +1,641 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# ScraperWiki Limited
+# Ian Hopkinson, 2013-06-04
+
+from __future__ import unicode_literals
+"""
+Some experiments with pdfminer
+http://www.unixuser.org/~euske/python/pdfminer/programming.html
+Some help here:
+http://denis.papathanasiou.org/2010/08/04/extracting-text-images-from-pdf-files
+"""
+
+# TODO Identify multi-column text, for multicolumn text detect per column
+# TODO Dynamic / smarter thresholding
+# TODO Handle argentina_diputados_voting_record.pdf automatically
+# TODO Handle multiple tables on one page
+
+
+import sys
+import codecs
+
+from pdfminer.pdfparser import PDFParser, PDFDocument
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.pdfdevice import PDFDevice
+from pdfminer.layout import LAParams, LTPage
+from pdfminer.converter import PDFPageAggregator
+
+import collections
+
+from tree import Leaf, LeafList
+import requests # TODO: remove this dependency
+from cStringIO import StringIO
+import math
+import numpy # TODO: remove this dependency
+from counter import Counter
+
+IS_TABLE_COLUMN_COUNT_THRESHOLD = 3
+IS_TABLE_ROW_COUNT_THRESHOLD = 3
+
+class TableDiagnosticData(object):
+ def __init__(self, box_list=LeafList(), top_plot=dict(), left_plot=dict(), x_comb=[], y_comb=[]):
+ self.box_list = box_list
+ self.top_plot = top_plot
+ self.left_plot = left_plot
+ self.x_comb = x_comb
+ self.y_comb = y_comb
+
+class Table(list):
+ def __init__(self, content, page, page_total, table_index, table_index_total):
+ super(Table, self).__init__(content)
+ self.page_number = page
+ self.total_pages = page_total
+ self.table_number_on_page = table_index
+ self.total_tables_on_page = table_index_total
+
+LEFT = 0
+TOP = 3
+RIGHT = 2
+BOTTOM = 1
+
+def get_tables(fh):
+ """
+ Return a list of 'tables' from the given file handle, where a table is a
+ list of rows, and a row is a list of strings.
+ """
+ result = []
+ doc, interpreter, device = initialize_pdf_miner(fh)
+ doc_length = len(list(doc.get_pages()))
+ for i, pdf_page in enumerate(doc.get_pages()):
+ #print("Trying page {}".format(i + 1))
+ if not page_contains_tables(pdf_page, interpreter, device):
+ #print("Skipping page {}: no tables.".format(i + 1))
+ continue
+
+ # receive the LTPage object for the page.
+ interpreter.process_page(pdf_page)
+ processed_page = device.get_result()
+
+ (table, _) = page_to_tables(
+ processed_page,
+ extend_y=True,
+ hints=[],
+ atomise=True)
+ crop_table(table)
+ result.append(Table(table,i+1,doc_length,1,1))
+
+ return result
+
+
+def crop_table(table):
+ """
+ Remove empty rows from the top and bottom of the table.
+ """
+ for row in list(table): # top -> bottom
+ if not any(cell.strip() for cell in row):
+ table.remove(row)
+ else:
+ break
+
+ for row in list(reversed(table)): # bottom -> top
+ if not any(cell.strip() for cell in row):
+ table.remove(row)
+ else:
+ break
+
+
+def initialize_pdf_miner(fh):
+ # Create a PDF parser object associated with the file object.
+ parser = PDFParser(fh)
+ # Create a PDF document object that stores the document structure.
+ doc = PDFDocument()
+ # Connect the parser and document objects.
+ parser.set_document(doc)
+ doc.set_parser(parser)
+ # Supply the password for initialization.
+ # (If no password is set, give an empty string.)
+ doc.initialize("")
+ # Check if the document allows text extraction. If not, abort.
+ if not doc.is_extractable:
+ raise ValueError("PDFDocument is_extractable was False.")
+ # Create a PDF resource manager object that stores shared resources.
+ rsrcmgr = PDFResourceManager()
+ # Create a PDF device object.
+ device = PDFDevice(rsrcmgr)
+ # Create a PDF interpreter object.
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
+ # Process each page contained in the document.
+ # for page in doc.get_pages():
+ # interpreter.process_page(page)
+
+ # Set parameters for analysis.
+ laparams = LAParams()
+ laparams.word_margin = 0.0
+ # Create a PDF page aggregator object.
+ device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
+ return doc, interpreter, device
+
+
+def contains_tables(fh):
+ """
+ contains_tables(fh) takes a file handle and returns a boolean array of the
+ length of the document which is true for pages which contains tables
+ """
+ doc, interpreter, device = initialize_pdf_miner(fh)
+
+ return [page_contains_tables(p, interpreter, device) for
+ p in doc.get_pages()]
+
+
+def page_contains_tables(pdf_page, interpreter, device):
+ # TODO: hide doc, interpreter, device inside a higher level Pdf class. It's
+ # silly that we have to care about these (see function signature!!)
+
+ interpreter.process_page(pdf_page)
+ # receive the LTPage object for the page.
+ layout = device.get_result()
+ box_list = LeafList().populate(layout)
+ for item in box_list:
+ assert isinstance(item, Leaf), "NOT LEAF"
+ yhist = box_list.histogram(Leaf._top).rounder(1)
+
+ test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD]
+ return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
+
+
+def threshold_above(hist, threshold_value):
+ """
+ >>> threshold_above(Counter({518: 10, 520: 20, 530: 20, \
+ 525: 17}), 15)
+ [520, 530, 525]
+ """
+ if not isinstance(hist, Counter):
+ raise ValueError("requires Counter") # TypeError then?
+
+ above = [k for k, v in hist.items() if v > threshold_value]
+ return above
+
+
+def comb(combarray, value):
+ """
+ Takes a sorted array and returns the interval number of the value passed to
+ the function
+ """
+ # Raise an error in combarray not sorted
+ if (combarray != sorted(combarray)) and (combarray != sorted(
+ combarray, reverse=True)):
+ raise Exception("comb: combarray is not sorted")
+
+ index = -1
+ if combarray[0] > combarray[-1]:
+ for i in range(1, len(combarray)):
+ if combarray[i - 1] >= value >= combarray[i]:
+ index = i - 1
+ else:
+ for i in range(1, len(combarray)):
+ if combarray[i - 1] <= value <= combarray[i]:
+ index = i - 1
+
+ return index
+
+
+def apply_combs(box_list, x_comb, y_comb):
+ """Allocates text to table cells using the x and y combs"""
+ ncolumns = len(x_comb) - 1
+ nrows = len(y_comb) - 1
+ table_array = [[''] * ncolumns for j in range(nrows)]
+ for box in box_list:
+ y = round(box.midline)
+ x = round(box.centreline)
+ rowindex = comb(y_comb, y)
+ columnindex = comb(x_comb, x)
+ if rowindex != -1 and columnindex != -1:
+ # there was already some content at this coordinate so we
+ # concatenate (in an arbitrary order!)
+ table_array[rowindex][columnindex] += box.text.rstrip('\n\r')
+
+ return table_array
+
+
+def comb_from_projection(projection, threshold, orientation):
+ """Calculates the boundaries between cells from the projection of the boxes
+ onto either the y axis (for rows) or the x-axis (for columns). These
+ boundaries are known as the comb
+ """
+ if orientation=="row":
+ tol=1
+ elif orientation=="column":
+ tol=3
+
+ projection_threshold = threshold_above(projection, threshold)
+
+ projection_threshold = sorted(projection_threshold)
+ # need to generate a list of uppers (right or top edges)
+ # and a list of lowers (left or bottom edges)
+
+ # uppers = [k for k, v in yhisttop.items() if v > yThreshold]
+ # lowers = [k for k, v in yhistbottom.items() if v > yThreshold]
+ uppers = []
+ lowers = []
+
+ lowers.append(projection_threshold[0])
+ for i in range(1, len(projection_threshold)):
+ if projection_threshold[i] > (
+ projection_threshold[i-1] + 1):
+ uppers.append(projection_threshold[i - 1])
+ lowers.append(projection_threshold[i])
+ uppers.append(projection_threshold[-1])
+
+ comb = comb_from_uppers_and_lowers(uppers, lowers, tol=tol,
+ projection = projection)
+ comb.reverse()
+
+ return comb
+
+
+def comb_from_uppers_and_lowers(uppers, lowers, tol=1, projection=dict()):
+ """Called by comb_from_projection to calculate the comb given a set of
+ uppers and lowers, which are upper and lower edges of the thresholded
+ projection"""
+ # tol is a tolerance to remove very small minima, increasing to 2 fowls up
+ # row separation
+ assert len(uppers) == len(lowers)
+ uppers.sort(reverse=True)
+ lowers.sort(reverse=True)
+ comb = []
+ comb.append(uppers[0])
+ for i in range(1, len(uppers)):
+ if (lowers[i - 1]-uppers[i])>tol:
+ comb.append(find_minima(lowers[i - 1], uppers[i], projection))
+ #comb.append(find_minima(lowers[i - 1], uppers[i]))
+
+ comb.append(lowers[-1])
+
+ return comb
+
+def find_minima(lower, upper, projection=dict()):
+
+ #print lower, upper, projection
+ if len(projection)==0:
+ idx = (lower + upper) / 2.0
+ else:
+ profile = []
+ for i in range(upper, lower):
+ #print projection[i]
+ profile.append(projection[i])
+
+ val, idx = min((val, idx) for (idx, val) in enumerate(profile))
+ #val, idx = min(profile)
+ idx = upper + idx
+
+ return idx
+
+def comb_extend(comb, minv, maxv):
+ """Extend the comb to minv and maxv"""
+ # TODO should this truncate if minv>minc or maxc>maxc
+ # print y_comb
+ # Find sort order of comb, convert to ascending
+ reversed = False
+ if comb[0] > comb[-1]:
+ comb.reverse()
+ reversed = True
+ # Find min and max of comb
+ minc = comb[0]
+ maxc = comb[-1]
+ # Get average row spacing
+ rowSpacing = numpy.average(numpy.diff(comb))
+ # Extend minimum
+ if minv < minc:
+ comb.reverse()
+ comb.extend(list(numpy.arange(minc, minv, -rowSpacing))[1:])
+ comb.reverse()
+ # Extend maximum
+ if maxv > maxc:
+ comb.extend(list(numpy.arange(maxc, maxv, rowSpacing))[1:])
+
+ if reversed:
+ comb.reverse()
+ return comb
+
+
+def project_boxes(box_list, orientation, erosion=0):
+ """
+ Take a set of boxes and project their extent onto an axis
+ """
+ if orientation == "column":
+ upper = RIGHT
+ lower = LEFT
+ elif orientation == "row":
+ upper = TOP
+ lower = BOTTOM
+
+ projection = {}
+ minv = round(min([box.bbox[lower]
+ for box in box_list])) - 2 # ensure some overlap
+ maxv = round(max([box.bbox[upper] for box in box_list])) + 2
+
+ # Initialise projection structure
+ # print minv, maxv
+ coords = range(int(minv), int(maxv))
+ projection = coords
+
+ # print projection
+ for box in box_list:
+ for i in range(int(round(box.bbox[lower])) + erosion,
+ int(round(box.bbox[upper])) - erosion):
+ # projection[i] += 1
+ projection.append(i)
+
+ return Counter(projection)
+
+
+def get_pdf_page(fh, pagenumber):
+ doc, interpreter, device = initialize_pdf_miner(fh)
+ pages = list(doc.get_pages())
+
+ try:
+ page = pages[pagenumber - 1]
+ except IndexError:
+ raise IndexError("Invalid page number")
+
+ interpreter.process_page(page)
+ # receive the LTPage object for the page.
+ processedPage = device.get_result()
+ return processedPage
+
+# def getTable(fh, page, extend_y=False, hints=[]):
+# """placeholder for tests, refactor out"""
+# return page_to_tables(get_pdf_page(fh, page), extend_y, hints)
+
+
+def get_min_and_max_y_from_hints(box_list, top_string, bottom_string):
+ miny = None
+ maxy = None
+ if top_string:
+ top_box = [box for box in box_list if top_string in box.text]
+ if top_box:
+ maxy = top_box[0].top
+ if bottom_string:
+ bottomBox = [box for box in box_list if bottom_string in box.text]
+ if bottomBox:
+ miny = bottomBox[0].bottom
+ return miny, maxy
+
+
+def rounder(val, tol):
+ """
+ Utility function to round numbers to arbitrary tolerance
+ """
+ return round((1.0 * val) / tol) * tol
+
+
+#def filter_box_list_by_type(box_list, flt):
+# return [box for box in box_list if box.classname in flt]
+
+
+def multi_column_detect(page):
+ #TODO This function is under construction
+ """
+ Test for multiColumns from a box_list, returns an integer number of columns
+ and a set of (left, right) pairs delineating any columns
+ """
+ # Ways to identify multicolumns:
+ # 1. High fill factor compared to tables
+ # 2. Gullies at textwidth/2, (textwidth/3, 2*textwidth/3)...
+ # 3. Histogram of boxwidths with peak at some fraction of page width
+ # This is like project_boxes but we are projecting the length of the
+ # textbox onto the axis
+ box_list = LeafList().populate(
+ page, ['LTPage', 'LTTextLineHorizontal']).purge_empty_text()
+
+ # Should use the LTPage object to get page bounding box
+ box_list = filter_box_list_by_type(box_list, 'LTTextLineHorizontal')
+ pile = {}
+ vstep = 5 # should be scaled by modal row height
+ minv = rounder(
+ min([box.bottom for box in box_list]),
+ 5) # ensure some overlap
+ maxv = rounder(max([box.top for box in box_list]), 5)
+
+ minx = round(min([box.left for box in box_list])) # ensure some overlap
+ maxx = round(max([box.right for box in box_list]))
+
+ # Initialise projection structure
+ # print minv, maxv
+
+ coords = range(int(minv), int(maxv) + vstep, vstep)
+
+ pile = collections.OrderedDict(zip(coords, [0] * len(coords)))
+ # print projection
+ for box in box_list:
+ # print int(rounder(box.midline, 30)), box.width
+ pile[int(rounder(box.midline, vstep))] += box.width
+
+ for key, value in pile.items():
+ pile[key] = value / (maxx - minx)
+
+ # Box width histogram
+ bstep = 10
+ boxhist = {}
+ boxwidthmin = rounder(min([box.width for box in box_list]), bstep)
+ boxwidthmax = rounder(max([box.width for box in box_list]), bstep)
+
+ coords = range(int(boxwidthmin), int(boxwidthmax) + bstep, bstep)
+ boxhist = collections.OrderedDict(zip(coords, [0] * len(coords)))
+ for box in box_list:
+ # print int(rounder(box.midline, 30)), box.width
+ boxhist[int(rounder(box.width, bstep))] += 1
+
+ nboxes = len(box_list)
+ for key, value in boxhist.items():
... 1035 lines suppressed ...
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/pdftables.git
More information about the Python-modules-commits
mailing list