[Python-modules-commits] [csvkit] 01/04: Imported Upstream version 0.9.1

Sandro Tosi morph at moszumanska.debian.org
Wed Jul 1 15:34:20 UTC 2015


This is an automated email from the git hooks/post-receive script.

morph pushed a commit to branch bpo80
in repository csvkit.

commit eb0ec9088835d28024fc42370694deaf447444e3
Author: Sandro Tosi <morph at debian.org>
Date:   Wed Jul 1 10:16:42 2015 -0400

    Imported Upstream version 0.9.1
---
 PKG-INFO                             |  39 ++++
 README                               |  11 +
 csvkit.egg-info/PKG-INFO             |  39 ++++
 csvkit.egg-info/SOURCES.txt          |  46 ++++
 csvkit.egg-info/dependency_links.txt |   1 +
 csvkit.egg-info/entry_points.txt     |  16 ++
 csvkit.egg-info/requires.txt         |   6 +
 csvkit.egg-info/top_level.txt        |   1 +
 csvkit/__init__.py                   |  34 +++
 csvkit/cleanup.py                    | 113 ++++++++++
 csvkit/cli.py                        | 396 +++++++++++++++++++++++++++++++++++
 csvkit/convert/__init__.py           |  81 +++++++
 csvkit/convert/csvitself.py          |  18 ++
 csvkit/convert/dbase.py              |  44 ++++
 csvkit/convert/fixed.py              | 140 +++++++++++++
 csvkit/convert/geojs.py              |  73 +++++++
 csvkit/convert/js.py                 |  77 +++++++
 csvkit/convert/ndjs.py               |  76 +++++++
 csvkit/convert/xls.py                | 155 ++++++++++++++
 csvkit/convert/xlsx.py               |  95 +++++++++
 csvkit/exceptions.py                 |  93 ++++++++
 csvkit/grep.py                       | 117 +++++++++++
 csvkit/headers.py                    |   7 +
 csvkit/join.py                       | 185 ++++++++++++++++
 csvkit/py2.py                        | 104 +++++++++
 csvkit/py3.py                        | 119 +++++++++++
 csvkit/sniffer.py                    |  18 ++
 csvkit/sql.py                        | 102 +++++++++
 csvkit/table.py                      | 291 +++++++++++++++++++++++++
 csvkit/typeinference.py              | 248 ++++++++++++++++++++++
 csvkit/unicsv.py                     | 144 +++++++++++++
 csvkit/utilities/__init__.py         |   0
 csvkit/utilities/csvclean.py         |  80 +++++++
 csvkit/utilities/csvcut.py           |  68 ++++++
 csvkit/utilities/csvformat.py        |  64 ++++++
 csvkit/utilities/csvgrep.py          |  69 ++++++
 csvkit/utilities/csvjoin.py          | 115 ++++++++++
 csvkit/utilities/csvjson.py          | 194 +++++++++++++++++
 csvkit/utilities/csvlook.py          |  80 +++++++
 csvkit/utilities/csvpy.py            |  41 ++++
 csvkit/utilities/csvsort.py          |  63 ++++++
 csvkit/utilities/csvsql.py           | 163 ++++++++++++++
 csvkit/utilities/csvstack.py         |  89 ++++++++
 csvkit/utilities/csvstat.py          | 250 ++++++++++++++++++++++
 csvkit/utilities/in2csv.py           |  86 ++++++++
 csvkit/utilities/sql2csv.py          |  68 ++++++
 setup.cfg                            |   5 +
 setup.py                             |  75 +++++++
 48 files changed, 4399 insertions(+)

diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..e3cef1b
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,39 @@
+Metadata-Version: 1.1
+Name: csvkit
+Version: 0.9.1
+Summary: A library of utilities for working with CSV, the king of tabular file formats.
+Home-page: http://csvkit.rtfd.org/
+Author: Christopher Groskopf
+Author-email: staringmonkey at gmail.com
+License: MIT
+Description: csvkit is a suite of utilities for converting to and working with CSV, the king of tabular file formats.
+        
+        It is inspired by pdftk, gdal and the original csvcut utility by Joe Germuska and Aaron Bycoffe.
+        
+        Important links:
+        
+        * Repository:    https://github.com/onyxfish/csvkit
+        * Issues:        https://github.com/onyxfish/csvkit/issues
+        * Documentation: http://csvkit.rtfd.org/
+        * Schemas:       https://github.com/onyxfish/ffs
+        * Buildbot:      https://travis-ci.org/onyxfish/csvkit
+        
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: End Users/Desktop
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2.6
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3.3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Topic :: Scientific/Engineering :: Information Analysis
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Utilities
diff --git a/README b/README
new file mode 100644
index 0000000..0d5677a
--- /dev/null
+++ b/README
@@ -0,0 +1,11 @@
+csvkit is a suite of utilities for converting to and working with CSV, the king of tabular file formats.
+
+It is inspired by pdftk, gdal and the original csvcut utility by Joe Germuska and Aaron Bycoffe.
+
+Important links:
+
+* Repository:    https://github.com/onyxfish/csvkit
+* Issues:        https://github.com/onyxfish/csvkit/issues
+* Documentation: http://csvkit.rtfd.org/
+* Schemas:       https://github.com/onyxfish/ffs
+* Buildbot:      https://travis-ci.org/onyxfish/csvkit
diff --git a/csvkit.egg-info/PKG-INFO b/csvkit.egg-info/PKG-INFO
new file mode 100644
index 0000000..e3cef1b
--- /dev/null
+++ b/csvkit.egg-info/PKG-INFO
@@ -0,0 +1,39 @@
+Metadata-Version: 1.1
+Name: csvkit
+Version: 0.9.1
+Summary: A library of utilities for working with CSV, the king of tabular file formats.
+Home-page: http://csvkit.rtfd.org/
+Author: Christopher Groskopf
+Author-email: staringmonkey at gmail.com
+License: MIT
+Description: csvkit is a suite of utilities for converting to and working with CSV, the king of tabular file formats.
+        
+        It is inspired by pdftk, gdal and the original csvcut utility by Joe Germuska and Aaron Bycoffe.
+        
+        Important links:
+        
+        * Repository:    https://github.com/onyxfish/csvkit
+        * Issues:        https://github.com/onyxfish/csvkit/issues
+        * Documentation: http://csvkit.rtfd.org/
+        * Schemas:       https://github.com/onyxfish/ffs
+        * Buildbot:      https://travis-ci.org/onyxfish/csvkit
+        
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: End Users/Desktop
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2.6
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3.3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Topic :: Scientific/Engineering :: Information Analysis
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Utilities
diff --git a/csvkit.egg-info/SOURCES.txt b/csvkit.egg-info/SOURCES.txt
new file mode 100644
index 0000000..2b15f5e
--- /dev/null
+++ b/csvkit.egg-info/SOURCES.txt
@@ -0,0 +1,46 @@
+README
+setup.py
+csvkit/__init__.py
+csvkit/cleanup.py
+csvkit/cli.py
+csvkit/exceptions.py
+csvkit/grep.py
+csvkit/headers.py
+csvkit/join.py
+csvkit/py2.py
+csvkit/py3.py
+csvkit/sniffer.py
+csvkit/sql.py
+csvkit/table.py
+csvkit/typeinference.py
+csvkit/unicsv.py
+csvkit.egg-info/PKG-INFO
+csvkit.egg-info/SOURCES.txt
+csvkit.egg-info/dependency_links.txt
+csvkit.egg-info/entry_points.txt
+csvkit.egg-info/requires.txt
+csvkit.egg-info/top_level.txt
+csvkit/convert/__init__.py
+csvkit/convert/csvitself.py
+csvkit/convert/dbase.py
+csvkit/convert/fixed.py
+csvkit/convert/geojs.py
+csvkit/convert/js.py
+csvkit/convert/ndjs.py
+csvkit/convert/xls.py
+csvkit/convert/xlsx.py
+csvkit/utilities/__init__.py
+csvkit/utilities/csvclean.py
+csvkit/utilities/csvcut.py
+csvkit/utilities/csvformat.py
+csvkit/utilities/csvgrep.py
+csvkit/utilities/csvjoin.py
+csvkit/utilities/csvjson.py
+csvkit/utilities/csvlook.py
+csvkit/utilities/csvpy.py
+csvkit/utilities/csvsort.py
+csvkit/utilities/csvsql.py
+csvkit/utilities/csvstack.py
+csvkit/utilities/csvstat.py
+csvkit/utilities/in2csv.py
+csvkit/utilities/sql2csv.py
\ No newline at end of file
diff --git a/csvkit.egg-info/dependency_links.txt b/csvkit.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/csvkit.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/csvkit.egg-info/entry_points.txt b/csvkit.egg-info/entry_points.txt
new file mode 100644
index 0000000..811415c
--- /dev/null
+++ b/csvkit.egg-info/entry_points.txt
@@ -0,0 +1,16 @@
+[console_scripts]
+csvclean = csvkit.utilities.csvclean:launch_new_instance
+csvcut = csvkit.utilities.csvcut:launch_new_instance
+csvformat = csvkit.utilities.csvformat:launch_new_instance
+csvgrep = csvkit.utilities.csvgrep:launch_new_instance
+csvjoin = csvkit.utilities.csvjoin:launch_new_instance
+csvjson = csvkit.utilities.csvjson:launch_new_instance
+csvlook = csvkit.utilities.csvlook:launch_new_instance
+csvpy = csvkit.utilities.csvpy:launch_new_instance
+csvsort = csvkit.utilities.csvsort:launch_new_instance
+csvsql = csvkit.utilities.csvsql:launch_new_instance
+csvstack = csvkit.utilities.csvstack:launch_new_instance
+csvstat = csvkit.utilities.csvstat:launch_new_instance
+in2csv = csvkit.utilities.in2csv:launch_new_instance
+sql2csv = csvkit.utilities.sql2csv:launch_new_instance
+
diff --git a/csvkit.egg-info/requires.txt b/csvkit.egg-info/requires.txt
new file mode 100644
index 0000000..a15d620
--- /dev/null
+++ b/csvkit.egg-info/requires.txt
@@ -0,0 +1,6 @@
+xlrd>=0.7.1
+sqlalchemy>=0.6.6
+openpyxl==2.2.0-b1
+six>=1.6.1
+python-dateutil==2.2
+dbf==0.94.003
diff --git a/csvkit.egg-info/top_level.txt b/csvkit.egg-info/top_level.txt
new file mode 100644
index 0000000..36204a5
--- /dev/null
+++ b/csvkit.egg-info/top_level.txt
@@ -0,0 +1 @@
+csvkit
diff --git a/csvkit/__init__.py b/csvkit/__init__.py
new file mode 100644
index 0000000..a170278
--- /dev/null
+++ b/csvkit/__init__.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+
+"""
+This module contains csvkit's superpowered replacement for the builtin :mod:`csv` module. For Python 2 users, the greatest improvement over the standard library full unicode support. Python 3's :mod:`csv` module supports unicode internally, so this module is provided primarily for compatability purposes.
+
+* Python 2: :mod:`csvkit.py2`.
+* Python 3: :mod:`csvkit.py3`.
+"""
+
+import six
+
+if six.PY2:
+    from csvkit import py2
+
+    CSVKitReader = py2.CSVKitReader
+    CSVKitWriter = py2.CSVKitWriter
+    CSVKitDictReader = py2.CSVKitDictReader
+    CSVKitDictWriter = py2.CSVKitDictWriter
+    reader = py2.reader
+    writer = py2.writer
+    DictReader = py2.CSVKitDictReader
+    DictWriter = py2.CSVKitDictWriter
+else:
+    from csvkit import py3
+
+    CSVKitReader = py3.CSVKitReader
+    CSVKitWriter = py3.CSVKitWriter
+    CSVKitDictReader = py3.CSVKitDictReader
+    CSVKitDictWriter = py3.CSVKitDictWriter
+    reader = py3.reader
+    writer = py3.writer
+    DictReader = py3.CSVKitDictReader
+    DictWriter = py3.CSVKitDictWriter
+
diff --git a/csvkit/cleanup.py b/csvkit/cleanup.py
new file mode 100644
index 0000000..6fc241a
--- /dev/null
+++ b/csvkit/cleanup.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+from csvkit.exceptions import CSVTestException, LengthMismatchError
+
+def join_rows(rows, joiner=' '):
+    """
+    Given a series of rows, return them as a single row where the inner edge cells are merged. By default joins with a single space character, but you can specify new-line, empty string, or anything else with the 'joiner' kwarg.
+    """
+    rows = list(rows)
+    fixed_row = rows[0][:]
+
+    for row in rows[1:]:
+        if len(row) == 0:
+            row = ['']
+        
+        fixed_row[-1] += "%s%s" % (joiner, row[0])
+        fixed_row.extend(row[1:])
+
+    return fixed_row
+        
+def fix_length_errors(errs, target_line_length, joiner=' '):
+    """
+    If possible, transform the rows backed up in the list of errors into rows of the correct length.
+    If the list of errors does not yet produce a row of target_line_length, return an empty array.
+    """
+    if not errs:
+        return []
+
+    fixed_rows = []
+    backlog = []
+    
+    for err in errs:
+        if type(err) is not LengthMismatchError:
+            return [] # give up if any are not length errors
+
+        backlog.append(err)
+        fixed_row = join_rows([err.row for err in backlog])
+
+        if len(fixed_row) == target_line_length:
+            fixed_rows.append(fixed_row)
+            backlog = [] # reset
+        
+    return fixed_rows
+
+def extract_joinable_row_errors(errs):
+    joinable = []
+
+    for err in reversed(errs):
+        if type(err) is not LengthMismatchError:
+            break
+
+        if joinable and err.line_number != joinable[-1].line_number - 1:
+            break
+
+        joinable.append(err)
+
+    joinable.reverse() 
+
+    return joinable
+
+class RowChecker(object):
+    """
+    Iterate over rows of a CSV producing cleaned rows and storing error rows.
+    """
+    def __init__(self, reader):
+        self.reader = reader
+        self.column_names = next(reader)
+
+        self.errors = []
+        self.rows_joined = 0
+        self.joins = 0
+
+    def checked_rows(self):
+        """
+        A generator which yields rows which are ready to write to output.
+        """
+        line_number = self.reader.line_num
+        
+        for row in self.reader:
+            try:
+                if len(row) != len(self.column_names):
+                    raise LengthMismatchError(line_number, row, len(self.column_names))
+
+                yield row
+            except LengthMismatchError as e:
+                self.errors.append(e)
+
+                joinable_row_errors = extract_joinable_row_errors(self.errors)
+                
+                while joinable_row_errors:
+                    fixed_row = join_rows([err.row for err in joinable_row_errors], joiner=' ')
+
+                    if len(fixed_row) < len(self.column_names):
+                        break
+
+                    if len(fixed_row) == len(self.column_names):
+                        self.rows_joined += len(joinable_row_errors)
+                        self.joins += 1
+
+                        yield fixed_row
+                        
+                        for fixed in joinable_row_errors:
+                            self.errors.remove(fixed)
+                        
+                        break
+
+                    joinable_row_errors = joinable_row_errors[1:] # keep trying in case we're too long because of a straggler
+
+            except CSVTestException as e:
+                self.errors.append(e)
+        
+            line_number = self.reader.line_num
+ 
diff --git a/csvkit/cli.py b/csvkit/cli.py
new file mode 100644
index 0000000..5bd811b
--- /dev/null
+++ b/csvkit/cli.py
@@ -0,0 +1,396 @@
+#!/usr/bin/env python
+
+import argparse
+import bz2
+import codecs
+import gzip
+import os.path
+import sys
+
+import six
+
+from csvkit import CSVKitReader
+from csvkit.exceptions import ColumnIdentifierError, RequiredHeaderError
+
+def lazy_opener(fn):
+    def wrapped(self, *args, **kwargs):
+        self._lazy_open()
+        fn(*args, **kwargs)
+    return wrapped
+
+class LazyFile(six.Iterator):
+    """
+    A proxy for a File object that delays opening it until
+    a read method is called.
+
+    Currently this implements only the minimum methods to be useful,
+    but it could easily be expanded.
+    """
+    def __init__(self, init, *args, **kwargs):
+        self.init = init
+        self.f = None
+        self._is_lazy_opened = False
+
+        self._lazy_args = args
+        self._lazy_kwargs = kwargs
+
+    def __getattr__(self, name):
+        if not self._is_lazy_opened:
+            self.f = self.init(*self._lazy_args, **self._lazy_kwargs)
+            self._is_lazy_opened = True
+
+        return getattr(self.f, name)
+
+    def __iter__(self):
+        return self
+
+    def close(self):
+        self.f.close()
+        self.f = None
+        self._is_lazy_opened = False
+
+    def __next__(self):
+        if not self._is_lazy_opened:
+            self.f = self.init(*self._lazy_args, **self._lazy_kwargs)
+            self._is_lazy_opened = True
+
+        return next(self.f)
+
+class CSVKitUtility(object):
+    description = ''
+    epilog = ''
+    override_flags = ''
+
+    def __init__(self, args=None, output_file=None):
+        """
+        Perform argument processing and other setup for a CSVKitUtility.
+        """
+        self._init_common_parser()
+        self.add_arguments()
+        self.args = self.argparser.parse_args(args)
+
+        if 'f' not in self.override_flags:
+            self.input_file = self._open_input_file(self.args.input_path)
+
+        self.reader_kwargs = self._extract_csv_reader_kwargs()
+        self.writer_kwargs = self._extract_csv_writer_kwargs()
+
+        self._install_exception_handler()
+
+        if output_file is None:
+            self.output_file = sys.stdout
+        else:
+            self.output_file = output_file
+
+        # Ensure SIGPIPE doesn't throw an exception
+        # Prevents [Errno 32] Broken pipe errors, e.g. when piping to 'head'
+        # To test from the shell:
+        #  python -c "for i in range(5000): print 'a,b,c'" | csvlook | head
+        # Without this fix you will see at the end:
+        #  [Errno 32] Broken pipe
+        # With this fix, there should be no error
+        # For details on Python and SIGPIPE, see http://bugs.python.org/issue1652
+        try:
+            import signal
+            signal.signal(signal.SIGPIPE, signal.SIG_DFL)
+        except (ImportError, AttributeError):
+            #Do nothing on platforms that don't have signals or don't have SIGPIPE
+            pass
+
+    def add_arguments(self):
+        """
+        Called upon initialization once the parser for common arguments has been constructed.
+
+        Should be overriden by individual utilities.
+        """
+        raise NotImplementedError('add_arguments must be provided by each subclass of CSVKitUtility.')
+
+    def main(self):
+        """
+        Main loop of the utility.
+
+        Should be overriden by individual utilities and explicitly called by the executing script.
+        """
+        raise NotImplementedError(' must be provided by each subclass of CSVKitUtility.')
+
+    def _init_common_parser(self):
+        """
+        Prepare a base argparse argument parser so that flags are consistent across different shell command tools.
+        If you want to constrain which common args are present, you can pass a string for 'omitflags'. Any argument
+        whose single-letter form is contained in 'omitflags' will be left out of the configured parser. Use 'f' for 
+        file.
+        """
+        self.argparser = argparse.ArgumentParser(description=self.description, epilog=self.epilog)
+
+        # Input
+        if 'f' not in self.override_flags:
+            self.argparser.add_argument(metavar="FILE", nargs='?', dest='input_path',
+                help='The CSV file to operate on. If omitted, will accept input on STDIN.')
+        if 'd' not in self.override_flags:
+            self.argparser.add_argument('-d', '--delimiter', dest='delimiter',
+                help='Delimiting character of the input CSV file.')
+        if 't' not in self.override_flags:
+            self.argparser.add_argument('-t', '--tabs', dest='tabs', action='store_true',
+                help='Specifies that the input CSV file is delimited with tabs. Overrides "-d".')
+        if 'q' not in self.override_flags:
+            self.argparser.add_argument('-q', '--quotechar', dest='quotechar',
+                help='Character used to quote strings in the input CSV file.')
+        if 'u' not in self.override_flags:
+            self.argparser.add_argument('-u', '--quoting', dest='quoting', type=int, choices=[0,1,2,3],
+                help='Quoting style used in the input CSV file. 0 = Quote Minimal, 1 = Quote All, 2 = Quote Non-numeric, 3 = Quote None.')
+        if 'b' not in self.override_flags:
+            self.argparser.add_argument('-b', '--doublequote', dest='doublequote', action='store_true',
+                help='Whether or not double quotes are doubled in the input CSV file.')
+        if 'p' not in self.override_flags:
+            self.argparser.add_argument('-p', '--escapechar', dest='escapechar',
+                help='Character used to escape the delimiter if --quoting 3 ("Quote None") is specified and to escape the QUOTECHAR if --doublequote is not specified.')
+        if 'z' not in self.override_flags:
+            self.argparser.add_argument('-z', '--maxfieldsize', dest='maxfieldsize', type=int,
+                help='Maximum length of a single field in the input CSV file.')
+        if 'e' not in self.override_flags:
+            self.argparser.add_argument('-e', '--encoding', dest='encoding', default='utf-8',
+                help='Specify the encoding the input CSV file.')
+        if 'S' not in self.override_flags:
+            self.argparser.add_argument('-S', '--skipinitialspace', dest='skipinitialspace', default=False, action='store_true',
+                help='Ignore whitespace immediately following the delimiter.')
+        if 'H' not in self.override_flags:
+            self.argparser.add_argument('-H', '--no-header-row', dest='no_header_row', action='store_true',
+                help='Specifies that the input CSV file has no header row. Will create default headers.')
+        if 'v' not in self.override_flags:
+            self.argparser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
+                help='Print detailed tracebacks when errors occur.')
+
+        # Output
+        if 'l' not in self.override_flags:
+            self.argparser.add_argument('-l', '--linenumbers', dest='line_numbers', action='store_true',
+                                help='Insert a column of line numbers at the front of the output. Useful when piping to grep or as a simple primary key.')
+
+        # Input/Output
+        if 'zero' not in self.override_flags:
+            self.argparser.add_argument('--zero', dest='zero_based', action='store_true',
+                            help='When interpreting or displaying column numbers, use zero-based numbering instead of the default 1-based numbering.')
+        
+    def _open_input_file(self, path):
+        """
+        Open the input file specified on the command line.
+        """
+        if six.PY2:
+            mode = 'rb'
+            kwargs = {}
+        else:
+            mode = 'rt'
+            kwargs = { 'encoding': self.args.encoding }
+
+        if not path or path == '-':
+            f = sys.stdin
+        else:
+            (_, extension) = os.path.splitext(path)
+
+            if extension == u'.gz':
+                f = LazyFile(gzip.open, path, mode, **kwargs)
+            elif extension == '.bz2':
+                if six.PY2:
+                    f = LazyFile(bz2.BZ2File, path, mode, **kwargs)
+                else:
+                    f = LazyFile(bz2.open, path, mode, **kwargs)
+            else:
+                f = LazyFile(open, path, mode, **kwargs)
+
+        return f
+
+    def _extract_csv_reader_kwargs(self):
+        """
+        Extracts those from the command-line arguments those would should be passed through to the input CSV reader(s).
+        """
+        kwargs = {}
+
+        if self.args.tabs:
+            kwargs['delimiter'] = '\t'
+        elif self.args.delimiter:
+            kwargs['delimiter'] = self.args.delimiter
+
+        if self.args.quotechar:
+            kwargs['quotechar'] = self.args.quotechar
+
+        if self.args.quoting:
+            kwargs['quoting'] = self.args.quoting
+
+        if self.args.doublequote:
+            kwargs['doublequote'] = self.args.doublequote
+
+        if self.args.escapechar:
+            kwargs['escapechar'] = self.args.escapechar
+
+        if self.args.maxfieldsize:
+            kwargs['maxfieldsize'] = self.args.maxfieldsize
+
+        if self.args.skipinitialspace:
+            kwargs['skipinitialspace'] = self.args.skipinitialspace
+
+        if six.PY2 and self.args.encoding:
+            kwargs['encoding'] = self.args.encoding
+
+        return kwargs
+
+    def _extract_csv_writer_kwargs(self):
+        """
+        Extracts those from the command-line arguments those would should be passed through to the output CSV writer.
+        """
+        kwargs = {}
+
+        if 'l' not in self.override_flags and self.args.line_numbers:
+            kwargs['line_numbers'] = True
+
+        return kwargs
+
+    def _install_exception_handler(self):
+        """
+        Installs a replacement for sys.excepthook, which handles pretty-printing uncaught exceptions.
+        """
+        if six.PY2:
+            sys.stderr = codecs.getwriter('utf-8')(sys.stderr)
+
+        def handler(t, value, traceback):
+            if self.args.verbose:
+                sys.__excepthook__(t, value, traceback)
+            else:
+                # Special case handling for Unicode errors, which behave very strangely
+                # when cast with unicode()
+                if t == UnicodeDecodeError:
+                    sys.stderr.write('Your file is not "%s" encoded. Please specify the correct encoding with the -e flag. Use the -v flag to see the complete error.\n' % self.args.encoding)
+                else:
+                    sys.stderr.write('%s\n' % six.text_type(value))
+
+        sys.excepthook = handler
+
+    def print_column_names(self):
+        """
+        Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
+        """
+        if self.args.no_header_row:
+            raise RequiredHeaderError('You cannot use --no-header-row with the -n or --names options.')
+
+        f = self.input_file
+        output = self.output_file
+
+        try:
+            zero_based=self.args.zero_based
+        except:
+            zero_based=False
+
+        rows = CSVKitReader(f, **self.reader_kwargs)
+        column_names = next(rows)
+
+        for i, c in enumerate(column_names):
+            if not zero_based:
+                i += 1
+            output.write('%3i: %s\n' % (i, c))
+
+
+def match_column_identifier(column_names, c, zero_based=False):
+    """
+    Determine what column a single column id (name or index) matches in a series of column names.
+    Note that integer values are *always* treated as positional identifiers. If you happen to have
+    column names which are also integers, you must specify them using a positional index.
+    """
+    if isinstance(c, six.string_types) and not c.isdigit() and c in column_names:
+        return column_names.index(c)
+    else:
+        try:
+            c = int(c)
+            if not zero_based:
+                c -= 1
+        # Fail out if neither a column name nor an integer
+        except:
+            raise ColumnIdentifierError('Column identifier "%s" is neither an integer, nor a existing column\'s name.' % c)
+
+        # Fail out if index is 0-based
+        if c < 0:
+            raise ColumnIdentifierError('Column 0 is not valid; columns are 1-based.')
+
+        # Fail out if index is out of range
+        if c >= len(column_names):
+            raise ColumnIdentifierError('Index %i is beyond the last named column, "%s" at index %i.' % (c, column_names[-1], len(column_names) - 1))
+
+    return c
+
+def parse_column_identifiers(ids, column_names, zero_based=False, excluded_columns=None):
+    """
+    Parse a comma-separated list of column indices AND/OR names into a list of integer indices.
+    Ranges of integers can be specified with two integers separated by a '-' or ':' character. Ranges of 
+    non-integers (e.g. column names) are not supported.
+    Note: Column indices are 1-based. 
+    """
+    columns = []
+
+    # If not specified, start with all columns 
+    if not ids:
+        columns = range(len(column_names))        
+
+    if columns and not excluded_columns:
+        return columns
+
+    if not columns:
+        for c in ids.split(','):
+            c = c.strip()
+
+            try:
+                columns.append(match_column_identifier(column_names, c, zero_based))
+            except ColumnIdentifierError:
+                if ':' in c:
+                    a,b = c.split(':',1)
+                elif '-' in c:
+                    a,b = c.split('-',1)
+                else:
+                    raise
+                
+                try:
+                    if a:
+                        a = int(a)
+                    else:
+                        a = 1
+                    if b:
+                        b = int(b) + 1
+                    else:
+                        b = len(column_names) + 1
+                        
+                except ValueError:
+                    raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.")
+                
+                for x in range(a,b):
+                    columns.append(match_column_identifier(column_names, x, zero_based))
+
+    excludes = []
+    
+    if excluded_columns:
+        for c in excluded_columns.split(','):
+            c = c.strip()
+
+            try:
+                excludes.append(match_column_identifier(column_names, c, zero_based))
+            except ColumnIdentifierError:
+                if ':' in c:
+                    a,b = c.split(':',1)
+                elif '-' in c:
+                    a,b = c.split('-',1)
+                else:
+                    raise
+                
+                try:
+                    if a:
+                        a = int(a)
+                    else:
+                        a = 1
+                    if b:
+                        b = int(b) + 1
+                    else:
+                        b = len(column_names)
+                        
+                except ValueError:
+                    raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.")
+                
+                for x in range(a,b):
+                    excludes.append(match_column_identifier(column_names, x, zero_based))
+
+    return [c for c in columns if c not in excludes]
+
diff --git a/csvkit/convert/__init__.py b/csvkit/convert/__init__.py
new file mode 100644
index 0000000..28dd6c3
--- /dev/null
+++ b/csvkit/convert/__init__.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import six
+
+from csvkit.convert.csvitself import csv2csv
+from csvkit.convert.fixed import fixed2csv
+from csvkit.convert.geojs import geojson2csv
+from csvkit.convert.js import json2csv
+from csvkit.convert.ndjs import ndjson2csv
+from csvkit.convert.xls import xls2csv
+from csvkit.convert.xlsx import xlsx2csv
+
+SUPPORTED_FORMATS = ['fixed', 'xls', 'xlsx', 'csv', 'json', 'geojson', 'ndjson']
+
+# DBF is supported for Python 2 only
+if six.PY2:
+    from csvkit.convert.dbase import dbf2csv
+
+    SUPPORTED_FORMATS.append('dbf')
+
+def convert(f, format, schema=None, key=None, **kwargs):
+    """
+    Convert a file of a specified format to CSV.
+    """
+    if not f:
+        raise ValueError('f must not be None')
+
+    if not format:
+        raise ValueError('format must not be None')
+
+    if format == 'fixed':
+        if not schema:
+            raise ValueError('schema must not be null when format is "fixed"')
+
+        return fixed2csv(f, schema, **kwargs)
+    elif format == 'xls':
+        return xls2csv(f, **kwargs)
+    elif format == 'xlsx':
+        return xlsx2csv(f, **kwargs)
+    elif format == 'json':
+        return json2csv(f, key, **kwargs)
+    elif format == 'ndjson':
+        return ndjson2csv(f, **kwargs)
+    elif format == 'geojson':
+        return geojson2csv(f, **kwargs)
+    elif format == 'csv':
+        return csv2csv(f, **kwargs)
+    elif format == 'dbf':
+        if six.PY3:
+            raise ValueError('format "dbf" is not supported forthis version of Python.')
+        return dbf2csv(f, **kwargs)
+    else:
+        raise ValueError('format "%s" is not supported' % format)
+
+def guess_format(filename):
+    """
+    Try to guess a file's format based on its extension (or lack thereof).
+    """
+    last_period = filename.rfind('.')
+
+    if last_period == -1:
+        # No extension: assume fixed-width
+        return 'fixed'
+
+    extension = filename[last_period + 1:]
+
+    if extension == 'xls':
+        return extension
+    elif extension == 'xlsx':
+        return extension
+    elif extension in ['json', 'js']:
+        return 'json' 
+    elif extension == 'csv':
+        return extension
+    elif extension == 'fixed':
+        return extension
+    elif extension == 'dbf':
+        return extension
+
+    return None
+
diff --git a/csvkit/convert/csvitself.py b/csvkit/convert/csvitself.py
new file mode 100644
index 0000000..4b6f597
--- /dev/null
+++ b/csvkit/convert/csvitself.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+import six
+
+from csvkit import table
+
+def csv2csv(f, **kwargs):
+    """
+    "Convert" a CSV into a new CSV by normalizing types and correcting for other anomalies.
+    """
+    tab = table.Table.from_csv(f, **kwargs) 
+
+    o = six.StringIO()
+    output = tab.to_csv(o)
+    output = o.getvalue()
+    o.close()
+
+    return output
diff --git a/csvkit/convert/dbase.py b/csvkit/convert/dbase.py
new file mode 100644
index 0000000..e2e7b77
--- /dev/null
+++ b/csvkit/convert/dbase.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+"""
+Note: dbf is only supported/imported for Python 2.
+"""
+
+import dbf
+import six
+
+from csvkit import table
+
+def dbf2csv(f, **kwargs):
+    """
+    Convert a dBASE .dbf file to csv.
+    """
+    with dbf.Table(f.name) as db:
+        headers = db.field_names
+
+        column_ids = range(len(headers))
+
+        data_columns = [[] for c in headers]
+
+        for row in db:
+            for i, d in enumerate(row):
+                try:
+                    data_columns[i].append(six.text_type(row[column_ids[i]]).strip())
+                except IndexError:
+                    # Non-rectangular data is truncated
+                    break
+
+        columns = []
+
+        for i, c in enumerate(data_columns):
+            columns.append(table.Column(column_ids[i], headers[i], c))
+
+        tab = table.Table(columns=columns) 
+
+        o = six.StringIO()
+        output = tab.to_csv(o)
+        output = o.getvalue()
+        o.close()
+
+        return output
+
diff --git a/csvkit/convert/fixed.py b/csvkit/convert/fixed.py
new file mode 100644
index 0000000..ab7b7af
--- /dev/null
+++ b/csvkit/convert/fixed.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+from collections import namedtuple
+from codecs import iterdecode
+
+import six
+
+from csvkit import CSVKitReader, CSVKitWriter
... 3741 lines suppressed ...

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/csvkit.git



More information about the Python-modules-commits mailing list