[Python-modules-commits] [csvkit] 01/04: Imported Upstream version 0.9.1
Sandro Tosi
morph at moszumanska.debian.org
Wed Jul 1 15:34:20 UTC 2015
This is an automated email from the git hooks/post-receive script.
morph pushed a commit to branch bpo80
in repository csvkit.
commit eb0ec9088835d28024fc42370694deaf447444e3
Author: Sandro Tosi <morph at debian.org>
Date: Wed Jul 1 10:16:42 2015 -0400
Imported Upstream version 0.9.1
---
PKG-INFO | 39 ++++
README | 11 +
csvkit.egg-info/PKG-INFO | 39 ++++
csvkit.egg-info/SOURCES.txt | 46 ++++
csvkit.egg-info/dependency_links.txt | 1 +
csvkit.egg-info/entry_points.txt | 16 ++
csvkit.egg-info/requires.txt | 6 +
csvkit.egg-info/top_level.txt | 1 +
csvkit/__init__.py | 34 +++
csvkit/cleanup.py | 113 ++++++++++
csvkit/cli.py | 396 +++++++++++++++++++++++++++++++++++
csvkit/convert/__init__.py | 81 +++++++
csvkit/convert/csvitself.py | 18 ++
csvkit/convert/dbase.py | 44 ++++
csvkit/convert/fixed.py | 140 +++++++++++++
csvkit/convert/geojs.py | 73 +++++++
csvkit/convert/js.py | 77 +++++++
csvkit/convert/ndjs.py | 76 +++++++
csvkit/convert/xls.py | 155 ++++++++++++++
csvkit/convert/xlsx.py | 95 +++++++++
csvkit/exceptions.py | 93 ++++++++
csvkit/grep.py | 117 +++++++++++
csvkit/headers.py | 7 +
csvkit/join.py | 185 ++++++++++++++++
csvkit/py2.py | 104 +++++++++
csvkit/py3.py | 119 +++++++++++
csvkit/sniffer.py | 18 ++
csvkit/sql.py | 102 +++++++++
csvkit/table.py | 291 +++++++++++++++++++++++++
csvkit/typeinference.py | 248 ++++++++++++++++++++++
csvkit/unicsv.py | 144 +++++++++++++
csvkit/utilities/__init__.py | 0
csvkit/utilities/csvclean.py | 80 +++++++
csvkit/utilities/csvcut.py | 68 ++++++
csvkit/utilities/csvformat.py | 64 ++++++
csvkit/utilities/csvgrep.py | 69 ++++++
csvkit/utilities/csvjoin.py | 115 ++++++++++
csvkit/utilities/csvjson.py | 194 +++++++++++++++++
csvkit/utilities/csvlook.py | 80 +++++++
csvkit/utilities/csvpy.py | 41 ++++
csvkit/utilities/csvsort.py | 63 ++++++
csvkit/utilities/csvsql.py | 163 ++++++++++++++
csvkit/utilities/csvstack.py | 89 ++++++++
csvkit/utilities/csvstat.py | 250 ++++++++++++++++++++++
csvkit/utilities/in2csv.py | 86 ++++++++
csvkit/utilities/sql2csv.py | 68 ++++++
setup.cfg | 5 +
setup.py | 75 +++++++
48 files changed, 4399 insertions(+)
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..e3cef1b
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,39 @@
+Metadata-Version: 1.1
+Name: csvkit
+Version: 0.9.1
+Summary: A library of utilities for working with CSV, the king of tabular file formats.
+Home-page: http://csvkit.rtfd.org/
+Author: Christopher Groskopf
+Author-email: staringmonkey at gmail.com
+License: MIT
+Description: csvkit is a suite of utilities for converting to and working with CSV, the king of tabular file formats.
+
+ It is inspired by pdftk, gdal and the original csvcut utility by Joe Germuska and Aaron Bycoffe.
+
+ Important links:
+
+ * Repository: https://github.com/onyxfish/csvkit
+ * Issues: https://github.com/onyxfish/csvkit/issues
+ * Documentation: http://csvkit.rtfd.org/
+ * Schemas: https://github.com/onyxfish/ffs
+ * Buildbot: https://travis-ci.org/onyxfish/csvkit
+
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: End Users/Desktop
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2.6
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3.3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Topic :: Scientific/Engineering :: Information Analysis
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Utilities
diff --git a/README b/README
new file mode 100644
index 0000000..0d5677a
--- /dev/null
+++ b/README
@@ -0,0 +1,11 @@
+csvkit is a suite of utilities for converting to and working with CSV, the king of tabular file formats.
+
+It is inspired by pdftk, gdal and the original csvcut utility by Joe Germuska and Aaron Bycoffe.
+
+Important links:
+
+* Repository: https://github.com/onyxfish/csvkit
+* Issues: https://github.com/onyxfish/csvkit/issues
+* Documentation: http://csvkit.rtfd.org/
+* Schemas: https://github.com/onyxfish/ffs
+* Buildbot: https://travis-ci.org/onyxfish/csvkit
diff --git a/csvkit.egg-info/PKG-INFO b/csvkit.egg-info/PKG-INFO
new file mode 100644
index 0000000..e3cef1b
--- /dev/null
+++ b/csvkit.egg-info/PKG-INFO
@@ -0,0 +1,39 @@
+Metadata-Version: 1.1
+Name: csvkit
+Version: 0.9.1
+Summary: A library of utilities for working with CSV, the king of tabular file formats.
+Home-page: http://csvkit.rtfd.org/
+Author: Christopher Groskopf
+Author-email: staringmonkey at gmail.com
+License: MIT
+Description: csvkit is a suite of utilities for converting to and working with CSV, the king of tabular file formats.
+
+ It is inspired by pdftk, gdal and the original csvcut utility by Joe Germuska and Aaron Bycoffe.
+
+ Important links:
+
+ * Repository: https://github.com/onyxfish/csvkit
+ * Issues: https://github.com/onyxfish/csvkit/issues
+ * Documentation: http://csvkit.rtfd.org/
+ * Schemas: https://github.com/onyxfish/ffs
+ * Buildbot: https://travis-ci.org/onyxfish/csvkit
+
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: End Users/Desktop
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2.6
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3.3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Topic :: Scientific/Engineering :: Information Analysis
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Utilities
diff --git a/csvkit.egg-info/SOURCES.txt b/csvkit.egg-info/SOURCES.txt
new file mode 100644
index 0000000..2b15f5e
--- /dev/null
+++ b/csvkit.egg-info/SOURCES.txt
@@ -0,0 +1,46 @@
+README
+setup.py
+csvkit/__init__.py
+csvkit/cleanup.py
+csvkit/cli.py
+csvkit/exceptions.py
+csvkit/grep.py
+csvkit/headers.py
+csvkit/join.py
+csvkit/py2.py
+csvkit/py3.py
+csvkit/sniffer.py
+csvkit/sql.py
+csvkit/table.py
+csvkit/typeinference.py
+csvkit/unicsv.py
+csvkit.egg-info/PKG-INFO
+csvkit.egg-info/SOURCES.txt
+csvkit.egg-info/dependency_links.txt
+csvkit.egg-info/entry_points.txt
+csvkit.egg-info/requires.txt
+csvkit.egg-info/top_level.txt
+csvkit/convert/__init__.py
+csvkit/convert/csvitself.py
+csvkit/convert/dbase.py
+csvkit/convert/fixed.py
+csvkit/convert/geojs.py
+csvkit/convert/js.py
+csvkit/convert/ndjs.py
+csvkit/convert/xls.py
+csvkit/convert/xlsx.py
+csvkit/utilities/__init__.py
+csvkit/utilities/csvclean.py
+csvkit/utilities/csvcut.py
+csvkit/utilities/csvformat.py
+csvkit/utilities/csvgrep.py
+csvkit/utilities/csvjoin.py
+csvkit/utilities/csvjson.py
+csvkit/utilities/csvlook.py
+csvkit/utilities/csvpy.py
+csvkit/utilities/csvsort.py
+csvkit/utilities/csvsql.py
+csvkit/utilities/csvstack.py
+csvkit/utilities/csvstat.py
+csvkit/utilities/in2csv.py
+csvkit/utilities/sql2csv.py
\ No newline at end of file
diff --git a/csvkit.egg-info/dependency_links.txt b/csvkit.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/csvkit.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/csvkit.egg-info/entry_points.txt b/csvkit.egg-info/entry_points.txt
new file mode 100644
index 0000000..811415c
--- /dev/null
+++ b/csvkit.egg-info/entry_points.txt
@@ -0,0 +1,16 @@
+[console_scripts]
+csvclean = csvkit.utilities.csvclean:launch_new_instance
+csvcut = csvkit.utilities.csvcut:launch_new_instance
+csvformat = csvkit.utilities.csvformat:launch_new_instance
+csvgrep = csvkit.utilities.csvgrep:launch_new_instance
+csvjoin = csvkit.utilities.csvjoin:launch_new_instance
+csvjson = csvkit.utilities.csvjson:launch_new_instance
+csvlook = csvkit.utilities.csvlook:launch_new_instance
+csvpy = csvkit.utilities.csvpy:launch_new_instance
+csvsort = csvkit.utilities.csvsort:launch_new_instance
+csvsql = csvkit.utilities.csvsql:launch_new_instance
+csvstack = csvkit.utilities.csvstack:launch_new_instance
+csvstat = csvkit.utilities.csvstat:launch_new_instance
+in2csv = csvkit.utilities.in2csv:launch_new_instance
+sql2csv = csvkit.utilities.sql2csv:launch_new_instance
+
diff --git a/csvkit.egg-info/requires.txt b/csvkit.egg-info/requires.txt
new file mode 100644
index 0000000..a15d620
--- /dev/null
+++ b/csvkit.egg-info/requires.txt
@@ -0,0 +1,6 @@
+xlrd>=0.7.1
+sqlalchemy>=0.6.6
+openpyxl==2.2.0-b1
+six>=1.6.1
+python-dateutil==2.2
+dbf==0.94.003
diff --git a/csvkit.egg-info/top_level.txt b/csvkit.egg-info/top_level.txt
new file mode 100644
index 0000000..36204a5
--- /dev/null
+++ b/csvkit.egg-info/top_level.txt
@@ -0,0 +1 @@
+csvkit
diff --git a/csvkit/__init__.py b/csvkit/__init__.py
new file mode 100644
index 0000000..a170278
--- /dev/null
+++ b/csvkit/__init__.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+
+"""
+This module contains csvkit's superpowered replacement for the builtin :mod:`csv` module. For Python 2 users, the greatest improvement over the standard library full unicode support. Python 3's :mod:`csv` module supports unicode internally, so this module is provided primarily for compatability purposes.
+
+* Python 2: :mod:`csvkit.py2`.
+* Python 3: :mod:`csvkit.py3`.
+"""
+
+import six
+
+if six.PY2:
+ from csvkit import py2
+
+ CSVKitReader = py2.CSVKitReader
+ CSVKitWriter = py2.CSVKitWriter
+ CSVKitDictReader = py2.CSVKitDictReader
+ CSVKitDictWriter = py2.CSVKitDictWriter
+ reader = py2.reader
+ writer = py2.writer
+ DictReader = py2.CSVKitDictReader
+ DictWriter = py2.CSVKitDictWriter
+else:
+ from csvkit import py3
+
+ CSVKitReader = py3.CSVKitReader
+ CSVKitWriter = py3.CSVKitWriter
+ CSVKitDictReader = py3.CSVKitDictReader
+ CSVKitDictWriter = py3.CSVKitDictWriter
+ reader = py3.reader
+ writer = py3.writer
+ DictReader = py3.CSVKitDictReader
+ DictWriter = py3.CSVKitDictWriter
+
diff --git a/csvkit/cleanup.py b/csvkit/cleanup.py
new file mode 100644
index 0000000..6fc241a
--- /dev/null
+++ b/csvkit/cleanup.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+from csvkit.exceptions import CSVTestException, LengthMismatchError
+
+def join_rows(rows, joiner=' '):
+ """
+ Given a series of rows, return them as a single row where the inner edge cells are merged. By default joins with a single space character, but you can specify new-line, empty string, or anything else with the 'joiner' kwarg.
+ """
+ rows = list(rows)
+ fixed_row = rows[0][:]
+
+ for row in rows[1:]:
+ if len(row) == 0:
+ row = ['']
+
+ fixed_row[-1] += "%s%s" % (joiner, row[0])
+ fixed_row.extend(row[1:])
+
+ return fixed_row
+
+def fix_length_errors(errs, target_line_length, joiner=' '):
+ """
+ If possible, transform the rows backed up in the list of errors into rows of the correct length.
+ If the list of errors does not yet produce a row of target_line_length, return an empty array.
+ """
+ if not errs:
+ return []
+
+ fixed_rows = []
+ backlog = []
+
+ for err in errs:
+ if type(err) is not LengthMismatchError:
+ return [] # give up if any are not length errors
+
+ backlog.append(err)
+ fixed_row = join_rows([err.row for err in backlog])
+
+ if len(fixed_row) == target_line_length:
+ fixed_rows.append(fixed_row)
+ backlog = [] # reset
+
+ return fixed_rows
+
+def extract_joinable_row_errors(errs):
+ joinable = []
+
+ for err in reversed(errs):
+ if type(err) is not LengthMismatchError:
+ break
+
+ if joinable and err.line_number != joinable[-1].line_number - 1:
+ break
+
+ joinable.append(err)
+
+ joinable.reverse()
+
+ return joinable
+
+class RowChecker(object):
+ """
+ Iterate over rows of a CSV producing cleaned rows and storing error rows.
+ """
+ def __init__(self, reader):
+ self.reader = reader
+ self.column_names = next(reader)
+
+ self.errors = []
+ self.rows_joined = 0
+ self.joins = 0
+
+ def checked_rows(self):
+ """
+ A generator which yields rows which are ready to write to output.
+ """
+ line_number = self.reader.line_num
+
+ for row in self.reader:
+ try:
+ if len(row) != len(self.column_names):
+ raise LengthMismatchError(line_number, row, len(self.column_names))
+
+ yield row
+ except LengthMismatchError as e:
+ self.errors.append(e)
+
+ joinable_row_errors = extract_joinable_row_errors(self.errors)
+
+ while joinable_row_errors:
+ fixed_row = join_rows([err.row for err in joinable_row_errors], joiner=' ')
+
+ if len(fixed_row) < len(self.column_names):
+ break
+
+ if len(fixed_row) == len(self.column_names):
+ self.rows_joined += len(joinable_row_errors)
+ self.joins += 1
+
+ yield fixed_row
+
+ for fixed in joinable_row_errors:
+ self.errors.remove(fixed)
+
+ break
+
+ joinable_row_errors = joinable_row_errors[1:] # keep trying in case we're too long because of a straggler
+
+ except CSVTestException as e:
+ self.errors.append(e)
+
+ line_number = self.reader.line_num
+
diff --git a/csvkit/cli.py b/csvkit/cli.py
new file mode 100644
index 0000000..5bd811b
--- /dev/null
+++ b/csvkit/cli.py
@@ -0,0 +1,396 @@
+#!/usr/bin/env python
+
+import argparse
+import bz2
+import codecs
+import gzip
+import os.path
+import sys
+
+import six
+
+from csvkit import CSVKitReader
+from csvkit.exceptions import ColumnIdentifierError, RequiredHeaderError
+
+def lazy_opener(fn):
+ def wrapped(self, *args, **kwargs):
+ self._lazy_open()
+ fn(*args, **kwargs)
+ return wrapped
+
+class LazyFile(six.Iterator):
+ """
+ A proxy for a File object that delays opening it until
+ a read method is called.
+
+ Currently this implements only the minimum methods to be useful,
+ but it could easily be expanded.
+ """
+ def __init__(self, init, *args, **kwargs):
+ self.init = init
+ self.f = None
+ self._is_lazy_opened = False
+
+ self._lazy_args = args
+ self._lazy_kwargs = kwargs
+
+ def __getattr__(self, name):
+ if not self._is_lazy_opened:
+ self.f = self.init(*self._lazy_args, **self._lazy_kwargs)
+ self._is_lazy_opened = True
+
+ return getattr(self.f, name)
+
+ def __iter__(self):
+ return self
+
+ def close(self):
+ self.f.close()
+ self.f = None
+ self._is_lazy_opened = False
+
+ def __next__(self):
+ if not self._is_lazy_opened:
+ self.f = self.init(*self._lazy_args, **self._lazy_kwargs)
+ self._is_lazy_opened = True
+
+ return next(self.f)
+
+class CSVKitUtility(object):
+ description = ''
+ epilog = ''
+ override_flags = ''
+
+ def __init__(self, args=None, output_file=None):
+ """
+ Perform argument processing and other setup for a CSVKitUtility.
+ """
+ self._init_common_parser()
+ self.add_arguments()
+ self.args = self.argparser.parse_args(args)
+
+ if 'f' not in self.override_flags:
+ self.input_file = self._open_input_file(self.args.input_path)
+
+ self.reader_kwargs = self._extract_csv_reader_kwargs()
+ self.writer_kwargs = self._extract_csv_writer_kwargs()
+
+ self._install_exception_handler()
+
+ if output_file is None:
+ self.output_file = sys.stdout
+ else:
+ self.output_file = output_file
+
+ # Ensure SIGPIPE doesn't throw an exception
+ # Prevents [Errno 32] Broken pipe errors, e.g. when piping to 'head'
+ # To test from the shell:
+ # python -c "for i in range(5000): print 'a,b,c'" | csvlook | head
+ # Without this fix you will see at the end:
+ # [Errno 32] Broken pipe
+ # With this fix, there should be no error
+ # For details on Python and SIGPIPE, see http://bugs.python.org/issue1652
+ try:
+ import signal
+ signal.signal(signal.SIGPIPE, signal.SIG_DFL)
+ except (ImportError, AttributeError):
+ #Do nothing on platforms that don't have signals or don't have SIGPIPE
+ pass
+
+ def add_arguments(self):
+ """
+ Called upon initialization once the parser for common arguments has been constructed.
+
+ Should be overriden by individual utilities.
+ """
+ raise NotImplementedError('add_arguments must be provided by each subclass of CSVKitUtility.')
+
+ def main(self):
+ """
+ Main loop of the utility.
+
+ Should be overriden by individual utilities and explicitly called by the executing script.
+ """
+ raise NotImplementedError(' must be provided by each subclass of CSVKitUtility.')
+
+ def _init_common_parser(self):
+ """
+ Prepare a base argparse argument parser so that flags are consistent across different shell command tools.
+ If you want to constrain which common args are present, you can pass a string for 'omitflags'. Any argument
+ whose single-letter form is contained in 'omitflags' will be left out of the configured parser. Use 'f' for
+ file.
+ """
+ self.argparser = argparse.ArgumentParser(description=self.description, epilog=self.epilog)
+
+ # Input
+ if 'f' not in self.override_flags:
+ self.argparser.add_argument(metavar="FILE", nargs='?', dest='input_path',
+ help='The CSV file to operate on. If omitted, will accept input on STDIN.')
+ if 'd' not in self.override_flags:
+ self.argparser.add_argument('-d', '--delimiter', dest='delimiter',
+ help='Delimiting character of the input CSV file.')
+ if 't' not in self.override_flags:
+ self.argparser.add_argument('-t', '--tabs', dest='tabs', action='store_true',
+ help='Specifies that the input CSV file is delimited with tabs. Overrides "-d".')
+ if 'q' not in self.override_flags:
+ self.argparser.add_argument('-q', '--quotechar', dest='quotechar',
+ help='Character used to quote strings in the input CSV file.')
+ if 'u' not in self.override_flags:
+ self.argparser.add_argument('-u', '--quoting', dest='quoting', type=int, choices=[0,1,2,3],
+ help='Quoting style used in the input CSV file. 0 = Quote Minimal, 1 = Quote All, 2 = Quote Non-numeric, 3 = Quote None.')
+ if 'b' not in self.override_flags:
+ self.argparser.add_argument('-b', '--doublequote', dest='doublequote', action='store_true',
+ help='Whether or not double quotes are doubled in the input CSV file.')
+ if 'p' not in self.override_flags:
+ self.argparser.add_argument('-p', '--escapechar', dest='escapechar',
+ help='Character used to escape the delimiter if --quoting 3 ("Quote None") is specified and to escape the QUOTECHAR if --doublequote is not specified.')
+ if 'z' not in self.override_flags:
+ self.argparser.add_argument('-z', '--maxfieldsize', dest='maxfieldsize', type=int,
+ help='Maximum length of a single field in the input CSV file.')
+ if 'e' not in self.override_flags:
+ self.argparser.add_argument('-e', '--encoding', dest='encoding', default='utf-8',
+ help='Specify the encoding the input CSV file.')
+ if 'S' not in self.override_flags:
+ self.argparser.add_argument('-S', '--skipinitialspace', dest='skipinitialspace', default=False, action='store_true',
+ help='Ignore whitespace immediately following the delimiter.')
+ if 'H' not in self.override_flags:
+ self.argparser.add_argument('-H', '--no-header-row', dest='no_header_row', action='store_true',
+ help='Specifies that the input CSV file has no header row. Will create default headers.')
+ if 'v' not in self.override_flags:
+ self.argparser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
+ help='Print detailed tracebacks when errors occur.')
+
+ # Output
+ if 'l' not in self.override_flags:
+ self.argparser.add_argument('-l', '--linenumbers', dest='line_numbers', action='store_true',
+ help='Insert a column of line numbers at the front of the output. Useful when piping to grep or as a simple primary key.')
+
+ # Input/Output
+ if 'zero' not in self.override_flags:
+ self.argparser.add_argument('--zero', dest='zero_based', action='store_true',
+ help='When interpreting or displaying column numbers, use zero-based numbering instead of the default 1-based numbering.')
+
+ def _open_input_file(self, path):
+ """
+ Open the input file specified on the command line.
+ """
+ if six.PY2:
+ mode = 'rb'
+ kwargs = {}
+ else:
+ mode = 'rt'
+ kwargs = { 'encoding': self.args.encoding }
+
+ if not path or path == '-':
+ f = sys.stdin
+ else:
+ (_, extension) = os.path.splitext(path)
+
+ if extension == u'.gz':
+ f = LazyFile(gzip.open, path, mode, **kwargs)
+ elif extension == '.bz2':
+ if six.PY2:
+ f = LazyFile(bz2.BZ2File, path, mode, **kwargs)
+ else:
+ f = LazyFile(bz2.open, path, mode, **kwargs)
+ else:
+ f = LazyFile(open, path, mode, **kwargs)
+
+ return f
+
+ def _extract_csv_reader_kwargs(self):
+ """
+ Extracts those from the command-line arguments those would should be passed through to the input CSV reader(s).
+ """
+ kwargs = {}
+
+ if self.args.tabs:
+ kwargs['delimiter'] = '\t'
+ elif self.args.delimiter:
+ kwargs['delimiter'] = self.args.delimiter
+
+ if self.args.quotechar:
+ kwargs['quotechar'] = self.args.quotechar
+
+ if self.args.quoting:
+ kwargs['quoting'] = self.args.quoting
+
+ if self.args.doublequote:
+ kwargs['doublequote'] = self.args.doublequote
+
+ if self.args.escapechar:
+ kwargs['escapechar'] = self.args.escapechar
+
+ if self.args.maxfieldsize:
+ kwargs['maxfieldsize'] = self.args.maxfieldsize
+
+ if self.args.skipinitialspace:
+ kwargs['skipinitialspace'] = self.args.skipinitialspace
+
+ if six.PY2 and self.args.encoding:
+ kwargs['encoding'] = self.args.encoding
+
+ return kwargs
+
+ def _extract_csv_writer_kwargs(self):
+ """
+ Extracts those from the command-line arguments those would should be passed through to the output CSV writer.
+ """
+ kwargs = {}
+
+ if 'l' not in self.override_flags and self.args.line_numbers:
+ kwargs['line_numbers'] = True
+
+ return kwargs
+
+ def _install_exception_handler(self):
+ """
+ Installs a replacement for sys.excepthook, which handles pretty-printing uncaught exceptions.
+ """
+ if six.PY2:
+ sys.stderr = codecs.getwriter('utf-8')(sys.stderr)
+
+ def handler(t, value, traceback):
+ if self.args.verbose:
+ sys.__excepthook__(t, value, traceback)
+ else:
+ # Special case handling for Unicode errors, which behave very strangely
+ # when cast with unicode()
+ if t == UnicodeDecodeError:
+ sys.stderr.write('Your file is not "%s" encoded. Please specify the correct encoding with the -e flag. Use the -v flag to see the complete error.\n' % self.args.encoding)
+ else:
+ sys.stderr.write('%s\n' % six.text_type(value))
+
+ sys.excepthook = handler
+
+ def print_column_names(self):
+ """
+ Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
+ """
+ if self.args.no_header_row:
+ raise RequiredHeaderError('You cannot use --no-header-row with the -n or --names options.')
+
+ f = self.input_file
+ output = self.output_file
+
+ try:
+ zero_based=self.args.zero_based
+ except:
+ zero_based=False
+
+ rows = CSVKitReader(f, **self.reader_kwargs)
+ column_names = next(rows)
+
+ for i, c in enumerate(column_names):
+ if not zero_based:
+ i += 1
+ output.write('%3i: %s\n' % (i, c))
+
+
+def match_column_identifier(column_names, c, zero_based=False):
+ """
+ Determine what column a single column id (name or index) matches in a series of column names.
+ Note that integer values are *always* treated as positional identifiers. If you happen to have
+ column names which are also integers, you must specify them using a positional index.
+ """
+ if isinstance(c, six.string_types) and not c.isdigit() and c in column_names:
+ return column_names.index(c)
+ else:
+ try:
+ c = int(c)
+ if not zero_based:
+ c -= 1
+ # Fail out if neither a column name nor an integer
+ except:
+ raise ColumnIdentifierError('Column identifier "%s" is neither an integer, nor a existing column\'s name.' % c)
+
+ # Fail out if index is 0-based
+ if c < 0:
+ raise ColumnIdentifierError('Column 0 is not valid; columns are 1-based.')
+
+ # Fail out if index is out of range
+ if c >= len(column_names):
+ raise ColumnIdentifierError('Index %i is beyond the last named column, "%s" at index %i.' % (c, column_names[-1], len(column_names) - 1))
+
+ return c
+
+def parse_column_identifiers(ids, column_names, zero_based=False, excluded_columns=None):
+ """
+ Parse a comma-separated list of column indices AND/OR names into a list of integer indices.
+ Ranges of integers can be specified with two integers separated by a '-' or ':' character. Ranges of
+ non-integers (e.g. column names) are not supported.
+ Note: Column indices are 1-based.
+ """
+ columns = []
+
+ # If not specified, start with all columns
+ if not ids:
+ columns = range(len(column_names))
+
+ if columns and not excluded_columns:
+ return columns
+
+ if not columns:
+ for c in ids.split(','):
+ c = c.strip()
+
+ try:
+ columns.append(match_column_identifier(column_names, c, zero_based))
+ except ColumnIdentifierError:
+ if ':' in c:
+ a,b = c.split(':',1)
+ elif '-' in c:
+ a,b = c.split('-',1)
+ else:
+ raise
+
+ try:
+ if a:
+ a = int(a)
+ else:
+ a = 1
+ if b:
+ b = int(b) + 1
+ else:
+ b = len(column_names) + 1
+
+ except ValueError:
+ raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.")
+
+ for x in range(a,b):
+ columns.append(match_column_identifier(column_names, x, zero_based))
+
+ excludes = []
+
+ if excluded_columns:
+ for c in excluded_columns.split(','):
+ c = c.strip()
+
+ try:
+ excludes.append(match_column_identifier(column_names, c, zero_based))
+ except ColumnIdentifierError:
+ if ':' in c:
+ a,b = c.split(':',1)
+ elif '-' in c:
+ a,b = c.split('-',1)
+ else:
+ raise
+
+ try:
+ if a:
+ a = int(a)
+ else:
+ a = 1
+ if b:
+ b = int(b) + 1
+ else:
+ b = len(column_names)
+
+ except ValueError:
+ raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.")
+
+ for x in range(a,b):
+ excludes.append(match_column_identifier(column_names, x, zero_based))
+
+ return [c for c in columns if c not in excludes]
+
diff --git a/csvkit/convert/__init__.py b/csvkit/convert/__init__.py
new file mode 100644
index 0000000..28dd6c3
--- /dev/null
+++ b/csvkit/convert/__init__.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import six
+
+from csvkit.convert.csvitself import csv2csv
+from csvkit.convert.fixed import fixed2csv
+from csvkit.convert.geojs import geojson2csv
+from csvkit.convert.js import json2csv
+from csvkit.convert.ndjs import ndjson2csv
+from csvkit.convert.xls import xls2csv
+from csvkit.convert.xlsx import xlsx2csv
+
+SUPPORTED_FORMATS = ['fixed', 'xls', 'xlsx', 'csv', 'json', 'geojson', 'ndjson']
+
+# DBF is supported for Python 2 only
+if six.PY2:
+ from csvkit.convert.dbase import dbf2csv
+
+ SUPPORTED_FORMATS.append('dbf')
+
+def convert(f, format, schema=None, key=None, **kwargs):
+ """
+ Convert a file of a specified format to CSV.
+ """
+ if not f:
+ raise ValueError('f must not be None')
+
+ if not format:
+ raise ValueError('format must not be None')
+
+ if format == 'fixed':
+ if not schema:
+ raise ValueError('schema must not be null when format is "fixed"')
+
+ return fixed2csv(f, schema, **kwargs)
+ elif format == 'xls':
+ return xls2csv(f, **kwargs)
+ elif format == 'xlsx':
+ return xlsx2csv(f, **kwargs)
+ elif format == 'json':
+ return json2csv(f, key, **kwargs)
+ elif format == 'ndjson':
+ return ndjson2csv(f, **kwargs)
+ elif format == 'geojson':
+ return geojson2csv(f, **kwargs)
+ elif format == 'csv':
+ return csv2csv(f, **kwargs)
+ elif format == 'dbf':
+ if six.PY3:
+ raise ValueError('format "dbf" is not supported forthis version of Python.')
+ return dbf2csv(f, **kwargs)
+ else:
+ raise ValueError('format "%s" is not supported' % format)
+
+def guess_format(filename):
+ """
+ Try to guess a file's format based on its extension (or lack thereof).
+ """
+ last_period = filename.rfind('.')
+
+ if last_period == -1:
+ # No extension: assume fixed-width
+ return 'fixed'
+
+ extension = filename[last_period + 1:]
+
+ if extension == 'xls':
+ return extension
+ elif extension == 'xlsx':
+ return extension
+ elif extension in ['json', 'js']:
+ return 'json'
+ elif extension == 'csv':
+ return extension
+ elif extension == 'fixed':
+ return extension
+ elif extension == 'dbf':
+ return extension
+
+ return None
+
diff --git a/csvkit/convert/csvitself.py b/csvkit/convert/csvitself.py
new file mode 100644
index 0000000..4b6f597
--- /dev/null
+++ b/csvkit/convert/csvitself.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+import six
+
+from csvkit import table
+
+def csv2csv(f, **kwargs):
+ """
+ "Convert" a CSV into a new CSV by normalizing types and correcting for other anomalies.
+ """
+ tab = table.Table.from_csv(f, **kwargs)
+
+ o = six.StringIO()
+ output = tab.to_csv(o)
+ output = o.getvalue()
+ o.close()
+
+ return output
diff --git a/csvkit/convert/dbase.py b/csvkit/convert/dbase.py
new file mode 100644
index 0000000..e2e7b77
--- /dev/null
+++ b/csvkit/convert/dbase.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+"""
+Note: dbf is only supported/imported for Python 2.
+"""
+
+import dbf
+import six
+
+from csvkit import table
+
+def dbf2csv(f, **kwargs):
+ """
+ Convert a dBASE .dbf file to csv.
+ """
+ with dbf.Table(f.name) as db:
+ headers = db.field_names
+
+ column_ids = range(len(headers))
+
+ data_columns = [[] for c in headers]
+
+ for row in db:
+ for i, d in enumerate(row):
+ try:
+ data_columns[i].append(six.text_type(row[column_ids[i]]).strip())
+ except IndexError:
+ # Non-rectangular data is truncated
+ break
+
+ columns = []
+
+ for i, c in enumerate(data_columns):
+ columns.append(table.Column(column_ids[i], headers[i], c))
+
+ tab = table.Table(columns=columns)
+
+ o = six.StringIO()
+ output = tab.to_csv(o)
+ output = o.getvalue()
+ o.close()
+
+ return output
+
diff --git a/csvkit/convert/fixed.py b/csvkit/convert/fixed.py
new file mode 100644
index 0000000..ab7b7af
--- /dev/null
+++ b/csvkit/convert/fixed.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+from collections import namedtuple
+from codecs import iterdecode
+
+import six
+
+from csvkit import CSVKitReader, CSVKitWriter
... 3741 lines suppressed ...
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/csvkit.git
More information about the Python-modules-commits
mailing list