[tika] 10/11: Ignore some tika-parser classes...
Markus Koschany
apo-guest at moszumanska.debian.org
Tue Dec 1 19:19:50 UTC 2015
This is an automated email from the git hooks/post-receive script.
apo-guest pushed a commit to branch master
in repository tika.
commit 4cc0552258fb048eb7ab4b6104c9c0ed5a88e5aa
Author: Markus Koschany <apo at debian.org>
Date: Tue Dec 1 19:16:02 2015 +0100
Ignore some tika-parser classes...
---
debian/patches/ignore-com.drew.imaging.webp.patch | 563 +++++++++++
.../ignore-com.github.junrar.exception.patch | 125 +++
.../ignore-com.healthmarketscience.jackcess.patch | 497 ++++++++++
debian/patches/ignore-com.pff.patch | 218 +++++
debian/patches/ignore-javax.ws.rs.core.patch | 127 +++
debian/patches/ignore-opennlp.tools.namefind.patch | 142 +++
debian/patches/ignore-org.apache.ctakes.patch | 1014 ++++++++++++++++++++
.../ignore-org.apache.poi.hslf.usermodel.patch | 353 +++++++
.../ignore-org.apache.poi.hssf.extractor.patch | 112 +++
debian/patches/ignore-org.json.XML.patch | 908 ++++++++++++++++++
.../ignore-package-org.apache.poi.xwpf.patch | 647 +++++++++++++
debian/patches/ignore-sqlite-jdbc.patch | 125 +++
debian/patches/ignore-ucar.nc2.patch | 137 +++
debian/patches/series | 13 +
14 files changed, 4981 insertions(+)
diff --git a/debian/patches/ignore-com.drew.imaging.webp.patch b/debian/patches/ignore-com.drew.imaging.webp.patch
new file mode 100644
index 0000000..f58b1d1
--- /dev/null
+++ b/debian/patches/ignore-com.drew.imaging.webp.patch
@@ -0,0 +1,563 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:10:52 +0100
+Subject: ignore com.drew.imaging.webp
+
+---
+ .../tika/parser/image/ImageMetadataExtractor.java | 548 ---------------------
+ 1 file changed, 548 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+deleted file mode 100644
+index dd732f4..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
++++ /dev/null
+@@ -1,548 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.image;
+-
+-import java.io.File;
+-import java.io.IOException;
+-import java.io.InputStream;
+-import java.text.DecimalFormat;
+-import java.text.DecimalFormatSymbols;
+-import java.text.SimpleDateFormat;
+-import java.util.Date;
+-import java.util.Iterator;
+-import java.util.Locale;
+-import java.util.regex.Matcher;
+-import java.util.regex.Pattern;
+-
+-import com.drew.imaging.jpeg.JpegMetadataReader;
+-import com.drew.imaging.jpeg.JpegProcessingException;
+-import com.drew.imaging.riff.RiffProcessingException;
+-import com.drew.imaging.tiff.TiffMetadataReader;
+-import com.drew.imaging.tiff.TiffProcessingException;
+-import com.drew.imaging.webp.WebpMetadataReader;
+-import com.drew.lang.ByteArrayReader;
+-import com.drew.lang.GeoLocation;
+-import com.drew.lang.Rational;
+-import com.drew.metadata.Directory;
+-import com.drew.metadata.MetadataException;
+-import com.drew.metadata.Tag;
+-import com.drew.metadata.exif.ExifIFD0Directory;
+-import com.drew.metadata.exif.ExifReader;
+-import com.drew.metadata.exif.ExifSubIFDDirectory;
+-import com.drew.metadata.exif.ExifThumbnailDirectory;
+-import com.drew.metadata.exif.GpsDirectory;
+-import com.drew.metadata.iptc.IptcDirectory;
+-import com.drew.metadata.jpeg.JpegCommentDirectory;
+-import com.drew.metadata.jpeg.JpegDirectory;
+-import com.drew.metadata.xmp.XmpReader;
+-import org.apache.poi.util.IOUtils;
+-import org.apache.tika.exception.TikaException;
+-import org.apache.tika.metadata.IPTC;
+-import org.apache.tika.metadata.Metadata;
+-import org.apache.tika.metadata.Property;
+-import org.apache.tika.metadata.TikaCoreProperties;
+-import org.xml.sax.SAXException;
+-
+-/**
+- * Uses the <a href="http://www.drewnoakes.com/code/exif/">Metadata Extractor</a> library
+- * to read EXIF and IPTC image metadata and map to Tika fields.
+- * <p/>
+- * As of 2.4.0 the library supports jpeg and tiff.
+- * As of 2.8.0 the library supports webp.
+- */
+-public class ImageMetadataExtractor {
+-
+- private static final String GEO_DECIMAL_FORMAT_STRING = "#.######"; // 6 dp seems to be reasonable
+- private final Metadata metadata;
+- private DirectoryHandler[] handlers;
+-
+- /**
+- * @param metadata to extract to, using default directory handlers
+- */
+- public ImageMetadataExtractor(Metadata metadata) {
+- this(metadata,
+- new CopyUnknownFieldsHandler(),
+- new JpegCommentHandler(),
+- new ExifHandler(),
+- new DimensionsHandler(),
+- new GeotagHandler(),
+- new IptcHandler()
+- );
+- }
+-
+- /**
+- * @param metadata to extract to
+- * @param handlers handlers in order, note that handlers may override values from earlier handlers
+- */
+- public ImageMetadataExtractor(Metadata metadata, DirectoryHandler... handlers) {
+- this.metadata = metadata;
+- this.handlers = handlers;
+- }
+-
+- private static String trimPixels(String s) {
+- //if height/width appears as "100 pixels", trim " pixels"
+- if (s != null) {
+- int i = s.lastIndexOf(" pixels");
+- s = s.substring(0, i);
+- }
+- return s;
+- }
+-
+- public void parseJpeg(File file)
+- throws IOException, SAXException, TikaException {
+- try {
+- com.drew.metadata.Metadata jpegMetadata = JpegMetadataReader.readMetadata(file);
+- handle(jpegMetadata);
+- } catch (JpegProcessingException e) {
+- throw new TikaException("Can't read JPEG metadata", e);
+- } catch (MetadataException e) {
+- throw new TikaException("Can't read JPEG metadata", e);
+- }
+- }
+-
+- public void parseTiff(File file)
+- throws IOException, SAXException, TikaException {
+- try {
+- com.drew.metadata.Metadata tiffMetadata = TiffMetadataReader.readMetadata(file);
+- handle(tiffMetadata);
+- } catch (MetadataException e) {
+- throw new TikaException("Can't read TIFF metadata", e);
+- } catch (TiffProcessingException e) {
+- throw new TikaException("Can't read TIFF metadata", e);
+- }
+- }
+-
+- public void parseWebP(File file) throws IOException, TikaException {
+-
+- try {
+- com.drew.metadata.Metadata webPMetadata = new com.drew.metadata.Metadata();
+- webPMetadata = WebpMetadataReader.readMetadata(file);
+- handle(webPMetadata);
+- } catch (IOException e) {
+- throw e;
+- } catch (RiffProcessingException e) {
+- throw new TikaException("Can't process Riff data", e);
+- } catch (MetadataException e) {
+- throw new TikaException("Can't process Riff data", e);
+- }
+- }
+-
+- public void parseRawExif(InputStream stream, int length, boolean needsExifHeader)
+- throws IOException, SAXException, TikaException {
+- byte[] exif;
+- if (needsExifHeader) {
+- exif = new byte[length + 6];
+- exif[0] = (byte) 'E';
+- exif[1] = (byte) 'x';
+- exif[2] = (byte) 'i';
+- exif[3] = (byte) 'f';
+- IOUtils.readFully(stream, exif, 6, length);
+- } else {
+- exif = new byte[length];
+- IOUtils.readFully(stream, exif, 0, length);
+- }
+- parseRawExif(exif);
+- }
+-
+- public void parseRawExif(byte[] exifData)
+- throws IOException, SAXException, TikaException {
+- com.drew.metadata.Metadata metadata = new com.drew.metadata.Metadata();
+- ExifReader reader = new ExifReader();
+- reader.extract(new ByteArrayReader(exifData), metadata, ExifReader.JPEG_SEGMENT_PREAMBLE.length());
+-
+- try {
+- handle(metadata);
+- } catch (MetadataException e) {
+- throw new TikaException("Can't process the EXIF Data", e);
+- }
+- }
+-
+- public void parseRawXMP(byte[] xmpData)
+- throws IOException, SAXException, TikaException {
+- com.drew.metadata.Metadata metadata = new com.drew.metadata.Metadata();
+- XmpReader reader = new XmpReader();
+- reader.extract(xmpData, metadata);
+-
+- try {
+- handle(metadata);
+- } catch (MetadataException e) {
+- throw new TikaException("Can't process the XMP Data", e);
+- }
+- }
+-
+- /**
+- * Copies extracted tags to tika metadata using registered handlers.
+- *
+- * @param metadataExtractor Tag directories from a Metadata Extractor "reader"
+- * @throws MetadataException This method does not handle exceptions from Metadata Extractor
+- */
+- protected void handle(com.drew.metadata.Metadata metadataExtractor)
+- throws MetadataException {
+- handle(metadataExtractor.getDirectories().iterator());
+- }
+-
+- /**
+- * Copies extracted tags to tika metadata using registered handlers.
+- *
+- * @param directories Metadata Extractor {@link com.drew.metadata.Directory} instances.
+- * @throws MetadataException This method does not handle exceptions from Metadata Extractor
+- */
+- protected void handle(Iterator<Directory> directories) throws MetadataException {
+- while (directories.hasNext()) {
+- Directory directory = directories.next();
+- for (DirectoryHandler handler : handlers) {
+- if (handler.supports(directory.getClass())) {
+- handler.handle(directory, metadata);
+- }
+- }
+- }
+- }
+-
+- /**
+- * Reads one or more type of Metadata Extractor fields.
+- */
+- static interface DirectoryHandler {
+- /**
+- * @param directoryType A Metadata Extractor directory class
+- * @return true if the directory type is supported by this handler
+- */
+- boolean supports(Class<? extends Directory> directoryType);
+-
+- /**
+- * @param directory extracted tags
+- * @param metadata current tika metadata
+- * @throws MetadataException typically field extraction error, aborts all further extraction
+- */
+- void handle(Directory directory, Metadata metadata)
+- throws MetadataException;
+- }
+-
+- /**
+- * Mimics the behavior from TIKA-314 of copying all extracted tags
+- * to tika metadata using field names from Metadata Extractor.
+- */
+- static class CopyAllFieldsHandler implements DirectoryHandler {
+- public boolean supports(Class<? extends Directory> directoryType) {
+- return true;
+- }
+-
+- public void handle(Directory directory, Metadata metadata)
+- throws MetadataException {
+- if (directory.getTags() != null) {
+- for (Tag tag : directory.getTags()) {
+- metadata.set(tag.getTagName(), tag.getDescription());
+- }
+- }
+- }
+- }
+-
+- /**
+- * Copies all fields regardless of directory, if the tag name
+- * is not identical to a known Metadata field name.
+- * This leads to more predictable behavior than {@link CopyAllFieldsHandler}.
+- */
+- static class CopyUnknownFieldsHandler implements DirectoryHandler {
+- public boolean supports(Class<? extends Directory> directoryType) {
+- return true;
+- }
+-
+- public void handle(Directory directory, Metadata metadata)
+- throws MetadataException {
+- if (directory.getTags() != null) {
+- for (Tag tag : directory.getTags()) {
+- String name = tag.getTagName();
+- if (!MetadataFields.isMetadataField(name) && tag.getDescription() != null) {
+- String value = tag.getDescription().trim();
+- if (Boolean.TRUE.toString().equalsIgnoreCase(value)) {
+- value = Boolean.TRUE.toString();
+- } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
+- value = Boolean.FALSE.toString();
+- }
+- metadata.set(name, value);
+- }
+- }
+- }
+- }
+- }
+-
+- /**
+- * Basic image properties for TIFF and JPEG, at least.
+- */
+- static class DimensionsHandler implements DirectoryHandler {
+- private final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
+-
+- public boolean supports(Class<? extends Directory> directoryType) {
+- return directoryType == JpegDirectory.class ||
+- directoryType == ExifSubIFDDirectory.class ||
+- directoryType == ExifThumbnailDirectory.class ||
+- directoryType == ExifIFD0Directory.class;
+- }
+-
+- public void handle(Directory directory, Metadata metadata) throws MetadataException {
+- // The test TIFF has width and height stored as follows according to exiv2
+- //Exif.Image.ImageWidth Short 1 100
+- //Exif.Image.ImageLength Short 1 75
+- // and the values are found in "Thumbnail Image Width" (and Height) from Metadata Extractor
+- set(directory, metadata, JpegDirectory.TAG_IMAGE_WIDTH, Metadata.IMAGE_WIDTH);
+- set(directory, metadata, JpegDirectory.TAG_IMAGE_HEIGHT, Metadata.IMAGE_LENGTH);
+- // Bits per sample, two methods of extracting, exif overrides jpeg
+- set(directory, metadata, JpegDirectory.TAG_DATA_PRECISION, Metadata.BITS_PER_SAMPLE);
+- set(directory, metadata, ExifSubIFDDirectory.TAG_BITS_PER_SAMPLE, Metadata.BITS_PER_SAMPLE);
+- // Straightforward
+- set(directory, metadata, ExifSubIFDDirectory.TAG_SAMPLES_PER_PIXEL, Metadata.SAMPLES_PER_PIXEL);
+- }
+-
+- private void set(Directory directory, Metadata metadata, int extractTag, Property metadataField) {
+- if (directory.containsTag(extractTag)) {
+- Matcher m = LEADING_NUMBERS.matcher(directory.getString(extractTag));
+- if (m.matches()) {
+- metadata.set(metadataField, m.group(1));
+- }
+- }
+- }
+- }
+-
+- static class JpegCommentHandler implements DirectoryHandler {
+- public boolean supports(Class<? extends Directory> directoryType) {
+- return directoryType == JpegCommentDirectory.class;
+- }
+-
+- public void handle(Directory directory, Metadata metadata) throws MetadataException {
+- if (directory.containsTag(JpegCommentDirectory.TAG_COMMENT)) {
+- metadata.add(TikaCoreProperties.COMMENTS, directory.getString(JpegCommentDirectory.TAG_COMMENT));
+- }
+- }
+- }
+-
+- static class ExifHandler implements DirectoryHandler {
+- // There's a new ExifHandler for each file processed, so this is thread safe
+- private static final ThreadLocal<SimpleDateFormat> DATE_UNSPECIFIED_TZ = new ThreadLocal<SimpleDateFormat>() {
+- @Override
+- protected SimpleDateFormat initialValue() {
+- return new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.US);
+- }
+- };
+-
+- public boolean supports(Class<? extends Directory> directoryType) {
+- return directoryType == ExifIFD0Directory.class ||
+- directoryType == ExifSubIFDDirectory.class;
+- }
+-
+- public void handle(Directory directory, Metadata metadata) {
+- try {
+- handleDateTags(directory, metadata);
+- handlePhotoTags(directory, metadata);
+- handleCommentTags(directory, metadata);
+- } catch (MetadataException e) {
+- // ignore date parse errors and proceed with other tags
+- }
+- }
+-
+- /**
+- * EXIF may contain image description, although with undefined encoding.
+- * Use IPTC for other annotation fields, and XMP for unicode support.
+- */
+- public void handleCommentTags(Directory directory, Metadata metadata) {
+- if (metadata.get(TikaCoreProperties.DESCRIPTION) == null &&
+- directory.containsTag(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION)) {
+- metadata.set(TikaCoreProperties.DESCRIPTION,
+- directory.getString(ExifIFD0Directory.TAG_IMAGE_DESCRIPTION));
+- }
+- }
+-
+- /**
+- * Maps common TIFF and EXIF tags onto the Tika
+- * TIFF image metadata namespace.
+- */
+- public void handlePhotoTags(Directory directory, Metadata metadata) {
+- if (directory.containsTag(ExifSubIFDDirectory.TAG_EXPOSURE_TIME)) {
+- Object exposure = directory.getObject(ExifSubIFDDirectory.TAG_EXPOSURE_TIME);
+- if (exposure instanceof Rational) {
+- metadata.set(Metadata.EXPOSURE_TIME, ((Rational) exposure).doubleValue());
+- } else {
+- metadata.set(Metadata.EXPOSURE_TIME, directory.getString(ExifSubIFDDirectory.TAG_EXPOSURE_TIME));
+- }
+- }
+-
+- if (directory.containsTag(ExifSubIFDDirectory.TAG_FLASH)) {
+- String flash = directory.getDescription(ExifSubIFDDirectory.TAG_FLASH);
+- if (flash.contains("Flash fired")) {
+- metadata.set(Metadata.FLASH_FIRED, Boolean.TRUE.toString());
+- } else if (flash.contains("Flash did not fire")) {
+- metadata.set(Metadata.FLASH_FIRED, Boolean.FALSE.toString());
+- } else {
+- metadata.set(Metadata.FLASH_FIRED, flash);
+- }
+- }
+-
+- if (directory.containsTag(ExifSubIFDDirectory.TAG_FNUMBER)) {
+- Object fnumber = directory.getObject(ExifSubIFDDirectory.TAG_FNUMBER);
+- if (fnumber instanceof Rational) {
+- metadata.set(Metadata.F_NUMBER, ((Rational) fnumber).doubleValue());
+- } else {
+- metadata.set(Metadata.F_NUMBER, directory.getString(ExifSubIFDDirectory.TAG_FNUMBER));
+- }
+- }
+-
+- if (directory.containsTag(ExifSubIFDDirectory.TAG_FOCAL_LENGTH)) {
+- Object length = directory.getObject(ExifSubIFDDirectory.TAG_FOCAL_LENGTH);
+- if (length instanceof Rational) {
+- metadata.set(Metadata.FOCAL_LENGTH, ((Rational) length).doubleValue());
+- } else {
+- metadata.set(Metadata.FOCAL_LENGTH, directory.getString(ExifSubIFDDirectory.TAG_FOCAL_LENGTH));
+- }
+- }
+-
+- if (directory.containsTag(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT)) {
+- metadata.set(Metadata.ISO_SPEED_RATINGS, directory.getString(ExifSubIFDDirectory.TAG_ISO_EQUIVALENT));
+- }
+-
+- if (directory.containsTag(ExifIFD0Directory.TAG_MAKE)) {
+- metadata.set(Metadata.EQUIPMENT_MAKE, directory.getString(ExifIFD0Directory.TAG_MAKE));
+- }
+- if (directory.containsTag(ExifIFD0Directory.TAG_MODEL)) {
+- metadata.set(Metadata.EQUIPMENT_MODEL, directory.getString(ExifIFD0Directory.TAG_MODEL));
+- }
+-
+- if (directory.containsTag(ExifIFD0Directory.TAG_ORIENTATION)) {
+- Object length = directory.getObject(ExifIFD0Directory.TAG_ORIENTATION);
+- if (length instanceof Integer) {
+- metadata.set(Metadata.ORIENTATION, Integer.toString((Integer) length));
+- } else {
+- metadata.set(Metadata.ORIENTATION, directory.getString(ExifIFD0Directory.TAG_ORIENTATION));
+- }
+- }
+-
+- if (directory.containsTag(ExifIFD0Directory.TAG_SOFTWARE)) {
+- metadata.set(Metadata.SOFTWARE, directory.getString(ExifIFD0Directory.TAG_SOFTWARE));
+- }
+-
+- if (directory.containsTag(ExifIFD0Directory.TAG_X_RESOLUTION)) {
+- Object resolution = directory.getObject(ExifIFD0Directory.TAG_X_RESOLUTION);
+- if (resolution instanceof Rational) {
+- metadata.set(Metadata.RESOLUTION_HORIZONTAL, ((Rational) resolution).doubleValue());
+- } else {
+- metadata.set(Metadata.RESOLUTION_HORIZONTAL, directory.getString(ExifIFD0Directory.TAG_X_RESOLUTION));
+- }
+- }
+- if (directory.containsTag(ExifIFD0Directory.TAG_Y_RESOLUTION)) {
+- Object resolution = directory.getObject(ExifIFD0Directory.TAG_Y_RESOLUTION);
+- if (resolution instanceof Rational) {
+- metadata.set(Metadata.RESOLUTION_VERTICAL, ((Rational) resolution).doubleValue());
+- } else {
+- metadata.set(Metadata.RESOLUTION_VERTICAL, directory.getString(ExifIFD0Directory.TAG_Y_RESOLUTION));
+- }
+- }
+- if (directory.containsTag(ExifIFD0Directory.TAG_RESOLUTION_UNIT)) {
+- metadata.set(Metadata.RESOLUTION_UNIT, directory.getDescription(ExifIFD0Directory.TAG_RESOLUTION_UNIT));
+- }
+- if (directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)) {
+- metadata.set(Metadata.IMAGE_WIDTH,
+- trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)));
+- }
+- if (directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)) {
+- metadata.set(Metadata.IMAGE_LENGTH,
+- trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)));
+- }
+- }
+-
+- /**
+- * Maps exif dates to metadata fields.
+- */
+- public void handleDateTags(Directory directory, Metadata metadata)
+- throws MetadataException {
+- // Date/Time Original overrides value from ExifDirectory.TAG_DATETIME
+- Date original = null;
+- if (directory.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)) {
+- original = directory.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL);
+- // Unless we have GPS time we don't know the time zone so date must be set
+- // as ISO 8601 datetime without timezone suffix (no Z or +/-)
+- if (original != null) {
+- String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.get().format(original); // Same time zone as Metadata Extractor uses
+- metadata.set(TikaCoreProperties.CREATED, datetimeNoTimeZone);
+- metadata.set(Metadata.ORIGINAL_DATE, datetimeNoTimeZone);
+- }
+- }
+- if (directory.containsTag(ExifIFD0Directory.TAG_DATETIME)) {
+- Date datetime = directory.getDate(ExifIFD0Directory.TAG_DATETIME);
+- if (datetime != null) {
+- String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.get().format(datetime);
+- metadata.set(TikaCoreProperties.MODIFIED, datetimeNoTimeZone);
+- // If Date/Time Original does not exist this might be creation date
+- if (metadata.get(TikaCoreProperties.CREATED) == null) {
+- metadata.set(TikaCoreProperties.CREATED, datetimeNoTimeZone);
+- }
+- }
+- }
+- }
+- }
+-
+- /**
+- * Reads image comments, originally TIKA-472.
+- * Metadata Extractor does not read XMP so we need to use the values from Iptc or EXIF
+- */
+- static class IptcHandler implements DirectoryHandler {
+- public boolean supports(Class<? extends Directory> directoryType) {
+- return directoryType == IptcDirectory.class;
+- }
+-
+- public void handle(Directory directory, Metadata metadata)
+- throws MetadataException {
+- if (directory.containsTag(IptcDirectory.TAG_KEYWORDS)) {
+- String[] keywords = directory.getStringArray(IptcDirectory.TAG_KEYWORDS);
+- for (String k : keywords) {
+- metadata.add(TikaCoreProperties.KEYWORDS, k);
+- }
+- }
+- if (directory.containsTag(IptcDirectory.TAG_HEADLINE)) {
+- metadata.set(TikaCoreProperties.TITLE, directory.getString(IptcDirectory.TAG_HEADLINE));
+- } else if (directory.containsTag(IptcDirectory.TAG_OBJECT_NAME)) {
+- metadata.set(TikaCoreProperties.TITLE, directory.getString(IptcDirectory.TAG_OBJECT_NAME));
+- }
+- if (directory.containsTag(IptcDirectory.TAG_BY_LINE)) {
+- metadata.set(TikaCoreProperties.CREATOR, directory.getString(IptcDirectory.TAG_BY_LINE));
+- metadata.set(IPTC.CREATOR, directory.getString(IptcDirectory.TAG_BY_LINE));
+- }
+- if (directory.containsTag(IptcDirectory.TAG_CAPTION)) {
+- metadata.set(TikaCoreProperties.DESCRIPTION,
+- // Looks like metadata extractor returns IPTC newlines as a single carriage return,
+- // but the exiv2 command does not so we change to line feed here because that is less surprising to users
+- directory.getString(IptcDirectory.TAG_CAPTION).replaceAll("\r\n?", "\n"));
+- }
+- }
+- }
+-
+- /**
+- * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
+- */
+- static class GeotagHandler implements DirectoryHandler {
+- public boolean supports(Class<? extends Directory> directoryType) {
+- return directoryType == GpsDirectory.class;
+- }
+-
+- public void handle(Directory directory, Metadata metadata) throws MetadataException {
+- GeoLocation geoLocation = ((GpsDirectory) directory).getGeoLocation();
+- if (geoLocation != null) {
+- DecimalFormat geoDecimalFormat = new DecimalFormat(GEO_DECIMAL_FORMAT_STRING,
+- new DecimalFormatSymbols(Locale.ENGLISH));
+- metadata.set(TikaCoreProperties.LATITUDE, geoDecimalFormat.format(geoLocation.getLatitude()));
+- metadata.set(TikaCoreProperties.LONGITUDE, geoDecimalFormat.format(geoLocation.getLongitude()));
+- }
+- }
+- }
+-
+-}
diff --git a/debian/patches/ignore-com.github.junrar.exception.patch b/debian/patches/ignore-com.github.junrar.exception.patch
new file mode 100644
index 0000000..8cf7cd3
--- /dev/null
+++ b/debian/patches/ignore-com.github.junrar.exception.patch
@@ -0,0 +1,125 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:13:43 +0100
+Subject: ignore com.github.junrar.exception
+
+---
+ .../java/org/apache/tika/parser/pkg/RarParser.java | 110 ---------------------
+ 1 file changed, 110 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
+deleted file mode 100644
+index 99508b0..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
++++ /dev/null
+@@ -1,110 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.pkg;
+-
+-import java.io.IOException;
+-import java.io.InputStream;
+-import java.util.Collections;
+-import java.util.Set;
+-
+-import org.apache.tika.exception.EncryptedDocumentException;
+-import org.apache.tika.exception.TikaException;
+-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+-import org.apache.tika.io.TemporaryResources;
+-import org.apache.tika.io.TikaInputStream;
+-import org.apache.tika.metadata.Metadata;
+-import org.apache.tika.mime.MediaType;
+-import org.apache.tika.parser.AbstractParser;
+-import org.apache.tika.parser.ParseContext;
+-import org.apache.tika.sax.XHTMLContentHandler;
+-import org.xml.sax.ContentHandler;
+-import org.xml.sax.SAXException;
+-
+-import com.github.junrar.Archive;
+-import com.github.junrar.exception.RarException;
+-import com.github.junrar.rarfile.FileHeader;
+-
+-/**
+- * Parser for Rar files.
+- */
+-public class RarParser extends AbstractParser {
+- private static final long serialVersionUID = 6157727985054451501L;
+-
+- private static final Set<MediaType> SUPPORTED_TYPES = Collections
+- .singleton(MediaType.application("x-rar-compressed"));
+-
+- @Override
+- public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+- return SUPPORTED_TYPES;
+- }
+-
+- @Override
+- public void parse(InputStream stream, ContentHandler handler,
+- Metadata metadata, ParseContext context) throws IOException,
+- SAXException, TikaException {
+-
+- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+- xhtml.startDocument();
+-
+- EmbeddedDocumentExtractor extractor = context.get(
+- EmbeddedDocumentExtractor.class,
+- new ParsingEmbeddedDocumentExtractor(context));
+-
+- Archive rar = null;
+- try (TemporaryResources tmp = new TemporaryResources()) {
+- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+- rar = new Archive(tis.getFile());
+-
+- if (rar.isEncrypted()) {
+- throw new EncryptedDocumentException();
+- }
+-
+- //Without this BodyContentHandler does not work
+- xhtml.element("div", " ");
+-
+- FileHeader header = rar.nextFileHeader();
+- while (header != null && !Thread.currentThread().isInterrupted()) {
+- if (!header.isDirectory()) {
+- try (InputStream subFile = rar.getInputStream(header)) {
+- Metadata entrydata = PackageParser.handleEntryMetadata(
+- "".equals(header.getFileNameW()) ? header.getFileNameString() : header.getFileNameW(),
+- header.getCTime(), header.getMTime(),
+- header.getFullUnpackSize(),
+- xhtml
+- );
+-
+- if (extractor.shouldParseEmbedded(entrydata)) {
+- extractor.parseEmbedded(subFile, handler, entrydata, true);
+- }
+- }
+- }
+-
+- header = rar.nextFileHeader();
+- }
+-
+- } catch (RarException e) {
+- throw new TikaException("RarParser Exception", e);
+- } finally {
+- if (rar != null)
+- rar.close();
+-
+- }
+-
+- xhtml.endDocument();
+- }
+-}
diff --git a/debian/patches/ignore-com.healthmarketscience.jackcess.patch b/debian/patches/ignore-com.healthmarketscience.jackcess.patch
new file mode 100644
index 0000000..f3d771b
--- /dev/null
+++ b/debian/patches/ignore-com.healthmarketscience.jackcess.patch
@@ -0,0 +1,497 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:14:07 +0100
+Subject: ignore com.healthmarketscience.jackcess
+
+---
+ .../tika/parser/microsoft/JackcessExtractor.java | 345 ---------------------
+ .../tika/parser/microsoft/JackcessParser.java | 129 --------
+ 2 files changed, 474 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+deleted file mode 100644
+index e224d54..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
++++ /dev/null
+@@ -1,345 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-package org.apache.tika.parser.microsoft;
+-
+-
+-import static java.nio.charset.StandardCharsets.UTF_8;
+-
+-import java.io.ByteArrayInputStream;
+-import java.io.IOException;
+-import java.math.BigDecimal;
+-import java.text.DateFormat;
+-import java.text.NumberFormat;
+-import java.util.Date;
+-import java.util.HashSet;
+-import java.util.Iterator;
+-import java.util.List;
+-import java.util.Locale;
+-import java.util.Set;
+-
+-import com.healthmarketscience.jackcess.Column;
+-import com.healthmarketscience.jackcess.DataType;
+-import com.healthmarketscience.jackcess.Database;
+-import com.healthmarketscience.jackcess.PropertyMap;
+-import com.healthmarketscience.jackcess.Row;
+-import com.healthmarketscience.jackcess.Table;
+-import com.healthmarketscience.jackcess.query.Query;
+-import com.healthmarketscience.jackcess.util.OleBlob;
+-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+-import org.apache.tika.exception.TikaException;
+-import org.apache.tika.io.TikaInputStream;
+-import org.apache.tika.metadata.Metadata;
+-import org.apache.tika.metadata.OfficeOpenXMLExtended;
+-import org.apache.tika.metadata.TikaCoreProperties;
+-import org.apache.tika.parser.ParseContext;
+-import org.apache.tika.parser.html.HtmlParser;
+-import org.apache.tika.sax.BodyContentHandler;
+-import org.apache.tika.sax.XHTMLContentHandler;
+-import org.xml.sax.SAXException;
+-
+-/**
+- * Internal class. Needs to be instantiated for each parse because of
+- * the lack of thread safety with the dateTimeFormatter
+- */
+-class JackcessExtractor extends AbstractPOIFSExtractor {
+-
+- final static String TITLE_PROP_KEY = "Title";
+- final static String AUTHOR_PROP_KEY = "Author";
+- final static String COMPANY_PROP_KEY = "Company";
+-
+- final static String TEXT_FORMAT_KEY = "TextFormat";
+- final static String CURRENCY_FORMAT_KEY = "Format";
+- final static byte TEXT_FORMAT = 0;
+- final static byte RICH_TEXT_FORMAT = 1;
+- final static ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
+-
+- final NumberFormat currencyFormatter;
+- final DateFormat shortDateTimeFormatter;
+-
+- final HtmlParser htmlParser = new HtmlParser();
+-
+- protected JackcessExtractor(ParseContext context, Locale locale) {
+- super(context);
+- currencyFormatter = NumberFormat.getCurrencyInstance(locale);
+- shortDateTimeFormatter = DateFormat.getDateInstance(DateFormat.SHORT, locale);
+- }
+-
+- public void parse(Database db, XHTMLContentHandler xhtml, Metadata metadata) throws IOException, SAXException, TikaException {
+-
+-
+- String pw = db.getDatabasePassword();
+- if (pw != null) {
+- metadata.set(JackcessParser.MDB_PW, pw);
+- }
+-
+- PropertyMap dbp = db.getDatabaseProperties();
+- for (PropertyMap.Property p : dbp) {
+- metadata.add(JackcessParser.MDB_PROPERTY_PREFIX + p.getName(),
+- toString(p.getValue(), p.getType()));
+- }
+-
+- PropertyMap up = db.getUserDefinedProperties();
+- for (PropertyMap.Property p : up) {
+- metadata.add(JackcessParser.USER_DEFINED_PROPERTY_PREFIX+ p.getName(),
+- toString(p.getValue(), p.getType()));
+- }
+-
+- Set<String> found = new HashSet<>();
+- PropertyMap summaryProperties = db.getSummaryProperties();
+- if (summaryProperties != null) {
+- //try to get core properties
+- PropertyMap.Property title = summaryProperties.get(TITLE_PROP_KEY);
+- if (title != null) {
+- metadata.set(TikaCoreProperties.TITLE, toString(title.getValue(), title.getType()));
+- found.add(title.getName());
+- }
+- PropertyMap.Property author = summaryProperties.get(AUTHOR_PROP_KEY);
+- if (author != null && author.getValue() != null) {
+- String authorString = toString(author.getValue(), author.getType());
+- SummaryExtractor.addMulti(metadata, TikaCoreProperties.CREATOR, authorString);
+- found.add(author.getName());
+- }
+- PropertyMap.Property company = summaryProperties.get(COMPANY_PROP_KEY);
+- if (company != null) {
+- metadata.set(OfficeOpenXMLExtended.COMPANY, toString(company.getValue(), company.getType()));
+- found.add(company.getName());
+- }
+-
+- for (PropertyMap.Property p : db.getSummaryProperties()) {
+- if (! found.contains(p.getName())) {
+- metadata.add(JackcessParser.SUMMARY_PROPERTY_PREFIX + p.getName(),
+- toString(p.getValue(), p.getType()));
+- }
+- }
+-
+- }
+-
+- Iterator<Table> it = db.newIterable().
+- setIncludeLinkedTables(false).
+- setIncludeSystemTables(false).iterator();
+-
+- while (it.hasNext()) {
+- Table table = it.next();
+- String tableName = table.getName();
+- List<? extends Column> columns = table.getColumns();
+- xhtml.startElement("table", "name", tableName);
+- addHeaders(columns, xhtml);
+- xhtml.startElement("tbody");
+-
+- Row r = table.getNextRow();
+-
+- while (r != null) {
+- xhtml.startElement("tr");
+- for (Column c : columns) {
+- handleCell(r, c, xhtml);
+- }
+- xhtml.endElement("tr");
+- r = table.getNextRow();
+- }
+- xhtml.endElement("tbody");
+- xhtml.endElement("table");
+- }
+-
+- for (Query q : db.getQueries()) {
+- xhtml.startElement("div", "type", "sqlQuery");
+- xhtml.characters(q.toSQLString());
+- xhtml.endElement("div");
+- }
+- }
+-
+- private void addHeaders(List<? extends Column> columns, XHTMLContentHandler xhtml) throws SAXException {
+- xhtml.startElement("thead");
+- xhtml.startElement("tr");
+- for (Column c : columns) {
+- xhtml.startElement("th");
+- xhtml.characters(c.getName());
+- xhtml.endElement("th");
+- }
+- xhtml.endElement("tr");
+- xhtml.endElement("thead");
+-
+- }
+-
+- private void handleCell(Row r, Column c, XHTMLContentHandler handler)
+- throws SAXException, IOException, TikaException {
+-
+- handler.startElement("td");
+- if (c.getType().equals(DataType.OLE)) {
+- handleOLE(r, c.getName(), handler);
+- } else if (c.getType().equals(DataType.BINARY)) {
+- Object obj = r.get(c.getName());
+- if (obj != null) {
+- byte[] bytes = (byte[])obj;
+- handleEmbeddedResource(
+- TikaInputStream.get(bytes),
+- null,//filename
+- null,//relationshipId
+- null,//mediatype
+- handler, false);
+- }
+- } else {
+- Object obj = r.get(c.getName());
+- String v = toString(obj, c.getType());
+- if (isRichText(c)) {
+- BodyContentHandler h = new BodyContentHandler();
+- Metadata m = new Metadata();
+- m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+- try {
+- htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)),
+- h,
+- m, EMPTY_PARSE_CONTEXT);
+- handler.characters(h.toString());
+- } catch (SAXException e) {
+- //if something went wrong in htmlparser, just append the characters
+- handler.characters(v);
+- }
+- } else {
+- handler.characters(v);
+- }
+- }
+- handler.endElement("td");
+- }
+-
+- private boolean isRichText(Column c) throws IOException {
+-
+- if (c == null) {
+- return false;
+- }
+-
+- PropertyMap m = c.getProperties();
+- if (m == null) {
+- return false;
+- }
+- if (c.getType() == null || ! c.getType().equals(DataType.MEMO)) {
+- return false;
+- }
+- Object b = m.getValue(TEXT_FORMAT_KEY);
+- if (b instanceof Byte) {
+- if (((Byte)b).byteValue() == RICH_TEXT_FORMAT) {
+- return true;
+- }
+- }
+- return false;
+- }
+-
+- private String toString(Object value, DataType type) {
+- if (value == null) {
+- return "";
+- }
+- if (type == null) {
+- //this shouldn't happen
+- return value.toString();
+- }
+- switch (type) {
+- case LONG:
+- return Integer.toString((Integer)value);
+- case TEXT:
+- return (String)value;
+- case MONEY:
+- //TODO: consider getting parsing "Format" field from
+- //field properties.
+- return formatCurrency(((BigDecimal)value).doubleValue(), type);
+- case SHORT_DATE_TIME:
+- return formatShortDateTime((Date)value);
+- case BOOLEAN:
+- return Boolean.toString((Boolean) value);
+- case MEMO:
+- return (String)value;
+- case INT:
+- return Short.toString((Short)value);
+- case DOUBLE:
+- return Double.toString((Double)value);
+- case FLOAT:
+- return Float.toString((Float)value);
+- case NUMERIC:
+- return value.toString();
+- case BYTE:
+- return Byte.toString((Byte)value);
+- case GUID:
+- return value.toString();
+- case COMPLEX_TYPE: //skip all these
+- case UNKNOWN_0D:
+- case UNKNOWN_11:
+- case UNSUPPORTED_FIXEDLEN:
+- case UNSUPPORTED_VARLEN:
+- default:
+- return "";
+-
+- }
+- }
+-
+- private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
+- OleBlob blob = row.getBlob(cName);
+- //lifted shamelessly from Jackcess's OleBlobTest
+- if (blob == null)
+- return;
+-
+- OleBlob.Content content = blob.getContent();
+- if (content == null)
+- return;
+-
+- switch (content.getType()) {
+- case LINK:
+- xhtml.characters(((OleBlob.LinkContent) content).getLinkPath());
+- break;
+- case SIMPLE_PACKAGE:
+- OleBlob.SimplePackageContent spc = (OleBlob.SimplePackageContent) content;
+-
+- handleEmbeddedResource(
+- TikaInputStream.get(spc.getStream()),
+- spc.getFileName(),//filename
+- null,//relationshipId
+- spc.getTypeName(),//mediatype
+- xhtml, false);
+- break;
+- case OTHER:
+- OleBlob.OtherContent oc = (OleBlob.OtherContent) content;
+- handleEmbeddedResource(
+- TikaInputStream.get(oc.getStream()),
+- null,//filename
+- null,//relationshipId
+- oc.getTypeName(),//mediatype
+- xhtml, false);
+- break;
+- case COMPOUND_STORAGE:
+- OleBlob.CompoundContent cc = (OleBlob.CompoundContent) content;
+- handleCompoundContent(cc, xhtml);
+- break;
+- }
+- }
+-
+- private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
+- NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream());
+- handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
+- }
+-
+- String formatCurrency(Double d, DataType type) {
+- if (d == null) {
+- return "";
+- }
+- return currencyFormatter.format(d);
+- }
+-
+- String formatShortDateTime(Date d) {
+- if (d == null) {
+- return "";
+- }
+- return shortDateTimeFormatter.format(d);
+- }
+-}
+-
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
+deleted file mode 100644
+index 9704fbb..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
++++ /dev/null
+@@ -1,129 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-package org.apache.tika.parser.microsoft;
+-
+-
+-import java.io.IOException;
+-import java.io.InputStream;
+-import java.util.Collections;
+-import java.util.Locale;
+-import java.util.Set;
+-
+-import com.healthmarketscience.jackcess.CryptCodecProvider;
+-import com.healthmarketscience.jackcess.Database;
+-import com.healthmarketscience.jackcess.DatabaseBuilder;
+-import com.healthmarketscience.jackcess.util.LinkResolver;
+-import org.apache.tika.exception.EncryptedDocumentException;
+-import org.apache.tika.exception.TikaException;
+-import org.apache.tika.io.TikaInputStream;
+-import org.apache.tika.metadata.Metadata;
+-import org.apache.tika.metadata.Property;
+-import org.apache.tika.mime.MediaType;
+-import org.apache.tika.parser.AbstractParser;
+-import org.apache.tika.parser.ParseContext;
+-import org.apache.tika.parser.PasswordProvider;
+-import org.apache.tika.sax.XHTMLContentHandler;
+-import org.xml.sax.ContentHandler;
+-import org.xml.sax.SAXException;
+-
+-/**
+- * Parser that handles Microsoft Access files via
+- * <a href="http://jackcess.sourceforge.net/>Jackcess</a>
+- * <p>
+- * Many, many thanks to LexisNexis®/Health Market Science (HMS), Brian O'Neill,
+- * and James Ahlborn for relicensing Jackcess to Apache v2.0!
+- */
+-public class JackcessParser extends AbstractParser {
+-
+- public static final String SUMMARY_PROPERTY_PREFIX = "MDB_SUMMARY_PROP" + Metadata.NAMESPACE_PREFIX_DELIMITER;
+- public static String MDB_PROPERTY_PREFIX = "MDB_PROP" + Metadata.NAMESPACE_PREFIX_DELIMITER;
+- public static String USER_DEFINED_PROPERTY_PREFIX = "MDB_USER_PROP" + Metadata.NAMESPACE_PREFIX_DELIMITER;
+- public static Property MDB_PW = Property.externalText("Password");
+- private final static LinkResolver IGNORE_LINK_RESOLVER = new IgnoreLinkResolver();
+-
+- //TODO: figure out how to get this info
+- // public static Property LINKED_DATABASES = Property.externalTextBag("LinkedDatabases");
+-
+- private static final long serialVersionUID = -752276948656079347L;
+-
+- private static final MediaType MEDIA_TYPE = MediaType.application("x-msaccess");
+-
+- private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
+-
+- private Locale locale = Locale.ROOT;
+-
+- @Override
+- public Set<MediaType> getSupportedTypes(ParseContext context) {
+- return SUPPORTED_TYPES;
+- }
+-
+- @Override
+- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+- ParseContext context) throws IOException, SAXException, TikaException {
+- TikaInputStream tis = TikaInputStream.get(stream);
+- Database db = null;
+- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+- xhtml.startDocument();
+-
+- String password = null;
+- PasswordProvider passwordProvider = context.get(PasswordProvider.class);
+- if (passwordProvider != null) {
+- password = passwordProvider.getPassword(metadata);
+- }
+- try {
+- if (password == null) {
+- //do this to ensure encryption/wrong password exception vs. more generic
+- //"need right codec" error message.
+- db = new DatabaseBuilder(tis.getFile())
+- .setCodecProvider(new CryptCodecProvider())
+- .setReadOnly(true).open();
+- } else {
+- db = new DatabaseBuilder(tis.getFile())
+- .setCodecProvider(new CryptCodecProvider(password))
+- .setReadOnly(true).open();
+- }
+- db.setLinkResolver(IGNORE_LINK_RESOLVER);//just in case
+- JackcessExtractor ex = new JackcessExtractor(context, locale);
+- ex.parse(db, xhtml, metadata);
+- } catch (IllegalStateException e) {
+- if (e.getMessage() != null && e.getMessage().contains("Incorrect password")) {
+- throw new EncryptedDocumentException(e);
+- }
+- throw e;
+- } finally {
+- if (db != null) {
+- try {
+- db.close();
+- } catch (IOException e) {
+- //swallow = silent close
+- }
+- }
+- }
+- xhtml.endDocument();
+- }
+-
+- private static final class IgnoreLinkResolver implements LinkResolver {
+- //If links are resolved, Jackcess might try to open and process
+- //any file on the current system that is specified as a linked db.
+- //This could be a nasty security issue.
+- @Override
+- public Database resolveLinkedDatabase(Database database, String s) throws IOException {
+- throw new AssertionError("DO NOT ALLOW RESOLVING OF LINKS!!!");
+- }
+- }
+-}
diff --git a/debian/patches/ignore-com.pff.patch b/debian/patches/ignore-com.pff.patch
new file mode 100644
index 0000000..77d7a95
--- /dev/null
+++ b/debian/patches/ignore-com.pff.patch
@@ -0,0 +1,218 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:08:44 +0100
+Subject: ignore com.pff
+
+---
+ .../apache/tika/parser/mbox/OutlookPSTParser.java | 203 ---------------------
+ 1 file changed, 203 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+deleted file mode 100644
+index 5883bd5..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
++++ /dev/null
+@@ -1,203 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.mbox;
+-
+-import static java.lang.String.valueOf;
+-import static java.nio.charset.StandardCharsets.UTF_8;
+-import static java.util.Collections.singleton;
+-
+-import java.io.ByteArrayInputStream;
+-import java.io.File;
+-import java.io.IOException;
+-import java.io.InputStream;
+-import java.util.Set;
+-
+-import com.pff.PSTAttachment;
+-import com.pff.PSTFile;
+-import com.pff.PSTFolder;
+-import com.pff.PSTMessage;
+-import org.apache.tika.exception.TikaException;
+-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+-import org.apache.tika.io.TemporaryResources;
+-import org.apache.tika.io.TikaInputStream;
+-import org.apache.tika.metadata.Metadata;
+-import org.apache.tika.metadata.TikaCoreProperties;
+-import org.apache.tika.mime.MediaType;
+-import org.apache.tika.parser.AbstractParser;
+-import org.apache.tika.parser.ParseContext;
+-import org.apache.tika.sax.XHTMLContentHandler;
+-import org.xml.sax.ContentHandler;
+-import org.xml.sax.SAXException;
+-import org.xml.sax.helpers.AttributesImpl;
+-
+-/**
+- * Parser for MS Outlook PST email storage files
+- */
+-public class OutlookPSTParser extends AbstractParser {
+-
+- private static final long serialVersionUID = 620998217748364063L;
+-
+- public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
+- private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE);
+-
+- private static AttributesImpl createAttribute(String attName, String attValue) {
+- AttributesImpl attributes = new AttributesImpl();
+- attributes.addAttribute("", attName, attName, "CDATA", attValue);
+- return attributes;
+- }
+-
+- public Set<MediaType> getSupportedTypes(ParseContext context) {
+- return SUPPORTED_TYPES;
+- }
+-
+- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+- throws IOException, SAXException, TikaException {
+-
+- // Use the delegate parser to parse the contained document
+- EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
+- new ParsingEmbeddedDocumentExtractor(context));
+-
+- metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
+-
+- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+- xhtml.startDocument();
+-
+- TikaInputStream in = TikaInputStream.get(stream);
+- PSTFile pstFile = null;
+- try {
+- pstFile = new PSTFile(in.getFile().getPath());
+- metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
+- boolean isValid = pstFile.getFileHandle().getFD().valid();
+- metadata.set("isValid", valueOf(isValid));
+- if (isValid) {
+- parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
+- }
+- } catch (Exception e) {
+- throw new TikaException(e.getMessage(), e);
+- } finally {
+- if (pstFile != null && pstFile.getFileHandle() != null) {
+- try {
+- pstFile.getFileHandle().close();
+- } catch (IOException e) {
+- //swallow closing exception
+- }
+- }
+- }
+-
+- xhtml.endDocument();
+- }
+-
+- private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor)
+- throws Exception {
+- if (pstFolder.getContentCount() > 0) {
+- PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
+- while (pstMail != null) {
+- AttributesImpl attributes = new AttributesImpl();
+- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+- attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+- handler.startElement("div", attributes);
+- handler.element("h1", pstMail.getSubject());
+-
+- parserMailItem(handler, pstMail, embeddedExtractor);
+- parseMailAttachments(handler, pstMail, embeddedExtractor);
+-
+- handler.endElement("div");
+-
+- pstMail = (PSTMessage) pstFolder.getNextChild();
+- }
+- }
+-
+- if (pstFolder.hasSubfolders()) {
+- for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
+- handler.startElement("div", createAttribute("class", "email-folder"));
+- handler.element("h1", pstSubFolder.getDisplayName());
+- parseFolder(handler, pstSubFolder, embeddedExtractor);
+- handler.endElement("div");
+- }
+- }
+- }
+-
+- private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
+- Metadata mailMetadata = new Metadata();
+- mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
+- mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
+- mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
+- mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
+- mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
+- mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
+- mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
+- mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime());
+- mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
+- mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId()));
+- mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress());
+- mailMetadata.set("recipients", pstMail.getRecipientsString());
+- mailMetadata.set("displayTo", pstMail.getDisplayTo());
+- mailMetadata.set("displayCC", pstMail.getDisplayCC());
+- mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
+- mailMetadata.set("importance", valueOf(pstMail.getImportance()));
+- mailMetadata.set("priority", valueOf(pstMail.getPriority()));
+- mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
+-
+- byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+- embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
+- }
+-
+- private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor)
+- throws TikaException {
+- int numberOfAttachments = email.getNumberOfAttachments();
+- for (int i = 0; i < numberOfAttachments; i++) {
+- File tempFile = null;
+- try {
+- PSTAttachment attach = email.getAttachment(i);
+-
+- // Get the filename; both long and short filenames can be used for attachments
+- String filename = attach.getLongFilename();
+- if (filename.isEmpty()) {
+- filename = attach.getFilename();
+- }
+-
+- xhtml.element("p", filename);
+-
+- Metadata attachMeta = new Metadata();
+- attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
+- attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
+- AttributesImpl attributes = new AttributesImpl();
+- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+- attributes.addAttribute("", "id", "id", "CDATA", filename);
+- xhtml.startElement("div", attributes);
+- if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
+- TemporaryResources tmp = new TemporaryResources();
+- try {
+- TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
+- embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
+- } finally {
+- tmp.dispose();
+- }
+- }
+- xhtml.endElement("div");
+-
+- } catch (Exception e) {
+- throw new TikaException("Unable to unpack document stream", e);
+- } finally {
+- if (tempFile != null)
+- tempFile.delete();
+- }
+- }
+- }
+-
+-}
diff --git a/debian/patches/ignore-javax.ws.rs.core.patch b/debian/patches/ignore-javax.ws.rs.core.patch
new file mode 100644
index 0000000..07ca9ff
--- /dev/null
+++ b/debian/patches/ignore-javax.ws.rs.core.patch
@@ -0,0 +1,127 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:12:46 +0100
+Subject: ignore javax.ws.rs.core
+
+---
+ .../tika/parser/journal/GrobidRESTParser.java | 112 ---------------------
+ 1 file changed, 112 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
+deleted file mode 100644
+index 05b09fc..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
++++ /dev/null
+@@ -1,112 +0,0 @@
+-/**
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-package org.apache.tika.parser.journal;
+-
+-import java.io.File;
+-import java.io.FileInputStream;
+-import java.io.FileNotFoundException;
+-import java.io.IOException;
+-import java.util.Properties;
+-
+-import javax.ws.rs.core.MediaType;
+-import javax.ws.rs.core.Response;
+-
+-import org.apache.cxf.jaxrs.client.WebClient;
+-import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+-import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
+-import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
+-import org.apache.tika.metadata.Metadata;
+-import org.apache.tika.parser.ParseContext;
+-import org.xml.sax.ContentHandler;
+-
+-public class GrobidRESTParser {
+-
+- private static final String GROBID_REST_HOST = "http://localhost:8080";
+-
+- private static final String GROBID_ISALIVE_PATH = "/grobid"; // isalive
+- // doesn't work
+- // nfc why
+-
+- private static final String GROBID_PROCESSHEADER_PATH = "/processHeaderDocument";
+-
+- private String restHostUrlStr;
+-
+- public GrobidRESTParser() {
+- String restHostUrlStr = null;
+- try {
+- restHostUrlStr = readRestUrl();
+- } catch (IOException e) {
+- e.printStackTrace();
+- }
+-
+- if (restHostUrlStr == null
+- || (restHostUrlStr != null && restHostUrlStr.equals(""))) {
+- this.restHostUrlStr = GROBID_REST_HOST;
+- } else {
+- this.restHostUrlStr = restHostUrlStr;
+- }
+- }
+-
+- public void parse(String filePath, ContentHandler handler, Metadata metadata,
+- ParseContext context) throws FileNotFoundException {
+-
+- File pdfFile = new File(filePath);
+- ContentDisposition cd = new ContentDisposition(
+- "form-data; name=\"input\"; filename=\"" + pdfFile.getName() + "\"");
+- Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd);
+- MultipartBody body = new MultipartBody(att);
+-
+- Response response = WebClient
+- .create(restHostUrlStr + GROBID_PROCESSHEADER_PATH)
+- .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA)
+- .post(body);
+-
+- try {
+- String resp = response.readEntity(String.class);
+- Metadata teiMet = new TEIParser().parse(resp);
+- for (String key : teiMet.names()) {
+- metadata.add("grobid:header_" + key, teiMet.get(key));
+- }
+- } catch (Exception e) {
+- e.printStackTrace();
+- }
+- }
+-
+- private static String readRestUrl() throws IOException {
+- Properties grobidProperties = new Properties();
+- grobidProperties.load(GrobidRESTParser.class
+- .getResourceAsStream("GrobidExtractor.properties"));
+-
+- return grobidProperties.getProperty("grobid.server.url");
+- }
+-
+- protected static boolean canRun() {
+- Response response = null;
+-
+- try {
+- response = WebClient.create(readRestUrl() + GROBID_ISALIVE_PATH)
+- .accept(MediaType.TEXT_HTML).get();
+- String resp = response.readEntity(String.class);
+- return resp != null && !resp.equals("") && resp.startsWith("<h4>");
+- } catch (Exception e) {
+- e.printStackTrace();
+- return false;
+- }
+- }
+-
+-}
diff --git a/debian/patches/ignore-opennlp.tools.namefind.patch b/debian/patches/ignore-opennlp.tools.namefind.patch
new file mode 100644
index 0000000..912fa44
--- /dev/null
+++ b/debian/patches/ignore-opennlp.tools.namefind.patch
@@ -0,0 +1,142 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:12:18 +0100
+Subject: ignore opennlp.tools.namefind
+
+---
+ .../tika/parser/geo/topic/NameEntityExtractor.java | 127 ---------------------
+ 1 file changed, 127 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
+deleted file mode 100644
+index e7435d1..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
++++ /dev/null
+@@ -1,127 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-package org.apache.tika.parser.geo.topic;
+-
+-import java.io.FileInputStream;
+-import java.io.IOException;
+-import java.io.InputStream;
+-import java.util.ArrayList;
+-import java.util.Arrays;
+-import java.util.Collections;
+-import java.util.Comparator;
+-import java.util.HashMap;
+-import java.util.List;
+-import java.util.Map;
+-
+-import opennlp.tools.namefind.NameFinderME;
+-import opennlp.tools.namefind.TokenNameFinderModel;
+-import opennlp.tools.util.InvalidFormatException;
+-import opennlp.tools.util.Span;
+-
+-import org.apache.commons.io.IOUtils;
+-
+-import static java.nio.charset.StandardCharsets.UTF_8;
+-
+-public class NameEntityExtractor {
+- private String nerModelPath = null;
+- ArrayList<String> locationNameEntities;
+- String bestNameEntity;
+- private HashMap<String, Integer> tf;
+-
+- public NameEntityExtractor(String nerModelpath) {
+- this.locationNameEntities = new ArrayList<String>();
+- this.bestNameEntity = null;
+- this.nerModelPath = nerModelpath;
+- tf = new HashMap<String, Integer>();
+-
+- }
+-
+- /*
+- * Use OpenNLP to extract location names that's appearing in the steam.
+- * OpenNLP's default Name Finder accuracy is not very good, please refer to
+- * its documentation.
+- *
+- * @param stream stream that passed from this.parse()
+- */
+-
+- public void getAllNameEntitiesfromInput(InputStream stream)
+- throws InvalidFormatException, IOException {
+-
+- InputStream modelIn = new FileInputStream(nerModelPath);
+- TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
+- NameFinderME nameFinder = new NameFinderME(model);
+- String[] in = IOUtils.toString(stream, UTF_8).split(" ");
+-
+- Span nameE[] = nameFinder.find(in);
+-
+- String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
+- spanNames = spanNames.substring(1, spanNames.length() - 1);
+- modelIn.close();
+- String[] tmp = spanNames.split(",");
+-
+- for (String name : tmp) {
+- name = name.trim();
+- this.locationNameEntities.add(name);
+- }
+-
+- }
+-
+- /*
+- * Get the best location entity extracted from the input stream. Simply
+- * return the most frequent entity, If there several highest frequent
+- * entity, pick one randomly. May not be the optimal solution, but works.
+- *
+- * @param locationNameEntities OpenNLP name finder's results, stored in
+- * ArrayList
+- */
+- public void getBestNameEntity() {
+- if (this.locationNameEntities.size() == 0)
+- return;
+-
+- for (int i = 0; i < this.locationNameEntities.size(); ++i) {
+- if (tf.containsKey(this.locationNameEntities.get(i)))
+- tf.put(this.locationNameEntities.get(i),
+- tf.get(this.locationNameEntities.get(i)) + 1);
+- else
+- tf.put(this.locationNameEntities.get(i), 1);
+- }
+- int max = 0;
+- List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(
+- tf.entrySet());
+- Collections.shuffle(list);
+- Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
+- public int compare(Map.Entry<String, Integer> o1,
+- Map.Entry<String, Integer> o2) {
+- return o2.getValue().compareTo(o1.getValue()); // descending
+- // order
+-
+- }
+- });
+-
+- this.locationNameEntities.clear();// update so that they are in
+- // descending order
+- for (Map.Entry<String, Integer> entry : list) {
+- this.locationNameEntities.add(entry.getKey());
+- if (entry.getValue() > max) {
+- max = entry.getValue();
+- this.bestNameEntity = entry.getKey();
+- }
+- }
+- }
+-
+-}
diff --git a/debian/patches/ignore-org.apache.ctakes.patch b/debian/patches/ignore-org.apache.ctakes.patch
new file mode 100644
index 0000000..d362d8d
--- /dev/null
+++ b/debian/patches/ignore-org.apache.ctakes.patch
@@ -0,0 +1,1014 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:06:44 +0100
+Subject: ignore org.apache.ctakes
+
+---
+ .../parser/ctakes/CTAKESAnnotationProperty.java | 46 ---
+ .../apache/tika/parser/ctakes/CTAKESConfig.java | 336 ---------------------
+ .../tika/parser/ctakes/CTAKESContentHandler.java | 176 -----------
+ .../apache/tika/parser/ctakes/CTAKESParser.java | 92 ------
+ .../tika/parser/ctakes/CTAKESSerializer.java | 42 ---
+ .../org/apache/tika/parser/ctakes/CTAKESUtils.java | 265 ----------------
+ 6 files changed, 957 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java b/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
+deleted file mode 100644
+index e6d261d..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
++++ /dev/null
+@@ -1,46 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.ctakes;
+-
+-import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+-
+-/**
+- * This enumeration includes the properties that an {@see IdentifiedAnnotation} object can provide.
+- *
+- */
+-public enum CTAKESAnnotationProperty {
+- BEGIN("start"),
+- END("end"),
+- CONDITIONAL("conditional"),
+- CONFIDENCE("confidence"),
+- DISCOVERY_TECNIQUE("discoveryTechnique"),
+- GENERIC("generic"),
+- HISTORY_OF("historyOf"),
+- ID("id"),
+- ONTOLOGY_CONCEPT_ARR("ontologyConceptArr"),
+- POLARITY("polarity");
+-
+- private String name;
+-
+- CTAKESAnnotationProperty(String name) {
+- this.name = name;
+- }
+-
+- public String getName() {
+- return name;
+- }
+-}
+\ No newline at end of file
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
+deleted file mode 100644
+index 67ba993..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
++++ /dev/null
+@@ -1,336 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.ctakes;
+-
+-import java.io.IOException;
+-import java.io.InputStream;
+-import java.io.OutputStream;
+-import java.io.Serializable;
+-import java.util.Properties;
+-
+-import static org.apache.commons.io.output.NullOutputStream.NULL_OUTPUT_STREAM;
+-
+-/**
+- * Configuration for {@see CTAKESContentHandler}.
+- *
+- * This class allows to enable cTAKES and set its parameters.
+- */
+-public class CTAKESConfig implements Serializable {
+- /**
+- * Serial version UID
+- */
+- private static final long serialVersionUID = -1599741171775528923L;
+-
+- // Path to XML descriptor for AnalysisEngine
+- private String aeDescriptorPath = "/ctakes-core/desc/analysis_engine/SentencesAndTokensAggregate.xml";
+-
+- // UMLS username
+- private String UMLSUser = "";
+-
+- // UMLS password
+- private String UMLSPass = "";
+-
+- // Enables formatted output
+- private boolean prettyPrint = true;
+-
+- // Type of cTAKES (UIMA) serializer
+- private CTAKESSerializer serializerType = CTAKESSerializer.XMI;
+-
+- // OutputStream object used for CAS serialization
+- private OutputStream stream = NULL_OUTPUT_STREAM;
+-
+- // Enables CAS serialization
+- private boolean serialize = false;
+-
+- // Enables text analysis using cTAKES
+- private boolean text = true;
+-
+- // List of metadata to analyze using cTAKES
+- private String[] metadata = null;
+-
+- // List of annotation properties to add to metadata in addition to text covered by an annotation
+- private CTAKESAnnotationProperty[] annotationProps = null;
+-
+- // Character used to separate the annotation properties into metadata
+- private char separatorChar = ':';
+-
+- /**
+- * Default constructor.
+- */
+- public CTAKESConfig() {
+- init(this.getClass().getResourceAsStream("CTAKESConfig.properties"));
+- }
+-
+- /**
+- * Loads properties from InputStream and then tries to close InputStream.
+- * @param stream {@see InputStream} object used to read properties.
+- */
+- public CTAKESConfig(InputStream stream) {
+- init(stream);
+- }
+-
+- private void init(InputStream stream) {
+- if (stream == null) {
+- return;
+- }
+- Properties props = new Properties();
+-
+- try {
+- props.load(stream);
+- } catch (IOException e) {
+- // TODO warning
+- } finally {
+- if (stream != null) {
+- try {
+- stream.close();
+- } catch (IOException ioe) {
+- // TODO warning
+- }
+- }
+- }
+-
+- setAeDescriptorPath(props.getProperty("aeDescriptorPath", getAeDescriptorPath()));
+- setUMLSUser(props.getProperty("UMLSUser", getUMLSUser()));
+- setUMLSPass(props.getProperty("UMLSPass", getUMLSPass()));
+- setText(Boolean.valueOf(props.getProperty("text", Boolean.toString(isText()))));
+- setMetadata(props.getProperty("metadata", getMetadataAsString()).split(","));
+- setAnnotationProps(props.getProperty("annotationProps", getAnnotationPropsAsString()).split(","));
+- setSeparatorChar(props.getProperty("separatorChar", Character.toString(getSeparatorChar())).charAt(0));
+- }
+-
+- /**
+- * Returns the path to XML descriptor for AnalysisEngine.
+- * @return the path to XML descriptor for AnalysisEngine.
+- */
+- public String getAeDescriptorPath() {
+- return aeDescriptorPath;
+- }
+-
+- /**
+- * Returns the UMLS username.
+- * @return the UMLS username.
+- */
+- public String getUMLSUser() {
+- return UMLSUser;
+- }
+-
+- /**
+- * Returns the UMLS password.
+- * @return the UMLS password.
+- */
+- public String getUMLSPass() {
+- return UMLSPass;
+- }
+-
+- /**
+- * Returns {@code true} if formatted output is enabled, {@code false} otherwise.
+- * @return {@code true} if formatted output is enabled, {@code false} otherwise.
+- */
+- public boolean isPrettyPrint() {
+- return prettyPrint;
+- }
+-
+- /**
+- * Returns the type of cTAKES (UIMA) serializer used to write the CAS.
+- * @return the type of cTAKES serializer.
+- */
+- public CTAKESSerializer getSerializerType() {
+- return serializerType;
+- }
+-
+- /**
+- * Returns an {@see OutputStream} object used write the CAS.
+- * @return {@see OutputStream} object used write the CAS.
+- */
+- public OutputStream getOutputStream() {
+- return stream;
+- }
+-
+- /**
+- * Returns {@code true} if CAS serialization is enabled, {@code false} otherwise.
+- * @return {@code true} if CAS serialization output is enabled, {@code false} otherwise.
+- */
+- public boolean isSerialize() {
+- return serialize;
+- }
+-
+- /**
+- * Returns {@code true} if content text analysis is enabled {@code false} otherwise.
+- * @return {@code true} if content text analysis is enabled {@code false} otherwise.
+- */
+- public boolean isText() {
+- return text;
+- }
+-
+- /**
+- * Returns an array of metadata whose values will be analyzed using cTAKES.
+- * @return an array of metadata whose values will be analyzed using cTAKES.
+- */
+- public String[] getMetadata() {
+- return metadata;
+- }
+-
+- /**
+- * Returns a string containing a comma-separated list of metadata whose values will be analyzed using cTAKES.
+- * @return a string containing a comma-separated list of metadata whose values will be analyzed using cTAKES.
+- */
+- public String getMetadataAsString() {
+- if (metadata == null) {
+- return "";
+- }
+- StringBuilder sb = new StringBuilder();
+- for (int i = 0; i < metadata.length; i++) {
+- sb.append(metadata[i]);
+- if (i < metadata.length-1) {
+- sb.append(",");
+- }
+- }
+- return sb.toString();
+- }
+-
+- /**
+- * Returns an array of {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata.
+- * @return an array of {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata.
+- */
+- public CTAKESAnnotationProperty[] getAnnotationProps() {
+- return annotationProps;
+- }
+-
+- /**
+- * Returns a string containing a comma-separated list of {@see CTAKESAnnotationProperty} names that will be included into cTAKES metadata.
+- * @return
+- */
+- public String getAnnotationPropsAsString() {
+- StringBuilder sb = new StringBuilder();
+- sb.append("coveredText");
+- if (annotationProps != null) {
+- for (CTAKESAnnotationProperty property : annotationProps) {
+- sb.append(separatorChar);
+- sb.append(property.getName());
+- }
+- }
+- return sb.toString();
+- }
+-
+- /**
+- * Returns the separator character used for annotation properties.
+- * @return the separator character used for annotation properties.
+- */
+- public char getSeparatorChar() {
+- return separatorChar;
+- }
+-
+- /**
+- * Sets the path to XML descriptor for AnalysisEngine.
+- * @param aeDescriptorPath the path to XML descriptor for AnalysisEngine.
+- */
+- public void setAeDescriptorPath(String aeDescriptorPath) {
+- this.aeDescriptorPath = aeDescriptorPath;
+- }
+-
+- /**
+- * Sets the UMLS username.
+- * @param uMLSUser the UMLS username.
+- */
+- public void setUMLSUser(String uMLSUser) {
+- this.UMLSUser = uMLSUser;
+- }
+-
+- /**
+- * Sets the UMLS password.
+- * @param uMLSPass the UMLS password.
+- */
+- public void setUMLSPass(String uMLSPass) {
+- this.UMLSPass = uMLSPass;
+- }
+-
+- /**
+- * Enables the formatted output for serializer.
+- * @param prettyPrint {@true} to enable formatted output, {@code false} otherwise.
+- */
+- public void setPrettyPrint(boolean prettyPrint) {
+- this.prettyPrint = prettyPrint;
+- }
+-
+- /**
+- * Sets the type of cTAKES (UIMA) serializer used to write CAS.
+- * @param serializerType the type of cTAKES serializer.
+- */
+- public void setSerializerType(CTAKESSerializer serializerType) {
+- this.serializerType = serializerType;
+- }
+-
+- /**
+- * Sets the {@see OutputStream} object used to write the CAS.
+- * @param stream the {@see OutputStream} object used to write the CAS.
+- */
+- public void setOutputStream(OutputStream stream) {
+- this.stream = stream;
+- }
+-
+- /**
+- * Enables CAS serialization.
+- * @param serialize {@true} to enable CAS serialization, {@code false} otherwise.
+- */
+- public void setSerialize(boolean serialize) {
+- this.serialize = serialize;
+- }
+-
+- /**
+- * Enables content text analysis using cTAKES.
+- * @param text {@true} to enable content text analysis, {@code false} otherwise.
+- */
+- public void setText(boolean text) {
+- this.text = text;
+- }
+-
+- /**
+- * Sets the metadata whose values will be analyzed using cTAKES.
+- * @param metadata the metadata whose values will be analyzed using cTAKES.
+- */
+- public void setMetadata(String[] metadata) {
+- this.metadata = metadata;
+- }
+-
+- /**
+- * Sets the {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata.
+- * @param annotationProps the {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata.
+- */
+- public void setAnnotationProps(CTAKESAnnotationProperty[] annotationProps) {
+- this.annotationProps = annotationProps;
+- }
+-
+- /**
+- * ets the {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata.
+- * @param annotationProps the {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata.
+- */
+- public void setAnnotationProps(String[] annotationProps) {
+- CTAKESAnnotationProperty[] properties = new CTAKESAnnotationProperty[annotationProps.length];
+- for (int i = 0; i < annotationProps.length; i++) {
+- properties[i] = CTAKESAnnotationProperty.valueOf(annotationProps[i]);
+- }
+- setAnnotationProps(properties);
+- }
+-
+- /**
+- * Sets the separator character used for annotation properties.
+- * @param separatorChar the separator character used for annotation properties.
+- */
+- public void setSeparatorChar(char separatorChar) {
+- this.separatorChar = separatorChar;
+- }
+-}
+\ No newline at end of file
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
+deleted file mode 100644
+index 38326e3..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
++++ /dev/null
+@@ -1,176 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.ctakes;
+-
+-import java.util.Collection;
+-import java.util.Iterator;
+-
+-import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+-import org.apache.tika.metadata.Metadata;
+-import org.apache.tika.sax.ContentHandlerDecorator;
+-import org.apache.uima.analysis_engine.AnalysisEngine;
+-import org.apache.uima.fit.util.JCasUtil;
+-import org.apache.uima.jcas.JCas;
+-import org.xml.sax.ContentHandler;
+-import org.xml.sax.SAXException;
+-import org.xml.sax.helpers.DefaultHandler;
+-
+-/**
+- * Class used to extract biomedical information while parsing.
+- *
+- * <p>
+- * This class relies on <a href="http://ctakes.apache.org/">Apache cTAKES</a>
+- * that is a natural language processing system for extraction of information
+- * from electronic medical record clinical free-text.
+- * </p>
+- */
+-public class CTAKESContentHandler extends ContentHandlerDecorator {
+- // Prefix used for metadata including cTAKES annotations
+- public static String CTAKES_META_PREFIX = "ctakes:";
+-
+- // Configuration object for CTAKESContentHandler
+- private CTAKESConfig config = null;
+-
+- // StringBuilder object used to build the clinical free-text for cTAKES
+- private StringBuilder sb = null;
+-
+- // Metadata object used for cTAKES annotations
+- private Metadata metadata = null;
+-
+- // UIMA Analysis Engine
+- private AnalysisEngine ae = null;
+-
+- // JCas object for working with the CAS (Common Analysis System)
+- private JCas jcas = null;
+-
+- /**
+- * Creates a new {@see CTAKESContentHandler} for the given {@see
+- * ContentHandler} and Metadata objects.
+- *
+- * @param handler
+- * the {@see ContentHandler} object to be decorated.
+- * @param metadata
+- * the {@see Metadata} object that will be populated using
+- * biomedical information extracted by cTAKES.
+- * @param config
+- * the {@see CTAKESConfig} object used to configure the handler.
+- */
+- public CTAKESContentHandler(ContentHandler handler, Metadata metadata,
+- CTAKESConfig config) {
+- super(handler);
+- this.metadata = metadata;
+- this.config = config;
+- this.sb = new StringBuilder();
+- }
+-
+- /**
+- * Creates a new {@see CTAKESContentHandler} for the given {@see
+- * ContentHandler} and Metadata objects.
+- *
+- * @param handler
+- * the {@see ContentHandler} object to be decorated.
+- * @param metadata
+- * the {@see Metadata} object that will be populated using
+- * biomedical information extracted by cTAKES.
+- */
+- public CTAKESContentHandler(ContentHandler handler, Metadata metadata) {
+- this(handler, metadata, new CTAKESConfig());
+- }
+-
+- /**
+- * Default constructor.
+- */
+- public CTAKESContentHandler() {
+- this(new DefaultHandler(), new Metadata());
+- }
+-
+- @Override
+- public void characters(char[] ch, int start, int length)
+- throws SAXException {
+- if (config.isText()) {
+- sb.append(ch, start, length);
+- }
+- super.characters(ch, start, length);
+- }
+-
+- @Override
+- public void endDocument() throws SAXException {
+- try {
+- // create an Analysis Engine
+- if (ae == null) {
+- ae = CTAKESUtils.getAnalysisEngine(config.getAeDescriptorPath(), config.getUMLSUser(), config.getUMLSPass());
+- }
+-
+- // create a JCas, given an AE
+- if (jcas == null) {
+- jcas = CTAKESUtils.getJCas(ae);
+- }
+-
+- // get metadata to process
+- StringBuilder metaText = new StringBuilder();
+- String[] metadataToProcess = config.getMetadata();
+- if (metadataToProcess != null) {
+- for (String name : config.getMetadata()) {
+- for (String value : metadata.getValues(name)) {
+- metaText.append(value);
+- metaText.append(System.lineSeparator());
+- }
+- }
+- }
+-
+- // analyze text
+- jcas.setDocumentText(metaText.toString() + sb.toString());
+- ae.process(jcas);
+-
+- // add annotations to metadata
+- metadata.add(CTAKES_META_PREFIX + "schema", config.getAnnotationPropsAsString());
+- CTAKESAnnotationProperty[] annotationPros = config.getAnnotationProps();
+- Collection<IdentifiedAnnotation> collection = JCasUtil.select(jcas, IdentifiedAnnotation.class);
+- Iterator<IdentifiedAnnotation> iterator = collection.iterator();
+- while (iterator.hasNext()) {
+- IdentifiedAnnotation annotation = iterator.next();
+- StringBuilder annotationBuilder = new StringBuilder();
+- annotationBuilder.append(annotation.getCoveredText());
+- if (annotationPros != null) {
+- for (CTAKESAnnotationProperty property : annotationPros) {
+- annotationBuilder.append(config.getSeparatorChar());
+- annotationBuilder.append(CTAKESUtils.getAnnotationProperty(annotation, property));
+- }
+- }
+- metadata.add(CTAKES_META_PREFIX + annotation.getType().getShortName(), annotationBuilder.toString());
+- }
+-
+- if (config.isSerialize()) {
+- // serialize data
+- CTAKESUtils.serialize(jcas, config.getSerializerType(), config.isPrettyPrint(), config.getOutputStream());
+- }
+- } catch (Exception e) {
+- throw new SAXException(e.getMessage());
+- } finally {
+- CTAKESUtils.resetCAS(jcas);
+- }
+- }
+-
+- /**
+- * Returns metadata that includes cTAKES annotations.
+- *
+- * @return {@Metadata} object that includes cTAKES annotations.
+- */
+- public Metadata getMetadata() {
+- return metadata;
+- }
+-}
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
+deleted file mode 100644
+index acd1965..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
++++ /dev/null
+@@ -1,92 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.ctakes;
+-
+-import java.io.IOException;
+-import java.io.InputStream;
+-
+-import org.apache.tika.config.TikaConfig;
+-import org.apache.tika.exception.TikaException;
+-import org.apache.tika.metadata.Metadata;
+-import org.apache.tika.parser.AutoDetectParser;
+-import org.apache.tika.parser.ParseContext;
+-import org.apache.tika.parser.Parser;
+-import org.apache.tika.parser.ParserDecorator;
+-import org.xml.sax.ContentHandler;
+-import org.xml.sax.SAXException;
+-
+-/**
+- * CTAKESParser decorates a {@see Parser} and leverages on
+- * {@see CTAKESContentHandler} to extract biomedical information from
+- * clinical text using Apache cTAKES.
+- * <p>It is normally called by supplying an instance to
+- * {@link AutoDetectParser}, such as:
+- * <code>AutoDetectParser parser = new AutoDetectParser(new CTAKESParser());</code>
+- * <p>It can also be used by giving a Tika Config file similar to:
+- * <code>
+- * <properties>
+- * <parsers>
+- * <parser class="org.apache.tika.parser.ctakes.CTAKESParser">
+- * <parser class="org.apache.tika.parser.DefaultParser"/>
+- * </parser>
+- * </parsers>
+- * </properties>
+- * </code>
+- * <p>Because this is a Parser Decorator, and not a normal Parser in
+- * it's own right, it isn't normally selected via the Parser Service Loader.
+- */
+-public class CTAKESParser extends ParserDecorator {
+- /**
+- * Serial version UID
+- */
+- private static final long serialVersionUID = -2313482748027097961L;
+-
+- /**
+- * Wraps the default Parser
+- */
+- public CTAKESParser() {
+- this(TikaConfig.getDefaultConfig());
+- }
+- /**
+- * Wraps the default Parser for this Config
+- */
+- public CTAKESParser(TikaConfig config) {
+- this(config.getParser());
+- }
+- /**
+- * Wraps the specified Parser
+- */
+- public CTAKESParser(Parser parser) {
+- super(parser);
+- }
+-
+- @Override
+- public void parse(InputStream stream, ContentHandler handler,
+- Metadata metadata, ParseContext context) throws IOException,
+- SAXException, TikaException {
+- CTAKESConfig config = context.get(CTAKESConfig.class,
+- new CTAKESConfig());
+- CTAKESContentHandler ctakesHandler = new CTAKESContentHandler(handler,
+- metadata, config);
+- super.parse(stream, ctakesHandler, metadata, context);
+- }
+-
+- //@Override
+- public String getDecorationName() {
+- return "CTakes";
+- }
+-}
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java b/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
+deleted file mode 100644
+index 4d4e4e2..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
++++ /dev/null
+@@ -1,42 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.ctakes;
+-
+-import org.apache.uima.cas.impl.XCASSerializer;
+-import org.apache.uima.cas.impl.XmiCasSerializer;
+-import org.apache.uima.util.XmlCasSerializer;
+-
+-/**
+- * Enumeration for types of cTAKES (UIMA) CAS serializer supported by cTAKES.
+- *
+- * A CAS serializer writes a CAS in the given format.
+- */
+-public enum CTAKESSerializer {
+- XCAS(XCASSerializer.class.getName()),
+- XMI(XmiCasSerializer.class.getName()),
+- XML(XmlCasSerializer.class.getName());
+-
+- private final String className;
+-
+- private CTAKESSerializer(String className) {
+- this.className = className;
+- }
+-
+- public String getClassName() {
+- return className;
+- }
+-}
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java b/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
+deleted file mode 100644
+index 23f281a..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
++++ /dev/null
+@@ -1,265 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.ctakes;
+-
+-import java.io.IOException;
+-import java.io.OutputStream;
+-import java.net.URISyntaxException;
+-
+-import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+-import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+-import org.apache.uima.UIMAFramework;
+-import org.apache.uima.analysis_engine.AnalysisEngine;
+-import org.apache.uima.cas.impl.XCASSerializer;
+-import org.apache.uima.cas.impl.XmiCasSerializer;
+-import org.apache.uima.cas.impl.XmiSerializationSharedData;
+-import org.apache.uima.jcas.JCas;
+-import org.apache.uima.jcas.cas.FSArray;
+-import org.apache.uima.resource.ResourceInitializationException;
+-import org.apache.uima.resource.ResourceSpecifier;
+-import org.apache.uima.util.InvalidXMLException;
+-import org.apache.uima.util.XMLInputSource;
+-import org.apache.uima.util.XmlCasSerializer;
+-import org.xml.sax.SAXException;
+-
+-/**
+- * This class provides methods to extract biomedical information from plain text
+- * using {@see CTAKESContentHandler} that relies on Apache cTAKES.
+- *
+- * <p>
+- * Apache cTAKES is built on top of <a href="https://uima.apache.org/">Apache
+- * UIMA</a> framework and <a href="https://opennlp.apache.org/">OpenNLP</a>
+- * toolkit.
+- * </p>
+- */
+-public class CTAKESUtils {
+- // UMLS username property
+- private final static String CTAKES_UMLS_USER = "ctakes.umlsuser";
+-
+- // UMLS password property
+- private final static String CTAKES_UMLS_PASS = "ctakes.umlspw";
+-
+- /**
+- * Returns a new UIMA Analysis Engine (AE). This method ensures that only
+- * one instance of an AE is created.
+- *
+- * <p>
+- * An Analysis Engine is a component responsible for analyzing unstructured
+- * information, discovering and representing semantic content. Unstructured
+- * information includes, but is not restricted to, text documents.
+- * </p>
+- *
+- * @param aeDescriptor
+- * pathname for XML file including an AnalysisEngineDescription
+- * that contains all of the information needed to instantiate and
+- * use an AnalysisEngine.
+- * @param umlsUser
+- * UMLS username for NLM database
+- * @param umlsPass
+- * UMLS password for NLM database
+- * @return an Analysis Engine for analyzing unstructured information.
+- * @throws IOException
+- * if any I/O error occurs.
+- * @throws InvalidXMLException
+- * if the input XML is not valid or does not specify a valid
+- * ResourceSpecifier.
+- * @throws ResourceInitializationException
+- * if a failure occurred during production of the resource.
+- * @throws URISyntaxException
+- * if URL of the resource is not formatted strictly according to
+- * to RFC2396 and cannot be converted to a URI.
+- */
+- public static AnalysisEngine getAnalysisEngine(String aeDescriptor,
+- String umlsUser, String umlsPass) throws IOException,
+- InvalidXMLException, ResourceInitializationException,
+- URISyntaxException {
+- // UMLS user ID and password.
+- String aeDescriptorPath = CTAKESUtils.class.getResource(aeDescriptor)
+- .toURI().getPath();
+-
+- // get Resource Specifier from XML
+- XMLInputSource aeIputSource = new XMLInputSource(aeDescriptorPath);
+- ResourceSpecifier aeSpecifier = UIMAFramework.getXMLParser()
+- .parseResourceSpecifier(aeIputSource);
+-
+- // UMLS user ID and password
+- if ((umlsUser != null) && (!umlsUser.isEmpty()) && (umlsPass != null)
+- && (!umlsPass.isEmpty())) {
+- /*
+- * It is highly recommended that you change UMLS credentials in the
+- * XML configuration file instead of giving user and password using
+- * CTAKESConfig.
+- */
+- System.setProperty(CTAKES_UMLS_USER, umlsUser);
+- System.setProperty(CTAKES_UMLS_PASS, umlsPass);
+- }
+-
+- // create AE
+- AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(aeSpecifier);
+-
+- return ae;
+- }
+-
+- /**
+- * Returns a new JCas () appropriate for the given Analysis Engine. This
+- * method ensures that only one instance of a JCas is created. A Jcas is a
+- * Java Cover Classes based Object-oriented CAS (Common Analysis System)
+- * API.
+- *
+- * <p>
+- * Important: It is highly recommended that you reuse CAS objects rather
+- * than creating new CAS objects prior to each analysis. This is because CAS
+- * objects may be expensive to create and may consume a significant amount
+- * of memory.
+- * </p>
+- *
+- * @param ae
+- * AnalysisEngine used to create an appropriate JCas object.
+- * @return a JCas object appropriate for the given AnalysisEngine.
+- * @throws ResourceInitializationException
+- * if a CAS could not be created because this AnalysisEngine's
+- * CAS metadata (type system, type priorities, or FS indexes)
+- * are invalid.
+- */
+- public static JCas getJCas(AnalysisEngine ae)
+- throws ResourceInitializationException {
+- JCas jcas = ae.newJCas();
+-
+- return jcas;
+- }
+-
+- /**
+- * Serializes a CAS in the given format.
+- *
+- * @param jcas
+- * CAS (Common Analysis System) to be serialized.
+- * @param type
+- * type of cTAKES (UIMA) serializer used to write CAS.
+- * @param prettyPrint
+- * {@code true} to do pretty printing of output.
+- * @param stream
+- * {@see OutputStream} object used to print out information
+- * extracted by using cTAKES.
+- * @throws SAXException
+- * if there was a SAX exception.
+- * @throws IOException
+- * if any I/O error occurs.
+- */
+- public static void serialize(JCas jcas, CTAKESSerializer type, boolean prettyPrint,
+- OutputStream stream) throws SAXException, IOException {
+- if (type == CTAKESSerializer.XCAS) {
+- XCASSerializer.serialize(jcas.getCas(), stream, prettyPrint);
+- } else if (type == CTAKESSerializer.XMI) {
+- XmiCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(),
+- stream, prettyPrint, new XmiSerializationSharedData());
+- } else {
+- XmlCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(),
+- stream);
+- }
+- }
+-
+- /**
+- * Returns the annotation value based on the given annotation type.
+- *
+- * @param annotation
+- * {@see IdentifiedAnnotation} object.
+- * @param property
+- * {@see CTAKESAnnotationProperty} enum used to identify the
+- * annotation type.
+- * @return the annotation value.
+- */
+- public static String getAnnotationProperty(IdentifiedAnnotation annotation,
+- CTAKESAnnotationProperty property) {
+- String value = null;
+- if (property == CTAKESAnnotationProperty.BEGIN) {
+- value = Integer.toString(annotation.getBegin());
+- } else if (property == CTAKESAnnotationProperty.END) {
+- value = Integer.toString(annotation.getEnd());
+- } else if (property == CTAKESAnnotationProperty.CONDITIONAL) {
+- value = Boolean.toString(annotation.getConditional());
+- } else if (property == CTAKESAnnotationProperty.CONFIDENCE) {
+- value = Float.toString(annotation.getConfidence());
+- } else if (property == CTAKESAnnotationProperty.DISCOVERY_TECNIQUE) {
+- value = Integer.toString(annotation.getDiscoveryTechnique());
+- } else if (property == CTAKESAnnotationProperty.GENERIC) {
+- value = Boolean.toString(annotation.getGeneric());
+- } else if (property == CTAKESAnnotationProperty.HISTORY_OF) {
+- value = Integer.toString(annotation.getHistoryOf());
+- } else if (property == CTAKESAnnotationProperty.ID) {
+- value = Integer.toString(annotation.getId());
+- } else if (property == CTAKESAnnotationProperty.ONTOLOGY_CONCEPT_ARR) {
+- FSArray mentions = annotation.getOntologyConceptArr();
+- StringBuilder sb = new StringBuilder();
+- if (mentions != null) {
+- for (int i = 0; i < mentions.size(); i++) {
+- if (mentions.get(i) instanceof UmlsConcept) {
+- UmlsConcept concept = (UmlsConcept) mentions.get(i);
+- sb.append(concept.getCui());
+- if (i < mentions.size() - 1) {
+- sb.append(",");
+- }
+- }
+- }
+- }
+- value = sb.toString();
+- } else if (property == CTAKESAnnotationProperty.POLARITY) {
+- value = Integer.toString(annotation.getPolarity());
+- }
+- return value;
+- }
+-
+- /**
+- * Resets cTAKES objects, if created. This method ensures that new cTAKES
+- * objects (a.k.a., Analysis Engine and JCas) will be created if getters of
+- * this class are called.
+- *
+- * @param ae UIMA Analysis Engine
+- * @param jcas JCas object
+- */
+- public static void reset(AnalysisEngine ae, JCas jcas) {
+- // Analysis Engine
+- resetAE(ae);
+-
+- // JCas
+- resetCAS(jcas);
+- jcas = null;
+- }
+-
+- /**
+- * Resets the CAS (Common Analysis System), emptying it of all content.
+- *
+- * @param jcas JCas object
+- */
+- public static void resetCAS(JCas jcas) {
+- if (jcas != null) {
+- jcas.reset();
+- }
+- }
+-
+- /**
+- * Resets the AE (AnalysisEngine), releasing all resources held by the
+- * current AE.
+- *
+- * @param ae UIMA Analysis Engine
+- */
+- public static void resetAE(AnalysisEngine ae) {
+- if (ae != null) {
+- ae.destroy();
+- ae = null;
+- }
+- }
+-}
diff --git a/debian/patches/ignore-org.apache.poi.hslf.usermodel.patch b/debian/patches/ignore-org.apache.poi.hslf.usermodel.patch
new file mode 100644
index 0000000..f02907b
--- /dev/null
+++ b/debian/patches/ignore-org.apache.poi.hslf.usermodel.patch
@@ -0,0 +1,353 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:15:33 +0100
+Subject: ignore org.apache.poi.hslf.usermodel
+
+---
+ .../tika/parser/microsoft/HSLFExtractor.java | 338 ---------------------
+ 1 file changed, 338 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+deleted file mode 100644
+index dedb135..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
++++ /dev/null
+@@ -1,338 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.microsoft;
+-
+-import java.io.IOException;
+-import java.util.HashSet;
+-import java.util.List;
+-
+-import org.apache.poi.hslf.model.Comment;
+-import org.apache.poi.hslf.model.HeadersFooters;
+-import org.apache.poi.hslf.model.OLEShape;
+-import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
+-import org.apache.poi.hslf.usermodel.HSLFNotes;
+-import org.apache.poi.hslf.usermodel.HSLFObjectData;
+-import org.apache.poi.hslf.usermodel.HSLFPictureData;
+-import org.apache.poi.hslf.usermodel.HSLFShape;
+-import org.apache.poi.hslf.usermodel.HSLFSlide;
+-import org.apache.poi.hslf.usermodel.HSLFSlideShow;
+-import org.apache.poi.hslf.usermodel.HSLFTable;
+-import org.apache.poi.hslf.usermodel.HSLFTableCell;
+-import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
+-import org.apache.poi.hslf.usermodel.HSLFTextRun;
+-import org.apache.poi.hslf.usermodel.HSLFTextShape;
+-import org.apache.poi.poifs.filesystem.DirectoryNode;
+-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+-import org.apache.tika.exception.TikaException;
+-import org.apache.tika.io.TikaInputStream;
+-import org.apache.tika.parser.ParseContext;
+-import org.apache.tika.sax.XHTMLContentHandler;
+-import org.xml.sax.SAXException;
+-import org.xml.sax.helpers.AttributesImpl;
+-
+-public class HSLFExtractor extends AbstractPOIFSExtractor {
+- public HSLFExtractor(ParseContext context) {
+- super(context);
+- }
+-
+- protected void parse(
+- NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+- throws IOException, SAXException, TikaException {
+- parse(filesystem.getRoot(), xhtml);
+- }
+-
+- protected void parse(
+- DirectoryNode root, XHTMLContentHandler xhtml)
+- throws IOException, SAXException, TikaException {
+- HSLFSlideShow ss = new HSLFSlideShow(root);
+- List<HSLFSlide> _slides = ss.getSlides();
+-
+- xhtml.startElement("div", "class", "slideShow");
+-
+- /* Iterate over slides and extract text */
+- for (HSLFSlide slide : _slides) {
+- xhtml.startElement("div", "class", "slide");
+-
+- // Slide header, if present
+- HeadersFooters hf = slide.getHeadersFooters();
+- if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
+- xhtml.startElement("p", "class", "slide-header");
+-
+- xhtml.characters(hf.getHeaderText());
+-
+- xhtml.endElement("p");
+- }
+-
+- // Slide master, if present
+- extractMaster(xhtml, slide.getMasterSheet());
+-
+- // Slide text
+- {
+- xhtml.startElement("div", "class", "slide-content");
+-
+- textRunsToText(xhtml, slide.getTextParagraphs());
+-
+- xhtml.endElement("div");
+- }
+-
+- // Table text
+- for (HSLFShape shape : slide.getShapes()) {
+- if (shape instanceof HSLFTable) {
+- extractTableText(xhtml, (HSLFTable) shape);
+- }
+- }
+-
+- // Slide footer, if present
+- if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
+- xhtml.startElement("p", "class", "slide-footer");
+-
+- xhtml.characters(hf.getFooterText());
+-
+- xhtml.endElement("p");
+- }
+-
+- // Comments, if present
+- StringBuilder authorStringBuilder = new StringBuilder();
+- for (Comment comment : slide.getComments()) {
+- authorStringBuilder.setLength(0);
+- xhtml.startElement("p", "class", "slide-comment");
+-
+- if (comment.getAuthor() != null) {
+- authorStringBuilder.append(comment.getAuthor());
+- }
+- if (comment.getAuthorInitials() != null) {
+- if (authorStringBuilder.length() > 0) {
+- authorStringBuilder.append(" ");
+- }
+- authorStringBuilder.append("("+comment.getAuthorInitials()+")");
+- }
+- if (authorStringBuilder.length() > 0) {
+- if (comment.getText() != null) {
+- authorStringBuilder.append(" - ");
+- }
+- xhtml.startElement("b");
+- xhtml.characters(authorStringBuilder.toString());
+- xhtml.endElement("b");
+- }
+- if (comment.getText() != null) {
+- xhtml.characters(comment.getText());
+- }
+- xhtml.endElement("p");
+- }
+-
+- // Now any embedded resources
+- handleSlideEmbeddedResources(slide, xhtml);
+-
+- // TODO Find the Notes for this slide and extract inline
+-
+- // Slide complete
+- xhtml.endElement("div");
+- }
+-
+- // All slides done
+- xhtml.endElement("div");
+-
+- /* notes */
+- xhtml.startElement("div", "class", "slide-notes");
+- HashSet<Integer> seenNotes = new HashSet<>();
+- HeadersFooters hf = ss.getNotesHeadersFooters();
+-
+- for (HSLFSlide slide : _slides) {
+- HSLFNotes notes = slide.getNotes();
+- if (notes == null) {
+- continue;
+- }
+- Integer id = notes._getSheetNumber();
+- if (seenNotes.contains(id)) {
+- continue;
+- }
+- seenNotes.add(id);
+-
+- // Repeat the Notes header, if set
+- if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
+- xhtml.startElement("p", "class", "slide-note-header");
+- xhtml.characters(hf.getHeaderText());
+- xhtml.endElement("p");
+- }
+-
+- // Notes text
+- textRunsToText(xhtml, notes.getTextParagraphs());
+-
+- // Repeat the notes footer, if set
+- if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
+- xhtml.startElement("p", "class", "slide-note-footer");
+- xhtml.characters(hf.getFooterText());
+- xhtml.endElement("p");
+- }
+- }
+-
+- handleSlideEmbeddedPictures(ss, xhtml);
+-
+- xhtml.endElement("div");
+- }
+-
+- private void extractMaster(XHTMLContentHandler xhtml, HSLFMasterSheet master) throws SAXException {
+- if (master == null) {
+- return;
+- }
+- List<HSLFShape> shapes = master.getShapes();
+- if (shapes == null || shapes.isEmpty()) {
+- return;
+- }
+-
+- xhtml.startElement("div", "class", "slide-master-content");
+- for (HSLFShape shape : shapes) {
+- if (shape != null && !HSLFMasterSheet.isPlaceholder(shape)) {
+- if (shape instanceof HSLFTextShape) {
+- HSLFTextShape tsh = (HSLFTextShape) shape;
+- String text = tsh.getText();
+- if (text != null) {
+- xhtml.element("p", text);
+- }
+- }
+- }
+- }
+- xhtml.endElement("div");
+- }
+-
+- private void extractTableText(XHTMLContentHandler xhtml, HSLFTable shape) throws SAXException {
+- xhtml.startElement("table");
+- for (int row = 0; row < shape.getNumberOfRows(); row++) {
+- xhtml.startElement("tr");
+- for (int col = 0; col < shape.getNumberOfColumns(); col++) {
+- HSLFTableCell cell = shape.getCell(row, col);
+- //insert empty string for empty cell if cell is null
+- String txt = "";
+- if (cell != null) {
+- txt = cell.getText();
+- }
+- xhtml.element("td", txt);
+- }
+- xhtml.endElement("tr");
+- }
+- xhtml.endElement("table");
+- }
+-
+- private void textRunsToText(XHTMLContentHandler xhtml, List<List<HSLFTextParagraph>> paragraphsList) throws SAXException {
+- if (paragraphsList == null) {
+- return;
+- }
+-
+- for (List<HSLFTextParagraph> run : paragraphsList) {
+- // Leaving in wisdom from TIKA-712 for easy revert.
+- // Avoid boiler-plate text on the master slide (0
+- // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
+- //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
+-
+- for (HSLFTextParagraph htp : run) {
+- xhtml.startElement("p");
+-
+- for (HSLFTextRun htr : htp.getTextRuns()) {
+- String line = htr.getRawText();
+- if (line != null) {
+- boolean isfirst = true;
+- for (String fragment : line.split("\\u000b")){
+- if (!isfirst) {
+- xhtml.startElement("br");
+- xhtml.endElement("br");
+- }
+- isfirst = false;
+- xhtml.characters(fragment.trim());
+- }
+- }
+- }
+- xhtml.endElement("p");
+-
+- }
+-
+- }
+- }
+-
+- private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, XHTMLContentHandler xhtml)
+- throws TikaException, SAXException, IOException {
+- for (HSLFPictureData pic : slideshow.getPictureData()) {
+- String mediaType;
+-
+- switch (pic.getType()) {
+- case EMF:
+- mediaType = "application/x-emf";
+- break;
+- case WMF:
+- mediaType = "application/x-msmetafile";
+- break;
+- case DIB:
+- mediaType = "image/bmp";
+- break;
+- default:
+- mediaType = pic.getContentType();
+- break;
+- }
+-
+- handleEmbeddedResource(
+- TikaInputStream.get(pic.getData()), null, null,
+- mediaType, xhtml, false);
+- }
+- }
+-
+- private void handleSlideEmbeddedResources(HSLFSlide slide, XHTMLContentHandler xhtml)
+- throws TikaException, SAXException, IOException {
+- List<HSLFShape> shapes;
+- try {
+- shapes = slide.getShapes();
+- } catch (NullPointerException e) {
+- // Sometimes HSLF hits problems
+- // Please open POI bugs for any you come across!
+- return;
+- }
+-
+- for (HSLFShape shape : shapes) {
+- if (shape instanceof OLEShape) {
+- OLEShape oleShape = (OLEShape) shape;
+- HSLFObjectData data = null;
+- try {
+- data = oleShape.getObjectData();
+- } catch (NullPointerException e) {
+- /* getObjectData throws NPE some times. */
+- }
+-
+- if (data != null) {
+- String objID = Integer.toString(oleShape.getObjectID());
+-
+- // Embedded Object: add a <div
+- // class="embedded" id="X"/> so consumer can see where
+- // in the main text each embedded document
+- // occurred:
+- AttributesImpl attributes = new AttributesImpl();
+- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+- attributes.addAttribute("", "id", "id", "CDATA", objID);
+- xhtml.startElement("div", attributes);
+- xhtml.endElement("div");
+-
+- try (TikaInputStream stream = TikaInputStream.get(data.getData())) {
+- String mediaType = null;
+- if ("Excel.Chart.8".equals(oleShape.getProgID())) {
+- mediaType = "application/vnd.ms-excel";
+- }
+- handleEmbeddedResource(
+- stream, objID, objID,
+- mediaType, xhtml, false);
+- }
+- }
+- }
+- }
+- }
+-}
diff --git a/debian/patches/ignore-org.apache.poi.hssf.extractor.patch b/debian/patches/ignore-org.apache.poi.hssf.extractor.patch
new file mode 100644
index 0000000..b92ad89
--- /dev/null
+++ b/debian/patches/ignore-org.apache.poi.hssf.extractor.patch
@@ -0,0 +1,112 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:05:03 +0100
+Subject: ignore org.apache.poi.hssf.extractor
+
+---
+ .../tika/parser/microsoft/OldExcelParser.java | 97 ----------------------
+ 1 file changed, 97 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
+deleted file mode 100644
+index 446eea9..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
++++ /dev/null
+@@ -1,97 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.microsoft;
+-
+-import java.io.BufferedReader;
+-import java.io.IOException;
+-import java.io.InputStream;
+-import java.io.StringReader;
+-import java.util.Arrays;
+-import java.util.Collections;
+-import java.util.HashSet;
+-import java.util.Set;
+-
+-import org.apache.poi.hssf.extractor.OldExcelExtractor;
+-import org.apache.tika.exception.TikaException;
+-import org.apache.tika.metadata.Metadata;
+-import org.apache.tika.mime.MediaType;
+-import org.apache.tika.parser.AbstractParser;
+-import org.apache.tika.parser.ParseContext;
+-import org.apache.tika.sax.XHTMLContentHandler;
+-import org.xml.sax.ContentHandler;
+-import org.xml.sax.SAXException;
+-
+-/**
+- * A POI-powered Tika Parser for very old versions of Excel, from
+- * pre-OLE2 days, such as Excel 4.
+- */
+-public class OldExcelParser extends AbstractParser {
+- private static final long serialVersionUID = 4611820730372823452L;
+-
+- private static final Set<MediaType> SUPPORTED_TYPES =
+- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+- MediaType.application("vnd.ms-excel.sheet.4"),
+- MediaType.application("vnd.ms-excel.workspace.4"),
+- MediaType.application("vnd.ms-excel.sheet.3"),
+- MediaType.application("vnd.ms-excel.workspace.3"),
+- MediaType.application("vnd.ms-excel.sheet.2")
+- )));
+-
+- protected static void parse(OldExcelExtractor extractor,
+- XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException {
+- // Get the whole text, as a single string
+- String text = extractor.getText();
+-
+- // Split and output
+- xhtml.startDocument();
+-
+- String line;
+- BufferedReader reader = new BufferedReader(new StringReader(text));
+- while ((line = reader.readLine()) != null) {
+- xhtml.startElement("p");
+- xhtml.characters(line);
+- xhtml.endElement("p");
+- }
+-
+- xhtml.endDocument();
+- }
+-
+- public Set<MediaType> getSupportedTypes(ParseContext context) {
+- return SUPPORTED_TYPES;
+- }
+-
+- /**
+- * Extracts properties and text from an MS Document input stream
+- */
+- public void parse(
+- InputStream stream, ContentHandler handler,
+- Metadata metadata, ParseContext context)
+- throws IOException, SAXException, TikaException {
+- // Open the POI provided extractor
+- OldExcelExtractor extractor = new OldExcelExtractor(stream);
+-
+- // We can't do anything about metadata, as these old formats
+- // didn't have any stored with them
+-
+- // Set the content type
+- // TODO Get the version and type, to set as the Content Type
+-
+- // Have the text extracted and given to our Content Handler
+- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+- parse(extractor, xhtml);
+- }
+-}
diff --git a/debian/patches/ignore-org.json.XML.patch b/debian/patches/ignore-org.json.XML.patch
new file mode 100644
index 0000000..5199d54
--- /dev/null
+++ b/debian/patches/ignore-org.json.XML.patch
@@ -0,0 +1,908 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:06:12 +0100
+Subject: ignore org.json.XML
+
+---
+ .../org/apache/tika/parser/journal/TEIParser.java | 893 ---------------------
+ 1 file changed, 893 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java
+deleted file mode 100644
+index 04d5195..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java
++++ /dev/null
+@@ -1,893 +0,0 @@
+-/**
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-package org.apache.tika.parser.journal;
+-
+-import java.util.ArrayList;
+-import java.util.List;
+-
+-import org.apache.tika.metadata.Metadata;
+-import org.json.JSONArray;
+-import org.json.JSONObject;
+-import org.json.XML;
+-
+-public class TEIParser {
+-
+- public TEIParser() {
+- }
+-
+- public Metadata parse(String source) {
+- JSONObject obj = XML.toJSONObject(source);
+- Metadata metadata = new Metadata();
+- createGrobidMetadata(source, obj, metadata);
+- return metadata;
+- }
+-
+- private void createGrobidMetadata(String source, JSONObject obj,
+- Metadata metadata) {
+- if (obj != null) {
+- JSONObject teiHeader = obj.getJSONObject("TEI")
+- .getJSONObject("teiHeader");
+- if (teiHeader.has("text")) {
+- parseText(teiHeader.getJSONObject("text"), metadata);
+- }
+-
+- if (teiHeader.has("fileDesc")) {
+- parseFileDesc(teiHeader.getJSONObject("fileDesc"), metadata);
+-
+- }
+- if (teiHeader.has("profileDesc")) {
+- parseProfileDesc(teiHeader.getJSONObject("profileDesc"), metadata);
+- }
+- }
+-
+- addStaticMet(source, obj, metadata);
+- }
+-
+- private void addStaticMet(String source, JSONObject obj, Metadata metadata) {
+- metadata.add("Class", Metadata.class.getName());
+- metadata.add("TEIJSONSource", obj.toString());
+- metadata.add("TEIXMLSource", source);
+- }
+-
+- private void parseText(JSONObject text, Metadata metadata) {
+- if (text.has("xml:lang")) {
+- metadata.add("Language", text.getString("xml:lang"));
+- }
+- }
+-
+- private void parseFileDesc(JSONObject fileDesc, Metadata metadata) {
+- if (fileDesc.has("titleStmt")) {
+- parseTitleStmt(fileDesc.getJSONObject("titleStmt"), metadata);
+- }
+-
+- if (fileDesc.has("sourceDesc")) {
+- parseSourceDesc(fileDesc.getJSONObject("sourceDesc"), metadata);
+- }
+- }
+-
+- private void parseTitleStmt(JSONObject titleStmt, Metadata metadata) {
+- if (titleStmt.has("title")) {
+- JSONObject title = titleStmt.getJSONObject("title");
+- if (title.has("content")) {
+- metadata.add("Title", title.getString("content"));
+- }
+- }
+- }
+-
+- private void parseSourceDesc(JSONObject sourceDesc, Metadata metadata) {
+- if (sourceDesc.has("biblStruct")) {
+- parseBiblStruct(sourceDesc.getJSONObject("biblStruct"), metadata);
+- }
+- }
+-
+- private void parseBiblStruct(JSONObject biblStruct, Metadata metadata) {
+- if (biblStruct.has("analytic")
+- && biblStruct.get("analytic") instanceof JSONObject) {
+- JSONObject analytic = biblStruct.getJSONObject("analytic");
+- if (analytic.has("author")) {
+- Object authorObj = analytic.get("author");
+-
+- List<Author> authorList = new ArrayList<Author>();
+- if (authorObj instanceof JSONObject) {
+- parseAuthor((JSONObject) authorObj, authorList);
+- } else if (authorObj instanceof JSONArray) {
+- JSONArray authors = (JSONArray) authorObj;
+- if (authors.length() > 0) {
+- for (int i = 0; i < authors.length(); i++) {
+- JSONObject author = authors.getJSONObject(i);
+- parseAuthor(author, authorList);
+- }
+- }
+-
+- metadata.add("Address", getMetadataAddresses(authorList));
+- metadata.add("Affiliation", getMetadataAffiliations(authorList));
+- metadata.add("Authors", getMetadataAuthors(authorList));
+- metadata.add("FullAffiliations",
+- getMetadataFullAffiliations(authorList));
+- }
+-
+- }
+- } else {
+- metadata.add("Error", "Unable to parse: no analytic section in JSON");
+- }
+-
+- }
+-
+- private String getMetadataFullAffiliations(List<Author> authorList) {
+- List<Affiliation> unique = new ArrayList<Affiliation>();
+- StringBuilder metAffils = new StringBuilder();
+-
+- for (Author a : authorList) {
+- for (Affiliation af : a.getAffiliations()) {
+- if (!unique.contains(af)) {
+- unique.add(af);
+- }
+- }
+- }
+- metAffils.append("[");
+- for (Affiliation af : unique) {
+- metAffils.append(af.toString());
+- metAffils.append(",");
+- }
+- metAffils.append(metAffils.deleteCharAt(metAffils.length() - 1));
+- metAffils.append("]");
+- return metAffils.toString();
+- }
+-
+- private String getMetadataAuthors(List<Author> authorList) {
+- // generates Chris A. Mattmann 1, 2 Daniel J. Crichton 1 Nenad Medvidovic 2
+- // Steve Hughes 1
+- List<Affiliation> unique = new ArrayList<Affiliation>();
+- StringBuilder metAuthors = new StringBuilder();
+-
+- for (Author a : authorList) {
+- for (Affiliation af : a.getAffiliations()) {
+- if (!unique.contains(af)) {
+- unique.add(af);
+- }
+- }
+- }
+-
+- for (Author a : authorList) {
+- metAuthors.append(printOrBlank(a.getFirstName()));
+- metAuthors.append(printOrBlank(a.getMiddleName()));
+- metAuthors.append(printOrBlank(a.getSurName()));
+-
+- StringBuilder affilBuilder = new StringBuilder();
+- for (int idx = 0; idx < unique.size(); idx++) {
+- Affiliation af = unique.get(idx);
+- if (a.getAffiliations().contains(af)) {
+- affilBuilder.append((idx + 1));
+- affilBuilder.append(",");
+- }
+- }
+-
+- if (affilBuilder.length() > 0)
+- affilBuilder.deleteCharAt(affilBuilder.length() - 1);
+-
+- metAuthors.append(affilBuilder.toString());
+- metAuthors.append(" ");
+- }
+-
+- return metAuthors.toString();
+- }
+-
+- private String getMetadataAffiliations(List<Author> authorList) {
+- // generates 1 Jet Propulsion Laboratory California Institute of Technology
+- // ; 2 Computer Science Department University of Southern California
+- List<Affiliation> unique = new ArrayList<Affiliation>();
+- StringBuilder metAffil = new StringBuilder();
+-
+- for (Author a : authorList) {
+- for (Affiliation af : a.getAffiliations()) {
+- if (!unique.contains(af)) {
+- unique.add(af);
+- }
+- }
+- }
+-
+- int count = 1;
+- for (Affiliation a : unique) {
+- metAffil.append(count);
+- metAffil.append(" ");
+- metAffil.append(a.getOrgName().toString());
+- metAffil.deleteCharAt(metAffil.length() - 1);
+- metAffil.append("; ");
+- count++;
+- }
+-
+- if (count > 1) {
+- metAffil.deleteCharAt(metAffil.length() - 1);
+- metAffil.deleteCharAt(metAffil.length() - 1);
+- }
+-
+- return metAffil.toString();
+- }
+-
+- private String getMetadataAddresses(List<Author> authorList) {
+- // generates: "Pasadena, CA 91109, USA Los Angeles, CA 90089, USA",
+- List<Address> unique = new ArrayList<Address>();
+- StringBuilder metAddress = new StringBuilder();
+-
+- for (Author a : authorList) {
+- for (Affiliation af : a.getAffiliations()) {
+- if (!unique.contains(af.getAddress())) {
+- unique.add(af.getAddress());
+- }
+- }
+- }
+-
+- for (Address ad : unique) {
+- metAddress.append(ad.toString());
+- metAddress.append(" ");
+- }
+-
+- return metAddress.toString();
+- }
+-
+- private void parseAuthor(JSONObject authorObj, List<Author> authorList) {
+- Author author = new Author();
+-
+- if (authorObj.has("persName")) {
+- JSONObject persName = authorObj.getJSONObject("persName");
+-
+- if (persName.has("forename")) {
+-
+- Object foreNameObj = persName.get("forename");
+-
+- if (foreNameObj instanceof JSONObject) {
+- parseNamePart((JSONObject) foreNameObj, author);
+- } else if (foreNameObj instanceof JSONArray) {
+- JSONArray foreName = persName.getJSONArray("forename");
+-
+- if (foreName.length() > 0) {
+- for (int i = 0; i < foreName.length(); i++) {
+- JSONObject namePart = foreName.getJSONObject(i);
+- parseNamePart(namePart, author);
+- }
+- }
+- }
+- }
+-
+- if (persName.has("surname")) {
+- author.setSurName(persName.getString("surname"));
+- }
+-
+- if (authorObj.has("affiliation")) {
+- parseAffiliation(authorObj.get("affiliation"), author);
+- }
+-
+- }
+-
+- authorList.add(author);
+- }
+-
+- private void parseNamePart(JSONObject namePart, Author author) {
+- if (namePart.has("type") && namePart.has("content")) {
+- String type = namePart.getString("type");
+- String content = namePart.getString("content");
+-
+- if (type.equals("first")) {
+- author.setFirstName(content);
+- }
+-
+- if (type.equals("middle")) {
+- author.setMiddleName(content);
+- }
+- }
+- }
+-
+- private void parseAffiliation(Object affiliationJSON, Author author) {
+- if (affiliationJSON instanceof JSONObject) {
+- parseOneAffiliation((JSONObject) affiliationJSON, author);
+- } else if (affiliationJSON instanceof JSONArray) {
+- JSONArray affiliationArray = (JSONArray) affiliationJSON;
+- if (affiliationArray != null && affiliationArray.length() > 0) {
+- for (int i = 0; i < affiliationArray.length(); i++) {
+- JSONObject affiliationObj = affiliationArray.getJSONObject(i);
+- parseOneAffiliation(affiliationObj, author);
+- }
+- }
+- }
+- }
+-
+- private void parseOneAffiliation(JSONObject affiliationObj, Author author) {
+-
+- Affiliation affiliation = new Affiliation();
+- if (affiliationObj.has("address")) {
+- parseAddress(affiliationObj.getJSONObject("address"), affiliation);
+- }
+-
+- if (affiliationObj.has("orgName")) {
+- OrgName orgName = new OrgName();
+- Object orgObject = affiliationObj.get("orgName");
+- if (orgObject instanceof JSONObject) {
+- parseOrgName((JSONObject) orgObject, orgName);
+- } else if (orgObject instanceof JSONArray) {
+- JSONArray orgNames = (JSONArray) orgObject;
+- if (orgNames != null && orgNames.length() > 0) {
+- for (int i = 0; i < orgNames.length(); i++) {
+- parseOrgName(orgNames.getJSONObject(i), orgName);
+- }
+- }
+-
+- affiliation.setOrgName(orgName);
+- }
+-
+- }
+-
+- author.getAffiliations().add(affiliation);
+- }
+-
+- private void parseAddress(JSONObject addressObj, Affiliation affiliation) {
+- Address address = new Address();
+-
+- if (addressObj.has("region")) {
+- address.setRegion(addressObj.getString("region"));
+- }
+-
+- if (addressObj.has("postCode")) {
+- address.setPostCode(JSONObject.valueToString(addressObj.get("postCode")));
+- }
+-
+- if (addressObj.has("settlement")) {
+- address.setSettlment(addressObj.getString("settlement"));
+- }
+-
+- if (addressObj.has("country")) {
+- Country country = new Country();
+- Object countryObj = addressObj.get("country");
+-
+- if (countryObj instanceof JSONObject) {
+- JSONObject countryJson = addressObj.getJSONObject("country");
+-
+- if (countryJson.has("content")) {
+- country.setContent(countryJson.getString("content"));
+- }
+-
+- if (countryJson.has("key")) {
+- country.setKey(countryJson.getString("key"));
+- }
+- } else if (countryObj instanceof String) {
+- country.setContent((String) countryObj);
+- }
+- address.setCountry(country);
+- }
+-
+- affiliation.setAddress(address);
+- }
+-
+- private void parseOrgName(JSONObject orgObj, OrgName orgName) {
+- OrgTypeName typeName = new OrgTypeName();
+- if (orgObj.has("content")) {
+- typeName.setName(orgObj.getString("content"));
+- }
+-
+- if (orgObj.has("type")) {
+- typeName.setType(orgObj.getString("type"));
+- }
+-
+- orgName.getTypeNames().add(typeName);
+- }
+-
+- private void parseProfileDesc(JSONObject profileDesc, Metadata metadata) {
+- if (profileDesc.has("abstract")) {
+- if (profileDesc.has("p")) {
+- metadata.add("Abstract", profileDesc.getString("p"));
+- }
+- }
+-
+- if (profileDesc.has("textClass")) {
+- JSONObject textClass = profileDesc.getJSONObject("textClass");
+-
+- if (textClass.has("keywords")) {
+- Object keywordsObj = textClass.get("keywords");
+- // test AJ15.pdf
+- if (keywordsObj instanceof String) {
+- metadata.add("Keyword", (String) keywordsObj);
+- } else if (keywordsObj instanceof JSONObject) {
+- JSONObject keywords = textClass.getJSONObject("keywords");
+- if (keywords.has("term")) {
+- JSONArray termArr = keywords.getJSONArray("term");
+- for (int i = 0; i < termArr.length(); i++) {
+- metadata.add("Keyword", JSONObject.valueToString(termArr.get(i)));
+- }
+- }
+- }
+-
+- }
+- }
+-
+- }
+-
+- private String printOrBlank(String val) {
+- if (val != null && !val.equals("")) {
+- return val + " ";
+- } else
+- return " ";
+- }
+-
+- class Author {
+-
+- private String surName;
+-
+- private String middleName;
+-
+- private String firstName;
+-
+- private List<Affiliation> affiliations;
+-
+- public Author() {
+- this.surName = null;
+- this.middleName = null;
+- this.firstName = null;
+- this.affiliations = new ArrayList<Affiliation>();
+- }
+-
+- /**
+- * @return the surName
+- */
+- public String getSurName() {
+- return surName;
+- }
+-
+- /**
+- * @param surName
+- * the surName to set
+- */
+- public void setSurName(String surName) {
+- this.surName = surName;
+- }
+-
+- /**
+- * @return the middleName
+- */
+- public String getMiddleName() {
+- return middleName;
+- }
+-
+- /**
+- * @param middleName
+- * the middleName to set
+- */
+- public void setMiddleName(String middleName) {
+- this.middleName = middleName;
+- }
+-
+- /**
+- * @return the firstName
+- */
+- public String getFirstName() {
+- return firstName;
+- }
+-
+- /**
+- * @param firstName
+- * the firstName to set
+- */
+- public void setFirstName(String firstName) {
+- this.firstName = firstName;
+- }
+-
+- /**
+- * @return the affiliations
+- */
+- public List<Affiliation> getAffiliations() {
+- return affiliations;
+- }
+-
+- /**
+- * @param affiliations
+- * the affiliations to set
+- */
+- public void setAffiliations(List<Affiliation> affiliations) {
+- this.affiliations = affiliations;
+- }
+-
+- /*
+- * (non-Javadoc)
+- *
+- * @see java.lang.Object#toString()
+- */
+- @Override
+- public String toString() {
+- return "Author [surName=" + surName + ", middleName=" + middleName != null ? middleName
+- : "" + ", firstName=" + firstName + ", affiliations=" + affiliations
+- + "]";
+- }
+-
+- }
+-
+- class Affiliation {
+-
+- private OrgName orgName;
+-
+- private Address address;
+-
+- public Affiliation() {
+- this.orgName = new OrgName();
+- this.address = new Address();
+- }
+-
+- /**
+- * @return the orgName
+- */
+- public OrgName getOrgName() {
+- return orgName;
+- }
+-
+- /**
+- * @param orgName
+- * the orgName to set
+- */
+- public void setOrgName(OrgName orgName) {
+- this.orgName = orgName;
+- }
+-
+- /**
+- * @return the address
+- */
+- public Address getAddress() {
+- return address;
+- }
+-
+- /**
+- * @param address
+- * the address to set
+- */
+- public void setAddress(Address address) {
+- this.address = address;
+- }
+-
+- /*
+- * (non-Javadoc)
+- *
+- * @see java.lang.Object#equals(java.lang.Object)
+- */
+- @Override
+- public boolean equals(Object obj) {
+- Affiliation otherA = (Affiliation) obj;
+- return this.getAddress().equals(otherA.getAddress())
+- && this.getOrgName().equals(otherA.getOrgName());
+-
+- }
+-
+- /*
+- * (non-Javadoc)
+- *
+- * @see java.lang.Object#toString()
+- */
+- @Override
+- public String toString() {
+- return "Affiliation {orgName=" + orgName + ", address=" + address + "}";
+- }
+-
+- }
+-
+- class OrgName {
+- private List<OrgTypeName> typeNames;
+-
+- public OrgName() {
+- this.typeNames = new ArrayList<OrgTypeName>();
+- }
+-
+- /**
+- * @return the typeNames
+- */
+- public List<OrgTypeName> getTypeNames() {
+- return typeNames;
+- }
+-
+- /**
+- * @param typeNames
+- * the typeNames to set
+- */
+- public void setTypeNames(List<OrgTypeName> typeNames) {
+- this.typeNames = typeNames;
+- }
+-
+- /*
+- * (non-Javadoc)
+- *
+- * @see java.lang.Object#toString()
+- */
+-
+- @Override
+- public String toString() {
+- StringBuilder builder = new StringBuilder();
+- for (OrgTypeName on : this.typeNames) {
+- builder.append(on.getName());
+- builder.append(" ");
+- }
+- return builder.toString();
+- }
+-
+- /*
+- * (non-Javadoc)
+- *
+- * @see java.lang.Object#equals(java.lang.Object)
+- */
+- @Override
+- public boolean equals(Object obj) {
+- OrgName otherA = (OrgName) obj;
+-
+- if (otherA.getTypeNames() != null) {
+- if (this.typeNames == null) {
+- return false;
+- } else {
+- return this.typeNames.size() == otherA.getTypeNames().size();
+- }
+- } else {
+- if (this.typeNames == null) {
+- return true;
+- } else
+- return false;
+- }
+-
+- }
+-
+- }
+-
+- class OrgTypeName {
+- private String name;
+- private String type;
+-
+- public OrgTypeName() {
+- this.name = null;
+- this.type = null;
+- }
+-
+- /**
+- * @return the name
+- */
+- public String getName() {
+- return name;
+- }
+-
+- /**
+- * @param name
+- * the name to set
+- */
+- public void setName(String name) {
+- this.name = name;
+- }
+-
+- /**
+- * @return the type
+- */
+- public String getType() {
+- return type;
+- }
+-
+- /**
+- * @param type
+- * the type to set
+- */
+- public void setType(String type) {
+- this.type = type;
+- }
+-
+- /*
+- * (non-Javadoc)
+- *
+- * @see java.lang.Object#equals(java.lang.Object)
+- */
+- @Override
+- public boolean equals(Object obj) {
+- OrgTypeName otherOrgName = (OrgTypeName) obj;
+- return this.type.equals(otherOrgName.getType())
+- && this.name.equals(otherOrgName.getName());
+- }
+-
+- }
+-
+- private class Address {
+-
+- private String region;
+- private String postCode;
+- private String settlment;
+- private Country country;
+-
+- public Address() {
+- this.region = null;
+- this.postCode = null;
+- this.settlment = null;
+- this.country = new Country();
+- }
+-
+- /**
+- * @return the region
+- */
+- public String getRegion() {
+- return region;
+- }
+-
+- /**
+- * @param region
+- * the region to set
+- */
+- public void setRegion(String region) {
+- this.region = region;
+- }
+-
+- /**
+- * @return the postCode
+- */
+- public String getPostCode() {
+- return postCode;
+- }
+-
+- /**
+- * @param postCode
+- * the postCode to set
+- */
+- public void setPostCode(String postCode) {
+- this.postCode = postCode;
+- }
+-
+- /**
+- * @return the settlment
+- */
+- public String getSettlment() {
+- return settlment;
+- }
+-
+- /**
+- * @param settlment
+- * the settlment to set
+- */
+- public void setSettlment(String settlment) {
+- this.settlment = settlment;
+- }
+-
+- /**
+- * @return the country
+- */
+- public Country getCountry() {
+- return country;
+- }
+-
+- /**
+- * @param country
+- * the country to set
+- */
+- public void setCountry(Country country) {
+- this.country = country;
+- }
+-
+- /*
+- * (non-Javadoc)
+- *
+- * @see java.lang.Object#equals(java.lang.Object)
+- */
+- @Override
+- public boolean equals(Object obj) {
+- Address otherA = (Address) obj;
+- if (this.settlment == null) {
+- return otherA.getSettlment() == null;
+- } else if (this.country == null) {
+- return otherA.getCountry() == null;
+- } else if (this.postCode == null) {
+- return otherA.getPostCode() == null;
+- } else if (this.region == null) {
+- return otherA.getRegion() == null;
+- }
+-
+- return this.settlment.equals(otherA.getSettlment())
+- && this.country.equals(otherA.getCountry())
+- && this.postCode.equals(otherA.getPostCode())
+- && this.region.equals(otherA.getRegion());
+- }
+-
+- /*
+- * (non-Javadoc)
+- *
+- * @see java.lang.Object#toString()
+- */
+- @Override
+- public String toString() {
+- StringBuilder builder = new StringBuilder();
+- builder.append(settlment);
+- builder.append(", ");
+- builder.append(region);
+- builder.append(" ");
+- builder.append(postCode);
+- builder.append(" ");
+- builder.append(country.getContent());
+- return builder.toString();
+- }
+- }
+-
+- private class Country {
+- private String key;
+- private String content;
+-
+- public Country() {
+- this.key = null;
+- this.content = null;
+- }
+-
+- /**
+- * @return the key
+- */
+- public String getKey() {
+- return key;
+- }
+-
+- /**
+- * @param key
+- * the key to set
+- */
+- public void setKey(String key) {
+- this.key = key;
+- }
+-
+- /**
+- * @return the content
+- */
+- public String getContent() {
+- return content;
+- }
+-
+- /**
+- * @param content
+- * the content to set
+- */
+- public void setContent(String content) {
+- this.content = content;
+- }
+-
+- /*
+- * (non-Javadoc)
+- *
+- * @see java.lang.Object#equals(java.lang.Object)
+- */
+- @Override
+- public boolean equals(Object obj) {
+- Country otherC = (Country) obj;
+-
+- if (this.key == null) {
+- if (otherC.getKey() != null) {
+- return false;
+- } else {
+- if (this.content == null) {
+- if (otherC.getContent() != null) {
+- return false;
+- } else {
+- return true;
+- }
+- } else {
+- return content.equals(otherC.getContent());
+- }
+- }
+- } else {
+- if (this.content == null) {
+- if (otherC.getContent() != null) {
+- return false;
+- } else {
+- return this.key.equals(otherC.getKey());
+- }
+- } else {
+- return this.key.equals(otherC.getKey())
+- && this.content.equals(otherC.getContent());
+- }
+- }
+- }
+-
+- }
+-}
diff --git a/debian/patches/ignore-package-org.apache.poi.xwpf.patch b/debian/patches/ignore-package-org.apache.poi.xwpf.patch
new file mode 100644
index 0000000..3f120a7
--- /dev/null
+++ b/debian/patches/ignore-package-org.apache.poi.xwpf.patch
@@ -0,0 +1,647 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:11:40 +0100
+Subject: ignore package org.apache.poi.xwpf
+
+---
+ .../parser/microsoft/ooxml/XWPFListManager.java | 165 --------
+ .../ooxml/XWPFWordExtractorDecorator.java | 459 ---------------------
+ 2 files changed, 624 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
+deleted file mode 100644
+index 5654378..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
++++ /dev/null
+@@ -1,165 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.microsoft.ooxml;
+-
+-import org.apache.poi.xwpf.usermodel.XWPFAbstractNum;
+-import org.apache.poi.xwpf.usermodel.XWPFDocument;
+-import org.apache.poi.xwpf.usermodel.XWPFNum;
+-import org.apache.poi.xwpf.usermodel.XWPFNumbering;
+-import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+-import org.apache.tika.parser.microsoft.AbstractListManager;
+-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTAbstractNum;
+-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber;
+-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTLvl;
+-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNum;
+-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl;
+-
+-
+-public class XWPFListManager extends AbstractListManager {
+- private final static boolean OVERRIDE_AVAILABLE;
+- private final static String SKIP_FORMAT = Character.toString((char) 61623);//if this shows up as the lvlText, don't show a number
+-
+- static {
+- boolean b = false;
+- try {
+- Class.forName("org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl");
+- b = true;
+- } catch (ClassNotFoundException e) {
+- }
+- b = OVERRIDE_AVAILABLE = false;
+-
+- }
+-
+- private final XWPFNumbering numbering;
+-
+- //map of numId (which paragraph series is this a member of?), levelcounts
+- public XWPFListManager(XWPFDocument document) {
+- numbering = document.getNumbering();
+- }
+-
+- /**
+- *
+- * @param paragraph paragraph
+- * @return the formatted number or an empty string if something went wrong
+- */
+- public String getFormattedNumber(final XWPFParagraph paragraph) {
+- int currNumId = paragraph.getNumID().intValue();
+- XWPFNum xwpfNum = numbering.getNum(paragraph.getNumID());
+- if (xwpfNum == null) {
+- return "";
+- }
+- CTNum ctNum = xwpfNum.getCTNum();
+- CTDecimalNumber abNum = ctNum.getAbstractNumId();
+- int currAbNumId = abNum.getVal().intValue();
+-
+- ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
+- LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
+- if (lc == null) {
+- lc = loadLevelTuples(abNum);
+- }
+- if (overrideTuples == null) {
+- overrideTuples = loadOverrideTuples(ctNum, lc.getNumberOfLevels());
+- }
+-
+- String formattedString = lc.incrementLevel(paragraph.getNumIlvl().intValue(), overrideTuples);
+-
+- listLevelMap.put(currAbNumId, lc);
+- overrideTupleMap.put(currNumId, overrideTuples);
+-
+- return formattedString;
+- }
+-
+- private LevelTuple[] loadOverrideTuples(CTNum ctNum, int length) {
+- LevelTuple[] levelTuples = new LevelTuple[length];
+- int overrideLength = ctNum.sizeOfLvlOverrideArray();
+- if (overrideLength == 0) {
+- return null;
+- }
+- for (int i = 0; i < length; i++) {
+- LevelTuple tuple;
+- if (i >= overrideLength) {
+- tuple = new LevelTuple("%"+i+".");
+- } else {
+- CTNumLvl ctNumLvl = ctNum.getLvlOverrideArray(i);
+- if (ctNumLvl != null) {
+- tuple = buildTuple(i, ctNumLvl.getLvl());
+- } else {
+- tuple = new LevelTuple("%"+i+".");
+- }
+- }
+- levelTuples[i] = tuple;
+- }
+- return levelTuples;
+- }
+-
+-
+- private ParagraphLevelCounter loadLevelTuples(CTDecimalNumber abNum) {
+- //Unfortunately, we need to go this far into the underlying structure
+- //to get the abstract num information for the edge case where
+- //someone skips a level and the format is not context-free, e.g. "1.B.i".
+- XWPFAbstractNum abstractNum = numbering.getAbstractNum(abNum.getVal());
+- CTAbstractNum ctAbstractNum = abstractNum.getCTAbstractNum();
+-
+- LevelTuple[] levels = new LevelTuple[ctAbstractNum.sizeOfLvlArray()];
+- for (int i = 0; i < levels.length; i++) {
+- levels[i] = buildTuple(i, ctAbstractNum.getLvlArray(i));
+- }
+- return new ParagraphLevelCounter(levels);
+- }
+-
+- private LevelTuple buildTuple(int level, CTLvl ctLvl) {
+- boolean isLegal = false;
+- int start = 1;
+- int restart = -1;
+- String lvlText = "%" + level + ".";
+- String numFmt = "decimal";
+-
+-
+- if (ctLvl != null && ctLvl.getIsLgl() != null) {
+- isLegal = true;
+- }
+-
+- if (ctLvl != null && ctLvl.getNumFmt() != null &&
+- ctLvl.getNumFmt().getVal() != null) {
+- numFmt = ctLvl.getNumFmt().getVal().toString();
+- }
+- if (ctLvl != null && ctLvl.getLvlRestart() != null &&
+- ctLvl.getLvlRestart().getVal() != null) {
+- restart = ctLvl.getLvlRestart().getVal().intValue();
+- }
+- if (ctLvl != null && ctLvl.getStart() != null &&
+- ctLvl.getStart().getVal() != null) {
+- start = ctLvl.getStart().getVal().intValue();
+- } else {
+-
+- //this is a hack. Currently, this gets the lowest possible
+- //start for a given numFmt. We should probably try to grab the
+- //restartNumberingAfterBreak value in
+- //e.g. <w:abstractNum w:abstractNumId="12" w15:restartNumberingAfterBreak="0">???
+- if ("decimal".equals(numFmt) || "ordinal".equals(numFmt) || "decimalZero".equals(numFmt)) {
+- start = 0;
+- } else {
+- start = 1;
+- }
+- }
+- if (ctLvl != null && ctLvl.getLvlText() != null && ctLvl.getLvlText().getVal() != null) {
+- lvlText = ctLvl.getLvlText().getVal();
+- }
+- return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
+- }
+-
+-}
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+deleted file mode 100644
+index 6caf803..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
++++ /dev/null
+@@ -1,459 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-package org.apache.tika.parser.microsoft.ooxml;
+-
+-import javax.xml.namespace.QName;
+-import java.io.IOException;
+-import java.util.ArrayList;
+-import java.util.List;
+-
+-import org.apache.poi.openxml4j.opc.PackagePart;
+-import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+-import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
+-import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
+-import org.apache.poi.xwpf.usermodel.BodyType;
+-import org.apache.poi.xwpf.usermodel.IBody;
+-import org.apache.poi.xwpf.usermodel.IBodyElement;
+-import org.apache.poi.xwpf.usermodel.ICell;
+-import org.apache.poi.xwpf.usermodel.IRunElement;
+-import org.apache.poi.xwpf.usermodel.ISDTContent;
+-import org.apache.poi.xwpf.usermodel.XWPFDocument;
+-import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
+-import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
+-import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
+-import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+-import org.apache.poi.xwpf.usermodel.XWPFPicture;
+-import org.apache.poi.xwpf.usermodel.XWPFPictureData;
+-import org.apache.poi.xwpf.usermodel.XWPFRun;
+-import org.apache.poi.xwpf.usermodel.XWPFSDT;
+-import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
+-import org.apache.poi.xwpf.usermodel.XWPFStyle;
+-import org.apache.poi.xwpf.usermodel.XWPFStyles;
+-import org.apache.poi.xwpf.usermodel.XWPFTable;
+-import org.apache.poi.xwpf.usermodel.XWPFTableCell;
+-import org.apache.poi.xwpf.usermodel.XWPFTableRow;
+-import org.apache.tika.parser.ParseContext;
+-import org.apache.tika.parser.microsoft.WordExtractor;
+-import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
+-import org.apache.tika.sax.XHTMLContentHandler;
+-import org.apache.xmlbeans.XmlCursor;
+-import org.apache.xmlbeans.XmlException;
+-import org.apache.xmlbeans.XmlObject;
+-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
+-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject;
+-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
+-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
+-import org.xml.sax.SAXException;
+-import org.xml.sax.helpers.AttributesImpl;
+-
+-public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+-
+- // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3
+- private static final String LIST_DELIMITER = " ";
+-
+-
+- private XWPFDocument document;
+- private XWPFStyles styles;
+-
+- public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor extractor) {
+- super(context, extractor);
+-
+- document = (XWPFDocument) extractor.getDocument();
+- styles = document.getStyles();
+- }
+-
+- /**
+- * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
+- */
+- @Override
+- protected void buildXHTML(XHTMLContentHandler xhtml)
+- throws SAXException, XmlException, IOException {
+- XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
+- XWPFListManager listManager = new XWPFListManager(document);
+- // headers
+- if (hfPolicy != null) {
+- extractHeaders(xhtml, hfPolicy, listManager);
+- }
+-
+- // process text in the order that it occurs in
+- extractIBodyText(document, listManager, xhtml);
+-
+- // then all document tables
+- if (hfPolicy != null) {
+- extractFooters(xhtml, hfPolicy, listManager);
+- }
+- }
+-
+- private void extractIBodyText(IBody bodyElement, XWPFListManager listManager,
+- XHTMLContentHandler xhtml)
+- throws SAXException, XmlException, IOException {
+- for (IBodyElement element : bodyElement.getBodyElements()) {
+- if (element instanceof XWPFParagraph) {
+- XWPFParagraph paragraph = (XWPFParagraph) element;
+- extractParagraph(paragraph, listManager, xhtml);
+- }
+- if (element instanceof XWPFTable) {
+- XWPFTable table = (XWPFTable) element;
+- extractTable(table, listManager, xhtml);
+- }
+- if (element instanceof XWPFSDT) {
+- extractSDT((XWPFSDT) element, xhtml);
+- }
+-
+- }
+- }
+-
+- private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml) throws SAXException,
+- XmlException, IOException {
+- ISDTContent content = element.getContent();
+- String tag = "p";
+- xhtml.startElement(tag);
+- xhtml.characters(content.getText());
+- xhtml.endElement(tag);
+- }
+-
+- private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager,
+- XHTMLContentHandler xhtml)
+- throws SAXException, XmlException, IOException {
+- // If this paragraph is actually a whole new section, then
+- // it could have its own headers and footers
+- // Check and handle if so
+- XWPFHeaderFooterPolicy headerFooterPolicy = null;
+- if (paragraph.getCTP().getPPr() != null) {
+- CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
+- if (ctSectPr != null) {
+- headerFooterPolicy =
+- new XWPFHeaderFooterPolicy(document, ctSectPr);
+- extractHeaders(xhtml, headerFooterPolicy, listManager);
+- }
+- }
+-
+- // Is this a paragraph, or a heading?
+- String tag = "p";
+- String styleClass = null;
+- if (paragraph.getStyleID() != null) {
+- XWPFStyle style = styles.getStyle(
+- paragraph.getStyleID()
+- );
+-
+- if (style != null && style.getName() != null) {
+- TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
+- style.getName(), paragraph.getPartType() == BodyType.TABLECELL
+- );
+- tag = tas.getTag();
+- styleClass = tas.getStyleClass();
+- }
+- }
+-
+- if (styleClass == null) {
+- xhtml.startElement(tag);
+- } else {
+- xhtml.startElement(tag, "class", styleClass);
+- }
+-
+- writeParagraphNumber(paragraph, listManager, xhtml);
+- // Output placeholder for any embedded docs:
+-
+- // TODO: replace w/ XPath/XQuery:
+- for (XWPFRun run : paragraph.getRuns()) {
+- XmlCursor c = run.getCTR().newCursor();
+- c.selectPath("./*");
+- while (c.toNextSelection()) {
+- XmlObject o = c.getObject();
+- if (o instanceof CTObject) {
+- XmlCursor c2 = o.newCursor();
+- c2.selectPath("./*");
+- while (c2.toNextSelection()) {
+- XmlObject o2 = c2.getObject();
+-
+- XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
+- if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) {
+- // Type is "Embed"
+- XmlObject relIDAtt = o2.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
+- if (relIDAtt != null) {
+- String relID = relIDAtt.getDomNode().getNodeValue();
+- AttributesImpl attributes = new AttributesImpl();
+- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+- attributes.addAttribute("", "id", "id", "CDATA", relID);
+- xhtml.startElement("div", attributes);
+- xhtml.endElement("div");
+- }
+- }
+- }
+- c2.dispose();
+- }
+- }
+-
+- c.dispose();
+- }
+-
+- // Attach bookmarks for the paragraph
+- // (In future, we might put them in the right place, for now
+- // we just put them in the correct paragraph)
+- for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) {
+- CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
+- xhtml.startElement("a", "name", bookmark.getName());
+- xhtml.endElement("a");
+- }
+-
+- TmpFormatting fmtg = new TmpFormatting(false, false);
+-
+- // Do the iruns
+- for (IRunElement run : paragraph.getIRuns()) {
+- if (run instanceof XWPFSDT) {
+- fmtg = closeStyleTags(xhtml, fmtg);
+- processSDTRun((XWPFSDT) run, xhtml);
+- //for now, we're ignoring formatting in sdt
+- //if you hit an sdt reset to false
+- fmtg.setBold(false);
+- fmtg.setItalic(false);
+- } else {
+- fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg);
+- }
+- }
+- closeStyleTags(xhtml, fmtg);
+-
+-
+- // Now do any comments for the paragraph
+- XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
+- String commentText = comments.getCommentText();
+- if (commentText != null && commentText.length() > 0) {
+- xhtml.characters(commentText);
+- }
+-
+- String footnameText = paragraph.getFootnoteText();
+- if (footnameText != null && footnameText.length() > 0) {
+- xhtml.characters(footnameText + "\n");
+- }
+-
+- // Also extract any paragraphs embedded in text boxes:
+- for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
+- extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
+- }
+-
+- // Finish this paragraph
+- xhtml.endElement(tag);
+-
+- if (headerFooterPolicy != null) {
+- extractFooters(xhtml, headerFooterPolicy, listManager);
+- }
+- }
+-
+- private void writeParagraphNumber(XWPFParagraph paragraph,
+- XWPFListManager listManager,
+- XHTMLContentHandler xhtml) throws SAXException {
+- if (paragraph.getNumIlvl() == null) {
+- return;
+- }
+- String number = listManager.getFormattedNumber(paragraph);
+- if (number != null) {
+- xhtml.characters(number);
+- }
+-
+- }
+-
+- private TmpFormatting closeStyleTags(XHTMLContentHandler xhtml,
+- TmpFormatting fmtg) throws SAXException {
+- // Close any still open style tags
+- if (fmtg.isItalic()) {
+- xhtml.endElement("i");
+- fmtg.setItalic(false);
+- }
+- if (fmtg.isBold()) {
+- xhtml.endElement("b");
+- fmtg.setBold(false);
+- }
+- return fmtg;
+- }
+-
+- private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph,
+- XHTMLContentHandler xhtml, TmpFormatting tfmtg)
+- throws SAXException, XmlException, IOException {
+- // True if we are currently in the named style tag:
+- if (run.isBold() != tfmtg.isBold()) {
+- if (tfmtg.isItalic()) {
+- xhtml.endElement("i");
+- tfmtg.setItalic(false);
+- }
+- if (run.isBold()) {
+- xhtml.startElement("b");
+- } else {
+- xhtml.endElement("b");
+- }
+- tfmtg.setBold(run.isBold());
+- }
+-
+- if (run.isItalic() != tfmtg.isItalic()) {
+- if (run.isItalic()) {
+- xhtml.startElement("i");
+- } else {
+- xhtml.endElement("i");
+- }
+- tfmtg.setItalic(run.isItalic());
+- }
+-
+- boolean addedHREF = false;
+- if (run instanceof XWPFHyperlinkRun) {
+- XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun) run;
+- XWPFHyperlink link = linkRun.getHyperlink(document);
+- if (link != null && link.getURL() != null) {
+- xhtml.startElement("a", "href", link.getURL());
+- addedHREF = true;
+- } else if (linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) {
+- xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
+- addedHREF = true;
+- }
+- }
+-
+- xhtml.characters(run.toString());
+-
+- // If we have any pictures, output them
+- for (XWPFPicture picture : run.getEmbeddedPictures()) {
+- if (paragraph.getDocument() != null) {
+- XWPFPictureData data = picture.getPictureData();
+- if (data != null) {
+- AttributesImpl attr = new AttributesImpl();
+-
+- attr.addAttribute("", "src", "src", "CDATA", "embedded:" + data.getFileName());
+- attr.addAttribute("", "alt", "alt", "CDATA", picture.getDescription());
+-
+- xhtml.startElement("img", attr);
+- xhtml.endElement("img");
+- }
+- }
+- }
+-
+- if (addedHREF) {
+- xhtml.endElement("a");
+- }
+-
+- return tfmtg;
+- }
+-
+- private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml)
+- throws SAXException, XmlException, IOException {
+- xhtml.characters(run.getContent().getText());
+- }
+-
+- private void extractTable(XWPFTable table, XWPFListManager listManager,
+- XHTMLContentHandler xhtml)
+- throws SAXException, XmlException, IOException {
+- xhtml.startElement("table");
+- xhtml.startElement("tbody");
+- for (XWPFTableRow row : table.getRows()) {
+- xhtml.startElement("tr");
+- for (ICell cell : row.getTableICells()) {
+- xhtml.startElement("td");
+- if (cell instanceof XWPFTableCell) {
+- extractIBodyText((XWPFTableCell) cell, listManager, xhtml);
+- } else if (cell instanceof XWPFSDTCell) {
+- xhtml.characters(((XWPFSDTCell) cell).getContent().getText());
+- }
+- xhtml.endElement("td");
+- }
+- xhtml.endElement("tr");
+- }
+- xhtml.endElement("tbody");
+- xhtml.endElement("table");
+- }
+-
+- private void extractFooters(
+- XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy,
+- XWPFListManager listManager)
+- throws SAXException, XmlException, IOException {
+- // footers
+- if (hfPolicy.getFirstPageFooter() != null) {
+- extractHeaderText(xhtml, hfPolicy.getFirstPageFooter(), listManager);
+- }
+- if (hfPolicy.getEvenPageFooter() != null) {
+- extractHeaderText(xhtml, hfPolicy.getEvenPageFooter(), listManager);
+- }
+- if (hfPolicy.getDefaultFooter() != null) {
+- extractHeaderText(xhtml, hfPolicy.getDefaultFooter(), listManager);
+- }
+- }
+-
+- private void extractHeaders(
+- XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy, XWPFListManager listManager)
+- throws SAXException, XmlException, IOException {
+- if (hfPolicy == null) return;
+-
+- if (hfPolicy.getFirstPageHeader() != null) {
+- extractHeaderText(xhtml, hfPolicy.getFirstPageHeader(), listManager);
+- }
+-
+- if (hfPolicy.getEvenPageHeader() != null) {
+- extractHeaderText(xhtml, hfPolicy.getEvenPageHeader(), listManager);
+- }
+-
+- if (hfPolicy.getDefaultHeader() != null) {
+- extractHeaderText(xhtml, hfPolicy.getDefaultHeader(), listManager);
+- }
+- }
+-
+- private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header, XWPFListManager listManager) throws SAXException, XmlException, IOException {
+-
+- for (IBodyElement e : header.getBodyElements()) {
+- if (e instanceof XWPFParagraph) {
+- extractParagraph((XWPFParagraph) e, listManager, xhtml);
+- } else if (e instanceof XWPFTable) {
+- extractTable((XWPFTable) e, listManager, xhtml);
+- } else if (e instanceof XWPFSDT) {
+- extractSDT((XWPFSDT) e, xhtml);
+- }
+- }
+- }
+-
+- /**
+- * Word documents are simple, they only have the one
+- * main part
+- */
+- @Override
+- protected List<PackagePart> getMainDocumentParts() {
+- List<PackagePart> parts = new ArrayList<PackagePart>();
+- parts.add(document.getPackagePart());
+- return parts;
+- }
+-
+- private class TmpFormatting {
+- private boolean bold = false;
+- private boolean italic = false;
+-
+- private TmpFormatting(boolean bold, boolean italic) {
+- this.bold = bold;
+- this.italic = italic;
+- }
+-
+- public boolean isBold() {
+- return bold;
+- }
+-
+- public void setBold(boolean bold) {
+- this.bold = bold;
+- }
+-
+- public boolean isItalic() {
+- return italic;
+- }
+-
+- public void setItalic(boolean italic) {
+- this.italic = italic;
+- }
+-
+- }
+-
+-}
diff --git a/debian/patches/ignore-sqlite-jdbc.patch b/debian/patches/ignore-sqlite-jdbc.patch
new file mode 100644
index 0000000..3b8f7a3
--- /dev/null
+++ b/debian/patches/ignore-sqlite-jdbc.patch
@@ -0,0 +1,125 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:09:08 +0100
+Subject: ignore sqlite jdbc
+
+---
+ .../apache/tika/parser/jdbc/SQLite3DBParser.java | 110 ---------------------
+ 1 file changed, 110 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java
+deleted file mode 100644
+index 4ea8f30..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java
++++ /dev/null
+@@ -1,110 +0,0 @@
+-package org.apache.tika.parser.jdbc;
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-import java.io.File;
+-import java.io.IOException;
+-import java.io.InputStream;
+-import java.sql.Connection;
+-import java.sql.ResultSet;
+-import java.sql.SQLException;
+-import java.sql.Statement;
+-import java.util.LinkedList;
+-import java.util.List;
+-import java.util.Set;
+-
+-import org.apache.commons.io.IOExceptionWithCause;
+-import org.apache.tika.io.TikaInputStream;
+-import org.apache.tika.metadata.Metadata;
+-import org.apache.tika.mime.MediaType;
+-import org.apache.tika.parser.ParseContext;
+-import org.sqlite.SQLiteConfig;
+-
+-/**
+- * This is the implementation of the db parser for SQLite.
+- * <p/>
+- * This parser is internal only; it should not be registered in the services
+- * file or configured in the TikaConfig xml file.
+- */
+-class SQLite3DBParser extends AbstractDBParser {
+-
+- protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
+-
+- /**
+- * @param context context
+- * @return null (always)
+- */
+- @Override
+- public Set<MediaType> getSupportedTypes(ParseContext context) {
+- return null;
+- }
+-
+- @Override
+- protected Connection getConnection(InputStream stream, Metadata metadata, ParseContext context) throws IOException {
+- String connectionString = getConnectionString(stream, metadata, context);
+-
+- Connection connection = null;
+- try {
+- Class.forName(getJDBCClassName());
+- } catch (ClassNotFoundException e) {
+- throw new IOExceptionWithCause(e);
+- }
+- try {
+- SQLiteConfig config = new SQLiteConfig();
+-
+- //good habit, but effectively meaningless here
+- config.setReadOnly(true);
+- connection = config.createConnection(connectionString);
+-
+- } catch (SQLException e) {
+- throw new IOException(e.getMessage());
+- }
+- return connection;
+- }
+-
+- @Override
+- protected String getConnectionString(InputStream is, Metadata metadata, ParseContext context) throws IOException {
+- File dbFile = TikaInputStream.get(is).getFile();
+- return "jdbc:sqlite:" + dbFile.getAbsolutePath();
+- }
+-
+- @Override
+- protected String getJDBCClassName() {
+- return SQLITE_CLASS_NAME;
+- }
+-
+- @Override
+- protected List<String> getTableNames(Connection connection, Metadata metadata,
+- ParseContext context) throws SQLException {
+- List<String> tableNames = new LinkedList<String>();
+-
+- try (Statement st = connection.createStatement()) {
+- String sql = "SELECT name FROM sqlite_master WHERE type='table'";
+- ResultSet rs = st.executeQuery(sql);
+-
+- while (rs.next()) {
+- tableNames.add(rs.getString(1));
+- }
+- }
+- return tableNames;
+- }
+-
+- @Override
+- public JDBCTableReader getTableReader(Connection connection, String tableName, ParseContext context) {
+- return new SQLite3TableReader(connection, tableName, context);
+- }
+-}
diff --git a/debian/patches/ignore-ucar.nc2.patch b/debian/patches/ignore-ucar.nc2.patch
new file mode 100644
index 0000000..1bb0771
--- /dev/null
+++ b/debian/patches/ignore-ucar.nc2.patch
@@ -0,0 +1,137 @@
+From: Markus Koschany <apo at debian.org>
+Date: Tue, 1 Dec 2015 19:13:14 +0100
+Subject: ignore ucar.nc2
+
+---
+ .../org/apache/tika/parser/grib/GribParser.java | 121 ---------------------
+ 1 file changed, 121 deletions(-)
+ delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java
+
+diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java
+deleted file mode 100644
+index 6f8756d..0000000
+--- a/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java
++++ /dev/null
+@@ -1,121 +0,0 @@
+-/*
+- * Licensed to the Apache Software Foundation (ASF) under one or more
+- * contributor license agreements. See the NOTICE file distributed with
+- * this work for additional information regarding copyright ownership.
+- * The ASF licenses this file to You under the Apache License, Version 2.0
+- * (the "License"); you may not use this file except in compliance with
+- * the License. You may obtain a copy of the License at
+- *
+- * http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-package org.apache.tika.parser.grib;
+-
+-import java.io.IOException;
+-import java.io.InputStream;
+-import java.io.File;
+-import java.util.Collections;
+-import java.util.Set;
+-import org.apache.tika.exception.TikaException;
+-import org.apache.tika.io.TemporaryResources;
+-import org.apache.tika.io.TikaInputStream;
+-import org.apache.tika.metadata.Metadata;
+-import org.apache.tika.metadata.Property;
+-import org.apache.tika.metadata.TikaCoreProperties;
+-import org.apache.tika.mime.MediaType;
+-import org.apache.tika.parser.AbstractParser;
+-import org.apache.tika.parser.ParseContext;
+-import org.apache.tika.sax.XHTMLContentHandler;
+-import org.xml.sax.ContentHandler;
+-import org.xml.sax.SAXException;
+-import ucar.nc2.Attribute;
+-import ucar.nc2.Dimension;
+-import ucar.nc2.NetcdfFile;
+-import ucar.nc2.Variable;
+-import ucar.nc2.dataset.NetcdfDataset;
+-
+-public class GribParser extends AbstractParser {
+-
+- private static final long serialVersionUID = 7855458954474247655L;
+-
+- public static final String GRIB_MIME_TYPE = "application/x-grib2";
+-
+- private final Set<MediaType> SUPPORTED_TYPES =
+- Collections.singleton(MediaType.application("x-grib2"));
+-
+- public Set<MediaType> getSupportedTypes(ParseContext context) {
+- return SUPPORTED_TYPES;
+- }
+-
+- public void parse(InputStream stream, ContentHandler handler,
+- Metadata metadata, ParseContext context) throws IOException,
+- SAXException, TikaException {
+-
+- //Set MIME type as grib2
+- metadata.set(Metadata.CONTENT_TYPE, GRIB_MIME_TYPE);
+-
+- TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
+- File gribFile = tis.getFile();
+-
+- try {
+- NetcdfFile ncFile = NetcdfDataset.openFile(gribFile.getAbsolutePath(), null);
+-
+- // first parse out the set of global attributes
+- for (Attribute attr : ncFile.getGlobalAttributes()) {
+- Property property = resolveMetadataKey(attr.getFullName());
+- if (attr.getDataType().isString()) {
+- metadata.add(property, attr.getStringValue());
+- } else if (attr.getDataType().isNumeric()) {
+- int value = attr.getNumericValue().intValue();
+- metadata.add(property, String.valueOf(value));
+- }
+- }
+-
+- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+-
+- xhtml.startDocument();
+-
+- xhtml.newline();
+- xhtml.startElement("ul");
+- xhtml.characters("dimensions:");
+- xhtml.newline();
+-
+- for (Dimension dim : ncFile.getDimensions()){
+- xhtml.element("li", dim.getFullName() + "=" + String.valueOf(dim.getLength()) + ";");
+- xhtml.newline();
+- }
+-
+- xhtml.startElement("ul");
+- xhtml.characters("variables:");
+- xhtml.newline();
+-
+- for (Variable var : ncFile.getVariables()){
+- xhtml.element("p", String.valueOf(var.getDataType()) + var.getNameAndDimensions() + ";");
+- for(Attribute element : var.getAttributes()){
+- xhtml.element("li", " :" + element + ";");
+- xhtml.newline();
+- }
+- }
+- xhtml.endElement("ul");
+- xhtml.endElement("ul");
+- xhtml.endDocument();
+-
+- } catch (IOException e) {
+- throw new TikaException("NetCDF parse error", e);
+- }
+- }
+-
+- private Property resolveMetadataKey(String localName) {
+- if ("title".equals(localName)) {
+- return TikaCoreProperties.TITLE;
+- }
+- return Property.internalText(localName);
+- }
+-
+-}
+\ No newline at end of file
diff --git a/debian/patches/series b/debian/patches/series
index 0418cf5..44c83e4 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -3,3 +3,16 @@
03-ignore-netcdf.patch
05-osgi-compatibility.patch
06-optional-parser-dependencies.patch
+ignore-org.apache.poi.hssf.extractor.patch
+ignore-org.json.XML.patch
+ignore-org.apache.ctakes.patch
+ignore-com.pff.patch
+ignore-sqlite-jdbc.patch
+ignore-com.drew.imaging.webp.patch
+ignore-package-org.apache.poi.xwpf.patch
+ignore-opennlp.tools.namefind.patch
+ignore-javax.ws.rs.core.patch
+ignore-ucar.nc2.patch
+ignore-com.github.junrar.exception.patch
+ignore-com.healthmarketscience.jackcess.patch
+ignore-org.apache.poi.hslf.usermodel.patch
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-java/tika.git
More information about the pkg-java-commits
mailing list