diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..a217b347
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+version: 2
+updates:
+- package-ecosystem: maven
+ directory: "/"
+ schedule:
+ interval: daily
+ open-pull-requests-limit: 10
diff --git a/.github/workflows/tests-windows.yml b/.github/workflows/tests-windows.yml
new file mode 100644
index 00000000..5cc1031a
--- /dev/null
+++ b/.github/workflows/tests-windows.yml
@@ -0,0 +1,23 @@
+name: Java CI (Windows)
+
+on: [push]
+
+jobs:
+ build:
+ runs-on: windows-latest
+
+ steps:
+ # https://github.com/actions/checkout/issues/135#issuecomment-602171132
+ - name: Set git to use LF
+ run: |
+ git config --global core.autocrlf false
+ git config --global core.eol lf
+ - uses: actions/checkout@v3
+ - name: Set up JDK 11
+ uses: actions/setup-java@v3
+ with:
+ java-version: '11'
+ distribution: 'adopt'
+ cache: maven
+ - name: Build with Maven
+ run: mvn --batch-mode test
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 00000000..da2d019b
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,18 @@
+name: Java CI
+
+on: [push, pull_request]
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up JDK 11
+ uses: actions/setup-java@v3
+ with:
+ java-version: '11'
+ distribution: 'adopt'
+ cache: maven
+ - name: Build with Maven
+ run: mvn --batch-mode test
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index b2e1366a..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-language: java
-install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true -Dgpg.skip=true -B -V
-script: mvn test -Dgpg.skip=true
-jdk:
- - oraclejdk7
- - openjdk7
- - oraclejdk8
-sudo: false
-
-
-
diff --git a/README.md b/README.md
index 755497df..db7b0023 100644
--- a/README.md
+++ b/README.md
@@ -1,72 +1,126 @@
-tabula-java [](https://travis-ci.org/tabulapdf/tabula-java) [](https://gitter.im/tabulapdf/tabula-java?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+tabula-java [](https://travis-ci.org/tabulapdf/tabula-java)
===========
-`tabula-java` is a library for extracting tables from PDF files — it is the table extraction engine that used to power [Tabula](http://tabula.technology/) ([repo](http://github.com/tabulapdf/tabula)). You can use `tabula-java` as a command-line tool to programmatically extract tables from PDFs.
+`tabula-java` is a library for extracting tables from PDF files — it is the table extraction engine that powers [Tabula](http://tabula.technology/) ([repo](http://github.com/tabulapdf/tabula)). You can use `tabula-java` as a command-line tool to programmatically extract tables from PDFs.
-(This is the new version of the extraction engine; the previous code can be found at [`tabula-extractor`](http://github.com/tabulapdf/tabula-extractor).)
-
-© 2014-2016 Manuel Aristarán. Available under MIT License. See [`LICENSE`](LICENSE).
+© 2014-2020 Manuel Aristarán. Available under MIT License. See [`LICENSE`](LICENSE).
## Download
Download a version of the tabula-java's jar, with all dependencies included, that works on Mac, Windows and Linux from our [releases page](../../releases).
-## Usage Examples
+## Commandline Usage Examples
`tabula-java` provides a command line application:
```
-$ java -jar ./target/tabula-0.9.1-jar-with-dependencies.jar --help
-
-usage: tabula [-a ] [-b ] [-c ] [-d] [-f ] [-g] [-h] [-i]
- [-n] [-o ] [-p ] [-r] [-s ] [-u] [-v]
+$ java -jar target/tabula-1.0.5-jar-with-dependencies.jar --help
+usage: tabula [-a ] [-b ] [-c ] [-f ]
+ [-g] [-h] [-i] [-l] [-n] [-o ] [-p ] [-r] [-s
+ ] [-t] [-u] [-v]
Tabula helps you extract tables from PDFs
- -a,--area Portion of the page to analyze
- (top,left,bottom,right). Example: --area
- 269.875,12.75,790.5,561. Default is entire
- page
- -c,--columns X coordinates of column boundaries. Example
- --columns 10.1,20.2,30.3
- -d,--debug Print detected table areas instead of
- processing.
- -b,--batch Convert all .pdfs in the provided directory
+ -a,--area -a/--area = Portion of the page to analyze.
+ Example: --area 269.875,12.75,790.5,561.
+ Accepts top,left,bottom,right i.e. y1,x1,y2,x2
+ where all values are in points relative to the
+ top left corner. If all values are between
+ 0-100 (inclusive) and preceded by '%', input
+ will be taken as % of actual height or width
+ of the page. Example: --area %0,0,100,50. To
+ specify multiple areas, -a option should be
+ repeated. Default is entire page
+ -b,--batch Convert all .pdfs in the provided directory.
+ -c,--columns X coordinates of column boundaries. Example
+ --columns 10.1,20.2,30.3. If all values are
+ between 0-100 (inclusive) and preceded by '%',
+ input will be taken as % of actual width of
+ the page. Example: --columns %25,50,80.6
-f,--format Output format: (CSV,TSV,JSON). Default: CSV
-g,--guess Guess the portion of the page to analyze per
page.
-h,--help Print this help text.
-i,--silent Suppress all stderr output.
- -n,--no-spreadsheet Force PDF not to be extracted using
- spreadsheet-style extraction (if there are
- ruling lines separating each cell, as in a PDF
- of an Excel spreadsheet)
+ -l,--lattice Force PDF to be extracted using lattice-mode
+ extraction (if there are ruling lines
+ separating each cell, as in a PDF of an Excel
+ spreadsheet)
+ -n,--no-spreadsheet [Deprecated in favor of -t/--stream] Force PDF
+ not to be extracted using spreadsheet-style
+ extraction (if there are no ruling lines
+ separating each cell)
-o,--outfile Write output to instead of STDOUT.
Default: -
-p,--pages Comma separated list of ranges, or all.
Examples: --pages 1-3,5-7, --pages 3 or
--pages all. Default is --pages 1
- -r,--spreadsheet Force PDF to be extracted using
- spreadsheet-style extraction (if there are
- ruling lines separating each cell, as in a PDF
- of an Excel spreadsheet)
+ -r,--spreadsheet [Deprecated in favor of -l/--lattice] Force
+ PDF to be extracted using spreadsheet-style
+ extraction (if there are ruling lines
+ separating each cell, as in a PDF of an Excel
+ spreadsheet)
-s,--password Password to decrypt document. Default is empty
+ -t,--stream Force PDF to be extracted using stream-mode
+ extraction (if there are no ruling lines
+ separating each cell)
-u,--use-line-returns Use embedded line returns in cells. (Only in
spreadsheet mode.)
-v,--version Print version and exit.
-
```
-It also includes a debugging tool, run `java -cp ./target/tabula-0.9.1-jar-with-dependencies.jar technology.tabula.debug.Debug -h` for the available options.
+It also includes a debugging tool, run `java -cp ./target/tabula-1.0.5-jar-with-dependencies.jar technology.tabula.debug.Debug -h` for the available options.
You can also integrate `tabula-java` with any JVM language. For Java examples, see the [`tests`](src/test/java/technology/tabula/) folder.
JVM start-up time is a lot of the cost of the `tabula` command, so if you're trying to extract many tables from PDFs, you have a few options for speeding it up:
+ - the -b option, which allows you to convert all pdfs in a given directory
- the [drip](https://github.com/ninjudd/drip) utility
- - the [Ruby](http://github.com/tabulapdf/tabula-extractor), [R](https://github.com/leeper/tabulizer), and [Node.js](https://github.com/ezodude/tabula-js) bindings
+ - the [Ruby](http://github.com/tabulapdf/tabula-extractor), [Python](https://github.com/chezou/tabula-py), [R](https://github.com/leeper/tabulizer), and [Node.js](https://github.com/ezodude/tabula-js) bindings
- writing your own program in any JVM language (Java, JRuby, Scala) that imports tabula-java.
- - waiting for us to implement an API/server-style system (it's on the roadmap)
+ - waiting for us to implement an API/server-style system (it's on the [roadmap](https://github.com/tabulapdf/tabula-api))
+
+## API Usage Examples
+
+A simple Java code example which extracts all rows and cells from all tables of all pages of a PDF document:
+
+```java
+InputStream in = this.getClass().getResourceAsStream("my.pdf");
+try (PDDocument document = PDDocument.load(in)) {
+ SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
+ PageIterator pi = new ObjectExtractor(document).extract();
+ while (pi.hasNext()) {
+ // iterate over the pages of the document
+ Page page = pi.next();
+ List table = sea.extract(page);
+ // iterate over the tables of the page
+ for(Table tables: table) {
+ List> rows = tables.getRows();
+ // iterate over the rows of the table
+ for (List cells : rows) {
+ // print all column-cells of the row plus linefeed
+ for (RectangularTextContainer content : cells) {
+ // Note: Cell.getText() uses \r to concat text chunks
+ String text = content.getText().replace("\r", " ");
+ System.out.print(text + "|");
+ }
+ System.out.println();
+ }
+ }
+ }
+}
+```
+
+
+For more detail information check the Javadoc.
+The Javadoc API documentation can be generated (see also '_Building from Source_' section) via
+
+```
+mvn javadoc:javadoc
+```
+
+which generates the HTML files to directory ```target/site/apidocs/```
## Building from Source
@@ -75,3 +129,30 @@ Clone this repo and run:
```
mvn clean compile assembly:single
```
+
+## Contributing
+
+Interested in helping out? We'd love to have your help!
+
+You can help by:
+
+- [Reporting a bug](https://github.com/tabulapdf/tabula-java/issues).
+- Adding or editing documentation.
+- Contributing code via a Pull Request.
+- Spreading the word about `tabula-java` to people who might be able to benefit from using it.
+
+### Backers
+
+You can also support our continued work on `tabula-java` with a one-time or monthly donation [on OpenCollective](https://opencollective.com/tabulapdf#support). Organizations who use `tabula-java` can also [sponsor the project](https://opencollective.com/tabulapdf#support) for acknowledgement on [our official site](http://tabula.technology/) and this README.
+
+Special thanks to the following users and organizations for generously supporting Tabula with donations and grants:
+
+
+
+
+
+
+
+
+
+
diff --git a/jbang-catalog.json b/jbang-catalog.json
new file mode 100644
index 00000000..b7f71347
--- /dev/null
+++ b/jbang-catalog.json
@@ -0,0 +1,8 @@
+{
+ "catalogs": {},
+ "aliases": {
+ "tabula": {
+ "script-ref": "https://github.com/tabulapdf/tabula-java/releases/download/v1.0.4/tabula-1.0.4-jar-with-dependencies.jar"
+ }
+ }
+}
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 8a15f77a..211d0d4d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,8 +1,9 @@
-
+
4.0.0
technology.tabula
tabula
- 0.9.1
+ 1.0.6-SNAPSHOT
Tabula
Extract tables from PDF files
http://github.com/tabulapdf/tabula-java
@@ -32,21 +33,26 @@
+
+
+ snapshots
+ https://repository.apache.org/content/repositories/snapshots/
+
+ false
+
+
+ true
+
+
+
+
scm:git:git@github.com:tabulapdf/tabula-java.git
scm:git:git@github.com:tabulapdf/tabula-java.git
git@github.com:tabulapdf/tabula-java.git
- tabula-0.9.1
+ v1.0.2
-
-
- sonatype
- Sonatype repository
- https://oss.sonatype.org/content/repositories/snapshots/
-
-
-
UTF-8
UTF-8
@@ -68,7 +74,7 @@
org.apache.maven.plugins
maven-javadoc-plugin
- 2.10.3
+ 3.8.0
true
@@ -81,7 +87,7 @@
org.sonatype.plugins
nexus-staging-maven-plugin
- 1.6.3
+ 1.7.0
true
ossrh
@@ -93,7 +99,7 @@
org.apache.maven.plugins
maven-source-plugin
- 2.2.1
+ 3.3.1
attach-sources
@@ -104,22 +110,25 @@
- org.apache.maven.plugins
- maven-javadoc-plugin
- 2.9.1
-
-
- attach-javadocs
-
- jar
-
-
-
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 3.8.0
+
+ 8
+
+
+
+ attach-javadocs
+
+ jar
+
+
+
org.apache.maven.plugins
maven-gpg-plugin
- 1.5
+ 3.2.4
sign-artifacts
@@ -127,15 +136,21 @@
sign
+
+
+ --pinentry-mode
+ loopback
+
+
maven-compiler-plugin
- 3.1
+ 3.13.0
- 1.6
- 1.6
+ 1.8
+ 1.8
@@ -145,134 +160,166 @@
technology.tabula.CommandLineApp
-
-
- jar-with-dependencies
-
-
-
-
- org.apache.maven.plugins
- maven-surefire-plugin
+
+
+ jar-with-dependencies
+
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.3.1
-Xms1024m -Xmx2048m
-
-
-
-
-
-
- release
-
-
+
+
org.apache.maven.plugins
- maven-javadoc-plugin
- 2.9.1
-
-
- attach-javadocs
-
- jar
-
-
-
+ maven-eclipse-plugin
+ 2.10
+
+ true
+ true
+
-
- org.apache.maven.plugins
- maven-source-plugin
- 2.2.1
-
-
- attach-sources
-
- jar-no-fork
-
-
-
-
-
- org.apache.maven.plugins
- maven-gpg-plugin
- 1.5
-
-
- sign-artifacts
- verify
-
- sign
-
-
-
-
-
-
-
-
+
+
+
+
+
+ release
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 3.8.0
+
+ 8
+
+
+
+ attach-javadocs
+
+ jar
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 3.3.1
+
+
+ attach-sources
+
+ jar-no-fork
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 3.2.4
+
+
+ sign-artifacts
+ verify
+
+ sign
+
+
+
+
+
+
+
+
+
+
+
+ org.locationtech.jts
+ jts-core
+ 1.20.0
+
+
+
+ org.slf4j
+ slf4j-api
+ 2.0.13
+
+
+
+ org.slf4j
+ slf4j-simple
+ 2.0.13
+
-
-
- net.sf.jsi
- jsi
- 1.1.0-SNAPSHOT
-
+
+ org.apache.pdfbox
+ pdfbox
+ 3.0.4
+
-
- org.slf4j
- slf4j-api
- 1.7.20
-
+
+ org.bouncycastle
+ bcprov-jdk18on
+ 1.80
+
-
- org.slf4j
- slf4j-simple
- 1.7.20
-
+
+ org.bouncycastle
+ bcmail-jdk18on
+ 1.80
+
-
- org.apache.pdfbox
- pdfbox
- 1.8.12
-
+
+ junit
+ junit
+ 4.13.2
+ test
+
-
- org.bouncycastle
- bcprov-jdk15on
- 1.54
-
+
+ commons-cli
+ commons-cli
+ 1.8.0
+
-
- org.bouncycastle
- bcmail-jdk15on
- 1.54
-
+
+ org.apache.commons
+ commons-csv
+ 1.11.0
+
-
- junit
- junit
- 4.11
- test
-
+
+ com.google.code.gson
+ gson
+ 2.11.0
+
-
- commons-cli
- commons-cli
- 1.3.1
-
+
+ com.github.jai-imageio
+ jai-imageio-core
+ 1.4.0
+
-
- org.apache.commons
- commons-csv
- 1.2
-
+
+ com.github.jai-imageio
+ jai-imageio-jpeg2000
+ 1.4.0
+
-
- com.google.code.gson
- gson
- 2.6.2
-
-
+
+ org.apache.pdfbox
+ jbig2-imageio
+ 3.0.4
+
+
diff --git a/src/main/java/technology/tabula/Cell.java b/src/main/java/technology/tabula/Cell.java
index b7e568db..d02c8c50 100644
--- a/src/main/java/technology/tabula/Cell.java
+++ b/src/main/java/technology/tabula/Cell.java
@@ -1,75 +1,62 @@
package technology.tabula;
import java.awt.geom.Point2D;
-import java.util.ArrayList;
import java.util.Collections;
-import java.util.List;
@SuppressWarnings("serial")
public class Cell extends RectangularTextContainer {
- private boolean spanning;
- private boolean placeholder;
- private List textElements;
-
- public Cell(float top, float left, float width, float height) {
- super(top, left, width, height);
- this.setPlaceholder(false);
- this.setSpanning(false);
- this.setTextElements(new ArrayList());
- }
-
- public Cell(Point2D topLeft, Point2D bottomRight) {
- super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
- this.setPlaceholder(false);
- this.setSpanning(false);
- this.setTextElements(new ArrayList());
- }
-
- @Override
- public String getText(boolean useLineReturns) {
- if (this.textElements.size() == 0) {
- return "";
- }
- StringBuilder sb = new StringBuilder();
- Collections.sort(this.textElements);
- double curTop = this.textElements.get(0).getTop();
- for (TextChunk tc: this.textElements) {
- if (useLineReturns && tc.getTop() > curTop) {
- sb.append('\r');
- }
- sb.append(tc.getText());
- curTop = tc.getTop();
- }
- return sb.toString().trim();
- }
-
- public String getText() {
- return getText(true);
- }
-
- public boolean isSpanning() {
- return spanning;
- }
-
- public void setSpanning(boolean spanning) {
- this.spanning = spanning;
- }
-
- public boolean isPlaceholder() {
- return placeholder;
- }
-
- public void setPlaceholder(boolean placeholder) {
- this.placeholder = placeholder;
- }
-
-
- public List getTextElements() {
- return textElements;
- }
-
- public void setTextElements(List textElements) {
- this.textElements = textElements;
- }
+ public Cell(float top, float left, float width, float height) {
+ super(top, left, width, height);
+ this.setPlaceholder(false);
+ this.setSpanning(false);
+ }
+
+ public Cell(Point2D topLeft, Point2D bottomRight) {
+ super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
+ this.setPlaceholder(false);
+ this.setSpanning(false);
+ }
+
+ private boolean spanning;
+ private boolean placeholder;
+
+ @Override
+ public String getText(boolean useLineReturns) {
+ if (this.textElements.size() == 0) {
+ return "";
+ }
+ StringBuilder sb = new StringBuilder();
+ this.textElements.sort(Rectangle.ILL_DEFINED_ORDER);
+ double curTop = this.textElements.get(0).getTop();
+ for (TextChunk tc : this.textElements) {
+ if (useLineReturns && tc.getTop() > curTop) {
+ sb.append('\r');
+ }
+ sb.append(tc.getText());
+ curTop = tc.getTop();
+ }
+ return sb.toString().trim();
+ }
+
+ @Override
+ public String getText() {
+ return getText(true);
+ }
+
+ public boolean isSpanning() {
+ return spanning;
+ }
+
+ public void setSpanning(boolean spanning) {
+ this.spanning = spanning;
+ }
+
+ public boolean isPlaceholder() {
+ return placeholder;
+ }
+
+ public void setPlaceholder(boolean placeholder) {
+ this.placeholder = placeholder;
+ }
}
diff --git a/src/main/java/technology/tabula/CohenSutherlandClipping.java b/src/main/java/technology/tabula/CohenSutherlandClipping.java
index 5e170ad8..db9153e9 100644
--- a/src/main/java/technology/tabula/CohenSutherlandClipping.java
+++ b/src/main/java/technology/tabula/CohenSutherlandClipping.java
@@ -18,122 +18,124 @@
* Implements the well known Cohen Sutherland line
* clipping algorithm (line against clip rectangle).
*/
-public final class CohenSutherlandClipping
-{
+public final class CohenSutherlandClipping {
+
private double xMin;
private double yMin;
private double xMax;
private double yMax;
+ private static final int INSIDE = 0;
+ private static final int LEFT = 1;
+ private static final int RIGHT = 2;
+ private static final int BOTTOM = 4;
+ private static final int TOP = 8;
+
+ private final static float MINIMUM_DELTA = 0.01f;
+
/**
- * Creates a Cohen Sutherland clipper with clip rect (0, 0, 0, 0).
+ * Creates a Cohen Sutherland clipper with clip window (0, 0, 0, 0).
*/
- public CohenSutherlandClipping() {
- }
+ public CohenSutherlandClipping() {}
/**
- * Creates a Cohen Sutherland clipper with the given clip rectangle.
- * @param clip the clip rectangle to use
+ * Creates a Cohen Sutherland clipper with the given clip window.
+ * @param clipWindow the clip window to use.
*/
- public CohenSutherlandClipping(Rectangle2D clip) {
- setClip(clip);
+ public CohenSutherlandClipping(Rectangle2D clipWindow) {
+ setClip(clipWindow);
}
/**
* Sets the clip rectangle.
- * @param clip the clip rectangle
+ * @param clipWindow the clip window.
*/
- public void setClip(Rectangle2D clip) {
- xMin = clip.getX();
- xMax = xMin + clip.getWidth();
- yMin = clip.getY();
- yMax = yMin + clip.getHeight();
- }
-
- private static final int INSIDE = 0;
- private static final int LEFT = 1;
- private static final int RIGHT = 2;
- private static final int BOTTOM = 4;
- private static final int TOP = 8;
-
- private final int regionCode(double x, double y) {
- int code = x < xMin
- ? LEFT
- : x > xMax
- ? RIGHT
- : INSIDE;
- if (y < yMin) code |= BOTTOM;
- else if (y > yMax) code |= TOP;
- return code;
+ public void setClip(Rectangle2D clipWindow) {
+ xMin = clipWindow.getX();
+ xMax = xMin + clipWindow.getWidth();
+ yMin = clipWindow.getY();
+ yMax = yMin + clipWindow.getHeight();
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
/**
- * Clips a given line against the clip rectangle.
+ * Clips a given line against the clip window.
* The modification (if needed) is done in place.
- * @param line the line to clip
+ * @param line the line to clip.
* @return true if line is clipped, false if line is
- * totally outside the clip rect.
+ * totally outside the clip window.
*/
public boolean clip(Line2D.Float line) {
+ Point point1 = new Point(line.getX1(), line.getY1());
+ Point point2 = new Point(line.getX2(), line.getY2());
+ Point outsidePoint = new Point(0d, 0d);
- double p1x = line.getX1();
- double p1y = line.getY1();
- double p2x = line.getX2();
- double p2y = line.getY2();
+ boolean lineIsVertical = (point1.x == point2.x);
+ double lineSlope = lineIsVertical ? 0d : (point2.y-point1.y)/(point2.x-point1.x);
- double qx = 0d;
- double qy = 0d;
+ while (point1.region != INSIDE || point2.region != INSIDE) {
+ if ((point1.region & point2.region) != 0) return false;
- boolean vertical = p1x == p2x;
+ outsidePoint.region = (point1.region == INSIDE) ? point2.region : point1.region;
- double slope = vertical
- ? 0d
- : (p2y-p1y)/(p2x-p1x);
-
- int c1 = regionCode(p1x, p1y);
- int c2 = regionCode(p2x, p2y);
-
- while (c1 != INSIDE || c2 != INSIDE) {
-
- if ((c1 & c2) != INSIDE)
- return false;
-
- int c = c1 == INSIDE ? c2 : c1;
-
- if ((c & LEFT) != INSIDE) {
- qx = xMin;
- qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y;
+ if ((outsidePoint.region & LEFT) != 0) {
+ outsidePoint.x = xMin;
+ outsidePoint.y = delta(outsidePoint.x, point1.x)*lineSlope + point1.y;
}
- else if ((c & RIGHT) != INSIDE) {
- qx = xMax;
- qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y;
+ else if ((outsidePoint.region & RIGHT) != 0) {
+ outsidePoint.x = xMax;
+ outsidePoint.y = delta(outsidePoint.x, point1.x)*lineSlope + point1.y;
}
- else if ((c & BOTTOM) != INSIDE) {
- qy = yMin;
- qx = vertical
- ? p1x
- : (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x;
+ else if ((outsidePoint.region & BOTTOM) != 0) {
+ outsidePoint.y = yMin;
+ outsidePoint.x = lineIsVertical
+ ? point1.x
+ : delta(outsidePoint.y, point1.y)/lineSlope + point1.x;
}
- else if ((c & TOP) != INSIDE) {
- qy = yMax;
- qx = vertical
- ? p1x
- : (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x;
+ else if ((outsidePoint.region & TOP) != 0) {
+ outsidePoint.y = yMax;
+ outsidePoint.x = lineIsVertical
+ ? point1.x
+ : delta(outsidePoint.y, point1.y)/lineSlope + point1.x;
}
- if (c == c1) {
- p1x = qx;
- p1y = qy;
- c1 = regionCode(p1x, p1y);
+ if (outsidePoint.isInTheSameRegionAs(point1)) {
+ point1.setPositionAndRegion(outsidePoint.x, outsidePoint.y);
}
else {
- p2x = qx;
- p2y = qy;
- c2 = regionCode(p2x, p2y);
+ point2.setPositionAndRegion(outsidePoint.x, outsidePoint.y);
}
}
- line.setLine(p1x, p1y, p2x, p2y);
+ line.setLine(point1.x, point1.y, point2.x, point2.y);
return true;
}
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ private static double delta(double value1, double value2) {
+ return (Math.abs(value1 - value2) < MINIMUM_DELTA) ? 0 : (value1 - value2);
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ class Point {
+ double x, y;
+ int region;
+
+ Point(double x, double y) {
+ setPositionAndRegion(x, y);
+ }
+
+ void setPositionAndRegion(double x, double y) {
+ this.x = x; this.y = y;
+ region = (x < xMin) ? LEFT : (x > xMax) ? RIGHT : INSIDE;
+ if (y < yMin)
+ region |= BOTTOM;
+ else if (y > yMax)
+ region |= TOP;
+ }
+
+ boolean isInTheSameRegionAs(Point otherPoint) {
+ return this.region == otherPoint.region;
+ }
+ }
+
}
-// end of file
\ No newline at end of file
diff --git a/src/main/java/technology/tabula/CommandLineApp.java b/src/main/java/technology/tabula/CommandLineApp.java
index 940bb4d4..1b422303 100644
--- a/src/main/java/technology/tabula/CommandLineApp.java
+++ b/src/main/java/technology/tabula/CommandLineApp.java
@@ -1,27 +1,25 @@
package technology.tabula;
-import java.awt.geom.Point2D;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FilenameFilter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
-import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import technology.tabula.detectors.DetectionAlgorithm;
import technology.tabula.detectors.NurminenDetectionAlgorithm;
-import technology.tabula.detectors.SpreadsheetDetectionAlgorithm;
import technology.tabula.extractors.BasicExtractionAlgorithm;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
import technology.tabula.writers.CSVWriter;
@@ -32,12 +30,17 @@
public class CommandLineApp {
- private static String VERSION = "0.9.1";
- private static String VERSION_STRING = String.format("tabula %s (c) 2012-2016 Manuel Aristarán", VERSION);
+ private static String VERSION = "1.0.6-SNAPSHOT";
+ private static String VERSION_STRING = String.format("tabula %s (c) 2012-2020 Manuel Aristarán", VERSION);
private static String BANNER = "\nTabula helps you extract tables from PDFs\n\n";
+ private static final int RELATIVE_AREA_CALCULATION_MODE = 0;
+ private static final int ABSOLUTE_AREA_CALCULATION_MODE = 1;
+
+
private Appendable defaultOutput;
- private Rectangle pageArea;
+
+ private List> pageAreas;
private List pages;
private OutputFormat outputFormat;
private String password;
@@ -45,21 +48,21 @@ public class CommandLineApp {
public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseException {
this.defaultOutput = defaultOutput;
- this.pageArea = CommandLineApp.whichArea(line);
+ this.pageAreas = CommandLineApp.whichAreas(line);
this.pages = CommandLineApp.whichPages(line);
this.outputFormat = CommandLineApp.whichOutputFormat(line);
this.tableExtractor = CommandLineApp.createExtractor(line);
if (line.hasOption('s')) {
- this.password = line.getOptionValue('s');
+ this.password = line.getOptionValue('s');
}
}
public static void main(String[] args) {
- CommandLineParser parser = new GnuParser();
+ CommandLineParser parser = new DefaultParser();
try {
// parse the command line arguments
- CommandLine line = parser.parse(buildOptions(), args );
+ CommandLine line = parser.parse(buildOptions(), args);
if (line.hasOption('h')) {
printHelp();
@@ -72,7 +75,7 @@ public static void main(String[] args) {
}
new CommandLineApp(System.out, line).extractTables(line);
- } catch(ParseException exp) {
+ } catch (ParseException exp) {
System.err.println("Error: " + exp.getMessage());
System.exit(1);
}
@@ -81,20 +84,20 @@ public static void main(String[] args) {
public void extractTables(CommandLine line) throws ParseException {
if (line.hasOption('b')) {
- if (line.getArgs().length != 0) {
- throw new ParseException("Filename specified with batch\nTry --help for help");
- }
+ if (line.getArgs().length != 0) {
+ throw new ParseException("Filename specified with batch\nTry --help for help");
+ }
- File pdfDirectory = new File(line.getOptionValue('b'));
- if (!pdfDirectory.isDirectory()) {
- throw new ParseException("Directory does not exist or is not a directory");
- }
- extractDirectoryTables(line, pdfDirectory);
- return;
+ File pdfDirectory = new File(line.getOptionValue('b'));
+ if (!pdfDirectory.isDirectory()) {
+ throw new ParseException("Directory does not exist or is not a directory");
+ }
+ extractDirectoryTables(line, pdfDirectory);
+ return;
}
if (line.getArgs().length != 1) {
- throw new ParseException("Need one filename\nTry --help for help");
+ throw new ParseException("Need exactly one filename\nTry --help for help");
}
File pdfFile = new File(line.getArgs()[0]);
@@ -106,22 +109,26 @@ public void extractTables(CommandLine line) throws ParseException {
public void extractDirectoryTables(CommandLine line, File pdfDirectory) throws ParseException {
File[] pdfs = pdfDirectory.listFiles(new FilenameFilter() {
- public boolean accept(File dir, String name) {
- return name.endsWith(".pdf");
- }
+ public boolean accept(File dir, String name) {
+ return name.endsWith(".pdf");
+ }
});
for (File pdfFile : pdfs) {
File outputFile = new File(getOutputFilename(pdfFile));
- extractFileInto(pdfFile, outputFile);
+ try {
+ extractFileInto(pdfFile, outputFile);
+ } catch (ParseException e) {
+ System.err.println("Caught exception while processing file: " + pdfFile.toString());
+ throw e;
+ }
}
}
public void extractFileTables(CommandLine line, File pdfFile) throws ParseException {
- Appendable outFile = this.defaultOutput;
if (!line.hasOption('o')) {
- extractFile(pdfFile, this.defaultOutput);
- return;
+ extractFile(pdfFile, this.defaultOutput);
+ return;
}
File outputFile = new File(line.getOptionValue('o'));
@@ -152,41 +159,52 @@ public void extractFileInto(File pdfFile, File outputFile) throws ParseException
private void extractFile(File pdfFile, Appendable outFile) throws ParseException {
PDDocument pdfDocument = null;
try {
- pdfDocument = PDDocument.load(pdfFile);
+ pdfDocument = this.password == null ? Loader.loadPDF(pdfFile) : Loader.loadPDF(pdfFile,password);
PageIterator pageIterator = getPageIterator(pdfDocument);
- List tables = new ArrayList();
+ List tables = new ArrayList<>();
while (pageIterator.hasNext()) {
Page page = pageIterator.next();
- if (pageArea != null) {
- page = page.getArea(pageArea);
+ if (tableExtractor.verticalRulingPositions != null) {
+ for (Float verticalRulingPosition : tableExtractor.verticalRulingPositions) {
+ page.addRuling(new Ruling(0, verticalRulingPosition, 0.0f, (float) page.getHeight()));
+ }
}
- tables.addAll(tableExtractor.extractTables(page));
+ if (pageAreas != null) {
+ for (Pair areaPair : pageAreas) {
+ Rectangle area = areaPair.getRight();
+ if (areaPair.getLeft() == RELATIVE_AREA_CALCULATION_MODE) {
+ area = new Rectangle((float) (area.getTop() / 100 * page.getHeight()),
+ (float) (area.getLeft() / 100 * page.getWidth()), (float) (area.getWidth() / 100 * page.getWidth()),
+ (float) (area.getHeight() / 100 * page.getHeight()));
+ }
+ tables.addAll(tableExtractor.extractTables(page.getArea(area)));
+ }
+ } else {
+ tables.addAll(tableExtractor.extractTables(page));
+ }
}
writeTables(tables, outFile);
} catch (IOException e) {
throw new ParseException(e.getMessage());
} finally {
- try {
- if (pdfDocument != null) {
- pdfDocument.close();
- }
- } catch (IOException e) {
- System.out.println("Error in closing pdf document" + e);
- }
+ try {
+ if (pdfDocument != null) {
+ pdfDocument.close();
+ }
+ } catch (IOException e) {
+ System.out.println("Error in closing pdf document" + e);
+ }
}
}
private PageIterator getPageIterator(PDDocument pdfDocument) throws IOException {
- ObjectExtractor extractor = (this.password == null) ?
- new ObjectExtractor(pdfDocument) :
- new ObjectExtractor(pdfDocument, this.password);
- PageIterator pageIterator = (pages == null) ?
- extractor.extract() :
- extractor.extract(pages);
- return pageIterator;
+ ObjectExtractor extractor = new ObjectExtractor(pdfDocument);
+ return (pages == null) ?
+ extractor.extract() :
+ extractor.extract(pages);
}
// CommandLine parsing methods
@@ -206,16 +224,28 @@ private static OutputFormat whichOutputFormat(CommandLine line) throws ParseExce
}
}
- private static Rectangle whichArea(CommandLine line) throws ParseException {
+ private static List> whichAreas(CommandLine line) throws ParseException {
if (!line.hasOption('a')) {
- return null;
+ return null;
}
- List f = parseFloatList(line.getOptionValue('a'));
- if (f.size() != 4) {
- throw new ParseException("area parameters must be top,left,bottom,right");
+ String[] optionValues = line.getOptionValues('a');
+
+ List> areaList = new ArrayList>();
+ for (String optionValue : optionValues) {
+ int areaCalculationMode = ABSOLUTE_AREA_CALCULATION_MODE;
+ int startIndex = 0;
+ if (optionValue.startsWith("%")) {
+ startIndex = 1;
+ areaCalculationMode = RELATIVE_AREA_CALCULATION_MODE;
+ }
+ List f = parseFloatList(optionValue.substring(startIndex));
+ if (f.size() != 4) {
+ throw new ParseException("area parameters must be top,left,bottom,right optionally preceded by %");
+ }
+ areaList.add(new Pair(areaCalculationMode, new Rectangle(f.get(0), f.get(1), f.get(3) - f.get(1), f.get(2) - f.get(0))));
}
- return new Rectangle(f.get(0), f.get(1), f.get(3) - f.get(1), f.get(2) - f.get(0));
+ return areaList;
}
private static List whichPages(CommandLine line) throws ParseException {
@@ -224,36 +254,44 @@ private static List whichPages(CommandLine line) throws ParseException
}
private static ExtractionMethod whichExtractionMethod(CommandLine line) {
- if (line.hasOption('r')) {
+ // -r/--spreadsheet [deprecated; use -l] or -l/--lattice
+ if (line.hasOption('r') || line.hasOption('l')) {
return ExtractionMethod.SPREADSHEET;
}
- if (line.hasOption('n') || line.hasOption('c') || line.hasOption('g')) {
+ // -n/--no-spreadsheet [deprecated; use -t] or -c/--columns or -g/--guess or -t/--stream
+ if (line.hasOption('n') || line.hasOption('c') || line.hasOption('t')) {
return ExtractionMethod.BASIC;
}
return ExtractionMethod.DECIDE;
}
private static TableExtractor createExtractor(CommandLine line) throws ParseException {
- TableExtractor extractor = new TableExtractor();
- extractor.setGuess(line.hasOption('g'));
- extractor.setMethod(CommandLineApp.whichExtractionMethod(line));
- extractor.setUseLineReturns(line.hasOption('u'));
-
- if (line.hasOption('c')) {
- extractor.setVerticalRulingPositions(parseFloatList(line.getOptionValue('c')));
- }
- return extractor;
+ TableExtractor extractor = new TableExtractor();
+ extractor.setGuess(line.hasOption('g'));
+ extractor.setMethod(CommandLineApp.whichExtractionMethod(line));
+ extractor.setUseLineReturns(line.hasOption('u'));
+
+ if (line.hasOption('c')) {
+ String optionString = line.getOptionValue('c');
+ if (optionString.startsWith("%")) {
+ extractor.setVerticalRulingPositionsRelative(true);
+ optionString = optionString.substring(1);
+ }
+ extractor.setVerticalRulingPositions(parseFloatList(optionString));
+ }
+
+ return extractor;
}
// utilities, etc.
public static List parseFloatList(String option) throws ParseException {
String[] f = option.split(",");
- List rv = new ArrayList();
+ List rv = new ArrayList<>();
try {
- for (int i = 0; i < f.length; i++) {
- rv.add(Float.parseFloat(f[i]));
+ for (final String element : f) {
+ rv.add(Float.parseFloat(element));
}
return rv;
} catch (NumberFormatException e) {
@@ -266,141 +304,173 @@ private static void printHelp() {
formatter.printHelp("tabula", BANNER, buildOptions(), "", true);
}
- @SuppressWarnings("static-access")
public static Options buildOptions() {
Options o = new Options();
o.addOption("v", "version", false, "Print version and exit.");
o.addOption("h", "help", false, "Print this help text.");
o.addOption("g", "guess", false, "Guess the portion of the page to analyze per page.");
- o.addOption("d", "debug", false, "Print detected table areas instead of processing");
- o.addOption("r", "spreadsheet", false, "Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
- o.addOption("n", "no-spreadsheet", false, "Force PDF not to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
+ o.addOption("r", "spreadsheet", false, "[Deprecated in favor of -l/--lattice] Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
+ o.addOption("n", "no-spreadsheet", false, "[Deprecated in favor of -t/--stream] Force PDF not to be extracted using spreadsheet-style extraction (if there are no ruling lines separating each cell)");
+ o.addOption("l", "lattice", false, "Force PDF to be extracted using lattice-mode extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
+ o.addOption("t", "stream", false, "Force PDF to be extracted using stream-mode extraction (if there are no ruling lines separating each cell)");
o.addOption("i", "silent", false, "Suppress all stderr output.");
o.addOption("u", "use-line-returns", false, "Use embedded line returns in cells. (Only in spreadsheet mode.)");
- o.addOption("d", "debug", false, "Print detected table areas instead of processing.");
- o.addOption(OptionBuilder.withLongOpt("batch")
- .withDescription("Convert all .pdfs in the provided directory.")
- .hasArg()
- .withArgName("DIRECTORY")
- .create("b"));
- o.addOption(OptionBuilder.withLongOpt("outfile")
- .withDescription("Write output to instead of STDOUT. Default: -")
- .hasArg()
- .withArgName("OUTFILE")
- .create("o"));
- o.addOption(OptionBuilder.withLongOpt("format")
- .withDescription("Output format: (" + Utils.join(",", OutputFormat.formatNames()) + "). Default: CSV")
- .hasArg()
- .withArgName("FORMAT")
- .create("f"));
- o.addOption(OptionBuilder.withLongOpt("password")
- .withDescription("Password to decrypt document. Default is empty")
- .hasArg()
- .withArgName("PASSWORD")
- .create("s"));
- o.addOption(OptionBuilder.withLongOpt("columns")
- .withDescription("X coordinates of column boundaries. Example --columns 10.1,20.2,30.3")
- .hasArg()
- .withArgName("COLUMNS")
- .create("c"));
- o.addOption(OptionBuilder.withLongOpt("area")
- .withDescription("Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page")
- .hasArg()
- .withArgName("AREA")
- .create("a"));
- o.addOption(OptionBuilder.withLongOpt("pages")
- .withDescription("Comma separated list of ranges, or all. Examples: --pages 1-3,5-7, --pages 3 or --pages all. Default is --pages 1")
- .hasArg()
- .withArgName("PAGES")
- .create("p"));
+ // o.addOption("d", "debug", false, "Print detected table areas instead of processing.");
+ o.addOption(Option.builder("b")
+ .longOpt("batch")
+ .desc("Convert all .pdfs in the provided directory.")
+ .hasArg()
+ .argName("DIRECTORY")
+ .build());
+ o.addOption(Option.builder("o")
+ .longOpt("outfile")
+ .desc("Write output to instead of STDOUT. Default: -")
+ .hasArg()
+ .argName("OUTFILE")
+ .build());
+ o.addOption(Option.builder("f")
+ .longOpt("format")
+ .desc("Output format: (" + Utils.join(",", OutputFormat.formatNames()) + "). Default: CSV")
+ .hasArg()
+ .argName("FORMAT")
+ .build());
+ o.addOption(Option.builder("s")
+ .longOpt("password")
+ .desc("Password to decrypt document. Default is empty")
+ .hasArg()
+ .argName("PASSWORD")
+ .build());
+ o.addOption(Option.builder("c")
+ .longOpt("columns")
+ .desc("X coordinates of column boundaries. Example --columns 10.1,20.2,30.3. "
+ + "If all values are between 0-100 (inclusive) and preceded by '%', input will be taken as % of actual width of the page. "
+ + "Example: --columns %25,50,80.6")
+ .hasArg()
+ .argName("COLUMNS")
+ .build());
+ o.addOption(Option.builder("a")
+ .longOpt("area")
+ .desc("-a/--area = Portion of the page to analyze. Example: --area 269.875,12.75,790.5,561. "
+ + "Accepts top,left,bottom,right i.e. y1,x1,y2,x2 where all values are in points relative to the top left corner. "
+ + "If all values are between 0-100 (inclusive) and preceded by '%', input will be taken as % of actual height or width of the page. "
+ + "Example: --area %0,0,100,50. To specify multiple areas, -a option should be repeated. Default is entire page")
+ .hasArg()
+ .argName("AREA")
+ .build());
+ o.addOption(Option.builder("p")
+ .longOpt("pages")
+ .desc("Comma separated list of ranges, or all. Examples: --pages 1-3,5-7, --pages 3 or --pages all. Default is --pages 1")
+ .hasArg()
+ .argName("PAGES")
+ .build());
return o;
}
private static class TableExtractor {
- private boolean guess = false;
- private boolean useLineReturns = false;
- private BasicExtractionAlgorithm basicExtractor = new BasicExtractionAlgorithm();
- private SpreadsheetExtractionAlgorithm spreadsheetExtractor = new SpreadsheetExtractionAlgorithm();
- private List verticalRulingPositions = null;
- private ExtractionMethod method = ExtractionMethod.BASIC;
-
- public TableExtractor() {
- }
-
- public void setVerticalRulingPositions(List positions) {
- this.verticalRulingPositions = positions;
- }
-
- public void setGuess(boolean guess) {
- this.guess = guess;
- }
-
- public void setUseLineReturns(boolean useLineReturns) {
- this.useLineReturns = useLineReturns;
- }
-
- public void setMethod(ExtractionMethod method) {
- this.method = method;
- }
-
- public List extractTables(Page page) {
- ExtractionMethod effectiveMethod = this.method;
- if (effectiveMethod == ExtractionMethod.DECIDE) {
- effectiveMethod = spreadsheetExtractor.isTabular(page) ?
- ExtractionMethod.SPREADSHEET :
- ExtractionMethod.BASIC;
- }
- switch(effectiveMethod) {
- case BASIC:
- return extractTablesBasic(page);
- case SPREADSHEET:
- return extractTablesSpreadsheet(page);
- default:
- return new ArrayList();
- }
- }
-
- public List extractTablesBasic(Page page) {
- if (guess) {
- // guess the page areas to extract using a detection algorithm
- // currently we only have a detector that uses spreadsheets to find table areas
- DetectionAlgorithm detector = new NurminenDetectionAlgorithm();
- List guesses = detector.detect(page);
- List tables = new ArrayList();
-
- for (Rectangle guessRect : guesses) {
- Page guess = page.getArea(guessRect);
- tables.addAll(basicExtractor.extract(guess));
- }
- return tables;
+ private boolean guess = false;
+ private boolean useLineReturns = false;
+ private BasicExtractionAlgorithm basicExtractor = new BasicExtractionAlgorithm();
+ private SpreadsheetExtractionAlgorithm spreadsheetExtractor = new SpreadsheetExtractionAlgorithm();
+
+ private boolean verticalRulingPositionsRelative = false;
+ private List verticalRulingPositions = null;
+
+ private ExtractionMethod method = ExtractionMethod.BASIC;
+
+ public TableExtractor() {
+ }
+
+ public void setVerticalRulingPositions(List positions) {
+ this.verticalRulingPositions = positions;
+ }
+
+ public void setVerticalRulingPositionsRelative(boolean relative) {
+ this.verticalRulingPositionsRelative = relative;
+ }
+
+ public void setGuess(boolean guess) {
+ this.guess = guess;
+ }
+
+ public void setUseLineReturns(boolean useLineReturns) {
+ this.useLineReturns = useLineReturns;
+ }
+
+ public void setMethod(ExtractionMethod method) {
+ this.method = method;
+ }
+
+ public List extractTables(Page page) {
+ ExtractionMethod effectiveMethod = this.method;
+ if (effectiveMethod == ExtractionMethod.DECIDE) {
+ effectiveMethod = spreadsheetExtractor.isTabular(page) ?
+ ExtractionMethod.SPREADSHEET :
+ ExtractionMethod.BASIC;
+ }
+ switch (effectiveMethod) {
+ case BASIC:
+ return extractTablesBasic(page);
+ case SPREADSHEET:
+ return extractTablesSpreadsheet(page);
+ default:
+ return new ArrayList<>();
+ }
}
- if (verticalRulingPositions != null) {
- return basicExtractor.extract(page, verticalRulingPositions);
+ public List extractTablesBasic(Page page) {
+ if (guess) {
+ // guess the page areas to extract using a detection algorithm
+ // currently we only have a detector that uses spreadsheets to find table areas
+ DetectionAlgorithm detector = new NurminenDetectionAlgorithm();
+ List guesses = detector.detect(page);
+ List tables = new ArrayList<>();
+
+ for (Rectangle guessRect : guesses) {
+ Page guess = page.getArea(guessRect);
+ tables.addAll(basicExtractor.extract(guess));
+ }
+ return tables;
+ }
+
+ if (verticalRulingPositions != null) {
+ List absoluteRulingPositions;
+
+ if (this.verticalRulingPositionsRelative) {
+ // convert relative to absolute
+ absoluteRulingPositions = new ArrayList<>(verticalRulingPositions.size());
+ for (float relative : this.verticalRulingPositions) {
+ float absolute = (float) (relative / 100.0 * page.getWidth());
+ absoluteRulingPositions.add(absolute);
+ }
+ } else {
+ absoluteRulingPositions = this.verticalRulingPositions;
+ }
+ return basicExtractor.extract(page, absoluteRulingPositions);
+ }
+
+ return basicExtractor.extract(page);
}
- return basicExtractor.extract(page);
- }
- public List extractTablesSpreadsheet(Page page) {
- // TODO add useLineReturns
- return (List)spreadsheetExtractor.extract(page);
- }
+ public List extractTablesSpreadsheet(Page page) {
+ // TODO add useLineReturns
+ return spreadsheetExtractor.extract(page);
+ }
}
private void writeTables(List tables, Appendable out) throws IOException {
Writer writer = null;
switch (outputFormat) {
- case CSV:
- writer = new CSVWriter();
- break;
- case JSON:
- writer = new JSONWriter();
- break;
- case TSV:
- writer = new TSVWriter();
- break;
+ case CSV:
+ writer = new CSVWriter();
+ break;
+ case JSON:
+ writer = new JSONWriter();
+ break;
+ case TSV:
+ writer = new TSVWriter();
+ break;
}
writer.write(out, tables);
}
@@ -408,15 +478,15 @@ private void writeTables(List tables, Appendable out) throws IOException
private String getOutputFilename(File pdfFile) {
String extension = ".csv";
switch (outputFormat) {
- case CSV:
- extension = ".csv";
- break;
- case JSON:
- extension = ".json";
- break;
- case TSV:
- extension = ".tsv";
- break;
+ case CSV:
+ extension = ".csv";
+ break;
+ case JSON:
+ extension = ".json";
+ break;
+ case TSV:
+ extension = ".tsv";
+ break;
}
return pdfFile.getPath().replaceFirst("(\\.pdf|)$", extension);
}
diff --git a/src/main/java/technology/tabula/DummyGraphics2D.java b/src/main/java/technology/tabula/DummyGraphics2D.java
deleted file mode 100644
index 88026fec..00000000
--- a/src/main/java/technology/tabula/DummyGraphics2D.java
+++ /dev/null
@@ -1,461 +0,0 @@
-package technology.tabula;
-
-import java.awt.Color;
-import java.awt.Composite;
-import java.awt.Font;
-import java.awt.FontMetrics;
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.awt.GraphicsConfiguration;
-import java.awt.Image;
-import java.awt.Paint;
-import java.awt.Rectangle;
-import java.awt.RenderingHints;
-import java.awt.RenderingHints.Key;
-import java.awt.Shape;
-import java.awt.Stroke;
-import java.awt.font.FontRenderContext;
-import java.awt.font.GlyphVector;
-import java.awt.geom.AffineTransform;
-import java.awt.image.BufferedImage;
-import java.awt.image.BufferedImageOp;
-import java.awt.image.ImageObserver;
-import java.awt.image.RenderedImage;
-import java.awt.image.renderable.RenderableImage;
-import java.text.AttributedCharacterIterator;
-import java.util.Map;
-
-public class DummyGraphics2D extends Graphics2D {
-
- @Override
- public void addRenderingHints(Map, ?> hints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void clip(Shape s) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void draw(Shape s) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawGlyphVector(GlyphVector g, float x, float y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public boolean drawImage(Image img, AffineTransform xform, ImageObserver obs) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public void drawImage(BufferedImage img, BufferedImageOp op, int x, int y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawRenderableImage(RenderableImage img, AffineTransform xform) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawRenderedImage(RenderedImage img, AffineTransform xform) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawString(String str, int x, int y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawString(String str, float x, float y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawString(AttributedCharacterIterator iterator, int x, int y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawString(AttributedCharacterIterator iterator, float x,
- float y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fill(Shape s) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public Color getBackground() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Composite getComposite() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public GraphicsConfiguration getDeviceConfiguration() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public FontRenderContext getFontRenderContext() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Paint getPaint() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Object getRenderingHint(Key hintKey) {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public RenderingHints getRenderingHints() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Stroke getStroke() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public AffineTransform getTransform() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public boolean hit(Rectangle rect, Shape s, boolean onStroke) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public void rotate(double theta) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void rotate(double theta, double x, double y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void scale(double sx, double sy) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setBackground(Color color) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setComposite(Composite comp) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setPaint(Paint paint) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setRenderingHint(Key hintKey, Object hintValue) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setRenderingHints(Map, ?> hints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setStroke(Stroke s) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setTransform(AffineTransform Tx) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void shear(double shx, double shy) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void transform(AffineTransform Tx) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void translate(int x, int y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void translate(double tx, double ty) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void clearRect(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void clipRect(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void copyArea(int x, int y, int width, int height, int dx, int dy) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public Graphics create() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public void dispose() {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawArc(int x, int y, int width, int height, int startAngle,
- int arcAngle) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public boolean drawImage(Image img, int x, int y, ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int x, int y, Color bgcolor,
- ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int x, int y, int width, int height,
- ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int x, int y, int width, int height,
- Color bgcolor, ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int dx1, int dy1, int dx2, int dy2,
- int sx1, int sy1, int sx2, int sy2, ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int dx1, int dy1, int dx2, int dy2,
- int sx1, int sy1, int sx2, int sy2, Color bgcolor,
- ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public void drawLine(int x1, int y1, int x2, int y2) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawOval(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawPolygon(int[] xPoints, int[] yPoints, int nPoints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawPolyline(int[] xPoints, int[] yPoints, int nPoints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawRoundRect(int x, int y, int width, int height,
- int arcWidth, int arcHeight) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillArc(int x, int y, int width, int height, int startAngle,
- int arcAngle) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillOval(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillPolygon(int[] xPoints, int[] yPoints, int nPoints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillRect(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillRoundRect(int x, int y, int width, int height,
- int arcWidth, int arcHeight) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public Shape getClip() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Rectangle getClipBounds() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Color getColor() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Font getFont() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public FontMetrics getFontMetrics(Font f) {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public void setClip(Shape clip) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setClip(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setColor(Color c) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setFont(Font font) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setPaintMode() {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setXORMode(Color c1) {
- // TODO Auto-generated method stub
-
- }
-
-}
diff --git a/src/main/java/technology/tabula/HasText.java b/src/main/java/technology/tabula/HasText.java
index 6f375dbc..1a9bda99 100644
--- a/src/main/java/technology/tabula/HasText.java
+++ b/src/main/java/technology/tabula/HasText.java
@@ -1,7 +1,8 @@
package technology.tabula;
public interface HasText {
-
- String getText();
+
+ String getText();
+ String getText(boolean useLineReturns);
}
diff --git a/src/main/java/technology/tabula/Line.java b/src/main/java/technology/tabula/Line.java
index ed2f6895..31d10529 100644
--- a/src/main/java/technology/tabula/Line.java
+++ b/src/main/java/technology/tabula/Line.java
@@ -8,7 +8,7 @@
@SuppressWarnings("serial")
public class Line extends Rectangle {
- List textChunks = new ArrayList();
+ List textChunks = new ArrayList<>();
public static final Character[] WHITE_SPACE_CHARS = { ' ', '\t', '\r', '\n', '\f' };
@@ -52,7 +52,7 @@ public void addTextChunk(TextChunk textChunk) {
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
- sb.append(s.substring(0, s.length() - 1));
+ sb.append(s, 0, s.length() - 1);
sb.append(",chunks=");
for (TextChunk te: this.textChunks) {
sb.append("'" + te.getText() + "', ");
diff --git a/src/main/java/technology/tabula/ObjectExtractor.java b/src/main/java/technology/tabula/ObjectExtractor.java
index 0e30e2dd..9f3f6a03 100644
--- a/src/main/java/technology/tabula/ObjectExtractor.java
+++ b/src/main/java/technology/tabula/ObjectExtractor.java
@@ -1,427 +1,73 @@
package technology.tabula;
-import java.awt.Image;
-import java.awt.Shape;
-import java.awt.event.KeyEvent;
-import java.awt.geom.AffineTransform;
-import java.awt.geom.GeneralPath;
-import java.awt.geom.Line2D;
-import java.awt.geom.PathIterator;
-import java.awt.geom.Point2D;
-import java.awt.geom.Rectangle2D;
import java.io.IOException;
-import java.lang.reflect.Field;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.List;
-import org.apache.pdfbox.exceptions.CryptographyException;
-import org.apache.pdfbox.pdfviewer.PageDrawer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-import org.apache.pdfbox.pdmodel.common.PDStream;
-import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
-import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
-import org.apache.pdfbox.pdmodel.font.PDFont;
-import org.apache.pdfbox.pdmodel.font.PDType3Font;
-import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState;
-import org.apache.pdfbox.pdmodel.text.PDTextState;
-import org.apache.pdfbox.util.TextPosition;
-public class ObjectExtractor extends org.apache.pdfbox.pdfviewer.PageDrawer {
+public class ObjectExtractor implements java.io.Closeable {
- private static final char[] spaceLikeChars = { ' ', '-', '1', 'i' };
- private static final String NBSP = "\u00A0";
+ private final PDDocument pdfDocument;
- private float minCharWidth;
- private float minCharHeight;
- private List characters;
- private List rulings;
- private RectangleSpatialIndex spatialIndex;
- private AffineTransform pageTransform;
- public List clippingPaths;
- private boolean debugClippingPaths;
- private boolean extractRulingLines;
- private final PDDocument pdf_document;
- protected List pdf_document_pages;
-
-
- public ObjectExtractor(PDDocument pdf_document) throws IOException {
- this(pdf_document, null, true, false);
- }
-
- public ObjectExtractor(PDDocument pdf_document, boolean debugClippingPaths) throws IOException {
- this(pdf_document, null, true, debugClippingPaths);
- }
-
- public ObjectExtractor(PDDocument pdf_document, String password) throws IOException {
- this(pdf_document, password, true, false);
+ public ObjectExtractor(PDDocument pdfDocument) {
+ this.pdfDocument = pdfDocument;
}
- public ObjectExtractor(PDDocument pdf_document, String password, boolean extractRulingLines, boolean debugClippingPaths)
- throws IOException {
- super();
-
- this.clippingPaths = new ArrayList();
- this.debugClippingPaths = debugClippingPaths;
- this.extractRulingLines = extractRulingLines;
-
- this.initialize();
-
- // patch PageDrawer: dummy Graphics2D context so some drawing operators don't complain
- try {
- Field field = PageDrawer.class.getDeclaredField("graphics");
- field.setAccessible(true);
- field.set(this, new DummyGraphics2D());
- }
- catch (Exception e1) {
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ protected Page extractPage(Integer pageNumber) throws IOException {
+ if (pageNumber > pdfDocument.getNumberOfPages() || pageNumber < 1) {
+ throw new java.lang.IndexOutOfBoundsException("Page number does not exist.");
}
-
- if (pdf_document.isEncrypted()) {
- try {
- pdf_document
- .openProtection(new StandardDecryptionMaterial(password));
- } catch (BadSecurityHandlerException e) {
- // TODO Auto-generated catch block
- throw new IOException("BadSecurityHandler");
- } catch (CryptographyException e) {
- throw new IOException("Document is encrypted");
- }
- }
- this.pdf_document = pdf_document;
- this.pdf_document_pages = this.pdf_document.getDocumentCatalog()
- .getAllPages();
+ PDPage page = pdfDocument.getPage(pageNumber - 1);
- }
+ ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page);
+ streamEngine.processPage(page);
+ TextStripper textStripper = new TextStripper(pdfDocument, pageNumber);
+ textStripper.process();
- protected Page extractPage(Integer page_number) throws IOException {
+ Utils.sort(textStripper.getTextElements(), Rectangle.ILL_DEFINED_ORDER);
- if (page_number > this.pdf_document_pages.size() || page_number < 1) {
- throw new java.lang.IndexOutOfBoundsException(
- "Page number does not exist");
+ float width, height;
+ int rotation = page.getRotation();
+ if (Math.abs(rotation) == 90 || Math.abs(rotation) == 270) {
+ width = page.getCropBox().getHeight();
+ height = page.getCropBox().getWidth();
+ } else {
+ width = page.getCropBox().getWidth();
+ height = page.getCropBox().getHeight();
}
- this.initialize();
- PDPage pdPage = (PDPage) this.pdf_document_pages.get(page_number - 1);
- pdPage = this.drawPage(pdPage);
-
- if(pdPage != null) {
-
- Utils.sort(this.characters);
-
- float w, h;
- int pageRotation = pdPage.findRotation();
- if (Math.abs(pageRotation) == 90 || Math.abs(pageRotation) == 270) {
- w = pdPage.findCropBox().getHeight();
- h = pdPage.findCropBox().getWidth();
- }
- else {
- w = pdPage.findCropBox().getWidth();
- h = pdPage.findCropBox().getHeight();
- }
-
- return new Page(0, 0, w, h, pageRotation, page_number, pdPage, this.characters,
- this.rulings, this.minCharWidth, this.minCharHeight,
- this.spatialIndex);
- }
- return null;//TODO: content is empty, return null? or empty Page? or exception?
+ return Page.Builder.newInstance()
+ .withPageDims(PageDims.of(0, 0, width, height))
+ .withRotation(rotation)
+ .withNumber(pageNumber)
+ .withPdPage(page)
+ .withPdDocument(pdfDocument)
+ .withRulings(streamEngine.rulings)
+ .withTextElements(textStripper.getTextElements())
+ .withMinCharWidth(textStripper.getMinCharWidth())
+ .withMinCharHeight(textStripper.getMinCharHeight())
+ .withIndex(textStripper.getSpatialIndex())
+ .build();
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public PageIterator extract(Iterable pages) {
return new PageIterator(this, pages);
}
public PageIterator extract() {
- return extract(Utils.range(1, this.pdf_document_pages.size() + 1));
+ return extract(Utils.range(1, pdfDocument.getNumberOfPages() + 1));
}
public Page extract(int pageNumber) {
return extract(Utils.range(pageNumber, pageNumber + 1)).next();
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public void close() throws IOException {
- this.pdf_document.close();
- }
-
- private PDPage drawPage(PDPage p) throws IOException {
- this.page = p;
- PDStream contents = p.getContents();
- if (contents != null) {
- ensurePageSize();
- this.processStream(p, p.findResources(), contents.getStream());
- return p;
- }
- return null;
- }
-
- private void ensurePageSize() {
- if (this.pageSize == null && this.page != null) {
- PDRectangle cropBox = this.page.findCropBox();
- this.pageSize = cropBox == null ? null : cropBox
- .createDimension();
- }
- }
-
- private void initialize() {
- this.characters = new ArrayList();
- this.rulings = new ArrayList();
- this.pageTransform = null;
- this.spatialIndex = new RectangleSpatialIndex();
- this.minCharWidth = Float.MAX_VALUE;
- this.minCharHeight = Float.MAX_VALUE;
- }
-
- @Override
- public void drawImage(Image awtImage, AffineTransform at) {
- // we just ignore images (for now)
- }
-
- public void strokeOrFillPath(boolean isFill) {
- GeneralPath path = this.getLinePath();
-
- if (!this.extractRulingLines) {
- this.getLinePath().reset();
- return;
- }
-
- PathIterator pi = path.getPathIterator(this.getPageTransform());
- float[] c = new float[6];
- int currentSegment;
-
- // skip paths whose first operation is not a MOVETO
- // or contains operations other than LINETO, MOVETO or CLOSE
- if ((pi.currentSegment(c) != PathIterator.SEG_MOVETO)) {
- path.reset();
- return;
- }
- pi.next();
- while (!pi.isDone()) {
- currentSegment = pi.currentSegment(c);
- if (currentSegment != PathIterator.SEG_LINETO
- && currentSegment != PathIterator.SEG_CLOSE
- && currentSegment != PathIterator.SEG_MOVETO) {
- path.reset();
- return;
- }
- pi.next();
- }
-
- // TODO: how to implement color filter?
-
- // skip the first path operation and save it as the starting position
- float[] first = new float[6];
- pi = path.getPathIterator(this.getPageTransform());
- pi.currentSegment(first);
- // last move
- Point2D.Float start_pos = new Point2D.Float(Utils.round(first[0], 2), Utils.round(first[1], 2));
- Point2D.Float last_move = start_pos;
- Point2D.Float end_pos = null;
- Line2D.Float line;
- PointComparator pc = new PointComparator();
-
- while (!pi.isDone()) {
- pi.next();
- currentSegment = pi.currentSegment(c);
- switch (currentSegment) {
- case PathIterator.SEG_LINETO:
- end_pos = new Point2D.Float(c[0], c[1]);
-
- line = pc.compare(start_pos, end_pos) == -1 ? new Line2D.Float(
- start_pos, end_pos) : new Line2D.Float(end_pos,
- start_pos);
-
- if (line.intersects(this.currentClippingPath())) {
- Ruling r = new Ruling(line.getP1(), line.getP2())
- .intersect(this.currentClippingPath());
-
- if (r.length() > 0.01) {
- this.rulings.add(r);
- }
- }
- break;
- case PathIterator.SEG_MOVETO:
- last_move = new Point2D.Float(c[0], c[1]);
- end_pos = last_move;
- break;
- case PathIterator.SEG_CLOSE:
- // according to PathIterator docs:
- // "the preceding subpath should be closed by appending a line
- // segment
- // back to the point corresponding to the most recent
- // SEG_MOVETO."
- line = pc.compare(end_pos, last_move) == -1 ? new Line2D.Float(
- end_pos, last_move) : new Line2D.Float(last_move,
- end_pos);
-
- if (line.intersects(this.currentClippingPath())) {
- Ruling r = new Ruling(line.getP1(), line.getP2())
- .intersect(this.currentClippingPath());
-
- if (r.length() > 0.01) {
- this.rulings.add(r);
- }
- }
- break;
- }
- start_pos = end_pos;
- }
- path.reset();
- }
-
- @Override
- public void strokePath() throws IOException {
- this.strokeOrFillPath(false);
- }
-
- @Override
- public void fillPath(int windingRule) throws IOException {
- //
- // float[] color_comps =
- // this.getGraphicsState().getNonStrokingColor().getJavaColor().getRGBColorComponents(null);
- // float[] color = this.getGraphicsState().getNonStrokingColor().getJavaColor().getComponents(null);
- // TODO use color_comps as filter_by_color
- this.strokeOrFillPath(true);
- }
-
- private float currentSpaceWidth() {
- PDGraphicsState gs = this.getGraphicsState();
- PDTextState ts = gs.getTextState();
- PDFont font = ts.getFont();
- float fontSizeText = ts.getFontSize();
- float horizontalScalingText = ts.getHorizontalScalingPercent() / 100.0f;
- float spaceWidthText = 1000;
-
- if (font instanceof PDType3Font) {
- // TODO WHAT?
- }
-
- for (int i = 0; i < spaceLikeChars.length; i++) {
- spaceWidthText = font.getFontWidth(spaceLikeChars[i]);
- if (spaceWidthText > 0)
- break;
- }
-
- float ctm00 = gs.getCurrentTransformationMatrix().getValue(0, 0);
-
- return (float) ((spaceWidthText / 1000.0) * fontSizeText
- * horizontalScalingText * (ctm00 == 0 ? 1 : ctm00));
- }
-
- @Override
- protected void processTextPosition(TextPosition textPosition) {
- String c = textPosition.getCharacter();
-
- // if c not printable, return
- if (!isPrintable(c)) {
- return;
- }
-
- Float h = textPosition.getHeightDir();
-
- if (c.equals(NBSP)) { // replace non-breaking space for space
- c = " ";
- }
-
- float wos = textPosition.getWidthOfSpace();
-
- TextElement te = new TextElement(
- Utils.round(textPosition.getYDirAdj() - h, 2),
- Utils.round(textPosition.getXDirAdj(), 2),
- Utils.round(textPosition.getWidthDirAdj(), 2),
- Utils.round(textPosition.getHeightDir(), 2),
- textPosition.getFont(),
- textPosition.getFontSize(),
- c,
- // workaround a possible bug in PDFBox:
- // https://issues.apache.org/jira/browse/PDFBOX-1755
- (Float.isNaN(wos) || wos == 0) ? this.currentSpaceWidth() : wos,
- textPosition.getDir());
-
- if (this.currentClippingPath().intersects(te)) {
-
- this.minCharWidth = (float) Math.min(this.minCharWidth, te.getWidth());
- this.minCharHeight = (float) Math.min(this.minCharHeight, te.getHeight());
-
- this.spatialIndex.add(te);
- this.characters.add(te);
- }
-
- if (this.isDebugClippingPaths() && !this.clippingPaths.contains(this.currentClippingPath())) {
- this.clippingPaths.add(this.currentClippingPath());
- }
-
- }
-
- public AffineTransform getPageTransform() {
-
- if (this.pageTransform != null) {
- return this.pageTransform;
- }
-
- PDRectangle cb = page.findCropBox();
- int rotation = Math.abs(page.findRotation());
-
- this.pageTransform = new AffineTransform();
-
- if (rotation == 90 || rotation == 270) {
- this.pageTransform = AffineTransform.getRotateInstance(rotation * (Math.PI / 180.0), 0, 0);
- this.pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
- this.pageTransform.concatenate(AffineTransform.getTranslateInstance(0, cb.getHeight()));
- this.pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
- }
- return this.pageTransform;
- }
-
- public Rectangle2D currentClippingPath() {
-
- Shape clippingPath = this.getGraphicsState().getCurrentClippingPath();
- Shape transformedClippingPath = this.getPageTransform()
- .createTransformedShape(clippingPath);
- Rectangle2D transformedClippingPathBounds = transformedClippingPath
- .getBounds2D();
-
- return transformedClippingPathBounds;
- }
-
- public boolean isExtractRulingLines() {
- return extractRulingLines;
- }
-
- private static boolean isPrintable(String s) {
- Character c = s.charAt(0);
- Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
- return (!Character.isISOControl(c)) && c != KeyEvent.CHAR_UNDEFINED
- && block != null && block != Character.UnicodeBlock.SPECIALS;
- }
-
- public boolean isDebugClippingPaths() {
- return debugClippingPaths;
- }
-
- public int getPageCount() {
- return this.pdf_document_pages.size();
+ pdfDocument.close();
}
- class PointComparator implements Comparator {
- @Override
- public int compare(Point2D o1, Point2D o2) {
- float o1X = Utils.round(o1.getX(), 2);
- float o1Y = Utils.round(o1.getY(), 2);
- float o2X = Utils.round(o2.getX(), 2);
- float o2Y = Utils.round(o2.getY(), 2);
-
- if (o1Y > o2Y)
- return 1;
- if (o1Y < o2Y)
- return -1;
- if (o1X > o2X)
- return 1;
- if (o1X < o2X)
- return -1;
- return 0;
- }
- }
-
}
diff --git a/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java b/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java
new file mode 100644
index 00000000..9907eca1
--- /dev/null
+++ b/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java
@@ -0,0 +1,271 @@
+package technology.tabula;
+
+import java.awt.Shape;
+import java.awt.geom.AffineTransform;
+import java.awt.geom.GeneralPath;
+import java.awt.geom.Line2D;
+import java.awt.geom.PathIterator;
+import java.awt.geom.Point2D;
+import java.awt.geom.Rectangle2D;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDRectangle;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static java.awt.geom.PathIterator.*;
+
+class ObjectExtractorStreamEngine extends PDFGraphicsStreamEngine {
+
+ protected List rulings;
+ private AffineTransform pageTransform;
+ private boolean extractRulingLines = true;
+ private Logger logger;
+ private int clipWindingRule = -1;
+ private GeneralPath currentPath = new GeneralPath();
+
+ private static final float RULING_MINIMUM_LENGTH = 0.01f;
+
+ protected ObjectExtractorStreamEngine(PDPage page) {
+ super(page);
+ logger = LoggerFactory.getLogger(ObjectExtractorStreamEngine.class);
+ rulings = new ArrayList<>();
+
+ // Calculate page transform:
+ pageTransform = new AffineTransform();
+ PDRectangle pageCropBox = getPage().getCropBox();
+ int rotationAngleInDegrees = getPage().getRotation();
+
+ if (Math.abs(rotationAngleInDegrees) == 90 || Math.abs(rotationAngleInDegrees) == 270) {
+ double rotationAngleInRadians = rotationAngleInDegrees * (Math.PI / 180.0);
+ pageTransform = AffineTransform.getRotateInstance(rotationAngleInRadians, 0, 0);
+ } else {
+ double deltaX = 0;
+ double deltaY = pageCropBox.getHeight();
+ pageTransform.concatenate(AffineTransform.getTranslateInstance(deltaX, deltaY));
+ }
+
+ pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
+ pageTransform.translate(-pageCropBox.getLowerLeftX(), -pageCropBox.getLowerLeftY());
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ @Override
+ public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) {
+ currentPath.moveTo((float) p0.getX(), (float) p0.getY());
+ currentPath.lineTo((float) p1.getX(), (float) p1.getY());
+ currentPath.lineTo((float) p2.getX(), (float) p2.getY());
+ currentPath.lineTo((float) p3.getX(), (float) p3.getY());
+ currentPath.closePath();
+ }
+
+ @Override
+ public void clip(int windingRule) {
+ // The clipping path will not be updated until the succeeding painting
+ // operator is called.
+ clipWindingRule = windingRule;
+ }
+
+ @Override
+ public void closePath() {
+ currentPath.closePath();
+ }
+
+ @Override
+ public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) {
+ currentPath.curveTo(x1, y1, x2, y2, x3, y3);
+ }
+
+ @Override
+ public void drawImage(PDImage arg0) {}
+
+ @Override
+ public void endPath() {
+ if (clipWindingRule != -1) {
+ currentPath.setWindingRule(clipWindingRule);
+ getGraphicsState().intersectClippingPath(currentPath);
+ clipWindingRule = -1;
+ }
+ currentPath.reset();
+ }
+
+ @Override
+ public void fillAndStrokePath(int arg0) {
+ strokeOrFillPath(true);
+ }
+
+ @Override
+ public void fillPath(int arg0) {
+ strokeOrFillPath(true);
+ }
+
+ @Override
+ public Point2D getCurrentPoint() {
+ return currentPath.getCurrentPoint();
+ }
+
+ @Override
+ public void lineTo(float x, float y) {
+ currentPath.lineTo(x, y);
+ }
+
+ @Override
+ public void moveTo(float x, float y) {
+ currentPath.moveTo(x, y);
+ }
+
+ @Override
+ public void shadingFill(COSName arg0) {}
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ @Override
+ public void strokePath() {
+ strokeOrFillPath(false);
+ }
+
+ private void strokeOrFillPath(boolean isFill) {
+ if (!extractRulingLines) {
+ currentPath.reset();
+ return;
+ }
+
+ boolean didNotPassedTheFilter = filterPathBySegmentType();
+ if (didNotPassedTheFilter) return;
+
+ // TODO: how to implement color filter?
+
+ // Skip the first path operation and save it as the starting point.
+ PathIterator pathIterator = currentPath.getPathIterator(getPageTransform());
+
+ float[] coordinates = new float[6];
+ int currentSegment;
+
+ Point2D.Float startPoint = getStartPoint(pathIterator);
+ Point2D.Float last_move = startPoint;
+ Point2D.Float endPoint = null;
+ Line2D.Float line;
+ PointComparator pointComparator = new PointComparator();
+
+ while (!pathIterator.isDone()) {
+ pathIterator.next();
+ // This can be the last segment, when pathIterator.isDone, but we need to
+ // process it otherwise us-017.pdf fails the last value.
+ try {
+ currentSegment = pathIterator.currentSegment(coordinates);
+ } catch (IndexOutOfBoundsException ex) {
+ continue;
+ }
+ switch (currentSegment) {
+ case SEG_LINETO:
+ endPoint = new Point2D.Float(coordinates[0], coordinates[1]);
+ if (startPoint == null || endPoint == null) {
+ break;
+ }
+ line = getLineBetween(startPoint, endPoint, pointComparator);
+ verifyLineIntersectsClipping(line);
+ break;
+ case SEG_MOVETO:
+ last_move = new Point2D.Float(coordinates[0], coordinates[1]);
+ endPoint = last_move;
+ break;
+ case SEG_CLOSE:
+ // According to PathIterator docs:
+ // "The preceding sub-path should be closed by appending a line
+ // segment back to the point corresponding to the most recent
+ // SEG_MOVETO."
+ if (startPoint == null || endPoint == null) {
+ break;
+ }
+ line = getLineBetween(endPoint, last_move, pointComparator);
+ verifyLineIntersectsClipping(line);
+ break;
+ }
+ startPoint = endPoint;
+ }
+ currentPath.reset();
+ }
+
+ private boolean filterPathBySegmentType() {
+ PathIterator pathIterator = currentPath.getPathIterator(pageTransform);
+ float[] coordinates = new float[6];
+ int currentSegmentType = pathIterator.currentSegment(coordinates);
+ if (currentSegmentType != SEG_MOVETO) {
+ currentPath.reset();
+ return true;
+ }
+ pathIterator.next();
+ while (!pathIterator.isDone()) {
+ currentSegmentType = pathIterator.currentSegment(coordinates);
+ if (currentSegmentType != SEG_LINETO && currentSegmentType != SEG_CLOSE && currentSegmentType != SEG_MOVETO) {
+ currentPath.reset();
+ return true;
+ }
+ pathIterator.next();
+ }
+ return false;
+ }
+
+ private Point2D.Float getStartPoint(PathIterator pathIterator) {
+ float[] startPointCoordinates = new float[6];
+ pathIterator.currentSegment(startPointCoordinates);
+ float x = Utils.round(startPointCoordinates[0], 2);
+ float y = Utils.round(startPointCoordinates[1], 2);
+ return new Point2D.Float(x, y);
+ }
+
+ private Line2D.Float getLineBetween(Point2D.Float pointA, Point2D.Float pointB, PointComparator pointComparator) {
+ if (pointComparator.compare(pointA, pointB) == -1) {
+ return new Line2D.Float(pointA, pointB);
+ }
+ return new Line2D.Float(pointB, pointA);
+ }
+
+ private void verifyLineIntersectsClipping(Line2D.Float line) {
+ Rectangle2D currentClippingPath = currentClippingPath();
+ if (line.intersects(currentClippingPath)) {
+ Ruling ruling = new Ruling(line.getP1(), line.getP2()).intersect(currentClippingPath);
+ if (ruling.length() > RULING_MINIMUM_LENGTH) {
+ rulings.add(ruling);
+ }
+ }
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ public AffineTransform getPageTransform() {
+ return pageTransform;
+ }
+
+ public Rectangle2D currentClippingPath() {
+ Shape currentClippingPath = getGraphicsState().getCurrentClippingPath();
+ Shape transformedClippingPath = getPageTransform().createTransformedShape(currentClippingPath);
+ return transformedClippingPath.getBounds2D();
+ }
+
+ // TODO: repeated in SpreadsheetExtractionAlgorithm.
+ class PointComparator implements Comparator {
+ @Override
+ public int compare(Point2D p1, Point2D p2) {
+ float p1X = Utils.round(p1.getX(), 2);
+ float p1Y = Utils.round(p1.getY(), 2);
+ float p2X = Utils.round(p2.getX(), 2);
+ float p2Y = Utils.round(p2.getY(), 2);
+
+ if (p1Y > p2Y)
+ return 1;
+ if (p1Y < p2Y)
+ return -1;
+ if (p1X > p2X)
+ return 1;
+ if (p1X < p2X)
+ return -1;
+ return 0;
+ }
+ }
+
+}
diff --git a/src/main/java/technology/tabula/Page.java b/src/main/java/technology/tabula/Page.java
index ab57d938..ed74d14a 100644
--- a/src/main/java/technology/tabula/Page.java
+++ b/src/main/java/technology/tabula/Page.java
@@ -2,135 +2,220 @@
import java.awt.geom.Point2D;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
+import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
+import static java.lang.Float.compare;
+import static java.util.Collections.min;
+
@SuppressWarnings("serial")
// TODO: this class should probably be called "PageArea" or something like that
public class Page extends Rectangle {
+ private int number;
private Integer rotation;
- private int pageNumber;
- private List texts;
- private List rulings, cleanRulings = null, verticalRulingLines = null, horizontalRulingLines = null;
private float minCharWidth;
private float minCharHeight;
- private RectangleSpatialIndex spatial_index;
+
+ private List textElements;
+
+ // TODO: Create a class for 'List ' that encapsulates all of these lists and their behaviors?
+ private List rulings,
+ cleanRulings = null,
+ verticalRulingLines = null,
+ horizontalRulingLines = null;
+
private PDPage pdPage;
+ private PDDocument pdDoc;
- public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage) {
- super(top, left, width, height);
+ private RectangleSpatialIndex spatialIndex;
+
+ private static final float DEFAULT_MIN_CHAR_LENGTH = 7;
+
+ private Page(
+ PageDims pageDims,
+ int rotation,
+ int number,
+ PDPage pdPage,
+ PDDocument doc,
+ List characters,
+ List rulings,
+ float minCharWidth,
+ float minCharHeight,
+ RectangleSpatialIndex index
+ ) {
+ super(pageDims.getTop(), pageDims.getLeft(), pageDims.getWidth(), pageDims.getHeight());
this.rotation = rotation;
- this.pageNumber = page_number;
+ this.number = number;
this.pdPage = pdPage;
+ this.pdDoc = doc;
+ this.textElements = characters;
+ this.rulings = rulings;
+ this.minCharWidth = minCharWidth;
+ this.minCharHeight = minCharHeight;
+ this.spatialIndex = index;
}
-
- public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage,
- List characters, List rulings) {
- this(top, left, width, height, rotation, page_number, pdPage);
- this.texts = characters;
- this.rulings = rulings;
+ /**
+ *
+ * @deprecated use {@link Builder} instead
+ */
+ @Deprecated
+ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc) {
+ super(top, left, width, height);
+ this.rotation = rotation;
+ this.number = number;
+ this.pdPage = pdPage;
+ this.pdDoc = doc;
+ }
+
+ /**
+ *
+ * @deprecated use {@link Builder} instead
+ */
+ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc,
+ List characters, List rulings) {
+ this(top, left, width, height, rotation, number, pdPage, doc);
+ this.textElements = characters;
+ this.rulings = rulings;
}
+ /**
+ *
+ * @deprecated use {@link Builder} instead
+ */
+ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc,
+ ObjectExtractorStreamEngine streamEngine, TextStripper textStripper) {
+ this(top, left, width, height, rotation, number, pdPage, doc, textStripper.getTextElements(), streamEngine.rulings);
+ this.minCharWidth = textStripper.getMinCharWidth();
+ this.minCharHeight = textStripper.getMinCharHeight();
+ this.spatialIndex = textStripper.getSpatialIndex();
+ }
- public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage,
- List characters, List rulings,
- float minCharWidth, float minCharHeight, RectangleSpatialIndex index) {
- this(top, left, width, height, rotation, page_number, pdPage, characters, rulings);
- this.minCharHeight = minCharHeight;
- this.minCharWidth = minCharWidth;
- this.spatial_index = index;
+
+ /**
+ *
+ * @deprecated use {@link Builder} instead
+ */
+ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc,
+ List characters, List rulings,
+ float minCharWidth, float minCharHeight, RectangleSpatialIndex index) {
+ this(top, left, width, height, rotation, number, pdPage, doc, characters, rulings);
+ this.minCharHeight = minCharHeight;
+ this.minCharWidth = minCharWidth;
+ this.spatialIndex = index;
}
-
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public Page getArea(Rectangle area) {
- List t = getText(area);
- Page rv = new Page(
- (float) area.getTop(),
- (float) area.getLeft(),
- (float) area.getWidth(),
- (float) area.getHeight(),
- rotation,
- pageNumber,
- pdPage,
- t,
- Ruling.cropRulingsToArea(getRulings(), area),
-
- Collections.min(t, new Comparator() {
- @Override
- public int compare(TextElement te1, TextElement te2) {
- return java.lang.Float.compare(te1.width, te2.width);
- }}).width,
-
- Collections.min(t, new Comparator() {
- @Override
- public int compare(TextElement te1, TextElement te2) {
- return java.lang.Float.compare(te1.height, te2.height);
- }}).height,
-
- spatial_index);
-
- rv.addRuling(new Ruling(
- new Point2D.Double(rv.getLeft(),
- rv.getTop()),
- new Point2D.Double(rv.getRight(),
- rv.getTop())));
- rv.addRuling(new Ruling(
- new Point2D.Double(rv.getRight(),
- rv.getTop()),
- new Point2D.Double(rv.getRight(),
- rv.getBottom())));
- rv.addRuling(new Ruling(
- new Point2D.Double(rv.getRight(),
- rv.getBottom()),
- new Point2D.Double(rv.getLeft(),
- rv.getBottom())));
- rv.addRuling(new Ruling(
- new Point2D.Double(rv.getLeft(),
- rv.getBottom()),
- new Point2D.Double(rv.getLeft(),
- rv.getTop())));
-
- return rv;
- }
-
- public Page getArea(float top, float left, float bottom, float right) {
- Rectangle area = new Rectangle(top, left, right - left, bottom - top);
- return this.getArea(area);
+ List areaTextElements = getText(area);
+
+ float minimumCharWidth = getMinimumCharWidthFrom(areaTextElements);
+ float minimumCharHeight = getMinimumCharHeightFrom(areaTextElements);
+
+ final Page page = Page.Builder.newInstance()
+ .withPageDims(PageDims.of(area.getTop(), area.getLeft(), (float) area.getWidth(), (float) area.getHeight()))
+ .withRotation(rotation)
+ .withNumber(number)
+ .withPdPage(pdPage)
+ .withPdDocument(pdDoc)
+ .withTextElements(areaTextElements)
+ .withRulings(Ruling.cropRulingsToArea(getRulings(), area))
+ .withMinCharWidth(minimumCharWidth)
+ .withMinCharHeight(minimumCharHeight)
+ .withIndex(spatialIndex)
+ .build();
+
+ addBorderRulingsTo(page);
+
+ return page;
}
-
- public List getText() {
- return texts;
+
+ private float getMinimumCharWidthFrom(List areaTextElements) {
+ if (!areaTextElements.isEmpty()) {
+ return min(areaTextElements, (te1, te2) -> compare(te1.width, te2.width)).width;
+ }
+ return DEFAULT_MIN_CHAR_LENGTH;
}
-
- public List getText(Rectangle area) {
- return this.spatial_index.contains(area);
+
+ private float getMinimumCharHeightFrom(List areaTextElements) {
+ if (!areaTextElements.isEmpty()) {
+ return min(areaTextElements, (te1, te2) -> compare(te1.height, te2.height)).height;
+ }
+ return DEFAULT_MIN_CHAR_LENGTH;
}
-
- public List getText(float top, float left, float bottom, float right) {
- return this.getText(new Rectangle(top, left, right - left, bottom - top));
+
+ private void addBorderRulingsTo(Page page) {
+ Point2D.Double leftTop = new Point2D.Double(page.getLeft(), page.getTop()),
+ rightTop = new Point2D.Double(page.getRight(), page.getTop()),
+ rightBottom = new Point2D.Double(page.getRight(), page.getBottom()),
+ leftBottom = new Point2D.Double(page.getLeft(), page.getBottom());
+ page.addRuling(new Ruling(leftTop, rightTop));
+ page.addRuling(new Ruling(rightTop, rightBottom));
+ page.addRuling(new Ruling(rightBottom, leftBottom));
+ page.addRuling(new Ruling(leftBottom, leftTop));
+ }
+
+ public Page getArea(float top, float left, float bottom, float right) {
+ Rectangle area = new Rectangle(top, left, right - left, bottom - top);
+ return getArea(area);
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public Integer getRotation() {
return rotation;
}
public int getPageNumber() {
- return pageNumber;
+ return number;
+ }
+
+ /**
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public float getMinCharWidth() {
+ return minCharWidth;
+ }
+
+ /**
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public float getMinCharHeight() {
+ return minCharHeight;
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ public List getText() {
+ return textElements;
+ }
+
+ public List getText(Rectangle area) {
+ return spatialIndex.contains(area);
+ }
+
+ /**
+ * @deprecated use {@linkplain #getText(Rectangle)} instead
+ */
+ @Deprecated
+ public List getText(float top, float left, float bottom, float right) {
+ return getText(new Rectangle(top, left, right - left, bottom - top));
}
+ /**
+ * @deprecated use {@linkplain #getText()} instead
+ */
+ @Deprecated
public List getTexts() {
- return texts;
+ return textElements;
}
-
+
/**
* Returns the minimum bounding box that contains all the TextElements on this Page
*/
@@ -138,99 +223,194 @@ public Rectangle getTextBounds() {
List texts = this.getText();
if (!texts.isEmpty()) {
return Utils.bounds(texts);
- }
- else {
+ } else {
return new Rectangle();
}
-
}
+ /**
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public boolean hasText() {
+ return textElements.size() > 0;
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public List getRulings() {
- if (this.cleanRulings != null) {
- return this.cleanRulings;
- }
-
- if (this.rulings == null || this.rulings.isEmpty()) {
- this.verticalRulingLines = new ArrayList();
- this.horizontalRulingLines = new ArrayList();
- return new ArrayList();
- }
-
- Utils.snapPoints(this.rulings, this.minCharWidth, this.minCharHeight);
-
- List vrs = new ArrayList();
- for (Ruling vr: this.rulings) {
- if (vr.vertical()) {
- vrs.add(vr);
+ if (cleanRulings != null) {
+ return cleanRulings;
+ }
+
+ if (rulings == null || rulings.isEmpty()) {
+ verticalRulingLines = new ArrayList<>();
+ horizontalRulingLines = new ArrayList<>();
+ return new ArrayList<>();
+ }
+
+ // TODO: Move as a static method to the Ruling class?
+ Utils.snapPoints(rulings, minCharWidth, minCharHeight);
+
+ verticalRulingLines = getCollapsedVerticalRulings();
+ horizontalRulingLines = getCollapsedHorizontalRulings();
+
+ cleanRulings = new ArrayList<>(verticalRulingLines);
+ cleanRulings.addAll(horizontalRulingLines);
+
+ return cleanRulings;
+ }
+
+ // TODO: Create a class for 'List ' and encapsulate these behaviors within it?
+ private List getCollapsedVerticalRulings() {
+ List verticalRulings = new ArrayList<>();
+ for (Ruling ruling : rulings) {
+ if (ruling.vertical()) {
+ verticalRulings.add(ruling);
}
}
- this.verticalRulingLines = Ruling.collapseOrientedRulings(vrs);
-
- List hrs = new ArrayList();
- for (Ruling hr: this.rulings) {
- if (hr.horizontal()) {
- hrs.add(hr);
+ return Ruling.collapseOrientedRulings(verticalRulings);
+ }
+
+ private List getCollapsedHorizontalRulings() {
+ List horizontalRulings = new ArrayList<>();
+ for (Ruling ruling : rulings) {
+ if (ruling.horizontal()) {
+ horizontalRulings.add(ruling);
}
}
- this.horizontalRulingLines = Ruling.collapseOrientedRulings(hrs);
-
- this.cleanRulings = new ArrayList(this.verticalRulingLines);
- this.cleanRulings.addAll(this.horizontalRulingLines);
-
- return this.cleanRulings;
-
+ return Ruling.collapseOrientedRulings(horizontalRulings);
}
-
+
public List getVerticalRulings() {
- if (this.verticalRulingLines != null) {
- return this.verticalRulingLines;
+ if (verticalRulingLines != null) {
+ return verticalRulingLines;
}
- this.getRulings();
- return this.verticalRulingLines;
+ getRulings();
+ return verticalRulingLines;
}
-
+
public List getHorizontalRulings() {
- if (this.horizontalRulingLines != null) {
- return this.horizontalRulingLines;
+ if (horizontalRulingLines != null) {
+ return horizontalRulingLines;
}
- this.getRulings();
- return this.horizontalRulingLines;
+ getRulings();
+ return horizontalRulingLines;
}
-
- public void addRuling(Ruling r) {
- if (r.oblique()) {
- throw new UnsupportedOperationException("Can't add an oblique ruling");
+
+ public void addRuling(Ruling ruling) {
+ if (ruling.oblique()) {
+ throw new UnsupportedOperationException("Can't add an oblique ruling.");
}
- this.rulings.add(r);
- // clear caches
- this.verticalRulingLines = null;
- this.horizontalRulingLines = null;
- this.cleanRulings = null;
+ rulings.add(ruling);
+ // Clear caches:
+ verticalRulingLines = null;
+ horizontalRulingLines = null;
+ cleanRulings = null;
}
-
+
public List getUnprocessedRulings() {
- return this.rulings;
+ return rulings;
}
- public float getMinCharWidth() {
- return minCharWidth;
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ public PDPage getPDPage() {
+ return pdPage;
}
- public float getMinCharHeight() {
- return minCharHeight;
+ public PDDocument getPDDoc() {
+ return pdDoc;
}
- public PDPage getPDPage() {
- return pdPage;
- }
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ /**
+ * @deprecated with no replacement
+ */
+ @Deprecated
public RectangleSpatialIndex getSpatialIndex() {
- return this.spatial_index;
+ return spatialIndex;
}
-
- public boolean hasText() {
- return this.texts.size() > 0;
+
+ public static class Builder {
+ private PageDims pageDims;
+ private int rotation;
+ private int number;
+ private PDPage pdPage;
+ private PDDocument pdDocument;
+ private List textElements;
+ private List rulings;
+ private float minCharWidth;
+ private float minCharHeight;
+ private RectangleSpatialIndex index;
+
+ private Builder() {}
+
+ public static Builder newInstance() {
+ return new Builder();
+ }
+
+ public Builder withPageDims(PageDims pageDims) {
+ this.pageDims = pageDims;
+
+ return this;
+ }
+
+ public Builder withRotation(int rotation) {
+ this.rotation = rotation;
+
+ return this;
+ }
+
+ public Builder withNumber(int number) {
+ this.number = number;
+
+ return this;
+ }
+
+ public Builder withPdPage(PDPage pdPage) {
+ this.pdPage = pdPage;
+
+ return this;
+ }
+
+ public Builder withPdDocument(PDDocument pdDocument) {
+ this.pdDocument = pdDocument;
+
+ return this;
+ }
+
+ public Builder withTextElements(List textElements) {
+ this.textElements = textElements;
+
+ return this;
+ }
+
+ public Builder withRulings(List rulings) {
+ this.rulings = rulings;
+
+ return this;
+ }
+
+ public Builder withMinCharWidth(float minCharWidth) {
+ this.minCharWidth = minCharWidth;
+
+ return this;
+ }
+
+ public Builder withMinCharHeight(float minCharHeight) {
+ this.minCharHeight = minCharHeight;
+
+ return this;
+ }
+
+ public Builder withIndex(RectangleSpatialIndex index) {
+ this.index = index;
+
+ return this;
+ }
+
+ public Page build() {
+ return new Page(pageDims, rotation, number, pdPage, pdDocument, textElements, rulings, minCharWidth, minCharHeight, index);
+ }
}
-
-
}
diff --git a/src/main/java/technology/tabula/PageDims.java b/src/main/java/technology/tabula/PageDims.java
new file mode 100644
index 00000000..1598d125
--- /dev/null
+++ b/src/main/java/technology/tabula/PageDims.java
@@ -0,0 +1,35 @@
+package technology.tabula;
+
+public class PageDims {
+ private final float top;
+ private final float left;
+ private final float width;
+ private final float height;
+
+ private PageDims(final float top, final float left, final float width, final float height) {
+ this.top = top;
+ this.left = left;
+ this.width = width;
+ this.height = height;
+ }
+
+ public static PageDims of(final float top, final float left, final float width, final float height) {
+ return new PageDims(top, left, width, height);
+ }
+
+ public float getTop() {
+ return top;
+ }
+
+ public float getLeft() {
+ return left;
+ }
+
+ public float getWidth() {
+ return width;
+ }
+
+ public float getHeight() {
+ return height;
+ }
+}
diff --git a/src/main/java/technology/tabula/PageIterator.java b/src/main/java/technology/tabula/PageIterator.java
index 5fec2a77..052ed54a 100644
--- a/src/main/java/technology/tabula/PageIterator.java
+++ b/src/main/java/technology/tabula/PageIterator.java
@@ -5,39 +5,39 @@
public class PageIterator implements Iterator {
- private ObjectExtractor oe;
+ private ObjectExtractor objectExtractor;
private Iterator pageIndexIterator;
-
- public PageIterator(ObjectExtractor oe, Iterable pages) {
+
+ public PageIterator(ObjectExtractor objectExtractor, Iterable pages) {
super();
- this.oe = oe;
+ this.objectExtractor = objectExtractor;
this.pageIndexIterator = pages.iterator();
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
@Override
public boolean hasNext() {
- return this.pageIndexIterator.hasNext();
+ return pageIndexIterator.hasNext();
}
@Override
public Page next() {
- Page page = null;
+ Page nextPage = null;
if (!this.hasNext()) {
throw new IllegalStateException();
}
try {
- page = oe.extractPage(this.pageIndexIterator.next());
+ nextPage = objectExtractor.extractPage(pageIndexIterator.next());
} catch (IOException e) {
- // TODO Auto-generated catch block
e.printStackTrace();
}
- return page;
+ return nextPage;
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
@Override
public void remove() {
throw new UnsupportedOperationException();
-
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/technology/tabula/Pair.java b/src/main/java/technology/tabula/Pair.java
new file mode 100644
index 00000000..d54cbbe5
--- /dev/null
+++ b/src/main/java/technology/tabula/Pair.java
@@ -0,0 +1,19 @@
+package technology.tabula;
+
+public class Pair {
+ private final L left;
+ private final R right;
+
+ public Pair(L left, R right) {
+ this.left = left;
+ this.right = right;
+ }
+
+ public L getLeft() {
+ return this.left;
+ }
+
+ public R getRight() {
+ return this.right;
+ }
+}
diff --git a/src/main/java/technology/tabula/ProjectionProfile.java b/src/main/java/technology/tabula/ProjectionProfile.java
index 6479964d..39ab9e41 100644
--- a/src/main/java/technology/tabula/ProjectionProfile.java
+++ b/src/main/java/technology/tabula/ProjectionProfile.java
@@ -5,6 +5,8 @@
import java.util.List;
+// NOTE: this class is currently not used by the extraction algorithms
+// keeping it for potential use.
public class ProjectionProfile {
public static final int DECIMAL_PLACES = 1; // fixed <-> float conversion precision
@@ -71,7 +73,7 @@ public float[] getHorizontalProjection() {
public float[] findVerticalSeparators(float minColumnWidth) {
boolean foundNarrower = false;
- List verticalSeparators = new ArrayList();
+ List verticalSeparators = new ArrayList<>();
for (Ruling r: area.getVerticalRulings()) {
if (r.length() / this.textBounds.getHeight() >= 0.95) {
verticalSeparators.add(toFixed(r.getPosition() - this.areaLeft));
@@ -103,7 +105,7 @@ public float[] findVerticalSeparators(float minColumnWidth) {
public float[] findHorizontalSeparators(float minRowHeight) {
boolean foundShorter = false;
- List horizontalSeparators = new ArrayList();
+ List horizontalSeparators = new ArrayList<>();
for (Ruling r: area.getHorizontalRulings()) {
System.out.println(r.length() / this.textBounds.getWidth());
if (r.length() / this.textBounds.getWidth() >= 0.95) {
@@ -134,7 +136,7 @@ public float[] findHorizontalSeparators(float minRowHeight) {
}
private static List findSeparatorsFromProjection(float[] derivative) {
- List separators = new ArrayList();
+ List separators = new ArrayList<>();
Integer lastNeg = null;
float s;
boolean positiveSlope = false;
@@ -165,7 +167,7 @@ public static float[] smooth(float[] data, int kernelSize) {
+ kernelSize / 2, data.length); j++) {
s += data[j];
}
- rv[i] = (float) Math.floor(s / (float) kernelSize);
+ rv[i] = (float) Math.floor(s / kernelSize);
}
}
return rv;
@@ -211,7 +213,7 @@ private static int toFixed(double value) {
}
private static double toDouble(int value) {
- return (double) value / Math.pow(10, DECIMAL_PLACES);
+ return value / Math.pow(10, DECIMAL_PLACES);
}
}
diff --git a/src/main/java/technology/tabula/QuickSort.java b/src/main/java/technology/tabula/QuickSort.java
index 21d26dd5..03388a15 100644
--- a/src/main/java/technology/tabula/QuickSort.java
+++ b/src/main/java/technology/tabula/QuickSort.java
@@ -16,94 +16,97 @@
*/
package technology.tabula;
+import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
+import java.util.RandomAccess;
import java.util.Stack;
/**
- * see http://de.wikipedia.org/wiki/Quicksort.
+ * An implementation of Quicksort.
+ *
+ * @see wikipedia
*
* @author UWe Pachler
*/
-public class QuickSort
-{
-
- private QuickSort()
- {
- }
-
- private static final Comparator extends Comparable> objComp = new Comparator()
- {
- public int compare(Comparable object1, Comparable object2)
- {
- return object1.compareTo(object2);
- }
- };
+public final class QuickSort {
+
+ private QuickSort() {
+ // utility
+ }
+
+ /**
+ * Sorts the given list according to natural order.
+ */
+ public static > void sort(List list) {
+ sort(list, QuickSort.naturalOrder()); // JAVA_8 replace with Comparator.naturalOrder() (and cleanup)
+ }
+
+ /**
+ * Sorts the given list using the given comparator.
+ */
+ public static void sort(List list, Comparator super T> comparator) {
+ if (list instanceof RandomAccess) {
+ quicksort(list, comparator);
+ } else {
+ List copy = new ArrayList<>(list);
+ quicksort(copy, comparator);
+ list.clear();
+ list.addAll(copy);
+ }
+ }
- /**
- * Sorts the given list using the given comparator.
- */
- public static void sort(List list, Comparator cmp)
- {
- quicksort(list, cmp);
- }
+ private static void quicksort(List list, Comparator super T> cmp) {
+ Stack stack = new Stack<>();
+ stack.push(0);
+ stack.push(list.size());
+ while (!stack.isEmpty()) {
+ int right = stack.pop();
+ int left = stack.pop();
+
+ if (right - left < 2) continue;
+ int p = left + ((right - left) / 2);
+ p = partition(list, cmp, p, left, right);
- /**
- * Sorts the given list using compareTo as comparator.
- */
- public static void sort(List list)
- {
- sort(list, (Comparator) objComp);
- }
+ stack.push(p + 1);
+ stack.push(right);
- private static void quicksort(List list, Comparator cmp)
- {
- Stack stack = new Stack();
- stack.push(0);
- stack.push(list.size());
- while (!stack.isEmpty()) {
- int right = stack.pop();
- int left = stack.pop();
- if (right - left < 2) continue;
- int p = left + ((right-left)/2);
- p = partition(list, cmp, p, left, right);
-
- stack.push(p+1);
- stack.push(right);
+ stack.push(left);
+ stack.push(p);
+ }
+ }
- stack.push(left);
- stack.push(p);
+ private static int partition(List list, Comparator super T> cmp, int p, int start, int end) {
+ int l = start;
+ int h = end - 2;
+ T piv = list.get(p);
+ swap(list, p, end - 1);
- }
- }
-
- private static int partition(List list, Comparator cmp, int p, int start, int end) {
- int l = start;
- int h = end - 2;
- T piv = list.get(p);
- swap(list,p,end-1);
+ while (l < h) {
+ if (cmp.compare(list.get(l), piv) <= 0) l++;
+ else if (cmp.compare(piv, list.get(h)) <= 0) h--;
+ else swap(list, l, h);
+ }
+ int idx = h;
+ if (cmp.compare(list.get(h), piv) < 0) idx++;
+ swap(list, end - 1, idx);
+ return idx;
+ }
- while (l < h) {
- if (cmp.compare(list.get(l), piv) <= 0) {
- l++;
- } else if (cmp.compare(piv, list.get(h)) <= 0) {
- h--;
- } else {
- swap(list,l,h);
- }
- }
- int idx = h;
- if (cmp.compare(list.get(h), piv) < 0) idx++;
- swap(list,end-1,idx);
- return idx;
- }
-
+ private static void swap(List list, int i, int j) {
+ T tmp = list.get(i);
+ list.set(i, list.get(j));
+ list.set(j, tmp);
+ }
- private static void swap(List list, int i, int j)
- {
- T tmp = list.get(i);
- list.set(i, list.get(j));
- list.set(j, tmp);
- }
+ @SuppressWarnings({ "rawtypes", "unchecked" })
+ private static final Comparator NATURAL_ORDER = new Comparator() {
+ @Override public int compare(Object l, Object r) { return ((Comparable) l).compareTo(r); }
+ };
+
+ @SuppressWarnings("unchecked")
+ private static > Comparator naturalOrder() {
+ return NATURAL_ORDER;
+ }
}
diff --git a/src/main/java/technology/tabula/Rectangle.java b/src/main/java/technology/tabula/Rectangle.java
index 41b79374..b96fcd77 100644
--- a/src/main/java/technology/tabula/Rectangle.java
+++ b/src/main/java/technology/tabula/Rectangle.java
@@ -2,171 +2,177 @@
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
+import java.util.Comparator;
import java.util.List;
+import java.util.Locale;
@SuppressWarnings("serial")
-public class Rectangle extends Rectangle2D.Float implements Comparable {
-
- protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
-
- public Rectangle() {
- super();
- }
-
- public Rectangle(float top, float left, float width, float height) {
- super();
- this.setRect(left, top, width, height);
- }
-
- @Override
- public int compareTo(Rectangle other) {
- double thisBottom = this.getBottom();
- double otherBottom = other.getBottom();
- int rv;
-
- if (this.equals(other)) return 0;
-
- if (this.verticalOverlap(other) > VERTICAL_COMPARISON_THRESHOLD) {
- rv = java.lang.Double.compare(this.getX(), other.getX());
- }
- else {
- rv = java.lang.Double.compare(thisBottom, otherBottom);
- }
- return rv;
- }
-
- // I'm bad at Java and need this for fancy sorting in technology.tabula.TextChunk.
- public int isLtrDominant(){
- return 0;
- }
-
-
- public float getArea() {
- return this.width * this.height;
- }
-
- public float verticalOverlap(Rectangle other) {
- return (float) Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
- }
-
- public boolean verticallyOverlaps(Rectangle other) {
- return verticalOverlap(other) > 0;
- }
-
- public float horizontalOverlap(Rectangle other) {
- return (float) Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
- }
-
- public boolean horizontallyOverlaps(Rectangle other) {
- return horizontalOverlap(other) > 0;
- }
-
- public float verticalOverlapRatio(Rectangle other) {
- float rv = 0,
- delta = (float) Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
-
- if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
- rv = (float) ((other.getBottom() - this.getTop()) / delta);
- }
- else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
- rv = (float) ((this.getBottom() - other.getTop()) / delta);
- }
- else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
- rv = (float) ((other.getBottom() - other.getTop()) / delta);
- }
- else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
- rv = (float) ((this.getBottom() - this.getTop()) / delta);
- }
-
- return rv;
-
- }
-
- public float overlapRatio(Rectangle other) {
- double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
- double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
- double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
- double unionArea = this.getArea() + other.getArea() - intersectionArea;
-
- return (float) (intersectionArea / unionArea);
- }
-
- public Rectangle merge(Rectangle other) {
- this.setRect(this.createUnion(other));
- return this;
- }
-
- public float getTop() {
- return (float) this.getMinY();
- }
-
- public void setTop(float top) {
- float deltaHeight = top - this.y;
- this.setRect(this.x, top, this.width, this.height - deltaHeight);
- }
-
- public float getRight() {
- return (float) this.getMaxX();
- }
-
- public void setRight(float right) {
- this.setRect(this.x, this.y, right - this.x, this.height);
- }
-
- public float getLeft() {
- return (float) this.getMinX();
- }
-
- public void setLeft(float left) {
- float deltaWidth = left - this.x;
- this.setRect(left, this.y, this.width - deltaWidth, this.height);
- }
-
- public float getBottom() {
- return (float) this.getMaxY();
- }
-
- public void setBottom(float bottom) {
- this.setRect(this.x, this.y, this.width, bottom - this.y);
- }
-
- public Point2D[] getPoints() {
- return new Point2D[] {
- new Point2D.Float((float) this.getLeft(), (float) this.getTop()),
- new Point2D.Float((float) this.getRight(), (float) this.getTop()),
- new Point2D.Float((float) this.getRight(), (float) this.getBottom()),
- new Point2D.Float((float) this.getLeft(), (float) this.getBottom())
- };
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- String s = super.toString();
- sb.append(s.substring(0, s.length() - 1));
- sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
- return sb.toString();
- }
-
-
- /**
- * @param rectangles
- * @return minimum bounding box that contains all the rectangles
- */
- public static Rectangle boundingBoxOf(List extends Rectangle> rectangles) {
- float minx = java.lang.Float.MAX_VALUE;
- float miny = java.lang.Float.MAX_VALUE;
- float maxx = java.lang.Float.MIN_VALUE;
- float maxy = java.lang.Float.MIN_VALUE;
-
- for (Rectangle r: rectangles) {
- minx = (float) Math.min(r.getMinX(), minx);
- miny = (float) Math.min(r.getMinY(), miny);
- maxx = (float) Math.max(r.getMaxX(), maxx);
- maxy = (float) Math.max(r.getMaxY(), maxy);
- }
- return new Rectangle(miny, minx, maxx - minx, maxy - miny);
- }
-
+public class Rectangle extends Rectangle2D.Float {
+
+ /**
+ * Ill-defined comparator, from when Rectangle was Comparable.
+ *
+ * @see PR 116
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public static final Comparator ILL_DEFINED_ORDER = new Comparator() {
+ @Override public int compare(Rectangle o1, Rectangle o2) {
+ if (o1.equals(o2)) return 0;
+ if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
+ return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1
+ ? - java.lang.Double.compare(o1.getX(), o2.getX())
+ : java.lang.Double.compare(o1.getX(), o2.getX());
+ } else {
+ return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
+ }
+ }
+ };
+
+ protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
+
+ public Rectangle() {
+ super();
+ }
+
+ public Rectangle(float top, float left, float width, float height) {
+ super();
+ this.setRect(left, top, width, height);
+ }
+
+ public int compareTo(Rectangle other) {
+ return ILL_DEFINED_ORDER.compare(this, other);
+ }
+
+ // I'm bad at Java and need this for fancy sorting in
+ // technology.tabula.TextChunk.
+ public int isLtrDominant() {
+ return 0;
+ }
+
+ public float getArea() {
+ return this.width * this.height;
+ }
+
+ public float verticalOverlap(Rectangle other) {
+ return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
+ }
+
+ public boolean verticallyOverlaps(Rectangle other) {
+ return verticalOverlap(other) > 0;
+ }
+
+ public float horizontalOverlap(Rectangle other) {
+ return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
+ }
+
+ public boolean horizontallyOverlaps(Rectangle other) {
+ return horizontalOverlap(other) > 0;
+ }
+
+ public float verticalOverlapRatio(Rectangle other) {
+ float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
+
+ if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom()
+ && other.getBottom() <= this.getBottom()) {
+ rv = (other.getBottom() - this.getTop()) / delta;
+ } else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom()
+ && this.getBottom() <= other.getBottom()) {
+ rv = (this.getBottom() - other.getTop()) / delta;
+ } else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom()
+ && other.getBottom() <= this.getBottom()) {
+ rv = (other.getBottom() - other.getTop()) / delta;
+ } else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom()
+ && this.getBottom() <= other.getBottom()) {
+ rv = (this.getBottom() - this.getTop()) / delta;
+ }
+
+ return rv;
+
+ }
+
+ public float overlapRatio(Rectangle other) {
+ double intersectionWidth = Math.max(0,
+ Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
+ double intersectionHeight = Math.max(0,
+ Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
+ double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
+ double unionArea = this.getArea() + other.getArea() - intersectionArea;
+
+ return (float) (intersectionArea / unionArea);
+ }
+
+ public Rectangle merge(Rectangle other) {
+ this.setRect(this.createUnion(other));
+ return this;
+ }
+
+ public float getTop() {
+ return (float) this.getMinY();
+ }
+
+ public void setTop(float top) {
+ float deltaHeight = top - this.y;
+ this.setRect(this.x, top, this.width, this.height - deltaHeight);
+ }
+
+ public float getRight() {
+ return (float) this.getMaxX();
+ }
+
+ public void setRight(float right) {
+ this.setRect(this.x, this.y, right - this.x, this.height);
+ }
+
+ public float getLeft() {
+ return (float) this.getMinX();
+ }
+
+ public void setLeft(float left) {
+ float deltaWidth = left - this.x;
+ this.setRect(left, this.y, this.width - deltaWidth, this.height);
+ }
+
+ public float getBottom() {
+ return (float) this.getMaxY();
+ }
+
+ public void setBottom(float bottom) {
+ this.setRect(this.x, this.y, this.width, bottom - this.y);
+ }
+
+ public Point2D[] getPoints() {
+ return new Point2D[] { new Point2D.Float(this.getLeft(), this.getTop()),
+ new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()),
+ new Point2D.Float(this.getLeft(), this.getBottom()) };
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ String s = super.toString();
+ sb.append(s.substring(0, s.length() - 1));
+ sb.append(String.format(Locale.US, ",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
+ return sb.toString();
+ }
+
+ /**
+ * @param rectangles
+ * @return minimum bounding box that contains all the rectangles
+ */
+ public static Rectangle boundingBoxOf(List extends Rectangle> rectangles) {
+ float minx = java.lang.Float.MAX_VALUE;
+ float miny = java.lang.Float.MAX_VALUE;
+ float maxx = java.lang.Float.MIN_VALUE;
+ float maxy = java.lang.Float.MIN_VALUE;
+
+ for (Rectangle r : rectangles) {
+ minx = (float) Math.min(r.getMinX(), minx);
+ miny = (float) Math.min(r.getMinY(), miny);
+ maxx = (float) Math.max(r.getMaxX(), maxx);
+ maxy = (float) Math.max(r.getMaxY(), maxy);
+ }
+ return new Rectangle(miny, minx, maxx - minx, maxy - miny);
+ }
}
diff --git a/src/main/java/technology/tabula/RectangleSpatialIndex.java b/src/main/java/technology/tabula/RectangleSpatialIndex.java
index e3aa633e..0e942545 100644
--- a/src/main/java/technology/tabula/RectangleSpatialIndex.java
+++ b/src/main/java/technology/tabula/RectangleSpatialIndex.java
@@ -1,88 +1,47 @@
package technology.tabula;
-import gnu.trove.procedure.TIntProcedure;
-
import java.util.ArrayList;
-import java.util.Collections;
import java.util.List;
-import net.sf.jsi.SpatialIndex;
-import net.sf.jsi.rtree.RTree;
+import org.locationtech.jts.geom.Envelope;
+import org.locationtech.jts.index.strtree.STRtree;
-class RectangleSpatialIndex {
+public class RectangleSpatialIndex {
- class SaveToListProcedure implements TIntProcedure {
- private List ids = new ArrayList();
- public boolean execute(int id) {
- ids.add(id);
- return true;
- };
-
- private List getIds() {
- return ids;
- }
- };
-
- private final SpatialIndex si;
- private final List rectangles;
- private Rectangle bounds = null;
-
- public RectangleSpatialIndex() {
- si = new RTree();
- si.init(null);
- rectangles = new ArrayList();
- }
-
+ private final STRtree si = new STRtree();
+ private final List rectangles = new ArrayList<>();
+
public void add(T te) {
rectangles.add(te);
- if (bounds == null) {
- bounds = new Rectangle();
- bounds.setRect(te);
- }
- else {
- bounds.merge(te);
- }
- si.add(rectangleToSpatialIndexRectangle(te), rectangles.size() - 1);
+ si.insert(new Envelope(te.getLeft(), te.getRight(), te.getBottom(), te.getTop()), te);
}
public List contains(Rectangle r) {
- SaveToListProcedure proc = new SaveToListProcedure();
- si.contains(rectangleToSpatialIndexRectangle(r), proc);
- ArrayList rv = new ArrayList();
- for (int i : proc.getIds()) {
- rv.add(rectangles.get(i));
+ List intersection = si.query(new Envelope(r.getLeft(), r.getRight(), r.getTop(), r.getBottom()));
+ List rv = new ArrayList();
+
+ for (T ir: intersection) {
+ if (r.contains(ir)) {
+ rv.add(ir);
+ }
}
- Utils.sort(rv);
+
+ Utils.sort(rv, Rectangle.ILL_DEFINED_ORDER);
return rv;
}
public List intersects(Rectangle r) {
- SaveToListProcedure proc = new SaveToListProcedure();
- si.intersects(rectangleToSpatialIndexRectangle(r), proc);
- ArrayList rv = new ArrayList();
- for (int i : proc.getIds()) {
- rv.add(rectangles.get(i));
- }
- Utils.sort(rv);
- return rv;
+ return si.query(new Envelope(r.getLeft(), r.getRight(), r.getTop(), r.getBottom()));
}
- private net.sf.jsi.Rectangle rectangleToSpatialIndexRectangle(Rectangle r) {
- return new net.sf.jsi.Rectangle((float) r.getX(),
- (float) r.getY(),
- (float) (r.getX() + r.getWidth()),
- (float) (r.getY() + r.getHeight()));
- }
-
-
/**
* Minimum bounding box of all the Rectangles contained on this RectangleSpatialIndex
*
* @return a Rectangle
*/
public Rectangle getBounds() {
- return bounds;
+ return Rectangle.boundingBoxOf(rectangles);
}
}
diff --git a/src/main/java/technology/tabula/RectangularTextContainer.java b/src/main/java/technology/tabula/RectangularTextContainer.java
index f9e0036f..934b5f13 100644
--- a/src/main/java/technology/tabula/RectangularTextContainer.java
+++ b/src/main/java/technology/tabula/RectangularTextContainer.java
@@ -1,35 +1,51 @@
package technology.tabula;
+import java.util.ArrayList;
import java.util.List;
@SuppressWarnings("serial")
-public abstract class RectangularTextContainer extends Rectangle {
-
- public RectangularTextContainer(float top, float left, float width, float height) {
- super(top, left, width, height);
- }
-
- public String toString() {
- StringBuilder sb = new StringBuilder();
- String s = super.toString();
- sb.append(s.substring(0, s.length() - 1));
- sb.append(String.format(",text=%s]", this.getText() == null ? "null" : "\"" + this.getText() + "\""));
- return sb.toString();
- }
-
- public RectangularTextContainer merge(RectangularTextContainer other) {
- if (this.compareTo(other) < 0) {
- this.getTextElements().addAll(other.getTextElements());
-
- }
- else {
- this.getTextElements().addAll(0, other.getTextElements());
- }
- super.merge(other);
- return this;
- }
-
- public abstract String getText();
- public abstract String getText(boolean useLineReturns);
- public abstract List getTextElements();
+public class RectangularTextContainer extends Rectangle implements HasText {
+
+ protected List textElements = new ArrayList<>();
+
+ protected RectangularTextContainer(float top, float left, float width, float height) {
+ super(top, left, width, height);
+ }
+
+ public RectangularTextContainer merge(RectangularTextContainer other) {
+ if (compareTo(other) < 0) {
+ this.getTextElements().addAll(other.getTextElements());
+ } else {
+ this.getTextElements().addAll(0, other.getTextElements());
+ }
+ super.merge(other);
+ return this;
+ }
+
+ public List getTextElements() {
+ return textElements;
+ }
+
+ public void setTextElements(List textElements) {
+ this.textElements = textElements;
+ }
+
+ @Override
+ public String getText() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public String getText(boolean useLineReturns) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override public String toString() {
+ StringBuilder sb = new StringBuilder();
+ String s = super.toString();
+ sb.append(s.substring(0, s.length() - 1));
+ sb.append(String.format(",text=%s]", this.getText() == null ? "null" : "\"" + this.getText() + "\""));
+ return sb.toString();
+ }
+
}
diff --git a/src/main/java/technology/tabula/Ruling.java b/src/main/java/technology/tabula/Ruling.java
index 8d455dfa..213ce87f 100644
--- a/src/main/java/technology/tabula/Ruling.java
+++ b/src/main/java/technology/tabula/Ruling.java
@@ -8,6 +8,7 @@
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
@@ -16,7 +17,7 @@ public class Ruling extends Line2D.Float {
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
private static int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1;
- private enum SOType { VERTICAL, HRIGHT, HLEFT };
+ private enum SOType { VERTICAL, HRIGHT, HLEFT }
public Ruling(float top, float left, float width, float height) {
this(new Point2D.Float(left, top), new Point2D.Float(left+width, top+height));
@@ -39,9 +40,6 @@ public void normalize() {
else if (Utils.within(angle, 90, 1) || Utils.within(angle, 270, 1)) { // almost vertical
this.setLine(this.x1, this.y1, this.x1, this.y2);
}
-// else {
-// System.out.println("oblique: " + this + " ("+ this.getAngle() + ")");
-// }
}
public boolean vertical() {
@@ -230,11 +228,6 @@ public boolean equals(Object other) {
return this.getP1().equals(o.getP1()) && this.getP2().equals(o.getP2());
}
- @Override
- public int hashCode() {
- return super.hashCode();
- }
-
public float getTop() {
return this.y1;
}
@@ -291,13 +284,13 @@ public double getAngle() {
public String toString() {
StringBuilder sb = new StringBuilder();
Formatter formatter = new Formatter(sb);
- String rv = formatter.format("%s[x1=%f y1=%f x2=%f y2=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
+ String rv = formatter.format(Locale.US, "%s[x1=%f y1=%f x2=%f y2=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
formatter.close();
return rv;
}
public static List cropRulingsToArea(List rulings, Rectangle2D area) {
- ArrayList rv = new ArrayList();
+ ArrayList rv = new ArrayList<>();
for (Ruling r : rulings) {
if (r.intersects(area)) {
rv.add(r.intersect(area));
@@ -322,15 +315,15 @@ public SortObject(SOType type, float position, Ruling ruling) {
}
}
- List sos = new ArrayList();
+ List sos = new ArrayList<>();
- TreeMap tree = new TreeMap(new Comparator() {
+ TreeMap tree = new TreeMap<>(new Comparator() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}});
- TreeMap rv = new TreeMap(new Comparator() {
+ TreeMap rv = new TreeMap<>(new Comparator() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) return 1;
@@ -409,7 +402,7 @@ public static List collapseOrientedRulings(List lines) {
}
public static List collapseOrientedRulings(List lines, int expandAmount) {
- ArrayList rv = new ArrayList();
+ ArrayList rv = new ArrayList<>();
Collections.sort(lines, new Comparator() {
@Override
public int compare(Ruling a, Ruling b) {
diff --git a/src/main/java/technology/tabula/Table.java b/src/main/java/technology/tabula/Table.java
index eda11251..1e73bedf 100644
--- a/src/main/java/technology/tabula/Table.java
+++ b/src/main/java/technology/tabula/Table.java
@@ -8,139 +8,98 @@
@SuppressWarnings("serial")
public class Table extends Rectangle {
-
- class CellPosition implements Comparable {
- int row, col;
- CellPosition(int row, int col) {
- this.row = row; this.col = col;
- }
-
- @Override
- public boolean equals(Object other) {
- if (this == other)
- return true;
- if (!(other instanceof CellPosition))
- return false;
- return other != null && this.row == ((CellPosition) other).row && this.col == ((CellPosition) other).col;
- }
-
- @Override
- public int hashCode() {
- return this.row * 100000 + this.col;
- }
-
- @Override
- public int compareTo(CellPosition other) {
- int rv = 0;
- if(this.row < other.row) {
- rv = -1;
- }
- else if (this.row > other.row) {
- rv = 1;
- }
- else if (this.col > other.col) {
- rv = 1;
- }
- else if (this.col < other.col) {
- rv = -1;
- }
- return rv;
- }
- }
-
- class CellContainer extends TreeMap {
-
- public int maxRow = 0, maxCol = 0;
-
- public RectangularTextContainer get(int row, int col) {
- return this.get(new CellPosition(row, col));
- }
-
- public List getRow(int row) {
- return new ArrayList(this.subMap(new CellPosition(row, 0), new CellPosition(row, maxRow+1)).values());
- }
-
- @Override
- public RectangularTextContainer put(CellPosition cp, RectangularTextContainer value) {
- this.maxRow = Math.max(maxRow, cp.row);
- this.maxCol = Math.max(maxCol, cp.col);
- if (this.containsKey(cp)) { // adding on an existing CellPosition, concatenate content and resize
- value.merge(this.get(cp));
- }
- super.put(cp, value);
- return value;
- }
-
- @Override
- public RectangularTextContainer get(Object key) {
- return this.containsKey(key) ? super.get(key) : TextChunk.EMPTY;
- }
-
- public boolean containsKey(int row, int col) {
- return this.containsKey(new CellPosition(row, col));
- }
-
- }
-
- public static final Table EMPTY = new Table();
-
- CellContainer cellContainer = new CellContainer();
- Page page;
- ExtractionAlgorithm extractionAlgorithm;
- List> rows = null;
-
- public Table() {
- super();
- }
-
- public Table(Page page, ExtractionAlgorithm extractionAlgorithm) {
- this();
- this.page = page;
- this.extractionAlgorithm = extractionAlgorithm;
- }
-
- public void add(RectangularTextContainer tc, int i, int j) {
- this.merge(tc);
- this.cellContainer.put(new CellPosition(i, j), tc);
- this.rows = null; // clear the memoized rows
- }
-
- public List> getRows() {
- if (this.rows != null) {
- return this.rows;
- }
-
- this.rows = new ArrayList>();
- for (int i = 0; i <= this.cellContainer.maxRow; i++) {
- List