From c54ae1782e4b6ff137dfbdff69e02f881da82662 Mon Sep 17 00:00:00 2001 From: Ted Han Date: Fri, 13 Feb 2015 18:12:12 -0600 Subject: [PATCH 1/8] Add a PDFShaver. --- docsplit.gemspec | 2 +- lib/docsplit.rb | 4 +-- lib/docsplit/pdfshaver_extractor.rb | 42 +++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 lib/docsplit/pdfshaver_extractor.rb diff --git a/docsplit.gemspec b/docsplit.gemspec index 0a147e9..392687d 100755 --- a/docsplit.gemspec +++ b/docsplit.gemspec @@ -1,6 +1,6 @@ Gem::Specification.new do |s| s.name = 'docsplit' - s.version = '0.7.6' # Keep version in sync with docsplit.rb + s.version = '0.8.0.alpha' # Keep version in sync with docsplit.rb s.date = '2014-11-17' s.homepage = "http://documentcloud.github.com/docsplit/" diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 1c49e91..f7de733 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -5,7 +5,7 @@ # The Docsplit module delegates to the Java PDF extractors. module Docsplit - VERSION = '0.7.6' # Keep in sync with gemspec. + VERSION = '0.8.0' # Keep in sync with gemspec. ESCAPE = lambda {|x| Shellwords.shellescape(x) } @@ -56,7 +56,7 @@ def self.extract_text(pdfs, opts={}) def self.extract_images(pdfs, opts={}) pdfs = ensure_pdfs(pdfs) opts[:pages] = normalize_value(opts[:pages]) if opts[:pages] - ImageExtractor.new.extract(pdfs, opts) + PDFShaverExtractor.new.extract(pdfs, opts) end # Use JODCConverter to extract the documents as PDFs. diff --git a/lib/docsplit/pdfshaver_extractor.rb b/lib/docsplit/pdfshaver_extractor.rb new file mode 100644 index 0000000..2a33191 --- /dev/null +++ b/lib/docsplit/pdfshaver_extractor.rb @@ -0,0 +1,42 @@ +module Docsplit + class PDShaverExtractor + + + def extract(paths, options={}) + paths.flatten.each |pdf_path| do + pdf = PDFium::Document.new(pdf_path) + pdf.pages.each do |page| + @formats.each do |format| + sizes.each do |size_string| + options = {} + + directory = directory_for(size_string) + pdf_name = File.basename(pdf_path, File.extname(pdf_path)) + filename = "#{pdf_name}_#{page.number}.#{format}" + destination = ESCAPE[File.join(directory, filename)] + + options[:width], options[:height] = extract_size(size_string) + page.render(destination, options) + end + end + end + end + end + + private + def extract_size(size_string) + height = nil + width = nil + + {:height => height, :width => width } + end + + # If there's only one size requested, generate the images directly into + # the output directory. Multiple sizes each get a directory of their own. + def directory_for(size) + path = @sizes.length == 1 ? @output : File.join(@output, size) + File.expand_path(path) + end + + end +end From c392f75229d13e7c8d597bea3e346b19df1658d0 Mon Sep 17 00:00:00 2001 From: Ted Han Date: Fri, 13 Feb 2015 18:13:04 -0600 Subject: [PATCH 2/8] Actually add a PDFShaver. --- lib/docsplit.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/docsplit.rb b/lib/docsplit.rb index f7de733..253d1b0 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -101,6 +101,7 @@ def self.normalize_value(value) end require "#{Docsplit::ROOT}/lib/docsplit/image_extractor" +require "#{Docsplit::ROOT}/lib/docsplit/pdfshaver_extractor" require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs" require "#{Docsplit::ROOT}/lib/docsplit/text_extractor" require "#{Docsplit::ROOT}/lib/docsplit/page_extractor" From c4decc4038fab7b97210e5b60f2d9599ca394b8c Mon Sep 17 00:00:00 2001 From: Ted Han Date: Mon, 16 Feb 2015 08:01:15 -0600 Subject: [PATCH 3/8] Update tests to add better failure messages. --- test/unit/test_extract_images.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/unit/test_extract_images.rb b/test/unit/test_extract_images.rb index 2b7ca87..9caef77 100755 --- a/test/unit/test_extract_images.rb +++ b/test/unit/test_extract_images.rb @@ -10,23 +10,23 @@ def test_basic_image_extraction def test_image_formatting Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => [:jpg, :gif], :size => "250x", :output => OUTPUT) - assert Dir["#{OUTPUT}/*.gif"].length == 2 - assert Dir["#{OUTPUT}/*.jpg"].length == 2 + assert_equal 2, Dir["#{OUTPUT}/*.gif"].length + assert_equal 2, Dir["#{OUTPUT}/*.jpg"].length end def test_page_ranges Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT) - assert Dir["#{OUTPUT}/*.gif"] == ["#{OUTPUT}/obama_arts_2.gif"] + assert_equal Dir["#{OUTPUT}/*.gif"], ["#{OUTPUT}/obama_arts_2.gif"] end def test_image_sizes Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :rolling => true, :size => ["150x", "50x"], :output => OUTPUT) - assert File.size("#{OUTPUT}/50x/obama_arts_1.gif") < File.size("#{OUTPUT}/150x/obama_arts_1.gif") + assert_operator File.size("#{OUTPUT}/50x/obama_arts_1.gif"), :<, File.size("#{OUTPUT}/150x/obama_arts_1.gif") end def test_encrypted_images Docsplit.extract_images('test/fixtures/encrypted.pdf', :format => :gif, :size => "50x", :output => OUTPUT) - assert File.size("#{OUTPUT}/encrypted_1.gif") > 100 + assert_operator File.size("#{OUTPUT}/encrypted_1.gif"), :>, 100 end def test_password_protected_extraction From 39b35f13eff60d7aa1e0c251a18861e8b8e8feaa Mon Sep 17 00:00:00 2001 From: Ted Han Date: Mon, 16 Feb 2015 10:43:58 -0600 Subject: [PATCH 4/8] Round out options for configuring how pages are rendered. --- lib/docsplit/pdfshaver_extractor.rb | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/lib/docsplit/pdfshaver_extractor.rb b/lib/docsplit/pdfshaver_extractor.rb index 2a33191..357aeb1 100644 --- a/lib/docsplit/pdfshaver_extractor.rb +++ b/lib/docsplit/pdfshaver_extractor.rb @@ -1,21 +1,27 @@ +require 'pdfshaver' module Docsplit - class PDShaverExtractor + class PDFShaverExtractor + MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB" + DEFAULT_FORMAT = :png + DEFAULT_DENSITY = '150' def extract(paths, options={}) - paths.flatten.each |pdf_path| do - pdf = PDFium::Document.new(pdf_path) + extract_options(options) + paths.flatten.each do |pdf_path| + pdf = PDFShaver::Document.new(pdf_path) pdf.pages.each do |page| @formats.each do |format| - sizes.each do |size_string| + @sizes.each do |size_string| options = {} directory = directory_for(size_string) pdf_name = File.basename(pdf_path, File.extname(pdf_path)) filename = "#{pdf_name}_#{page.number}.#{format}" - destination = ESCAPE[File.join(directory, filename)] + destination = File.join(directory, filename) + FileUtils.mkdir_p ESCAPE[directory] - options[:width], options[:height] = extract_size(size_string) + options = options.merge extract_size(size_string) page.render(destination, options) end end @@ -38,5 +44,14 @@ def directory_for(size) File.expand_path(path) end + def extract_options(options) + @output = options[:output] || '.' + @pages = options[:pages] + @density = options[:density] || DEFAULT_DENSITY + @formats = [options[:format] || DEFAULT_FORMAT].flatten + @sizes = [options[:size]].flatten.compact + @sizes = [nil] if @sizes.empty? + @rolling = !!options[:rolling] + end end end From 3c161a47a762ae750e747d5ec4edbf908b2b8c20 Mon Sep 17 00:00:00 2001 From: Ted Han Date: Mon, 16 Feb 2015 16:43:35 -0600 Subject: [PATCH 5/8] Catch PDFShaver errors when opening PDFs and raise ExtractionFailed. --- lib/docsplit/pdfshaver_extractor.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/docsplit/pdfshaver_extractor.rb b/lib/docsplit/pdfshaver_extractor.rb index 357aeb1..9d7cf5c 100644 --- a/lib/docsplit/pdfshaver_extractor.rb +++ b/lib/docsplit/pdfshaver_extractor.rb @@ -9,7 +9,11 @@ class PDFShaverExtractor def extract(paths, options={}) extract_options(options) paths.flatten.each do |pdf_path| - pdf = PDFShaver::Document.new(pdf_path) + begin + pdf = PDFShaver::Document.new(pdf_path) + rescue ArgumentError => e + raise ExtractionFailed + end pdf.pages.each do |page| @formats.each do |format| @sizes.each do |size_string| From 95c7c657ff4fd3d9c40882c42ad12e4e217cda82 Mon Sep 17 00:00:00 2001 From: Ted Han Date: Tue, 17 Feb 2015 18:11:13 -0600 Subject: [PATCH 6/8] Fix order of test arguments --- test/unit/test_extract_images.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/test_extract_images.rb b/test/unit/test_extract_images.rb index 9caef77..49dda75 100755 --- a/test/unit/test_extract_images.rb +++ b/test/unit/test_extract_images.rb @@ -16,7 +16,7 @@ def test_image_formatting def test_page_ranges Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT) - assert_equal Dir["#{OUTPUT}/*.gif"], ["#{OUTPUT}/obama_arts_2.gif"] + assert_equal ["#{OUTPUT}/obama_arts_2.gif"], Dir["#{OUTPUT}/*.gif"] end def test_image_sizes From baa6e2b47f1df377dbb5c73b0912264cefcc1c97 Mon Sep 17 00:00:00 2001 From: Ted Han Date: Tue, 17 Feb 2015 18:11:58 -0600 Subject: [PATCH 7/8] steal page list functionality from ImageExtractor, delegate geometry parsing to PDFShaver. --- lib/docsplit/pdfshaver_extractor.rb | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/lib/docsplit/pdfshaver_extractor.rb b/lib/docsplit/pdfshaver_extractor.rb index 9d7cf5c..fd03680 100644 --- a/lib/docsplit/pdfshaver_extractor.rb +++ b/lib/docsplit/pdfshaver_extractor.rb @@ -14,19 +14,17 @@ def extract(paths, options={}) rescue ArgumentError => e raise ExtractionFailed end - pdf.pages.each do |page| + pdf.pages(extract_page_list(@pages)).each do |page| @formats.each do |format| @sizes.each do |size_string| - options = {} - directory = directory_for(size_string) pdf_name = File.basename(pdf_path, File.extname(pdf_path)) filename = "#{pdf_name}_#{page.number}.#{format}" destination = File.join(directory, filename) FileUtils.mkdir_p ESCAPE[directory] - options = options.merge extract_size(size_string) - page.render(destination, options) + dimensions = page.extract_dimensions_from_gm_geometry_string(size_string) + page.render(destination, dimensions) end end end @@ -34,13 +32,6 @@ def extract(paths, options={}) end private - def extract_size(size_string) - height = nil - width = nil - - {:height => height, :width => width } - end - # If there's only one size requested, generate the images directly into # the output directory. Multiple sizes each get a directory of their own. def directory_for(size) @@ -48,6 +39,19 @@ def directory_for(size) File.expand_path(path) end + # Generate the expanded list of requested page numbers. + def extract_page_list(pages) + return :all if pages.nil? + pages.split(',').map { |range| + if range.include?('-') + range = range.split('-') + Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i } + else + range.to_i + end + }.flatten.uniq.sort + end + def extract_options(options) @output = options[:output] || '.' @pages = options[:pages] From 434fb296d7d5b984ea22399d19b539ee533a04c7 Mon Sep 17 00:00:00 2001 From: Ted Han Date: Wed, 18 Feb 2015 07:04:45 -0600 Subject: [PATCH 8/8] Don't escape the per size directory name --- docsplit.gemspec | 2 +- lib/docsplit/pdfshaver_extractor.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docsplit.gemspec b/docsplit.gemspec index 392687d..ff2122a 100755 --- a/docsplit.gemspec +++ b/docsplit.gemspec @@ -1,6 +1,6 @@ Gem::Specification.new do |s| s.name = 'docsplit' - s.version = '0.8.0.alpha' # Keep version in sync with docsplit.rb + s.version = '0.8.0.alpha1' # Keep version in sync with docsplit.rb s.date = '2014-11-17' s.homepage = "http://documentcloud.github.com/docsplit/" diff --git a/lib/docsplit/pdfshaver_extractor.rb b/lib/docsplit/pdfshaver_extractor.rb index fd03680..d5111a8 100644 --- a/lib/docsplit/pdfshaver_extractor.rb +++ b/lib/docsplit/pdfshaver_extractor.rb @@ -21,7 +21,7 @@ def extract(paths, options={}) pdf_name = File.basename(pdf_path, File.extname(pdf_path)) filename = "#{pdf_name}_#{page.number}.#{format}" destination = File.join(directory, filename) - FileUtils.mkdir_p ESCAPE[directory] + FileUtils.mkdir_p directory dimensions = page.extract_dimensions_from_gm_geometry_string(size_string) page.render(destination, dimensions)