diff --git a/docsplit.gemspec b/docsplit.gemspec index 0a147e9..ff2122a 100755 --- a/docsplit.gemspec +++ b/docsplit.gemspec @@ -1,6 +1,6 @@ Gem::Specification.new do |s| s.name = 'docsplit' - s.version = '0.7.6' # Keep version in sync with docsplit.rb + s.version = '0.8.0.alpha1' # Keep version in sync with docsplit.rb s.date = '2014-11-17' s.homepage = "http://documentcloud.github.com/docsplit/" diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 1c49e91..253d1b0 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -5,7 +5,7 @@ # The Docsplit module delegates to the Java PDF extractors. module Docsplit - VERSION = '0.7.6' # Keep in sync with gemspec. + VERSION = '0.8.0' # Keep in sync with gemspec. ESCAPE = lambda {|x| Shellwords.shellescape(x) } @@ -56,7 +56,7 @@ def self.extract_text(pdfs, opts={}) def self.extract_images(pdfs, opts={}) pdfs = ensure_pdfs(pdfs) opts[:pages] = normalize_value(opts[:pages]) if opts[:pages] - ImageExtractor.new.extract(pdfs, opts) + PDFShaverExtractor.new.extract(pdfs, opts) end # Use JODCConverter to extract the documents as PDFs. @@ -101,6 +101,7 @@ def self.normalize_value(value) end require "#{Docsplit::ROOT}/lib/docsplit/image_extractor" +require "#{Docsplit::ROOT}/lib/docsplit/pdfshaver_extractor" require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs" require "#{Docsplit::ROOT}/lib/docsplit/text_extractor" require "#{Docsplit::ROOT}/lib/docsplit/page_extractor" diff --git a/lib/docsplit/pdfshaver_extractor.rb b/lib/docsplit/pdfshaver_extractor.rb new file mode 100644 index 0000000..d5111a8 --- /dev/null +++ b/lib/docsplit/pdfshaver_extractor.rb @@ -0,0 +1,65 @@ +require 'pdfshaver' +module Docsplit + class PDFShaverExtractor + + MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB" + DEFAULT_FORMAT = :png + DEFAULT_DENSITY = '150' + + def extract(paths, options={}) + extract_options(options) + paths.flatten.each do |pdf_path| + begin + pdf = PDFShaver::Document.new(pdf_path) + rescue ArgumentError => e + raise ExtractionFailed + end + pdf.pages(extract_page_list(@pages)).each do |page| + @formats.each do |format| + @sizes.each do |size_string| + directory = directory_for(size_string) + pdf_name = File.basename(pdf_path, File.extname(pdf_path)) + filename = "#{pdf_name}_#{page.number}.#{format}" + destination = File.join(directory, filename) + FileUtils.mkdir_p directory + + dimensions = page.extract_dimensions_from_gm_geometry_string(size_string) + page.render(destination, dimensions) + end + end + end + end + end + + private + # If there's only one size requested, generate the images directly into + # the output directory. Multiple sizes each get a directory of their own. + def directory_for(size) + path = @sizes.length == 1 ? @output : File.join(@output, size) + File.expand_path(path) + end + + # Generate the expanded list of requested page numbers. + def extract_page_list(pages) + return :all if pages.nil? + pages.split(',').map { |range| + if range.include?('-') + range = range.split('-') + Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i } + else + range.to_i + end + }.flatten.uniq.sort + end + + def extract_options(options) + @output = options[:output] || '.' + @pages = options[:pages] + @density = options[:density] || DEFAULT_DENSITY + @formats = [options[:format] || DEFAULT_FORMAT].flatten + @sizes = [options[:size]].flatten.compact + @sizes = [nil] if @sizes.empty? + @rolling = !!options[:rolling] + end + end +end diff --git a/test/unit/test_extract_images.rb b/test/unit/test_extract_images.rb index 2b7ca87..49dda75 100755 --- a/test/unit/test_extract_images.rb +++ b/test/unit/test_extract_images.rb @@ -10,23 +10,23 @@ def test_basic_image_extraction def test_image_formatting Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => [:jpg, :gif], :size => "250x", :output => OUTPUT) - assert Dir["#{OUTPUT}/*.gif"].length == 2 - assert Dir["#{OUTPUT}/*.jpg"].length == 2 + assert_equal 2, Dir["#{OUTPUT}/*.gif"].length + assert_equal 2, Dir["#{OUTPUT}/*.jpg"].length end def test_page_ranges Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT) - assert Dir["#{OUTPUT}/*.gif"] == ["#{OUTPUT}/obama_arts_2.gif"] + assert_equal ["#{OUTPUT}/obama_arts_2.gif"], Dir["#{OUTPUT}/*.gif"] end def test_image_sizes Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :rolling => true, :size => ["150x", "50x"], :output => OUTPUT) - assert File.size("#{OUTPUT}/50x/obama_arts_1.gif") < File.size("#{OUTPUT}/150x/obama_arts_1.gif") + assert_operator File.size("#{OUTPUT}/50x/obama_arts_1.gif"), :<, File.size("#{OUTPUT}/150x/obama_arts_1.gif") end def test_encrypted_images Docsplit.extract_images('test/fixtures/encrypted.pdf', :format => :gif, :size => "50x", :output => OUTPUT) - assert File.size("#{OUTPUT}/encrypted_1.gif") > 100 + assert_operator File.size("#{OUTPUT}/encrypted_1.gif"), :>, 100 end def test_password_protected_extraction