documentcloud · narutosanjiv · Jul 14, 2014 · Jul 25, 2014
diff --git a/README b/README
@@ -14,9 +14,15 @@
  Installation:
  gem install docsplit

+  Added the options:
+    pdf_opts: which can be used to passed the pdftotext binary file options to docsplit gem
+    For Example:
+      Passing raw options to pdftotext, 
+        Docsplit.extract_text(path, {:pdf_opts => '-raw'})
+
  For documentation, usage, and examples, see:
  http://documentcloud.github.com/docsplit/

  To suggest a feature or report a bug: 
  http://github.com/documentcloud/docsplit/issues/
-
+  
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -103,14 +103,22 @@ def run(command)
    # Extract the full contents of a pdf as a single file, directly.
    def extract_full(pdf)
      text_path = File.join(@output, "#{@pdf_name}.txt")
-      run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      unless @pdf_txt_opts.empty?
+        run "pdftotext  -enc UTF-8 #{@pdf_txt_opts}  #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      else
+        run "pdftotext  -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      end
    end

    # Extract the contents of a single page of text, directly, adding it to
    # the `@pages_to_ocr` list if the text length is inadequate.
    def extract_page(pdf, page)
      text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
-      run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      unless @pdf_txt_opts.empty?
+        run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{@pdf_txt_opts}  #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      else
+        run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      end
      unless @forbid_ocr
        @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
      end
@@ -123,8 +131,9 @@ def extract_options(options)
      @forbid_ocr = options[:ocr] == false
      @clean_ocr  = !(options[:clean] == false)
      @language   = options[:language] || 'eng'
+      @pdf_txt_opts = options[:pdf_opts] || '' 
    end

  end

-end
+end
diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb
@@ -53,5 +53,9 @@ def test_name_escaping_while_extracting_text
    Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT)
    assert Dir["#{OUTPUT}/*.txt"].length == 2
  end
-
+
+  def test_name_escaping_while_extracting_text_with_pdf_opts
+    Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', {:pages => 'all', :output => OUTPUT, :pdf_opts => '-raw'})
+    assert Dir["#{OUTPUT}/*.txt"].length == 2
+  end
 end