diff --git a/LICENSE b/LICENSE
index 38e96bf..76383ba 100755
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,7 @@
 JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
 
-Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
+Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud
+Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
 
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
diff --git a/README b/README
index 34ce202..81374da 100755
--- a/README
+++ b/README
@@ -15,7 +15,7 @@
   gem install docsplit
   
   For documentation, usage, and examples, see:
-  http://documentcloud.github.com/docsplit/
+  https://documentcloud.github.io/docsplit/
   
   To suggest a feature or report a bug: 
   http://github.com/documentcloud/docsplit/issues/
diff --git a/Rakefile b/Rakefile
index e6c5153..9c0f394 100755
--- a/Rakefile
+++ b/Rakefile
@@ -3,7 +3,7 @@ require 'rake/testtask'
 
 desc 'Run all tests'
 task :test do
-  require 'test/unit'
+  require 'minitest/autorun'
   Dir['./test/*/**/test_*.rb'].each {|test| require test }
 end
 
diff --git a/docsplit.gemspec b/docsplit.gemspec
index f5abcc8..0a147e9 100755
--- a/docsplit.gemspec
+++ b/docsplit.gemspec
@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.7.0'         # Keep version in sync with docsplit.rb
-  s.date      = '2013-02-21'
+  s.version   = '0.7.6'         # Keep version in sync with docsplit.rb
+  s.date      = '2014-11-17'
 
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -15,6 +15,7 @@ Gem::Specification.new do |s|
   s.authors           = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
   s.email             = 'opensource@documentcloud.org'
   s.rubyforge_project = 'docsplit'
+  s.license           = 'MIT'
 
   s.require_paths     = ['lib']
   s.executables       = ['docsplit']
diff --git a/index.html b/index.html
index 97aaf16..ccbcb95 100755
--- a/index.html
+++ b/index.html
@@ -87,7 +87,7 @@
 
   <div class="container">
 
-    <h1>Doc<sub style="font-size:150%;">&#9889;</sub>split</h1>
+    <h1>Doc<sub style=""><img style="width:24pt" src="noto_bolt.svg"></sub>split</h1>
 
     <p>
       <a href="http://github.com/documentcloud/docsplit/">Docsplit</a>
@@ -98,7 +98,7 @@ <h1>Doc<sub style="font-size:150%;">&#9889;</sub>split</h1>
       (title, author, number of pages...)
     </p>
 
-    <p>Docsplit is currently at <a href="http://rubygems.org/gems/docsplit">version 0.7.0</a>.</p>
+    <p>Docsplit is currently at <a href="http://rubygems.org/gems/docsplit">version 0.7.6</a>.</p>
 
     <p>
       <i>Docsplit is an open-source component of <a href="http://documentcloud.org/">DocumentCloud</a>.</i>
@@ -149,7 +149,7 @@ <h2 id="installation">Installation &amp; Dependencies</h2>
         (Optional) Install <a href="http://www.accesspdf.com/pdftk/">pdftk</a>.
         On Linux, use <b>aptitude</b>, <b>apt-get</b> or <b>yum</b>:<br />
         <tt>aptitude install pdftk</tt><br />
-        On the Mac, you can <a href="http://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/">download a recent installer</a> for the binary.
+        On the Mac, you can <a href="https://www.pdflabs.com/tools/pdftk-server/">download a recent installer</a> for the binary.
         Without <b>pdftk</b> installed, you can use Docsplit, but won't be able
         to split apart a multi-page PDF into single-page PDFs.
       </li>
@@ -159,6 +159,12 @@ <h2 id="installation">Installation &amp; Dependencies</h2>
         <tt>aptitude install libreoffice</tt><br />
         On the Mac, download and install <a href="http://www.libreoffice.org/download">the latest release</a>.
       </li>
+      <li>
+        (Optional) Install fonts to process documents that use <a href="https://help.ubuntu.com/community/Fonts#Chinese.2C_Japanese.2C_and_Korean_Fonts">Chinese, Japanese, and Korean Fonts</a>.
+        On Linux, use <b>aptitude</b>, <b>apt-get</b> or <b>yum</b>:<br />
+        <tt>aptitude install ttf-wqy-microhei ttf-wqy-zenhei ttf-kochi-gothic ttf-kochi-mincho fonts-nanum</tt><br />
+        On the Mac, the fonts should already be present. However you can always download the TTF files and install them using <a href="http://support.apple.com/en-us/HT201749">Font Book</a>.
+      </li>
     </ol>
 
     <p><i>
@@ -183,7 +189,7 @@ <h2 id="usage">Usage</h2>
       and format. Pass <tt>--pages</tt> or <tt>-p</tt> to choose the specific pages to
       image. Passing<br /> <tt>--size</tt> or <tt>-s</tt> will specify the desired
       image resolution, <tt>--density</tt> or <tt>-d</tt> will specify the DPI to rasterize the images
-      at during conversion by GraphicsMagick, and <tt>--format</tt> or <tt>-f</tt> 
+      at during conversion by GraphicsMagick, and <tt>--format</tt> or <tt>-f</tt>
       will select the format of the final images.
     </p>
 <pre>
@@ -193,7 +199,7 @@ <h2 id="usage">Usage</h2>
 Docsplit.extract_images('example.doc', :size => '1000x', :format => [:png, :jpg])</pre>
 
     <p class="break">
-      <b class="header">text</b><code>--pages --ocr --no-ocr --no-clean</code>
+      <b class="header">text</b><code>--pages --ocr --no-ocr --no-clean --language --no-orientation-detection</code>
       <span class="alias">Ruby: <b>extract_text</b></span>
       <br />
       Extract the complete <b>UTF-8</b>-encoded plain text of a document to a
@@ -201,12 +207,22 @@ <h2 id="usage">Usage</h2>
       pass <tt>--pages all</tt>. You can use the <tt>--ocr</tt> and <tt>--no-ocr</tt>
       flags to force OCR, or disable it, respectively. By default (if Tesseract is installed)
       Docsplit will OCR the text of each page for which it fails to extract text
-      directly from the document. Docsplit will also attempt to clean up garbage 
+      directly from the document. Docsplit will also attempt to clean up garbage
       characters in the OCR'd text &mdash; to disable this, pass the
       <tt>--no-clean</tt> flag.
     </p>
+    <p>
+      By default Tesseract ships only with english extraction data.  
+      If <a href="https://code.google.com/p/tesseract-ocr/downloads/list"/>
+      any additional language models</a> are installed you can select one using
+      the <tt>--language</tt> flag.
+      
+      If <a href="https://code.google.com/p/tesseract-ocr/downloads/detail?name=tesseract-ocr-3.01.osd.tar.gz&can=2&q=">
+      Tesseract's orientation detection model</a> Docsplit will automatically use it
+      unless you specify not to with the <tt>--no-orientation-detection</tt>.
+    </p>
 <pre>
-docsplit text path/to/doc.pdf --pages all</pre>
+docsplit text path/to/doc.pdf --pages all --language deu</pre>
 <pre>
 docs = Dir['storage/originals/*.doc']
 Docsplit.extract_text(docs, :ocr => false, :output => 'storage/text')</pre>
@@ -262,7 +278,7 @@ <h2 id="internals">Internals</h2>
       <a href="http://poppler.freedesktop.org/">Poppler</a>,
       <a href="http://www.accesspdf.com/pdftk/">PDFTK</a>,
       <a href="http://code.google.com/p/tesseract-ocr/">Tesseract</a>, and
-      <a href="http://www.libreoffice.org/">LibreOffice</a> libraries. 
+      <a href="http://www.libreoffice.org/">LibreOffice</a> libraries.
       Poppler is used to extract text and metadata from PDF documents,
       PDFTK is used to split them apart into pages, and GraphicsMagick is used to generate
       the page images (internally, it's rendering them with
@@ -281,89 +297,107 @@ <h2 id="internals">Internals</h2>
     </p>
 
     <h2 id="changes">Change Log</h2>
-    
+
+    <p>
+      <b class="header">0.7.6</b><small> &ndash; Nov. 16, 2014</small><br />
+      Docsplit will now automatically use Tesseract's orientation detection model
+      if it is installed.
+    </p>
+
+    <p>
+      <b class="header">0.7.5</b><small> &ndash; May 28, 2014</small><br />
+      Docsplit will detect PDFs regardless of extension using magic number-based
+      detection.
+    </p>
+
+    <p>
+      <b class="header">0.7.2</b><small> &ndash; Feb. 23, 2013</small><br />
+      Bug fixes for LibreOffice support.
+    </p>
+
     <p>
-      <b class="header">0.7.0</b><small> &ndash; Feb. 21, 2013</small><br />
+      <b class="header">0.7.0</b><small> &ndash; Feb. 23, 2013</small><br />
       Docsplit now expresses a preference for LibreOffice over OpenOffice, with
-      an eye to removing JODConverter and OpenOffice support in future versions.
+      an eye to removing JODConverter and OpenOffice support in future versions
+      (direct LibreOffice support is substantially faster than JODConverter).
       Improved unicode support now correctly collects non-ascii characters from
       pdfinfo.
     </p>
-    
+
     <p>
       <b class="header">0.6.4</b><small> &ndash; Nov. 12, 2012</small><br />
       Added a language flag for the Docsplit commandline, fixed several bugs,
       and began preparations for the deprecation of pdftk.
     </p>
-    
+
     <p>
       <b class="header">0.6.2</b><small> &ndash; Nov. 22, 2011</small><br />
       Bugfix to escape document names during file type detection.
     </p>
-    
+
     <p>
       <b class="header">0.6.1</b><small> &ndash; Nov. 18, 2011</small><br />
       Docsplit now supports converting documents using LibreOffice
       as well as OpenOffice, through JODConverter 3.0 beta4.
     </p>
-    
+
     <p>
       <b class="header">0.6.0</b><small> &ndash; Sept. 13, 2011</small><br />
-      Docsplit should now handle shelling out for documents with arbitrary 
-      characters in their filenames correctly, thanks to a series of 
+      Docsplit should now handle shelling out for documents with arbitrary
+      characters in their filenames correctly, thanks to a series of
       epic patches from Vladimir Rybas.
-      A <tt>--density</tt> option was added for specifying the resolution of 
+      A <tt>--density</tt> option was added for specifying the resolution of
       rasterization when generating images from documents.
       The image resolution for OCR has been doubled from 200 to 400 DPI &mdash;
-      this shouldn't make a noticeable difference for normal docs, but will make 
+      this shouldn't make a noticeable difference for normal docs, but will make
       a world of difference for the fine print.
       Docsplit now uses GraphicsMagick's <tt>--despeckle</tt> before OCR.
     </p>
-    
+
     <p>
       <b class="header">0.5.2</b><small> &ndash; May 13, 2011</small><br />
       For transparent conversion to PDF, made Docsplit prefer GraphicsMagick
       over OpenOffice, when the file format is one that GraphicsMagick is able
       to read: (png, gif, jpg, jpeg, tif, tiff, bmp, pnm, ppm, svg, eps).
     </p>
-    
+
     <p>
       <b class="header">0.5.1</b><small> &ndash; April 26, 2011</small><br />
       Minor tweaks to the <tt>TextCleaner</tt> to be more lenient about acryonms
       with hyphens, and words with four vowels in a row.
     </p>
-    
+
     <p>
       <b class="header">0.5.0</b><br />
       Added a <tt>Docsplit::TextCleaner</tt> class which is used to post-process
       OCR'd text, and remove garbage characters that are created when Tesseract
       encounters non-english text. To disable the cleanup, pass <tt>--no-clean</tt>.
     </p>
-    
+
     <p>
       <b class="header">0.4.1</b><br />
       Upgraded the JODConverter dependency for PDF conversion via OpenOffice to
-      3.0 beta. Added PNG, GIF, TIF, JPG, and BMP to the list of supported 
+      3.0 beta. Added PNG, GIF, TIF, JPG, and BMP to the list of supported
       formats.
     </p>
-    
+
     <p>
       <b class="header">0.3.4</b><br />
       Adding a suggested optimization from the GraphicsMagick list -- only ever
       generate one page image per GraphicsMagick call. Saves large amounts of
       disk space for tempfiles on long documents.
     </p>
-    
+
     <p>
       <b class="header">0.3.3</b><br />
       Start using the MAGICK_TMPDIR environment variable to prevent parallel
       Docsplit runs from having the potential to clobber each other's temporary
       image files.
     </p>
-    
+
     <p>
       <b class="header">0.3.1</b><br />
-      Added a memory limit to GraphicsMagick while generating the TIFFs for 
+      Added a memory limit to GraphicsMagick while generating the TIFFs for
       Tesseract OCR -- prevents <tt>gm</tt> from gobbling up all available memory
       on large files.
     </p>
diff --git a/lib/docsplit.rb b/lib/docsplit.rb
index c05b5a0..1c49e91 100755
--- a/lib/docsplit.rb
+++ b/lib/docsplit.rb
@@ -5,7 +5,7 @@
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
 
-  VERSION       = '0.7.0' # Keep in sync with gemspec.
+  VERSION       = '0.7.6' # Keep in sync with gemspec.
 
   ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
 
@@ -16,7 +16,7 @@ module Docsplit
   
   GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
 
-  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
+  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
 
   # Check for all dependencies, and note their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -29,7 +29,14 @@ module Docsplit
     end
   end
 
-  # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
+  # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
+  if DEPENDENCIES[:tesseract]
+    # osd will be listed in tesseract --listlangs
+    val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
+    DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
+  end
+
+    # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
   # broke.
   class ExtractionFailed < StandardError; end
 
diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb
index 7c7af08..626fb02 100755
--- a/lib/docsplit/command_line.rb
+++ b/lib/docsplit/command_line.rb
@@ -97,6 +97,9 @@ def parse_options
         opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
           @options[:language] = l
         end
+        opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
+          @options[:detect_orientation] = false
+        end
         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
           @options[:rolling] = true
         end
diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index 8c29bbc..8bc4d1d 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -33,7 +33,7 @@ def convert(pdf, size, format, previous=nil)
       directory = directory_for(size)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       escaped_pdf = ESCAPE[pdf]
-      FileUtils.mkdir_p(directory) unless File.exists?(directory)
+      FileUtils.mkdir_p(directory) unless File.exist?(directory)
       common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
@@ -48,7 +48,7 @@ def convert(pdf, size, format, previous=nil)
         end
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
 
 
diff --git a/lib/docsplit/info_extractor.rb b/lib/docsplit/info_extractor.rb
index ce03626..2e34c85 100644
--- a/lib/docsplit/info_extractor.rb
+++ b/lib/docsplit/info_extractor.rb
@@ -27,7 +27,7 @@ def extract_all(pdfs, opts)
       raise ExtractionFailed, result if $? != 0
       # ruby  1.8 (iconv) and 1.9 (String#encode) :
       if String.method_defined?(:encode)
-        result.encode!('UTF-8', 'UTF-8', :invalid => :replace)
+        result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
       else
         require 'iconv' unless defined?(Iconv)
         ic = Iconv.new('UTF-8//IGNORE','UTF-8')
diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb
index 1b9bf7f..0aef939 100644
--- a/lib/docsplit/page_extractor.rb
+++ b/lib/docsplit/page_extractor.rb
@@ -9,16 +9,16 @@ def extract(pdfs, opts)
       extract_options opts
       [pdfs].flatten.each do |pdf|
         pdf_name = File.basename(pdf, File.extname(pdf))
-        page_path = File.join(@output, "#{pdf_name}_%d.pdf")
-        FileUtils.mkdir_p @output unless File.exists?(@output)
-        
+        page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
+        FileUtils.mkdir_p @output unless File.exist?(@output)
+
         cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
-          "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
+          "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
         else
-          "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
+          "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
         end
         result = `#{cmd}`.chomp
-        FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
+        FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
         raise ExtractionFailed, result if $? != 0
         result
       end
@@ -33,4 +33,4 @@ def extract_options(options)
 
   end
 
-end
\ No newline at end of file
+end
diff --git a/lib/docsplit/pdf_extractor.rb b/lib/docsplit/pdf_extractor.rb
index 5f6afc0..a479265 100644
--- a/lib/docsplit/pdf_extractor.rb
+++ b/lib/docsplit/pdf_extractor.rb
@@ -2,9 +2,11 @@
 
 module Docsplit
   class PdfExtractor
-    @@executable = nil
+    @@executable     = nil
+    @@version_string = nil
 
-    HOST_OS = (defined?("Config") ? Config : RbConfig)::CONFIG['host_os']
+    # Provide a set of helper functions to determine the OS.
+    HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
     def windows?
       !!HOST_OS.match(/mswin|windows|cygwin/i)
     end
@@ -14,48 +16,73 @@ def osx?
     def linux?
       !!HOST_OS.match(/linux/i)
     end
-    
+
+    # The first line of the help output holds the name and version number
+    # of the office software to be used for extraction.
     def version_string
-      @@help ||= `#{office_executable} -h 2>&1`.split("\n").first
+      unless @@version_string
+        null = windows? ? "NUL" : "/dev/null"
+        @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
+        if !!@@version_string.to_s.match(/[0-9]*/)
+          @@version_string = `#{office_executable} --version`.split("\n").first
+        end
+      end
+      @@version_string
     end
-    
     def libre_office?
       !!version_string.match(/^LibreOffice/)
     end
-
     def open_office?
       !!version_string.match(/^OpenOffice.org/)
     end
-    
+
+    # A set of default locations to search for office software
+    # These have been extracted from JODConverter.  Each listed
+    # path should contain a directory "program" which in turn
+    # contains the "soffice" executable.
+    # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
     def office_search_paths
       if windows?
         office_names       = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
         program_files_path = ENV["CommonProgramFiles"]
-        search_paths       = office_name.map{ |program| File.join(program_files_path, program) }
+        search_paths       = office_names.map{ |program| File.join(program_files_path, program) }
       elsif osx?
         search_paths = %w(
           /Applications/LibreOffice.app/Contents
           /Applications/OpenOffice.org.app/Contents
         )
       else # probably linux/unix
+        # heroku libreoffice buildpack: https://github.com/rishihahs/heroku-buildpack-libreoffice
         search_paths = %w(
           /usr/lib/libreoffice
+          /usr/lib64/libreoffice
           /opt/libreoffice
           /usr/lib/openoffice
+          /usr/lib64/openoffice
           /opt/openoffice.org3
+          /app/vendor/libreoffice
+          /usr/bin/libreoffice
+          /usr/local/bin
+          /usr/lib64/libreoffice
+          /usr/lib64/openoffice.org3
         )
       end
       search_paths
     end
-    
+
+    # Identify the path to a working office executable.
     def office_executable
       paths = office_search_paths
 
+      # If an OFFICE_PATH has been specified on the commandline
+      # raise an error if that path isn't valid, otherwise, add
+      # it to the front of our search paths.
       if ENV['OFFICE_PATH']
-        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
+        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
         paths.unshift(ENV['OFFICE_PATH'])
       end
-      
+
+      # The location of the office executable is OS dependent
       path_pieces = ["soffice"]
       if windows?
         path_pieces += [["program", "soffice.bin"]]
@@ -64,13 +91,15 @@ def office_executable
       else
         path_pieces += [["program", "soffice"]]
       end
-      
+
+      # Search for the first suitable office executable
+      # and short circuit an executable is found.
       paths.each do |path|
-        if File.exists? path
+        if File.exist? path
           @@executable ||= path unless File.directory? path
           path_pieces.each do |pieces|
             check_path = File.join(path, pieces)
-            @@executable ||= check_path if File.exists? check_path
+            @@executable ||= check_path if File.exist? check_path
           end
         end
         break if @@executable
@@ -78,14 +107,16 @@ def office_executable
       raise OfficeNotFound, "No office software found" unless @@executable
       @@executable
     end
-    
+
+    # Used to specify the office location for JODConverter
     def office_path
       File.dirname(File.dirname(office_executable))
     end
-    
+
+    # Convert documents to PDF.
     def extract(docs, opts)
       out = opts[:output] || '.'
-      FileUtils.mkdir_p out unless File.exists?(out)
+      FileUtils.mkdir_p out unless File.exist?(out)
       [docs].flatten.each do |doc|
         ext = File.extname(doc)
         basename = File.basename(doc, ext)
@@ -95,12 +126,15 @@ def extract(docs, opts)
           `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
         else
           if libre_office?
-            options = "--headless --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
+            # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
+            ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
+
+            options = "--headless --invisible  --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
             cmd = "#{office_executable} #{options} 2>&1"
             result = `#{cmd}`.chomp
             raise ExtractionFailed, result if $? != 0
             true
-          else # open office presumably
+          else # open office presumably, rely on JODConverter to figure it out.
             options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
             run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
           end
@@ -113,9 +147,9 @@ def extract(docs, opts)
     LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
 
     HEADLESS      = "-Djava.awt.headless=true"
-    
+
     private
-    
+
     # Runs a Java command, with quieted logging, and the classpath set properly.
     def run_jod(command, pdfs, opts, return_output=false)
 
diff --git a/lib/docsplit/text_cleaner.rb b/lib/docsplit/text_cleaner.rb
index 123f74a..c4aac01 100644
--- a/lib/docsplit/text_cleaner.rb
+++ b/lib/docsplit/text_cleaner.rb
@@ -35,8 +35,13 @@ class TextCleaner
     # For the time being, `clean` uses the regular StringScanner, and not the
     # multibyte-aware version, coercing to ASCII first.
     def clean(text)
-      require 'iconv' unless defined?(Iconv)
-      text    = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
+      if String.method_defined?(:encode)
+        text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
+      else
+        require 'iconv' unless defined?(Iconv)
+        text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
+      end
+
       scanner = StringScanner.new(text)
       cleaned = []
       spaced  = false
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 0d55f32..f3390e8 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -28,7 +28,7 @@ def initialize
     # Extract text from a list of PDFs.
     def extract(pdfs, opts)
       extract_options opts
-      FileUtils.mkdir_p @output unless File.exists?(@output)
+      FileUtils.mkdir_p @output unless File.exist?(@output)
       [pdfs].flatten.each do |pdf|
         @pdf_name = File.basename(pdf, File.extname(pdf))
         pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
@@ -60,13 +60,14 @@ def extract_from_ocr(pdf, pages)
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       escaped_pdf = ESCAPE[pdf]
+      psm = @detect_orientation ? "-psm 1" : ""
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
-          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
+          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
@@ -74,11 +75,12 @@ def extract_from_ocr(pdf, pages)
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
-        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
+        #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
+        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
 
 
@@ -100,31 +102,42 @@ def run(command)
       result
     end
 
+    # Run pdftotext command
+    def run_pdftotext(pdf, text_path, options=[])
+      options << '-enc UTF-8'
+      options << '-layout' if @keep_layout
+
+      run "pdftotext #{options.join(' ')} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+    end
+
     # Extract the full contents of a pdf as a single file, directly.
     def extract_full(pdf)
       text_path = File.join(@output, "#{@pdf_name}.txt")
-      run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      run_pdftotext pdf, text_path
     end
 
     # Extract the contents of a single page of text, directly, adding it to
     # the `@pages_to_ocr` list if the text length is inadequate.
     def extract_page(pdf, page)
       text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
-      run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      run_pdftotext pdf, text_path, ["-f #{page}", "-l #{page}"]
+
       unless @forbid_ocr
         @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
       end
     end
 
     def extract_options(options)
-      @output     = options[:output] || '.'
-      @pages      = options[:pages]
-      @force_ocr  = options[:ocr] == true
-      @forbid_ocr = options[:ocr] == false
-      @clean_ocr  = !(options[:clean] == false)
-      @language   = options[:language] || 'eng'
+      @output             = options[:output] || '.'
+      @pages              = options[:pages]
+      @force_ocr          = options[:ocr] == true
+      @forbid_ocr         = options[:ocr] == false
+      @language           = options[:language] || 'eng'
+      @clean_ocr          = (!(options[:clean] == false) and @language == 'eng')
+      @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
+      @keep_layout        = options.fetch(:layout, false)
     end
 
   end
 
-end
\ No newline at end of file
+end
diff --git a/lib/docsplit/transparent_pdfs.rb b/lib/docsplit/transparent_pdfs.rb
index f65072d..8987b3b 100755
--- a/lib/docsplit/transparent_pdfs.rb
+++ b/lib/docsplit/transparent_pdfs.rb
@@ -8,19 +8,22 @@ module TransparentPDFs
     # through further extraction.
     def ensure_pdfs(docs)
       [docs].flatten.map do |doc|
-        ext = File.extname(doc)
-        if ext.downcase == '.pdf'
+        if is_pdf?(doc)
           doc
         else
-          tempdir = Dir.mktmpdir
+          tempdir = File.join(Dir.tmpdir, 'docsplit')
           extract_pdf([doc], {:output => tempdir})
-          File.join(tempdir, File.basename(doc, ext) + '.pdf')
+          File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
         end
       end
     end
 
+    def is_pdf?(doc)
+      File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
+    end
+
   end
 
   extend TransparentPDFs
 
-end
\ No newline at end of file
+end
diff --git a/noto_bolt.svg b/noto_bolt.svg
new file mode 100644
index 0000000..226fcdb
--- /dev/null
+++ b/noto_bolt.svg
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 16.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- This file is Apache2 licensed see https://code.google.com/p/noto/ -->
+<svg version="1.1" id="レイヤー_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px"
+	 y="0px" width="128px" height="128px" viewBox="0 0 128 128" enable-background="new 0 0 128 128" xml:space="preserve">
+<path fill="#FCC21B" d="M115.36,61.84L70.22,50.49L114.45,2.4c0.41-0.45,0.43-1.13,0.05-1.6c-0.39-0.48-1.07-0.59-1.59-0.27
+	L12.3,61.98c-0.41,0.25-0.64,0.72-0.57,1.2c0.06,0.48,0.4,0.87,0.87,1.01l45.07,13.25L13.38,125.6c-0.42,0.46-0.44,1.15-0.04,1.61
+	c0.24,0.29,0.58,0.44,0.94,0.44c0.22,0,0.45-0.06,0.65-0.19l100.78-63.41c0.42-0.26,0.64-0.75,0.56-1.22
+	C116.19,62.34,115.84,61.95,115.36,61.84z"/>
+</svg>
diff --git a/test/fixtures/corrosion.reoriented.pdf b/test/fixtures/corrosion.reoriented.pdf
new file mode 100644
index 0000000..1aecd28
Binary files /dev/null and b/test/fixtures/corrosion.reoriented.pdf differ
diff --git a/test/fixtures/with_pdf_extension/actually_a_doc.pdf b/test/fixtures/with_pdf_extension/actually_a_doc.pdf
new file mode 100755
index 0000000..e7f7abd
Binary files /dev/null and b/test/fixtures/with_pdf_extension/actually_a_doc.pdf differ
diff --git a/test/fixtures/with_pdf_extension/actually_an_image.pdf b/test/fixtures/with_pdf_extension/actually_an_image.pdf
new file mode 100644
index 0000000..3f0ce52
Binary files /dev/null and b/test/fixtures/with_pdf_extension/actually_an_image.pdf differ
diff --git a/test/fixtures/with_pdf_extension/actually_an_rtf.pdf b/test/fixtures/with_pdf_extension/actually_an_rtf.pdf
new file mode 100755
index 0000000..2790993
Binary files /dev/null and b/test/fixtures/with_pdf_extension/actually_an_rtf.pdf differ
diff --git a/test/fixtures/with_pdf_extension/this_ones_a_real_pdf.pdf b/test/fixtures/with_pdf_extension/this_ones_a_real_pdf.pdf
new file mode 100644
index 0000000..02e7f1b
Binary files /dev/null and b/test/fixtures/with_pdf_extension/this_ones_a_real_pdf.pdf differ
diff --git a/test/fixtures/without_pdf_extension/indesign/test_pdf_1_3 b/test/fixtures/without_pdf_extension/indesign/test_pdf_1_3
new file mode 100644
index 0000000..3058d4c
Binary files /dev/null and b/test/fixtures/without_pdf_extension/indesign/test_pdf_1_3 differ
diff --git a/test/fixtures/without_pdf_extension/indesign/test_pdf_1_4 b/test/fixtures/without_pdf_extension/indesign/test_pdf_1_4
new file mode 100644
index 0000000..2031757
Binary files /dev/null and b/test/fixtures/without_pdf_extension/indesign/test_pdf_1_4 differ
diff --git a/test/fixtures/without_pdf_extension/indesign/test_pdf_1_5 b/test/fixtures/without_pdf_extension/indesign/test_pdf_1_5
new file mode 100644
index 0000000..9bbb92c
Binary files /dev/null and b/test/fixtures/without_pdf_extension/indesign/test_pdf_1_5 differ
diff --git a/test/fixtures/without_pdf_extension/indesign/test_pdf_1_6 b/test/fixtures/without_pdf_extension/indesign/test_pdf_1_6
new file mode 100644
index 0000000..9979ab2
Binary files /dev/null and b/test/fixtures/without_pdf_extension/indesign/test_pdf_1_6 differ
diff --git a/test/fixtures/without_pdf_extension/indesign/test_pdf_1_7 b/test/fixtures/without_pdf_extension/indesign/test_pdf_1_7
new file mode 100644
index 0000000..b159e73
Binary files /dev/null and b/test/fixtures/without_pdf_extension/indesign/test_pdf_1_7 differ
diff --git a/test/test_helper.rb b/test/test_helper.rb
index 598e1bb..2357c5a 100755
--- a/test/test_helper.rb
+++ b/test/test_helper.rb
@@ -1,15 +1,16 @@
 here = File.dirname(__FILE__)
 require File.join(here, '..', 'lib', 'docsplit')
 require 'fileutils'
-require 'test/unit'
+require 'minitest'
+require "minitest/autorun"
 
-class Test::Unit::TestCase
+class Minitest::Test
   include Docsplit
 
   OUTPUT = 'test/output'
 
   def clear_output
-    FileUtils.rm_r(OUTPUT) if File.exists?(OUTPUT)
+    FileUtils.rm_r(OUTPUT) if File.exist?(OUTPUT)
   end
 
   def teardown
diff --git a/test/unit/test_convert_to_pdf.rb b/test/unit/test_convert_to_pdf.rb
index a8c1d0b..7a2b1ae 100755
--- a/test/unit/test_convert_to_pdf.rb
+++ b/test/unit/test_convert_to_pdf.rb
@@ -1,7 +1,7 @@
 here = File.expand_path(File.dirname(__FILE__))
 require File.join(here, '..', 'test_helper')
 
-class ConvertToPdfTest < Test::Unit::TestCase
+class ConvertToPdfTest < Minitest::Test
 
   def test_doc_conversion
     Docsplit.extract_pdf('test/fixtures/obama_veterans.doc', :output => OUTPUT)
diff --git a/test/unit/test_extract_images.rb b/test/unit/test_extract_images.rb
index 8ccfc58..2b7ca87 100755
--- a/test/unit/test_extract_images.rb
+++ b/test/unit/test_extract_images.rb
@@ -1,7 +1,7 @@
 here = File.expand_path(File.dirname(__FILE__))
 require File.join(here, '..', 'test_helper')
 
-class ExtractImagesTest < Test::Unit::TestCase
+class ExtractImagesTest < Minitest::Test
 
   def test_basic_image_extraction
     Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "250x", :output => OUTPUT)
diff --git a/test/unit/test_extract_info.rb b/test/unit/test_extract_info.rb
index 880563c..08fdd91 100755
--- a/test/unit/test_extract_info.rb
+++ b/test/unit/test_extract_info.rb
@@ -1,7 +1,7 @@
 here = File.expand_path(File.dirname(__FILE__))
 require File.join(here, '..', 'test_helper')
 
-class ExtractInfoTest < Test::Unit::TestCase
+class ExtractInfoTest < Minitest::Test
 
   def test_title
     assert "PDF Pieces" == Docsplit.extract_title('test/fixtures/encrypted.pdf')
@@ -38,9 +38,7 @@ def test_name_escaping_while_extracting_info
   end
   
   def test_malformed_unicode
-    assert_nothing_raised do
-      Docsplit.extract_date('test/fixtures/Faktura 10.pdf')
-    end
+    assert Docsplit.extract_date('test/fixtures/Faktura 10.pdf')
   end
   
   def test_extract_all
diff --git a/test/unit/test_extract_pages.rb b/test/unit/test_extract_pages.rb
index e0b1015..f5c7c54 100755
--- a/test/unit/test_extract_pages.rb
+++ b/test/unit/test_extract_pages.rb
@@ -1,7 +1,7 @@
 here = File.expand_path(File.dirname(__FILE__))
 require File.join(here, '..', 'test_helper')
 
-class ExtractPagesTest < Test::Unit::TestCase
+class ExtractPagesTest < Minitest::Test
 
   def test_multi_page_extraction
     Docsplit.extract_pages('test/fixtures/obama_arts.pdf', :output => OUTPUT)
diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb
index 00d24e3..fa46180 100755
--- a/test/unit/test_extract_text.rb
+++ b/test/unit/test_extract_text.rb
@@ -2,7 +2,7 @@
 require File.join(here, '..', 'test_helper')
 require 'tmpdir'
 
-class ExtractTextTest < Test::Unit::TestCase
+class ExtractTextTest < Minitest::Test
 
   def test_paged_extraction
     Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
@@ -39,7 +39,7 @@ def test_ocr_extraction
   end
 
   def test_ocr_extraction_in_mock_language
-    exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
+    exception = assert_raises(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
     assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"
   end
 
@@ -53,5 +53,32 @@ def test_name_escaping_while_extracting_text
     Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT)
     assert Dir["#{OUTPUT}/*.txt"].length == 2
   end
+  
+  def test_orientation_detected_ocr_extraction
+    if Docsplit::DEPENDENCIES[:osd]
+      pages = 1..4
+      Docsplit.extract_text('test/fixtures/corrosion.reoriented.pdf', :output => OUTPUT, :pages=>pages, :force_ocr => true)
+      letters = Hash.new(0)
+      nonletters = Hash.new(0)
+      
+      pages.each do |number|
+        File.open(File.join(OUTPUT,"corrosion.reoriented_#{number}.txt")).each_char do |c| 
+          case c
+          when /[A-Za-z]/
+            letters[c] += 1
+          when /\s/
+          else
+            nonletters[c] += 1
+          end
+        end
+      end
+      
+      # the corrosion.pdf has 6160 letters & 362 nonletters, or ~17:1
+      # so lets give a fudge factor of ~half of that or 8:1
+      assert letters.values.reduce(0,:+)/8 > nonletters.values.reduce(0,:+), "Expected that text extracted with orientation detection would have more letters."
+    else
+      skip "Orientation detection module (osd) for Tesseract isn't installed"
+    end
+  end
 
 end
diff --git a/test/unit/test_transparent_pdfs.rb b/test/unit/test_transparent_pdfs.rb
new file mode 100644
index 0000000..32d1ae6
--- /dev/null
+++ b/test/unit/test_transparent_pdfs.rb
@@ -0,0 +1,29 @@
+here = File.expand_path(File.dirname(__FILE__))
+require File.join(here, '..', 'test_helper')
+require 'tmpdir'
+
+class TransparentPDFsTest < Minitest::Test
+
+  def setup
+    @klass = Class.new
+    @klass.send(:include, Docsplit::TransparentPDFs)
+    @detector = @klass.new
+  end
+
+  def test_files_with_pdf_extension_are_always_considered_a_pdf
+    pdfs = Dir.glob('test/fixtures/with_pdf_extension/*.pdf').select { |path| File.file?(path) }
+    assert pdfs.any?, 'ensure pdfs with extensions are available to test with'
+    pdfs.each do |pdf|
+      assert @detector.is_pdf?(pdf), "#{pdf} with '.pdf' extension is identified as a PDF (regardless of its file contents)"
+    end
+  end
+
+  def test_pdfs_without_the_pdf_file_extension_is_considerd_a_pdf
+    pdfs = Dir.glob('test/fixtures/without_pdf_extension/*/*').select { |path| File.file?(path) }
+    assert pdfs.any?, 'ensure pdfs without extensions are available to test with'
+    pdfs.each do |pdf|
+      assert @detector.is_pdf?(pdf), "#{pdf} with '.pdf' extension is identified as a PDF"
+    end
+  end
+
+end