diff --git a/.carthorse.yml b/.carthorse.yml new file mode 100644 index 0000000..7b6ca85 --- /dev/null +++ b/.carthorse.yml @@ -0,0 +1,9 @@ +carthorse: + version-from: setup.py + tag-format: "{version}" + when: + - version-not-tagged + actions: + - run: "sudo pip install -e .[build]" + - run: "twine upload -u __token__ -p $PYPI_TOKEN dist/*" + - create-tag diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..73edaec --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,98 @@ +version: 2.1 + +orbs: + python: cjw296/python-ci@2.1 + +jobs: + coverage: + docker: + - image: circleci/python:3.8 + steps: + - checkout + - attach_workspace: + at: coverage_output + - run: + name: "Check coverage" + command: | + sudo pip install coverage + coverage combine coverage_output/ + bash <(curl -s https://codecov.io/bash) + + check-package: + parameters: + image: + type: string + docker: + - image: << parameters.image >> + steps: + - python/check-package: + package: "xlrd" + test: + - run: + name: "Check Import" + command: python -c "import xlrd" + - run: + name: "Check no XLS in wheel" + command: "! unzip -l dist/*.whl | egrep '.xlsx?$'" + - run: + name: "Check no XLS in source dist" + command: "! tar tzf dist/*.tar.gz | egrep '.xlsx?$'" + +common: &common + jobs: + + - python/pip-run-tests: + matrix: + parameters: + image: + - circleci/python:2.7 + - circleci/python:3.6 + - circleci/python:3.9 + + - coverage: + name: coverage + requires: + - python/pip-run-tests + + - python/pip-docs: + name: docs + requires: + - coverage + + - python/pip-setuptools-build-package: + name: package + requires: + - docs + filters: + branches: + only: master + + - check-package: + matrix: + parameters: + image: + - circleci/python:2.7 + - circleci/python:3.9 + requires: + - package + + - python/release: + name: release + config: .carthorse.yml + requires: + - check-package + filters: + branches: + only: master + +workflows: + push: + <<: *common + periodic: + <<: *common + triggers: + - schedule: + cron: "0 0 11 * *" + filters: + branches: + only: master diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..3fb98a1 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,10 @@ +[run] +source = xlrd,scripts,tests + +[report] +exclude_lines = + # the original exclude + pragma: no cover + + # debug stuff + if DEBUG: diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..4bc268f --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,10 @@ +version: 2 +python: + version: 3.8 + install: + - method: pip + path: . + extra_requirements: + - docs +sphinx: + fail_on_warning: true diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index b84e62e..0000000 --- a/.travis.yml +++ /dev/null @@ -1,41 +0,0 @@ -dist: xenial -sudo: false -cache: pip - -language: python - -matrix: - include: - - python: "3.7" - env: TOXENV=py37 - - python: "3.6" - env: TOXENV=py36 - - python: "3.5" - env: TOXENV=py35 - - python: "3.4" - env: TOXENV=py34 - - python: "2.7" - env: TOXENV=py27 - - env: TOXENV=lint - -install: - - pip install tox - -script: - - tox - -after_success: - - if [ $TOXENV != lint ]; then pip install coveralls; fi - - if [ $TOXENV != lint ]; then coveralls; fi - -deploy: - provider: pypi - user: chrisw - password: - secure: BManEisxNCN966HPz8J/oYzFBPuTPFP212d8rUjv9p/W96pT0zWlO9paDce67nZiYWb7Khzg7fF5WSFpYRKsvB4a370KpTYbsAb8EPx7DUBPsz7qFC8B6EUbCXfwbe68XwPugenREby8vEtywUqxzge2SNVRPfH+wKio2hUcfEo= - on: - tags: true - repo: python-excel/xlrd - python: "3.5" - skip_cleanup: true - distributions: "sdist bdist_wheel" diff --git a/CHANGELOG.rst b/CHANGELOG.rst new file mode 100644 index 0000000..e545334 --- /dev/null +++ b/CHANGELOG.rst @@ -0,0 +1,583 @@ +Changes +======= + +2.0.1 (11 December 2020) +------------------------ + +- Use the README as the long description on PyPI. + +2.0.0 (11 December 2020) +------------------------ + +- Remove support for anything other than ``.xls`` files. +- Remove support for ``psyco``. +- Change the default encoding used when no ``CODEPAGE`` record can be found + from ``ascii`` to ``iso-8859-1``. +- Add support for iterating over :class:`~xlrd.book.Book` objects. +- Add support for item access from :class:`~xlrd.book.Book` objects, + where integer indices and string sheet names are supported. +- Non-unicode spaces are now stripped from the "last author" information. +- Workbook corruption errors can now be ignored using the + ``ignore_workbook_corruption`` option to :class:`~xlrd.open_workbook`. +- Handle ``WRITEACCESS`` records with invalid trailing characters. +- Officially support Python 3.8 and 3.9. + +Thanks to the following for their contributions to this release: + +- Jon Dufresne +- Tore Lundqvist +- nayyarv +- Michael Davis +- skonik + +1.2.0 (15 December 2018) +------------------------ + +- Added support for Python 3.7. +- Added optional support for defusedxml to help mitigate exploits. +- Automatically convert ``~`` in file paths to the current user's home + directory. +- Removed ``examples`` directory from the installed package. They are still + available in the source distribution. +- Fixed ``time.clock()`` deprecation warning. + +1.1.0 (22 August 2017) +---------------------- + +- Fix for parsing of merged cells containing a single cell reference in xlsx + files. + +- Fix for "invalid literal for int() with base 10: 'true'" when reading some + xlsx files. + +- Make xldate_as_datetime available to import direct from xlrd. + +- Build universal wheels. + +- Sphinx documentation. + +- Document the problem with XML vulnerabilities in xlsx files and mitigation + measures. + +- Fix :class:`NameError` on ``has_defaults is not defined``. + +- Some whitespace and code style tweaks. + +- Make example in README compatible with both Python 2 and 3. + +- Add default value for cells containing errors that causeed parsing of some + xlsx files to fail. + +- Add Python 3.6 to the list of supported Python versions, drop 3.3 and 2.6. + +- Use generator expressions to avoid unnecessary lists in memory. + +- Document unicode encoding used in Excel files from Excel 97 onwards. + +- Report hyperlink errors in R1C1 syntax. + +Thanks to the following for their contributions to this release: + +- icereval@gmail.com +- Daniel Rech +- Ville Skyttä +- Yegor Yefremov +- Maxime Lorant +- Alexandr N Zamaraev +- Zhaorong Ma +- Jon Dufresne +- Chris McIntyre +- coltleese@gmail.com +- Ivan Masá + +1.0.0 (2 June 2016) +------------------- + +- Official support, such as it is, is now for 2.6, 2.7, 3.3+ + +- Fixes a bug in looking up non-lowercase sheet filenames by ensuring that the + sheet targets are transformed the same way as the component_names dict keys. + +- Fixes a bug for ``ragged_rows=False`` when merged cells increases the number + of columns in the sheet. This requires all rows to be extended to ensure equal + row lengths that match the number of columns in the sheet. + +- Fixes to enable reading of SAP-generated .xls files. + +- support BIFF4 files with missing FORMAT records. + +- support files with missing WINDOW2 record. + +- Empty cells are now always unicode strings, they were a bytestring on + Python 2 and a unicode string on Python 3. + +- Fix for ```` ``inlineStr`` attribute without ```` child. + +- Fix for a zoom of ``None`` causing problems on Python 3. + +- Fix parsing of bad dimensions. + +- Fix xlsx sheet to comments relationship. + +Thanks to the following for their contributions to this release: + +- Lars-Erik Hannelius +- Deshi Xiao +- Stratos Moro +- Volker Diels-Grabsch +- John McNamara +- Ville Skyttä +- Patrick Fuller +- Dragon Dave McKee +- Gunnlaugur Þór Briem + +0.9.4 (14 July 2015) +-------------------- + +- Automated tests are now run on Python 3.4 + +- Use ``ElementTree.iter()`` if available, instead of the deprecated + ``getiterator()`` when parsing xlsx files. + +- Fix #106 : Exception Value: unorderable types: Name() < Name() + +- Create row generator expression with Sheet.get_rows() + +- Fix for forward slash file separator and lowercase names within xlsx + internals. + +Thanks to the following for their contributions to this release: + +- Corey Farwell +- Jonathan Kamens +- Deepak N +- Brandon R. Stoner +- John McNamara + +0.9.3 (8 Apr 2014) +------------------ + +- Github issue #49 + +- Github issue #64 - skip meaningless chunk of 4 zero bytes between two + otherwise-valid BIFF records + +- Github issue #61 - fix updating of escapement attribute of Font objects read + from workbooks. + +- Implemented ``Sheet.visibility`` for xlsx files + +- Ignore anchors (``$``) in cell references + +- Dropped support for Python 2.5 and earlier, Python 2.6 is now the earliest + Python release supported + +- Read xlsx merged cell elements. + +- Read cell comments in .xlsx files. + +- Added xldate_as_datetime() function to convert from Excel + serial date/time to datetime.datetime object. + +Thanks to the following for their contributions to this release: + +- John Machin +- Caleb Epstein +- Martin Panter +- John McNamara +- Gunnlaugur Þór Briem +- Stephen Lewis + + +0.9.2 (9 Apr 2013) +------------------ + +- Fix some packaging issues that meant docs and examples were missing from the tarball. + +- Fixed a small but serious regression that caused problems opening .xlsx files. + +0.9.1 (5 Apr 2013) +------------------ + +- Many fixes bugs in Python 3 support. +- Fix bug where ragged rows needed fixing when formatting info was being parsed. +- Improved handling of aberrant Excel 4.0 Worksheet files. +- Various bug fixes. +- Simplify a lot of the distribution packaging. +- Remove unused and duplicate imports. + +Thanks to the following for their contributions to this release: + +- Thomas Kluyver + +0.9.0 (31 Jan 2013) +------------------- + +- Support for Python 3.2+ +- Many new unit test added. +- Continuous integration tests are now run. +- Various bug fixes. + +Special thanks to Thomas Kluyver and Martin Panter for their work on +Python 3 compatibility. + +Thanks to Manfred Moitzi for re-licensing his unit tests so we could include +them. + +Thanks to the following for their contributions to this release: + +- "holm" +- Victor Safronovich +- Ross Jones + +0.8.0 (22 Aug 2012) +------------------- + +- More work-arounds for broken source files. +- Support for reading .xlsx files. +- Drop support for Python 2.5 and older. + +0.7.8 (7 June 2012) +------------------- + +- Ignore superfluous zero bytes at end of xls OBJECT record. +- Fix assertion error when reading file with xlwt-written bitmap. + +0.7.7 (13 Apr 2012) +------------------- + +- More packaging changes, this time to support 2to3. + +0.7.6 (3 Apr 2012) +------------------ + +- Fix more packaging issues. + +0.7.5 (3 Apr 2012) +------------------ +- Fix packaging issue that missed ``version.txt`` from the distributions. + +0.7.4 (2 Apr 2012) +------------------ + +- More tolerance of out-of-spec files. +- Fix bugs reading long text formula results. + +0.7.3 (28 Feb 2012) +------------------- + +- Packaging and documentation updates. + +0.7.2 (21 Feb 2012) +------------------- + +- Tolerant handling of files with extra zero bytes at end of NUMBER record. + Sample provided by Jan Kraus. +- Added access to cell notes/comments. Many cross-references added to Sheet + class docs. +- Added code to extract hyperlink (HLINK) records. Based on a patch supplied by + John Morrisey. +- Extraction of rich text formatting info based on code supplied by + Nathan van Gheem. +- added handling of BIFF2 WINDOW2 record. +- Included modified version of page breaks patch from Sam Listopad. +- Added reading of the PANE record. +- Reading SCL record. New attribute ``Sheet.scl_mag_factor``. +- Lots of bug fixes. +- Added ``ragged_rows`` functionality. + +0.7.1 (31 May 2009) +------------------- + +- Backed out "slash'n'burn" of sheet resources in unload_sheet(). + Fixed problem with STYLE records on some Mac Excel files. +- quieten warnings +- Integrated on_demand patch by Armando Serrano Lombillo + +0.7.0 (11 March 2009) +--------------------- + ++ colname utility function now supports more than 256 columns. ++ Fix bug where BIFF record type 0x806 was being regarded as a formula + opcode. ++ Ignore PALETTE record when formatting_info is false. ++ Tolerate up to 4 bytes trailing junk on PALETTE record. ++ Fixed bug in unused utility function xldate_from_date_tuple which + affected some years after 2099. ++ Added code for inspecting as-yet-unused record types: FILEPASS, TXO, + NOTE. ++ Added inspection code for add_in function calls. ++ Added support for unnumbered biff_dump (better for doing diffs). ++ ignore distutils cruft ++ Avoid assertion error in compdoc when -1 used instead of -2 for + first_SID of empty SCSS ++ Make version numbers match up. ++ Enhanced recovery from out-of-order/missing/wrong CODEPAGE record. ++ Added Name.area2d convenience method. ++ Avoided some checking of XF info when formatting_info is false. ++ Minor changes in preparation for XLSX support. ++ remove duplicate files that were out of date. ++ Basic support for Excel 2.0 ++ Decouple Book init & load. ++ runxlrd: minor fix for xfc. ++ More Excel 2.x work. ++ is_date_format() tweak. ++ Better detection of IronPython. ++ Better error message (including first 8 bytes of file) when file is + not in a supported format. ++ More BIFF2 formatting: ROW, COLWIDTH, and COLUMNDEFAULT records; ++ finished stage 1 of XF records. ++ More work on supporting BIFF2 (Excel 2.x) files. ++ Added support for Excel 2.x (BIFF2) files. Data only, no formatting + info. Alpha. ++ Wasn't coping with EXTERNSHEET record followed by CONTINUE + record(s). ++ Allow for BIFF2/3-style FORMAT record in BIFF4/8 file ++ Avoid crash when zero-length Unicode string missing options byte. ++ Warning message if sector sizes are extremely large. ++ Work around corrupt STYLE record ++ Added missing entry for blank cell type to ctype_text ++ Added "fonts" command to runxlrd script ++ Warning: style XF whose parent XF index != 0xFFF ++ Logfile arg wasn't being passed from open_workbook to + compdoc.CompDoc. + + +0.6.1 (10 June 2007) +--------------------- + ++ Version number updated to 0.6.1 ++ Documented runxlrd.py commands in its usage message. Changed + commands: dump to biff_dump, count_records to biff_count. + + +0.6.1a5 +------- + ++ Bug fixed: Missing "<" in a struct.unpack call means can't open + files on bigendian platforms. Discovered by "Mihalis". ++ Removed antique undocumented Book.get_name_dict method and + experimental "trimming" facility. ++ Meaningful exception instead of IndexError if a SAT (sector + allocation table) is corrupted. ++ If no CODEPAGE record in pre-8.0 file, assume ascii and keep going + (instead of raising exception). + + +0.6.1a4 +------- + ++ At least one source of XLS files writes parent style XF records + *after* the child cell XF records that refer to them, triggering + IndexError in 0.5.2 and AssertionError in later versions. Reported + with sample file by Todd O'Bryan. Fixed by changing to two-pass + processing of XF records. ++ Formatting info in pre-BIFF8 files: Ensured appropriate defaults and + lossless conversions to make the info BIFF8-compatible. Fixed bug in + extracting the "used" flags. ++ Fixed problems discovered with opening test files from Planmaker + 2006 (http://www.softmaker.com/english/ofwcomp_en.htm): (1) Four files + have reduced size of PALETTE record (51 and 32 colours; Excel writes + 56 always). xlrd now emits a NOTE to the logfile and continues. (2) + FORMULA records use the Excel 2.x record code 0x0021 instead of + 0x0221. xlrd now continues silently. (3) In two files, at the OLE2 + compound document level, the internal directory says that the length + of the Short-Stream Container Stream is 16384 bytes, but the actual + contents are 11264 and 9728 bytes respectively. xlrd now emits a + WARNING to the logfile and continues. ++ After discussion with Daniel Rentz, the concept of two lists of XF + (eXtended Format) objects (raw_xf_list and computed_xf_list) has been + abandoned. There is now a single list, called xf_list + + +0.6.1a3 +------- + ++ Added Book.sheets ... for sheetx, sheet in enumerate(book.sheets): ++ Formatting info: extraction of sheet-level flags from WINDOW2 + record, and sheet.visibility from BOUNDSHEET record. Added Macintosh- + only Font attributes "outline" and "shadow'. + + +0.6.1a2 +------- + ++ Added extraction of merged cells info. ++ pyExcelerator uses "general" instead of "General" for the generic + "number format". Worked around. ++ Crystal Reports writes "WORKBOOK" in the OLE2 Compound Document + directory instead of "Workbook". Changed to case-insensitive directory + search. Reported by Vic Simkus. + + +0.6.1a1 (18 Dec 2006) +--------------------- + ++ Added formatting information for cells (font, "number format", + background, border, alignment and protection) and rows/columns + (height/width etc). To save memory and time for those who don't need + it, this information is extracted only if formatting_info=1 is + supplied to the open_workbook() function. The cell records BLANK and + MULBLANKS which contain no data, only formatting information, will + continue to be ignored in the default (no formatting info) case. ++ Ralph Heimburger reported a problem with xlrd being intolerant about + an Excel 4.0 file (created by "some web app") with a DIMENSIONS record + that omitted Microsoft's usual padding with 2 unused bytes. Fixed. + + +0.6.0a4 (not released) +---------------------- + ++ Added extraction of human-readable formulas from NAME records. ++ Worked around OOo Calc writing 9-byte BOOLERR records instead of 8. + Reported by Rory Campbell-Lange. ++ This history file converted to descending chronological order and + HTML format. + + +0.6.0a3 (19 Sept 2006) +---------------------- + ++ Names: minor bugfixes; added script xlrdnameAPIdemo.py ++ ROW records were being used as additional hints for sizing memory + requirements. In some files the ROW records overstate the number of + used columns, and/or there are ROW records for rows that have no data + in them. This would cause xlrd to report sheet.ncols and/or + sheet.nrows as larger than reasonably expected. Change: ROW records + are ignored. The number of columns/rows is based solely on the highest + column/row index seen in non-empty data records. Empty data records + (types BLANK and MULBLANKS) which contain no data, only formatting + information, have always been ignored, and this will continue. + Consequence: trailing rows and columns which contain only empty cells + will vanish. + + +0.6.0a2 (13 Sept 2006) +---------------------- + + ++ Fixed a bug reported by Rory Campbell-Lange.: "open failed"; + incorrect assumptions about the layout of array formulas which return + strings. ++ Further work on defined names, especially the API. + + +0.6.0a1 (8 Sept 2006) +--------------------- + ++ Sheet objects have two new convenience methods: col_values(colx, + start_rowx=0, end_rowx=None) and the corresponding col_types. + Suggested by Dennis O'Brien. ++ BIFF 8 file missing its CODEPAGE record: xlrd will now assume + utf_16_le encoding (the only possibility) and keep going. ++ Older files missing a CODEPAGE record: an exception will be raised. + Thanks to Sergey Krushinsky for a sample file. The open_workbook() + function has a new argument (encoding_override) which can be used if + the CODEPAGE record is missing or incorrect (for example, + codepage=1251 but the data is actually encoded in koi8_r). The + runxlrd.py script takes a corresponding -e argument, for example -e + cp1251 ++ Further work done on parsing "number formats". Thanks to Chris + Withers for the ``"General_)"`` example. ++ Excel 97 introduced the concept of row and column labels, defined by + Insert > Name > Labels. The ranges containing the labels are now + exposed as the Sheet attributes row_label_ranges and col_label_ranges. ++ The major effort in this 0.6.0 release has been the provision of + access to named cell ranges and named constants (Excel: + Insert/Name/Define). Juan C. Mendez provided very useful real-world + sample files. + + +0.5.3a1 (24 May 2006) +--------------------- + ++ John Popplewell and Richard Sharp provided sample files which caused + any reliance at all on DIMENSIONS records and ROW records to be + abandoned. ++ If the file size is not a whole number of OLE sectors, a warning + message is logged. Previously this caused an exception to be raised. + + +0.5.2 (14 March 2006) +--------------------- + ++ public release ++ Updated version numbers, README, HISTORY. + + +0.5.2a3 (13 March 2006) +----------------------- + ++ Gnumeric writes user-defined formats with format codes starting at + 50 instead of 164; worked around. ++ Thanks to Didrik Pinte for reporting the need for xlrd to be more + tolerant of the idiosyncracies of other software, for supplying sample + files, and for performing alpha testing. ++ '_' character in a format should be treated like an escape + character; fixed. ++ An "empty" formula result means a zero-length string, not an empty + cell! Fixed. + + +0.5.2a2 (9 March 2006) +---------------------- + ++ Found that Gnumeric writes all DIMENSIONS records with nrows and + ncols each 1 less than they should be (except when it clamps ncols at + 256!), and pyXLwriter doesn't write ROW records. Cell memory pre- + allocation was generalised to use ROW records if available with fall- + back to DIMENSIONS records. + + +0.5.2a1 (6 March 2006) +---------------------- + + ++ pyXLwriter writes DIMENSIONS record with antique opcode 0x0000 + instead of 0x0200; worked around ++ A file written by Gnumeric had zeroes in DIMENSIONS record but data + in cell A1; worked around + + +0.5.1 (18 Feb 2006) +-------------------- + ++ released to Journyx ++ Python 2.1 mmap requires file to be opened for update access. Added + fall-back to read-only access without mmap if 2.1 open fails because + "permission denied". + + +0.5 (7 Feb 2006) +---------------- + ++ released to Journyx ++ Now works with Python 2.1. Backporting to Python 2.1 was partially + funded by Journyx - provider of timesheet and project accounting + solutions (http://journyx.com/) ++ open_workbook() can be given the contents of a file instead of its + name. Thanks to Remco Boerma for the suggestion. ++ New module attribute __VERSION__ (as a string; for example "0.5") ++ Minor enhancements to classification of formats as date or not-date. ++ Added warnings about files with inconsistent OLE compound document + structures. Thanks to Roman V. Kiseliov (author of pyExcelerator) for + the tip-off. + + +0.4a1, (7 Sept 2005) +-------------------- + ++ released to Laurent T. ++ Book and sheet objects can now be pickled and unpickled. Instead of + reading a large spreadsheet multiple times, consider pickling it once + and loading the saved pickle; can be much faster. Thanks to Laurent + Thioudellet for the enhancement request. ++ Using the mmap module can be turned off. But you would only do that + for benchmarking purposes. ++ Handling NUMBER records has been made faster + + +0.3a1 (15 May 2005) +------------------- + +- first public release diff --git a/MANIFEST.in b/MANIFEST.in index 563bde3..86dc99a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,3 @@ +include CHANGELOG.rst include LICENSE include README.md -graft docs -graft examples -graft tests diff --git a/README.md b/README.md deleted file mode 100644 index 07ed665..0000000 --- a/README.md +++ /dev/null @@ -1,74 +0,0 @@ -[![Build Status](https://travis-ci.org/python-excel/xlrd.svg?branch=master)](https://travis-ci.org/python-excel/xlrd) -[![Coverage Status](https://coveralls.io/repos/github/python-excel/xlrd/badge.svg?branch=master)](https://coveralls.io/github/python-excel/xlrd?branch=master) -[![Documentation Status](https://readthedocs.org/projects/xlrd/badge/?version=latest)](http://xlrd.readthedocs.io/en/latest/?badge=latest) -[![PyPI version](https://badge.fury.io/py/xlrd.svg)](https://badge.fury.io/py/xlrd) - -### xlrd - -Please read this before using this library: https://groups.google.com/d/msg/python-excel/P6TjJgFVjMI/g8d0eWxTBQAJ - -**Purpose**: Provide a library for developers to use to extract data from Microsoft Excel (tm) spreadsheet files. It is not an end-user tool. - -**Author**: John Machin, Lingfo Pty Ltd (sjmachin@lexicon.net) - -**Licence**: BSD-style (see licences.py) - -**Versions of Python supported**: 2.7, 3.4+. - -**External modules required**: - -The package itself is pure Python with no dependencies on modules or packages outside the standard Python distribution. - -**Outside the current scope**: xlrd will safely and reliably ignore any of these if present in the file: - -* Charts, Macros, Pictures, any other embedded object. WARNING: currently this includes embedded worksheets. -* VBA modules -* Formulas (results of formula calculations are extracted, of course). -* Comments -* Hyperlinks -* Autofilters, advanced filters, pivot tables, conditional formatting, data validation - -**Unlikely to be done**: - -* Handling password-protected (encrypted) files. - -**Particular emphasis (refer docs for details)**: - -* Operability across OS, regions, platforms -* Handling Excel's date problems, including the Windows / Macintosh four-year differential. -* Providing access to named constants and named groups of cells (from version 0.6.0) -* Providing access to "visual" information: font, "number format", background, border, alignment and protection for cells, height/width etc for rows/columns (from version 0.6.1) - -**Quick start**: - -```python -import xlrd -book = xlrd.open_workbook("myfile.xls") -print("The number of worksheets is {0}".format(book.nsheets)) -print("Worksheet name(s): {0}".format(book.sheet_names())) -sh = book.sheet_by_index(0) -print("{0} {1} {2}".format(sh.name, sh.nrows, sh.ncols)) -print("Cell D30 is {0}".format(sh.cell_value(rowx=29, colx=3))) -for rx in range(sh.nrows): - print(sh.row(rx)) -``` - -**Another quick start**: This will show the first, second and last rows of each sheet in each file: - - python PYDIR/scripts/runxlrd.py 3rows *blah*.xls - -**Installation**: - -* On Windows: use the installer. -* Any OS: Unzip the .zip file into a suitable directory, chdir to that directory, then do "python setup.py install". -* If PYDIR is your Python installation directory: the main files are in PYDIR/Lib/site-packages/xlrd the docs are in the doc subdirectory, and there's a sample script: PYDIR/Scripts/runxlrd.py -* If os.sep != "/": make the appropriate adjustments. - -**Acknowledgements**: - -* This package started life as a translation from C into Python of parts of a utility called "xlreader" developed by David Giffin. "This product includes software developed by David Giffin ." -* OpenOffice.org has truly excellent documentation of the Microsoft Excel file formats and Compound Document file format, authored by Daniel Rentz. See http://sc.openoffice.org -* U+5F20 U+654F: over a decade of inspiration, support, and interesting decoding opportunities. -* Ksenia Marasanova: sample Macintosh and non-Latin1 files, alpha testing -* Backporting to Python 2.1 was partially funded by Journyx - provider of timesheet and project accounting solutions (http://journyx.com/). -* Provision of formatting information in version 0.6.1 was funded by Simplistix Ltd (http://www.simplistix.co.uk/) diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..a0551e7 --- /dev/null +++ b/README.rst @@ -0,0 +1,56 @@ +xlrd +==== + +|Build Status|_ |Coverage Status|_ |Documentation|_ |PyPI version|_ + +.. |Build Status| image:: https://circleci.com/gh/python-excel/xlrd/tree/master.svg?style=shield +.. _Build Status: https://circleci.com/gh/python-excel/xlrd/tree/master + +.. |Coverage Status| image:: https://codecov.io/gh/python-excel/xlrd/branch/master/graph/badge.svg?token=lNSqwBBbvk +.. _Coverage Status: https://codecov.io/gh/python-excel/xlrd + +.. |Documentation| image:: https://readthedocs.org/projects/xlrd/badge/?version=latest +.. _Documentation: http://xlrd.readthedocs.io/en/latest/?badge=latest + +.. |PyPI version| image:: https://badge.fury.io/py/xlrd.svg +.. _PyPI version: https://badge.fury.io/py/xlrd + + +xlrd is a library for reading data and formatting information from Excel +files in the historical ``.xls`` format. + +.. warning:: + + This library will no longer read anything other than ``.xls`` files. For + alternatives that read newer file formats, please see http://www.python-excel.org/. + +The following are also not supported but will safely and reliably be ignored: + +* Charts, Macros, Pictures, any other embedded object, **including** embedded worksheets. +* VBA modules +* Formulas, but results of formula calculations are extracted. +* Comments +* Hyperlinks +* Autofilters, advanced filters, pivot tables, conditional formatting, data validation + +Password-protected files are not supported and cannot be read by this library. + +Quick start: + +.. code-block:: python + + import xlrd + book = xlrd.open_workbook("myfile.xls") + print("The number of worksheets is {0}".format(book.nsheets)) + print("Worksheet name(s): {0}".format(book.sheet_names())) + sh = book.sheet_by_index(0) + print("{0} {1} {2}".format(sh.name, sh.nrows, sh.ncols)) + print("Cell D30 is {0}".format(sh.cell_value(rowx=29, colx=3))) + for rx in range(sh.nrows): + print(sh.row(rx)) + +From the command line, this will show the first, second and last rows of each sheet in each file: + +.. code-block:: bash + + python PYDIR/scripts/runxlrd.py 3rows *blah*.xls diff --git a/docs/acknowledgements.rst b/docs/acknowledgements.rst index d4d3874..62530f4 100644 --- a/docs/acknowledgements.rst +++ b/docs/acknowledgements.rst @@ -1,6 +1,18 @@ Acknowledgements ================ +Many thanks to to John Machin for originally writing :mod:`xlrd` and tirelessly +supporting it for many years before retiring. + +* This package started life as a translation from C into Python of parts of a utility called "xlreader" developed by David Giffin. "This product includes software developed by David Giffin ." +* OpenOffice.org has truly excellent documentation of the Microsoft Excel file formats and Compound Document file format, authored by Daniel Rentz. See http://sc.openoffice.org +* U+5F20 U+654F: over a decade of inspiration, support, and interesting decoding opportunities. +* Ksenia Marasanova: sample Macintosh and non-Latin1 files, alpha testing +* Backporting to Python 2.1 was partially funded by Journyx - provider of timesheet and project accounting solutions (http://journyx.com/). +* Provision of formatting information in version 0.6.1 was funded by `Simplistix Ltd`__. + +__ http://www.simplistix.co.uk + Development of this package would not have been possible without the document OpenOffice.org's Documentation of the Microsoft Excel File Format" ("OOo docs" for short). @@ -18,8 +30,3 @@ Backporting to Python 2.1 was partially funded by `Journyx - provider of timesheet and project accounting solutions`__. __ http://journyx.com/ - -Provision of formatting information in version 0.6.1 was funded by -`Simplistix Ltd`__. - -__ http://www.simplistix.co.uk diff --git a/docs/changes.rst b/docs/changes.rst index 406f045..d712420 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -1,556 +1,4 @@ -Changes -======= .. currentmodule:: xlrd -1.2.0 (15 December 2018) ------------------------- - -- Added support for Python 3.7. -- Added optional support for defusedxml to help mitigate exploits. -- Automatically convert ``~`` in file paths to the current user's home - directory. -- Removed ``examples`` directory from the installed package. They are still - available in the source distribution. -- Fixed ``time.clock()`` deprecation warning. - -1.1.0 (22 August 2017) ----------------------- - -- Fix for parsing of merged cells containing a single cell reference in xlsx - files. - -- Fix for "invalid literal for int() with base 10: 'true'" when reading some - xlsx files. - -- Make xldate_as_datetime available to import direct from xlrd. - -- Build universal wheels. - -- Sphinx documentation. - -- Document the problem with XML vulnerabilities in xlsx files and mitigation - measures. - -- Fix :class:`NameError` on ``has_defaults is not defined``. - -- Some whitespace and code style tweaks. - -- Make example in README compatible with both Python 2 and 3. - -- Add default value for cells containing errors that causeed parsing of some - xlsx files to fail. - -- Add Python 3.6 to the list of supported Python versions, drop 3.3 and 2.6. - -- Use generator expressions to avoid unnecessary lists in memory. - -- Document unicode encoding used in Excel files from Excel 97 onwards. - -- Report hyperlink errors in R1C1 syntax. - -Thanks to the following for their contributions to this release: - -- icereval@gmail.com -- Daniel Rech -- Ville Skyttä -- Yegor Yefremov -- Maxime Lorant -- Alexandr N Zamaraev -- Zhaorong Ma -- Jon Dufresne -- Chris McIntyre -- coltleese@gmail.com -- Ivan Masá - -1.0.0 (2 June 2016) -------------------- - -- Official support, such as it is, is now for 2.6, 2.7, 3.3+ - -- Fixes a bug in looking up non-lowercase sheet filenames by ensuring that the - sheet targets are transformed the same way as the component_names dict keys. - -- Fixes a bug for ``ragged_rows=False`` when merged cells increases the number - of columns in the sheet. This requires all rows to be extended to ensure equal - row lengths that match the number of columns in the sheet. - -- Fixes to enable reading of SAP-generated .xls files. - -- support BIFF4 files with missing FORMAT records. - -- support files with missing WINDOW2 record. - -- Empty cells are now always unicode strings, they were a bytestring on - Python 2 and a unicode string on Python 3. - -- Fix for ```` ``inlineStr`` attribute without ```` child. - -- Fix for a zoom of ``None`` causing problems on Python 3. - -- Fix parsing of bad dimensions. - -- Fix xlsx sheet to comments relationship. - -Thanks to the following for their contributions to this release: - -- Lars-Erik Hannelius -- Deshi Xiao -- Stratos Moro -- Volker Diels-Grabsch -- John McNamara -- Ville Skyttä -- Patrick Fuller -- Dragon Dave McKee -- Gunnlaugur Þór Briem - -0.9.4 (14 July 2015) --------------------- - -- Automated tests are now run on Python 3.4 - -- Use ``ElementTree.iter()`` if available, instead of the deprecated - ``getiterator()`` when parsing xlsx files. - -- Fix #106 : Exception Value: unorderable types: Name() < Name() - -- Create row generator expression with Sheet.get_rows() - -- Fix for forward slash file separator and lowercase names within xlsx - internals. - -Thanks to the following for their contributions to this release: - -- Corey Farwell -- Jonathan Kamens -- Deepak N -- Brandon R. Stoner -- John McNamara - -0.9.3 (8 Apr 2014) ------------------- - -- Github issue #49 - -- Github issue #64 - skip meaningless chunk of 4 zero bytes between two - otherwise-valid BIFF records - -- Github issue #61 - fix updating of escapement attribute of Font objects read - from workbooks. - -- Implemented ``Sheet.visibility`` for xlsx files - -- Ignore anchors (``$``) in cell references - -- Dropped support for Python 2.5 and earlier, Python 2.6 is now the earliest - Python release supported - -- Read xlsx merged cell elements. - -- Read cell comments in .xlsx files. - -- Added xldate_as_datetime() function to convert from Excel - serial date/time to datetime.datetime object. - -Thanks to the following for their contributions to this release: - -- John Machin -- Caleb Epstein -- Martin Panter -- John McNamara -- Gunnlaugur Þór Briem -- Stephen Lewis - - -0.9.2 (9 Apr 2013) ------------------- - -- Fix some packaging issues that meant docs and examples were missing from the tarball. - -- Fixed a small but serious regression that caused problems opening .xlsx files. - -0.9.1 (5 Apr 2013) ------------------- - -- Many fixes bugs in Python 3 support. -- Fix bug where ragged rows needed fixing when formatting info was being parsed. -- Improved handling of aberrant Excel 4.0 Worksheet files. -- Various bug fixes. -- Simplify a lot of the distribution packaging. -- Remove unused and duplicate imports. - -Thanks to the following for their contributions to this release: - -- Thomas Kluyver - -0.9.0 (31 Jan 2013) -------------------- - -- Support for Python 3.2+ -- Many new unit test added. -- Continuous integration tests are now run. -- Various bug fixes. - -Special thanks to Thomas Kluyver and Martin Panter for their work on -Python 3 compatibility. - -Thanks to Manfred Moitzi for re-licensing his unit tests so we could include -them. - -Thanks to the following for their contributions to this release: - -- "holm" -- Victor Safronovich -- Ross Jones - -0.8.0 (22 Aug 2012) -------------------- - -- More work-arounds for broken source files. -- Support for reading .xlsx files. -- Drop support for Python 2.5 and older. - -0.7.8 (7 June 2012) -------------------- - -- Ignore superfluous zero bytes at end of xls OBJECT record. -- Fix assertion error when reading file with xlwt-written bitmap. - -0.7.7 (13 Apr 2012) -------------------- - -- More packaging changes, this time to support 2to3. - -0.7.6 (3 Apr 2012) ------------------- - -- Fix more packaging issues. - -0.7.5 (3 Apr 2012) ------------------- -- Fix packaging issue that missed ``version.txt`` from the distributions. - -0.7.4 (2 Apr 2012) ------------------- - -- More tolerance of out-of-spec files. -- Fix bugs reading long text formula results. - -0.7.3 (28 Feb 2012) -------------------- - -- Packaging and documentation updates. - -0.7.2 (21 Feb 2012) -------------------- - -- Tolerant handling of files with extra zero bytes at end of NUMBER record. - Sample provided by Jan Kraus. -- Added access to cell notes/comments. Many cross-references added to Sheet - class docs. -- Added code to extract hyperlink (HLINK) records. Based on a patch supplied by - John Morrisey. -- Extraction of rich text formatting info based on code supplied by - Nathan van Gheem. -- added handling of BIFF2 WINDOW2 record. -- Included modified version of page breaks patch from Sam Listopad. -- Added reading of the PANE record. -- Reading SCL record. New attribute ``Sheet.scl_mag_factor``. -- Lots of bug fixes. -- Added ``ragged_rows`` functionality. - -0.7.1 (31 May 2009) -------------------- - -- Backed out "slash'n'burn" of sheet resources in unload_sheet(). - Fixed problem with STYLE records on some Mac Excel files. -- quieten warnings -- Integrated on_demand patch by Armando Serrano Lombillo - -0.7.0 (11 March 2009) ---------------------- - -+ colname utility function now supports more than 256 columns. -+ Fix bug where BIFF record type 0x806 was being regarded as a formula - opcode. -+ Ignore PALETTE record when formatting_info is false. -+ Tolerate up to 4 bytes trailing junk on PALETTE record. -+ Fixed bug in unused utility function xldate_from_date_tuple which - affected some years after 2099. -+ Added code for inspecting as-yet-unused record types: FILEPASS, TXO, - NOTE. -+ Added inspection code for add_in function calls. -+ Added support for unnumbered biff_dump (better for doing diffs). -+ ignore distutils cruft -+ Avoid assertion error in compdoc when -1 used instead of -2 for - first_SID of empty SCSS -+ Make version numbers match up. -+ Enhanced recovery from out-of-order/missing/wrong CODEPAGE record. -+ Added Name.area2d convenience method. -+ Avoided some checking of XF info when formatting_info is false. -+ Minor changes in preparation for XLSX support. -+ remove duplicate files that were out of date. -+ Basic support for Excel 2.0 -+ Decouple Book init & load. -+ runxlrd: minor fix for xfc. -+ More Excel 2.x work. -+ is_date_format() tweak. -+ Better detection of IronPython. -+ Better error message (including first 8 bytes of file) when file is - not in a supported format. -+ More BIFF2 formatting: ROW, COLWIDTH, and COLUMNDEFAULT records; -+ finished stage 1 of XF records. -+ More work on supporting BIFF2 (Excel 2.x) files. -+ Added support for Excel 2.x (BIFF2) files. Data only, no formatting - info. Alpha. -+ Wasn't coping with EXTERNSHEET record followed by CONTINUE - record(s). -+ Allow for BIFF2/3-style FORMAT record in BIFF4/8 file -+ Avoid crash when zero-length Unicode string missing options byte. -+ Warning message if sector sizes are extremely large. -+ Work around corrupt STYLE record -+ Added missing entry for blank cell type to ctype_text -+ Added "fonts" command to runxlrd script -+ Warning: style XF whose parent XF index != 0xFFF -+ Logfile arg wasn't being passed from open_workbook to - compdoc.CompDoc. - - -0.6.1 (10 June 2007) ---------------------- - -+ Version number updated to 0.6.1 -+ Documented runxlrd.py commands in its usage message. Changed - commands: dump to biff_dump, count_records to biff_count. - - -0.6.1a5 -------- - -+ Bug fixed: Missing "<" in a struct.unpack call means can't open - files on bigendian platforms. Discovered by "Mihalis". -+ Removed antique undocumented Book.get_name_dict method and - experimental "trimming" facility. -+ Meaningful exception instead of IndexError if a SAT (sector - allocation table) is corrupted. -+ If no CODEPAGE record in pre-8.0 file, assume ascii and keep going - (instead of raising exception). - - -0.6.1a4 -------- - -+ At least one source of XLS files writes parent style XF records - *after* the child cell XF records that refer to them, triggering - IndexError in 0.5.2 and AssertionError in later versions. Reported - with sample file by Todd O'Bryan. Fixed by changing to two-pass - processing of XF records. -+ Formatting info in pre-BIFF8 files: Ensured appropriate defaults and - lossless conversions to make the info BIFF8-compatible. Fixed bug in - extracting the "used" flags. -+ Fixed problems discovered with opening test files from Planmaker - 2006 (http://www.softmaker.com/english/ofwcomp_en.htm): (1) Four files - have reduced size of PALETTE record (51 and 32 colours; Excel writes - 56 always). xlrd now emits a NOTE to the logfile and continues. (2) - FORMULA records use the Excel 2.x record code 0x0021 instead of - 0x0221. xlrd now continues silently. (3) In two files, at the OLE2 - compound document level, the internal directory says that the length - of the Short-Stream Container Stream is 16384 bytes, but the actual - contents are 11264 and 9728 bytes respectively. xlrd now emits a - WARNING to the logfile and continues. -+ After discussion with Daniel Rentz, the concept of two lists of XF - (eXtended Format) objects (raw_xf_list and computed_xf_list) has been - abandoned. There is now a single list, called xf_list - - -0.6.1a3 -------- - -+ Added Book.sheets ... for sheetx, sheet in enumerate(book.sheets): -+ Formatting info: extraction of sheet-level flags from WINDOW2 - record, and sheet.visibility from BOUNDSHEET record. Added Macintosh- - only Font attributes "outline" and "shadow'. - - -0.6.1a2 -------- - -+ Added extraction of merged cells info. -+ pyExcelerator uses "general" instead of "General" for the generic - "number format". Worked around. -+ Crystal Reports writes "WORKBOOK" in the OLE2 Compound Document - directory instead of "Workbook". Changed to case-insensitive directory - search. Reported by Vic Simkus. - - -0.6.1a1 (18 Dec 2006) ---------------------- - -+ Added formatting information for cells (font, "number format", - background, border, alignment and protection) and rows/columns - (height/width etc). To save memory and time for those who don't need - it, this information is extracted only if formatting_info=1 is - supplied to the open_workbook() function. The cell records BLANK and - MULBLANKS which contain no data, only formatting information, will - continue to be ignored in the default (no formatting info) case. -+ Ralph Heimburger reported a problem with xlrd being intolerant about - an Excel 4.0 file (created by "some web app") with a DIMENSIONS record - that omitted Microsoft's usual padding with 2 unused bytes. Fixed. - - -0.6.0a4 (not released) ----------------------- - -+ Added extraction of human-readable formulas from NAME records. -+ Worked around OOo Calc writing 9-byte BOOLERR records instead of 8. - Reported by Rory Campbell-Lange. -+ This history file converted to descending chronological order and - HTML format. - - -0.6.0a3 (19 Sept 2006) ----------------------- - -+ Names: minor bugfixes; added script xlrdnameAPIdemo.py -+ ROW records were being used as additional hints for sizing memory - requirements. In some files the ROW records overstate the number of - used columns, and/or there are ROW records for rows that have no data - in them. This would cause xlrd to report sheet.ncols and/or - sheet.nrows as larger than reasonably expected. Change: ROW records - are ignored. The number of columns/rows is based solely on the highest - column/row index seen in non-empty data records. Empty data records - (types BLANK and MULBLANKS) which contain no data, only formatting - information, have always been ignored, and this will continue. - Consequence: trailing rows and columns which contain only empty cells - will vanish. - - -0.6.0a2 (13 Sept 2006) ----------------------- - - -+ Fixed a bug reported by Rory Campbell-Lange.: "open failed"; - incorrect assumptions about the layout of array formulas which return - strings. -+ Further work on defined names, especially the API. - - -0.6.0a1 (8 Sept 2006) ---------------------- - -+ Sheet objects have two new convenience methods: col_values(colx, - start_rowx=0, end_rowx=None) and the corresponding col_types. - Suggested by Dennis O'Brien. -+ BIFF 8 file missing its CODEPAGE record: xlrd will now assume - utf_16_le encoding (the only possibility) and keep going. -+ Older files missing a CODEPAGE record: an exception will be raised. - Thanks to Sergey Krushinsky for a sample file. The open_workbook() - function has a new argument (encoding_override) which can be used if - the CODEPAGE record is missing or incorrect (for example, - codepage=1251 but the data is actually encoded in koi8_r). The - runxlrd.py script takes a corresponding -e argument, for example -e - cp1251 -+ Further work done on parsing "number formats". Thanks to Chris - Withers for the ``"General_)"`` example. -+ Excel 97 introduced the concept of row and column labels, defined by - Insert > Name > Labels. The ranges containing the labels are now - exposed as the Sheet attributes row_label_ranges and col_label_ranges. -+ The major effort in this 0.6.0 release has been the provision of - access to named cell ranges and named constants (Excel: - Insert/Name/Define). Juan C. Mendez provided very useful real-world - sample files. - - -0.5.3a1 (24 May 2006) ---------------------- - -+ John Popplewell and Richard Sharp provided sample files which caused - any reliance at all on DIMENSIONS records and ROW records to be - abandoned. -+ If the file size is not a whole number of OLE sectors, a warning - message is logged. Previously this caused an exception to be raised. - - -0.5.2 (14 March 2006) ---------------------- - -+ public release -+ Updated version numbers, README, HISTORY. - - -0.5.2a3 (13 March 2006) ------------------------ - -+ Gnumeric writes user-defined formats with format codes starting at - 50 instead of 164; worked around. -+ Thanks to Didrik Pinte for reporting the need for xlrd to be more - tolerant of the idiosyncracies of other software, for supplying sample - files, and for performing alpha testing. -+ '_' character in a format should be treated like an escape - character; fixed. -+ An "empty" formula result means a zero-length string, not an empty - cell! Fixed. - - -0.5.2a2 (9 March 2006) ----------------------- - -+ Found that Gnumeric writes all DIMENSIONS records with nrows and - ncols each 1 less than they should be (except when it clamps ncols at - 256!), and pyXLwriter doesn't write ROW records. Cell memory pre- - allocation was generalised to use ROW records if available with fall- - back to DIMENSIONS records. - - -0.5.2a1 (6 March 2006) ----------------------- - - -+ pyXLwriter writes DIMENSIONS record with antique opcode 0x0000 - instead of 0x0200; worked around -+ A file written by Gnumeric had zeroes in DIMENSIONS record but data - in cell A1; worked around - - -0.5.1 (18 Feb 2006) --------------------- - -+ released to Journyx -+ Python 2.1 mmap requires file to be opened for update access. Added - fall-back to read-only access without mmap if 2.1 open fails because - "permission denied". - - -0.5 (7 Feb 2006) ----------------- - -+ released to Journyx -+ Now works with Python 2.1. Backporting to Python 2.1 was partially - funded by Journyx - provider of timesheet and project accounting - solutions (http://journyx.com/) -+ open_workbook() can be given the contents of a file instead of its - name. Thanks to Remco Boerma for the suggestion. -+ New module attribute __VERSION__ (as a string; for example "0.5") -+ Minor enhancements to classification of formats as date or not-date. -+ Added warnings about files with inconsistent OLE compound document - structures. Thanks to Roman V. Kiseliov (author of pyExcelerator) for - the tip-off. - - -0.4a1, (7 Sept 2005) --------------------- - -+ released to Laurent T. -+ Book and sheet objects can now be pickled and unpickled. Instead of - reading a large spreadsheet multiple times, consider pickling it once - and loading the saved pickle; can be much faster. Thanks to Laurent - Thioudellet for the enhancement request. -+ Using the mmap module can be turned off. But you would only do that - for benchmarking purposes. -+ Handling NUMBER records has been made faster - - -0.3a1 (15 May 2005) -------------------- - -- first public release +.. include:: ../CHANGELOG.rst diff --git a/docs/conf.py b/docs/conf.py index cfe8744..38f974a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,10 +1,9 @@ import datetime import os -import pkginfo +from xlrd.info import __VERSION__ on_rtd = os.environ.get('READTHEDOCS', None) == 'True' -pkg_info = pkginfo.Develop(os.path.join(os.path.dirname(__file__), os.pardir)) intersphinx_mapping = {'http://docs.python.org': None} extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx'] @@ -12,7 +11,7 @@ master_doc = 'index' project = u'xlrd' copyright = '2005-%s Stephen John Machin, Lingfo Pty Ltd' % datetime.datetime.now().year -version = release = pkg_info.version +version = release = __VERSION__ exclude_patterns = ['_build'] pygments_style = 'sphinx' diff --git a/docs/development.rst b/docs/development.rst index 5e2f78f..cf29848 100644 --- a/docs/development.rst +++ b/docs/development.rst @@ -3,11 +3,6 @@ Development .. highlight:: bash -This package is developed using continuous integration which can be -found here: - -https://travis-ci.org/python-excel/xlrd - If you wish to contribute to this project, then you should fork the repository found here: @@ -24,23 +19,15 @@ your checkout into a virtualenv and then install the package in editable form as follows:: $ virtualenv . - $ bin/pip install -e . + $ bin/pip install -e .[test] Running the tests ----------------- Once you've set up a virtualenv, the tests can be run as follows:: - $ python -m unittest discover - -To run tests on all the versions of Python that are supported, you can do:: - - $ bin/tox - -If you change the supported python versions in ``.travis.yml``, please remember -to do the following to update ``tox.ini``:: - - $ bin/panci --to=tox .travis.yml > tox.ini + $ source bin/activate + $ pytest Building the documentation -------------------------- @@ -48,16 +35,18 @@ Building the documentation The Sphinx documentation is built by doing the following, having activated the virtualenv above, from the directory containing setup.py:: + $ source bin/activate $ cd docs $ make html +To check that the description that will be used on PyPI renders properly, +do the following:: + + $ python setup.py --long-description | rst2html.py > desc.html + Making a release ---------------- -To make a release, just update the version in ``xlrd.info.__VERSION__``, -update the change log, tag it, push to https://github.com/python-excel/xlrd -and Travis CI should take care of the rest. - -Once the above is done, make sure to go to -https://readthedocs.org/projects/xlrd/versions/ -and make sure the new release is marked as an Active Version. +To make a release, just update the version in ``xlrd.info.__VERSION__``, update the change log +and push to https://github.com/python-excel/xlrd +and Carthorse should take care of the rest. diff --git a/docs/index.rst b/docs/index.rst index be0294c..06b774c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,8 +1,10 @@ -xlrd documentation -================== +.. include:: ../README.rst + +You may also wish to consult the `tutorial`__. + +__ https://github.com/python-excel/tutorial -xlrd is a library for reading data and formatting information from Excel -files, whether they are .xls or .xlsx files. +Details: .. toctree:: :maxdepth: 1 @@ -12,20 +14,14 @@ files, whether they are .xls or .xlsx files. references.rst formatting.rst on_demand.rst - vulnerabilities.rst api.rst -You may also wish to consult the `tutorial`__. - -__ https://github.com/python-excel/tutorial - -For details of how to install the package or get involved in its -development, please see the sections below: +For details of how to get involved in development of this package, +and other meta-information, please see the sections below: .. toctree:: :maxdepth: 1 - installation.rst development.rst changes.rst acknowledgements.rst diff --git a/docs/installation.rst b/docs/installation.rst deleted file mode 100644 index 1af86e9..0000000 --- a/docs/installation.rst +++ /dev/null @@ -1,18 +0,0 @@ -Installation Instructions -========================= - -If you want to experiment with xlrd, the easiest way to -install it is to do the following in a virtualenv:: - - pip install xlrd - -If your package uses setuptools and you decide to use xlrd, -then you should add it as a requirement by adding an ``install_requires`` -parameter in your call to ``setup`` as follows: - -.. code-block:: python - - setup( - # other stuff here - install_requires=['xlrd'], - ) diff --git a/docs/vulnerabilities.rst b/docs/vulnerabilities.rst deleted file mode 100644 index eea53e3..0000000 --- a/docs/vulnerabilities.rst +++ /dev/null @@ -1,54 +0,0 @@ -XML vulnerabilities and Excel files -=================================== - -If your code ingests ``.xlsx`` files that come from sources in which you do not -have absolute trust, please be aware that ``.xlsx`` files are made up of XML -and, as such, are susceptible to the vulnerabilities of XML. - -xlrd uses ElementTree to parse XML, but as you'll find if you look into it, -there are many different ElementTree implementations. A good summary -of vulnerabilities you should worry can be found here: -:ref:`xml-vulnerabilities`. - -For clarity, xlrd will try and import ElementTree from the following sources. -The list is in priority order, with those earlier in the list being preferred -to those later in the list: - -1. `xml.etree.cElementTree`__ - - __ https://docs.python.org/2/library/xml.etree.elementtree.html - -2. `cElementTree`__ - - __ http://effbot.org/zone/celementtree.htm - -3. `lxml.etree`__ - - __ http://lxml.de/api/lxml.etree-module.html - -4. `xml.etree.ElementTree`__ - - __ https://docs.python.org/2/library/xml.etree.elementtree.html - -5. `elementtree.ElementTree`__ - - __ http://effbot.org/zone/element-index.htm - -To guard against these problems, you should consider the `defusedxml`__ -project which can be used as follows: - -__ https://pypi.org/project/defusedxml/ - -.. code-block:: python - - import defusedxml - from defusedxml.common import EntitiesForbidden - from xlrd import open_workbook - defusedxml.defuse_stdlib() - - - def secure_open_workbook(**kwargs): - try: - return open_workbook(**kwargs) - except EntitiesForbidden: - raise ValueError('Please use a xlsx file without XEE') diff --git a/examples/xlrdnameAPIdemo.py b/examples/xlrdnameAPIdemo.py deleted file mode 100644 index a016b4d..0000000 --- a/examples/xlrdnameAPIdemo.py +++ /dev/null @@ -1,181 +0,0 @@ -# -*- coding: utf-8 -*- - -## -# Module/script example of the xlrd API for extracting information -# about named references, named constants, etc. -# -#

Copyright © 2006 Stephen John Machin, Lingfo Pty Ltd

-#

This module is part of the xlrd package, which is released under a BSD-style licence.

-## -from __future__ import print_function - -import glob -import sys - -import xlrd -from xlrd.timemachine import REPR - - -def scope_as_string(book, scope): - if 0 <= scope < book.nsheets: - return "sheet #%d (%r)" % (scope, REPR(book.sheet_names()[scope])) - if scope == -1: - return "Global" - if scope == -2: - return "Macro/VBA" - return "Unknown scope value (%r)" % REPR(scope) - -def do_scope_query(book, scope_strg, show_contents=0, f=sys.stdout): - try: - qscope = int(scope_strg) - except ValueError: - if scope_strg == "*": - qscope = None # means "all' - else: - # so assume it's a sheet name ... - qscope = book.sheet_names().index(scope_strg) - print("%r => %d" % (scope_strg, qscope), file=f) - for nobj in book.name_obj_list: - if qscope is None or nobj.scope == qscope: - show_name_object(book, nobj, show_contents, f) - -def show_name_details(book, name, show_contents=0, f=sys.stdout): - """ - book -- Book object obtained from xlrd.open_workbook(). - name -- The name that's being investigated. - show_contents -- 0: Don't; 1: Non-empty cells only; 2: All cells - f -- Open output file handle. - """ - name_lcase = name.lower() # Excel names are case-insensitive. - nobj_list = book.name_map.get(name_lcase) - if not nobj_list: - print("%r: unknown name" % name, file=f) - return - for nobj in nobj_list: - show_name_object(book, nobj, show_contents, f) - -def show_name_details_in_scope(book, name, scope_strg, show_contents=0, f=sys.stdout): - try: - scope = int(scope_strg) - except ValueError: - # so assume it's a sheet name ... - scope = book.sheet_names().index(scope_strg) - print("%r => %d" % (scope_strg, scope), file=f) - name_lcase = name.lower() # Excel names are case-insensitive. - while 1: - nobj = book.name_and_scope_map.get((name_lcase, scope)) - if nobj: - break - print("Name %s not found in scope %d" % (REPR(name), scope), file=f) - if scope == -1: - return - scope = -1 # Try again with global scope - print("Name %s found in scope %d" % (REPR(name), scope), file=f) - show_name_object(book, nobj, show_contents, f) - -def showable_cell_value(celltype, cellvalue, datemode): - if celltype == xlrd.XL_CELL_DATE: - try: - showval = xlrd.xldate_as_tuple(cellvalue, datemode) - except xlrd.XLDateError as e: - showval = "%s:%s" % (type(e).__name__, e) - elif celltype == xlrd.XL_CELL_ERROR: - showval = xlrd.error_text_from_code.get( - cellvalue, '' % cellvalue) - else: - showval = cellvalue - return showval - -def show_name_object(book, nobj, show_contents=0, f=sys.stdout): - print("\nName: %s, scope: %s (%s)" - % (REPR(nobj.name), REPR(nobj.scope), scope_as_string(book, nobj.scope)), file=f) - res = nobj.result - print("Formula eval result: %s" % REPR(res), file=f) - if res is None: - return - # result should be an instance of the Operand class - kind = res.kind - value = res.value - if kind >= 0: - # A scalar, or unknown ... you've seen all there is to see. - pass - elif kind == xlrd.oREL: - # A list of Ref3D objects representing *relative* ranges - for i in range(len(value)): - ref3d = value[i] - print("Range %d: %s ==> %s"% (i, REPR(ref3d.coords), REPR(xlrd.rangename3drel(book, ref3d))), file=f) - elif kind == xlrd.oREF: - # A list of Ref3D objects - for i in range(len(value)): - ref3d = value[i] - print("Range %d: %s ==> %s"% (i, REPR(ref3d.coords), REPR(xlrd.rangename3d(book, ref3d))), file=f) - if not show_contents: - continue - datemode = book.datemode - for shx in range(ref3d.shtxlo, ref3d.shtxhi): - sh = book.sheet_by_index(shx) - print(" Sheet #%d (%s)" % (shx, sh.name), file=f) - rowlim = min(ref3d.rowxhi, sh.nrows) - collim = min(ref3d.colxhi, sh.ncols) - for rowx in range(ref3d.rowxlo, rowlim): - for colx in range(ref3d.colxlo, collim): - cty = sh.cell_type(rowx, colx) - if cty == xlrd.XL_CELL_EMPTY and show_contents == 1: - continue - cval = sh.cell_value(rowx, colx) - sval = showable_cell_value(cty, cval, datemode) - print(" (%3d,%3d) %-5s: %s" - % (rowx, colx, xlrd.cellname(rowx, colx), REPR(sval)), file=f) - -if __name__ == "__main__": - def usage(): - text = """ -usage: xlrdnameAIPdemo.py glob_pattern name scope show_contents - -where: - "glob_pattern" designates a set of files - "name" is a name or '*' (all names) - "scope" is -1 (global) or a sheet number - or a sheet name or * (all scopes) - "show_contents" is one of 0 (no show), - 1 (only non-empty cells), or 2 (all cells) - -Examples (script name and glob_pattern arg omitted for brevity) - [Searching through book.name_obj_list] - * * 0 lists all names - * * 1 lists all names, showing referenced non-empty cells - * 1 0 lists all names local to the 2nd sheet - * Northern 0 lists all names local to the 'Northern' sheet - * -1 0 lists all names with global scope - [Initial direct access through book.name_map] - Sales * 0 lists all occurrences of "Sales" in any scope - [Direct access through book.name_and_scope_map] - Revenue -1 0 checks if "Revenue" exists in global scope - -""" - sys.stdout.write(text) - - if len(sys.argv) != 5: - usage() - sys.exit(0) - arg_pattern = sys.argv[1] # glob pattern e.g. "foo*.xls" - arg_name = sys.argv[2] # see below - arg_scope = sys.argv[3] # see below - # 0: no show, - # 1: only non-empty cells, - # 2: all cells - arg_show_contents = int(sys.argv[4]) - for fname in glob.glob(arg_pattern): - book = xlrd.open_workbook(fname) - if arg_name == "*": - # Examine book.name_obj_list to find all names - # in a given scope ("*" => all scopes) - do_scope_query(book, arg_scope, arg_show_contents) - elif arg_scope == "*": - # Using book.name_map to find all usage of a name. - show_name_details(book, arg_name, arg_show_contents) - else: - # Using book.name_and_scope_map to find which if any instances - # of a name are visible in the given scope, which can be supplied - # as -1 (global) or a sheet number or a sheet name. - show_name_details_in_scope(book, arg_name, arg_scope, arg_show_contents) diff --git a/scripts/runxlrd.py b/scripts/runxlrd.py index d77b09b..b284a59 100644 --- a/scripts/runxlrd.py +++ b/scripts/runxlrd.py @@ -31,9 +31,6 @@ options = None if __name__ == "__main__": - - PSYCO = 0 - import xlrd import sys import time @@ -232,7 +229,7 @@ def count_xfs(bk): def main(cmd_args): import optparse - global options, PSYCO + global options usage = "\n%prog [options] command [input-file-patterns]\n" + cmd_doc oparser = optparse.OptionParser(usage) oparser.add_option( @@ -321,10 +318,6 @@ def main(cmd_args): n_unreachable = gc.collect() if n_unreachable: print("GC before open:", n_unreachable, "unreachable objects") - if PSYCO: - import psyco - psyco.full() - PSYCO = 0 try: t0 = time.time() bk = xlrd.open_workbook( @@ -413,8 +406,5 @@ def main(cmd_args): import pstats p = pstats.Stats('YYYY.prof') p.strip_dirs().sort_stats('cumulative').print_stats(30) - elif firstarg == "psyco": - PSYCO = 1 - main(av[1:]) else: main(av) diff --git a/setup.cfg b/setup.cfg index 060ef88..ed8a958 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,24 +1,5 @@ [bdist_wheel] universal = 1 -[flake8] -ignore = - E126,E128, - E201,E202,E203,E221,E222,E225,E226,E231,E241,E251,E261,E262,E266,E271,E272, - E301,E302,E303,E305, - E501, - E701,E702,E704,E722,E731, - F401,F403,F405, - W504 - -[isort] -combine_as_imports = true -default_section = THIRDPARTY -include_trailing_comma = true -known_first_party = xlrd -multi_line_output = 5 -not_skip = __init__.py -skip = .tox - [metadata] license_file = LICENSE diff --git a/setup.py b/setup.py index b7fd7bd..1a08892 100644 --- a/setup.py +++ b/setup.py @@ -3,29 +3,23 @@ from xlrd.info import __VERSION__ setup( - name = 'xlrd', - version = __VERSION__, - author = 'John Machin', - author_email = 'sjmachin@lexicon.net', - url = 'http://www.python-excel.org/', - packages = ['xlrd'], - scripts = [ + name='xlrd', + version=__VERSION__, + author='Chris Withers', + author_email='chris@withers.org', + url='http://www.python-excel.org/', + packages=['xlrd'], + scripts=[ 'scripts/runxlrd.py', ], - description = ( + description=( 'Library for developers to extract data from ' - 'Microsoft Excel (tm) spreadsheet files' + 'Microsoft Excel (tm) .xls spreadsheet files' ), - long_description = ( - "Extract data from Excel spreadsheets " - "(.xls and .xlsx, versions 2.0 onwards) on any platform. " - "Pure Python (2.7, 3.4+). " - "Strong support for Excel dates. Unicode-aware." - ), - platforms = ["Any platform -- don't need Windows"], - license = 'BSD', - keywords = ['xls', 'excel', 'spreadsheet', 'workbook'], - classifiers = [ + long_description=open('README.rst').read(), + license='BSD', + keywords=['xls', 'excel', 'spreadsheet', 'workbook'], + classifiers=[ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'License :: OSI Approved :: BSD License', @@ -33,14 +27,19 @@ 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', 'Operating System :: OS Independent', 'Topic :: Database', 'Topic :: Office/Business', 'Topic :: Software Development :: Libraries :: Python Modules', ], - python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*", + python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*", + extras_require=dict( + test=['pytest', 'pytest-cov'], + docs=['sphinx'], + build=['wheel', 'twine'] + ) ) diff --git a/tests/apachepoi_49609.xlsx b/tests/apachepoi_49609.xlsx deleted file mode 100644 index 03d9d12..0000000 Binary files a/tests/apachepoi_49609.xlsx and /dev/null differ diff --git a/tests/apachepoi_52348.xlsx b/tests/apachepoi_52348.xlsx deleted file mode 100644 index 356490d..0000000 Binary files a/tests/apachepoi_52348.xlsx and /dev/null differ diff --git a/tests/err_cell_empty.xlsx b/tests/err_cell_empty.xlsx deleted file mode 100644 index 22d09f2..0000000 Binary files a/tests/err_cell_empty.xlsx and /dev/null differ diff --git a/tests/base.py b/tests/helpers.py similarity index 59% rename from tests/base.py rename to tests/helpers.py index 6ec130e..94de831 100644 --- a/tests/base.py +++ b/tests/helpers.py @@ -1,5 +1,5 @@ import os -def from_this_dir(filename): - return os.path.join(os.path.dirname(os.path.abspath(__file__)), filename) +def from_sample(filename): + return os.path.join(os.path.dirname(os.path.abspath(__file__)), 'samples', filename) diff --git a/tests/issue150.xlsx b/tests/issue150.xlsx deleted file mode 100644 index b4ae30e..0000000 Binary files a/tests/issue150.xlsx and /dev/null differ diff --git a/tests/merged_cells.xlsx b/tests/merged_cells.xlsx deleted file mode 100644 index 2d94fa2..0000000 Binary files a/tests/merged_cells.xlsx and /dev/null differ diff --git a/tests/reveng1.xlsx b/tests/reveng1.xlsx deleted file mode 100644 index 099e26b..0000000 Binary files a/tests/reveng1.xlsx and /dev/null differ diff --git a/tests/Formate.xls b/tests/samples/Formate.xls similarity index 100% rename from tests/Formate.xls rename to tests/samples/Formate.xls diff --git a/tests/biff4_no_format_no_window2.xls b/tests/samples/biff4_no_format_no_window2.xls similarity index 100% rename from tests/biff4_no_format_no_window2.xls rename to tests/samples/biff4_no_format_no_window2.xls diff --git a/tests/samples/corrupted_error.xls b/tests/samples/corrupted_error.xls new file mode 100644 index 0000000..8da3cfa Binary files /dev/null and b/tests/samples/corrupted_error.xls differ diff --git a/tests/formula_test_names.xls b/tests/samples/formula_test_names.xls similarity index 100% rename from tests/formula_test_names.xls rename to tests/samples/formula_test_names.xls diff --git a/tests/formula_test_sjmachin.xls b/tests/samples/formula_test_sjmachin.xls similarity index 100% rename from tests/formula_test_sjmachin.xls rename to tests/samples/formula_test_sjmachin.xls diff --git a/tests/issue20.xls b/tests/samples/issue20.xls similarity index 100% rename from tests/issue20.xls rename to tests/samples/issue20.xls diff --git a/examples/namesdemo.xls b/tests/samples/namesdemo.xls similarity index 100% rename from examples/namesdemo.xls rename to tests/samples/namesdemo.xls diff --git a/tests/picture_in_cell.xls b/tests/samples/picture_in_cell.xls similarity index 100% rename from tests/picture_in_cell.xls rename to tests/samples/picture_in_cell.xls diff --git a/tests/profiles.xls b/tests/samples/profiles.xls similarity index 100% rename from tests/profiles.xls rename to tests/samples/profiles.xls diff --git a/tests/ragged.xls b/tests/samples/ragged.xls similarity index 100% rename from tests/ragged.xls rename to tests/samples/ragged.xls diff --git a/tests/samples/sample.ods b/tests/samples/sample.ods new file mode 100644 index 0000000..b3edf0a Binary files /dev/null and b/tests/samples/sample.ods differ diff --git a/tests/samples/sample.txt b/tests/samples/sample.txt new file mode 100644 index 0000000..5adcd3d --- /dev/null +++ b/tests/samples/sample.txt @@ -0,0 +1 @@ +This is a text file. diff --git a/tests/samples/sample.xlsb b/tests/samples/sample.xlsb new file mode 100644 index 0000000..d47d602 Binary files /dev/null and b/tests/samples/sample.xlsb differ diff --git a/tests/test_comments_gdocs.xlsx b/tests/samples/sample.xlsx similarity index 100% rename from tests/test_comments_gdocs.xlsx rename to tests/samples/sample.xlsx diff --git a/tests/samples/sample.zip b/tests/samples/sample.zip new file mode 100644 index 0000000..808352d Binary files /dev/null and b/tests/samples/sample.zip differ diff --git a/tests/xf_class.xls b/tests/samples/xf_class.xls similarity index 100% rename from tests/xf_class.xls rename to tests/samples/xf_class.xls diff --git a/tests/self_evaluation_report_2014-05-19.xlsx b/tests/self_evaluation_report_2014-05-19.xlsx deleted file mode 100644 index 2e7cce7..0000000 Binary files a/tests/self_evaluation_report_2014-05-19.xlsx and /dev/null differ diff --git a/tests/test_cell.py b/tests/test_cell.py index 2df3394..42a046e 100644 --- a/tests/test_cell.py +++ b/tests/test_cell.py @@ -5,13 +5,13 @@ import xlrd from xlrd.timemachine import UNICODE_LITERAL -from .base import from_this_dir +from .helpers import from_sample class TestCell(unittest.TestCase): def setUp(self): - self.book = xlrd.open_workbook(from_this_dir('profiles.xls'), formatting_info=True) + self.book = xlrd.open_workbook(from_sample('profiles.xls'), formatting_info=True) self.sheet = self.book.sheet_by_name('PROFILEDEF') def test_empty_cell(self): @@ -43,31 +43,8 @@ def test_calculated_cell(self): self.assertTrue(cell.xf_index > 0) def test_merged_cells(self): - book = xlrd.open_workbook(from_this_dir('xf_class.xls'), formatting_info=True) + book = xlrd.open_workbook(from_sample('xf_class.xls'), formatting_info=True) sheet3 = book.sheet_by_name('table2') row_lo, row_hi, col_lo, col_hi = sheet3.merged_cells[0] self.assertEqual(sheet3.cell(row_lo, col_lo).value, 'MERGED') self.assertEqual((row_lo, row_hi, col_lo, col_hi), (3, 7, 2, 5)) - - def test_merged_cells_xlsx(self): - book = xlrd.open_workbook(from_this_dir('merged_cells.xlsx')) - - sheet1 = book.sheet_by_name('Sheet1') - expected = [] - got = sheet1.merged_cells - self.assertEqual(expected, got) - - sheet2 = book.sheet_by_name('Sheet2') - expected = [(0, 1, 0, 2)] - got = sheet2.merged_cells - self.assertEqual(expected, got) - - sheet3 = book.sheet_by_name('Sheet3') - expected = [(0, 1, 0, 2), (0, 1, 2, 4), (1, 4, 0, 2), (1, 9, 2, 4)] - got = sheet3.merged_cells - self.assertEqual(expected, got) - - sheet4 = book.sheet_by_name('Sheet4') - expected = [(0, 1, 0, 2), (2, 20, 0, 1), (1, 6, 2, 5)] - got = sheet4.merged_cells - self.assertEqual(expected, got) diff --git a/tests/test_comments_excel.xlsx b/tests/test_comments_excel.xlsx deleted file mode 100644 index cdc2465..0000000 Binary files a/tests/test_comments_excel.xlsx and /dev/null differ diff --git a/tests/test_comments_excel_sheet2.xlsx b/tests/test_comments_excel_sheet2.xlsx deleted file mode 100644 index 5eed9b6..0000000 Binary files a/tests/test_comments_excel_sheet2.xlsx and /dev/null differ diff --git a/tests/test_formats.py b/tests/test_formats.py index c663591..446f643 100644 --- a/tests/test_formats.py +++ b/tests/test_formats.py @@ -6,7 +6,7 @@ import xlrd -from .base import from_this_dir +from .helpers import from_sample if sys.version_info[0] >= 3: def u(s): return s @@ -18,7 +18,7 @@ def u(s): class TestCellContent(TestCase): def setUp(self): - self.book = xlrd.open_workbook(from_this_dir('Formate.xls'), formatting_info=True) + self.book = xlrd.open_workbook(from_sample('Formate.xls'), formatting_info=True) self.sheet = self.book.sheet_by_name(u('Blätt1')) def test_text_cells(self): diff --git a/tests/test_formulas.py b/tests/test_formulas.py index b254b7d..c178a8f 100644 --- a/tests/test_formulas.py +++ b/tests/test_formulas.py @@ -5,7 +5,7 @@ import xlrd -from .base import from_this_dir +from .helpers import from_sample try: ascii @@ -20,7 +20,7 @@ def ascii(s): class TestFormulas(TestCase): def setUp(self): - book = xlrd.open_workbook(from_this_dir('formula_test_sjmachin.xls')) + book = xlrd.open_workbook(from_sample('formula_test_sjmachin.xls')) self.sheet = book.sheet_by_index(0) def get_value(self, col, row): @@ -56,7 +56,7 @@ def test_cell_B8(self): class TestNameFormulas(TestCase): def setUp(self): - book = xlrd.open_workbook(from_this_dir('formula_test_names.xls')) + book = xlrd.open_workbook(from_sample('formula_test_names.xls')) self.sheet = book.sheet_by_index(0) def get_value(self, col, row): diff --git a/tests/test_ignore_workbook_corruption_error.py b/tests/test_ignore_workbook_corruption_error.py new file mode 100644 index 0000000..0cc4fd6 --- /dev/null +++ b/tests/test_ignore_workbook_corruption_error.py @@ -0,0 +1,15 @@ +from unittest import TestCase + +import xlrd + +from .helpers import from_sample + + +class TestIgnoreWorkbookCorruption(TestCase): + + def test_not_corrupted(self): + with self.assertRaises(Exception) as context: + xlrd.open_workbook(from_sample('corrupted_error.xls')) + self.assertTrue('Workbook corruption' in str(context.exception)) + + xlrd.open_workbook(from_sample('corrupted_error.xls'), ignore_workbook_corruption=True) diff --git a/tests/test_inspect.py b/tests/test_inspect.py new file mode 100644 index 0000000..2d1a303 --- /dev/null +++ b/tests/test_inspect.py @@ -0,0 +1,32 @@ +from xlrd import inspect_format + +from .helpers import from_sample + + +def test_xlsx(): + assert inspect_format(from_sample('sample.xlsx')) == 'xlsx' + + +def test_xlsb(): + assert inspect_format(from_sample('sample.xlsb')) == 'xlsb' + + +def test_ods(): + assert inspect_format(from_sample('sample.ods')) == 'ods' + + +def test_zip(): + assert inspect_format(from_sample('sample.zip')) == 'zip' + + +def test_xls(): + assert inspect_format(from_sample('namesdemo.xls')) == 'xls' + + +def test_content(): + with open(from_sample('sample.xlsx'), 'rb') as source: + assert inspect_format(content=source.read()) == 'xlsx' + + +def test_unknown(): + assert inspect_format(from_sample('sample.txt')) is None diff --git a/tests/test_missing_records.py b/tests/test_missing_records.py index ce4ac9d..9b39473 100644 --- a/tests/test_missing_records.py +++ b/tests/test_missing_records.py @@ -3,13 +3,13 @@ from xlrd import open_workbook from xlrd.biffh import XL_CELL_TEXT -from .base import from_this_dir +from .helpers import from_sample class TestMissingRecords(TestCase): def setUp(self): - path = from_this_dir('biff4_no_format_no_window2.xls') + path = from_sample('biff4_no_format_no_window2.xls') self.book = open_workbook(path) self.sheet = self.book.sheet_by_index(0) diff --git a/tests/test_open_workbook.py b/tests/test_open_workbook.py index e8ef978..771bcec 100644 --- a/tests/test_open_workbook.py +++ b/tests/test_open_workbook.py @@ -3,49 +3,34 @@ import tempfile from unittest import TestCase -from xlrd import open_workbook +import pytest -from .base import from_this_dir +from xlrd import open_workbook, XLRDError +from .helpers import from_sample -class TestOpen(TestCase): + +class TestOpen(object): # test different uses of open_workbook def test_names_demo(self): # For now, we just check this doesn't raise an error. - open_workbook( - from_this_dir(os.path.join('..','examples','namesdemo.xls')), - ) - - def test_tilde_path_expansion(self): - with tempfile.NamedTemporaryFile(suffix='.xlsx', dir=os.path.expanduser('~')) as fp: - shutil.copyfile(from_this_dir('text_bar.xlsx'), fp.name) - # For now, we just check this doesn't raise an error. - open_workbook(os.path.join('~', os.path.basename(fp.name))) + open_workbook(from_sample('namesdemo.xls')) def test_ragged_rows_tidied_with_formatting(self): # For now, we just check this doesn't raise an error. - open_workbook(from_this_dir('issue20.xls'), + open_workbook(from_sample('issue20.xls'), formatting_info=True) def test_BYTES_X00(self): # For now, we just check this doesn't raise an error. - open_workbook(from_this_dir('picture_in_cell.xls'), + open_workbook(from_sample('picture_in_cell.xls'), formatting_info=True) - def test_xlsx_simple(self): - # For now, we just check this doesn't raise an error. - open_workbook(from_this_dir('text_bar.xlsx')) - # we should make assertions here that data has been - # correctly processed. - - def test_xlsx(self): - # For now, we just check this doesn't raise an error. - open_workbook(from_this_dir('reveng1.xlsx')) - # we should make assertions here that data has been - # correctly processed. - + def test_open_xlsx(self): + with pytest.raises(XLRDError, match='Excel xlsx file; not supported'): + open_workbook(from_sample('sample.xlsx')) - def test_err_cell_empty(self): - # For cell with type "e" (error) but without inner 'val' tags - open_workbook(from_this_dir('err_cell_empty.xlsx')) + def test_open_unknown(self): + with pytest.raises(XLRDError, match="Unsupported format, or corrupt file"): + open_workbook(from_sample('sample.txt')) diff --git a/tests/test_sheet.py b/tests/test_sheet.py index dd46942..877043b 100644 --- a/tests/test_sheet.py +++ b/tests/test_sheet.py @@ -6,7 +6,7 @@ import xlrd from xlrd.timemachine import xrange -from .base import from_this_dir +from .helpers import from_sample SHEETINDEX = 0 NROWS = 15 @@ -22,7 +22,7 @@ class TestSheet(TestCase): 'AXISDATUMLEVELS', 'PROFILELEVELS'] def setUp(self): - self.book = xlrd.open_workbook(from_this_dir('profiles.xls'), formatting_info=True) + self.book = xlrd.open_workbook(from_sample('profiles.xls'), formatting_info=True) def check_sheet_function(self, function): self.assertTrue(function(0, 0)) @@ -91,12 +91,38 @@ def test_row(self): row = sheet.row(0) self.assertEqual(len(row), NCOLS) + def test_getitem_int(self): + sheet = self.book.sheet_by_index(SHEETINDEX) + row = sheet[0] + self.assertEqual(len(row), NCOLS) + + def test_getitem_tuple(self): + sheet = self.book.sheet_by_index(SHEETINDEX) + self.assertNotEqual(xlrd.empty_cell, sheet[0, 0]) + self.assertNotEqual(xlrd.empty_cell, sheet[NROWS-1, NCOLS-1]) + + def test_getitem_failure(self): + sheet = self.book.sheet_by_index(SHEETINDEX) + with self.assertRaises(ValueError): + sheet[0, 0, 0] + + with self.assertRaises(TypeError): + sheet["hi"] + def test_get_rows(self): sheet = self.book.sheet_by_index(SHEETINDEX) rows = sheet.get_rows() self.assertTrue(isinstance(rows, types.GeneratorType), True) self.assertEqual(len(list(rows)), sheet.nrows) + def test_iter(self): + sheet = self.book.sheet_by_index(SHEETINDEX) + rows = [] + # check syntax + for row in sheet: + rows.append(row) + self.assertEqual(len(rows), sheet.nrows) + def test_col_slice(self): sheet = self.book.sheet_by_index(SHEETINDEX) self.check_col_slice(sheet.col_slice) @@ -125,19 +151,10 @@ def test_row_values(self): class TestSheetRagged(TestCase): def test_read_ragged(self): - book = xlrd.open_workbook(from_this_dir('ragged.xls'), ragged_rows=True) + book = xlrd.open_workbook(from_sample('ragged.xls'), ragged_rows=True) sheet = book.sheet_by_index(0) self.assertEqual(sheet.row_len(0), 3) self.assertEqual(sheet.row_len(1), 2) self.assertEqual(sheet.row_len(2), 1) self.assertEqual(sheet.row_len(3), 4) self.assertEqual(sheet.row_len(4), 4) - - -class TestMergedCells(TestCase): - - def test_tidy_dimensions(self): - book = xlrd.open_workbook(from_this_dir('merged_cells.xlsx')) - for sheet in book.sheets(): - for rowx in xrange(sheet.nrows): - self.assertEqual(sheet.row_len(rowx), sheet.ncols) diff --git a/tests/test_workbook.py b/tests/test_workbook.py index b717579..9efd9c2 100644 --- a/tests/test_workbook.py +++ b/tests/test_workbook.py @@ -2,20 +2,24 @@ from unittest import TestCase +import xlrd from xlrd import open_workbook from xlrd.book import Book from xlrd.sheet import Sheet -from .base import from_this_dir +from .helpers import from_sample +SHEETINDEX = 0 +NROWS = 15 +NCOLS = 13 -class TestWorkbook(TestCase): +class TestWorkbook(TestCase): sheetnames = ['PROFILEDEF', 'AXISDEF', 'TRAVERSALCHAINAGE', 'AXISDATUMLEVELS', 'PROFILELEVELS'] def setUp(self): - self.book = open_workbook(from_this_dir('profiles.xls')) + self.book = open_workbook(from_sample('profiles.xls')) def test_open_workbook(self): self.assertTrue(isinstance(self.book, Book)) @@ -43,3 +47,17 @@ def test_sheets(self): def test_sheet_names(self): self.assertEqual(self.sheetnames, self.book.sheet_names()) + + def test_getitem_ix(self): + sheet = self.book[SHEETINDEX] + self.assertNotEqual(xlrd.empty_cell, sheet.cell(0, 0)) + self.assertNotEqual(xlrd.empty_cell, sheet.cell(NROWS - 1, NCOLS - 1)) + + def test_getitem_name(self): + sheet = self.book[self.sheetnames[SHEETINDEX]] + self.assertNotEqual(xlrd.empty_cell, sheet.cell(0, 0)) + self.assertNotEqual(xlrd.empty_cell, sheet.cell(NROWS - 1, NCOLS - 1)) + + def test_iter(self): + sheets = [sh.name for sh in self.book] + self.assertEqual(sheets, self.sheetnames) diff --git a/tests/test_xlsx_comments.py b/tests/test_xlsx_comments.py deleted file mode 100644 index 11408b8..0000000 --- a/tests/test_xlsx_comments.py +++ /dev/null @@ -1,54 +0,0 @@ -from unittest import TestCase - -from xlrd import open_workbook - -from .base import from_this_dir - - -class TestXlsxComments(TestCase): - - def test_excel_comments(self): - book = open_workbook(from_this_dir('test_comments_excel.xlsx')) - sheet = book.sheet_by_index(0) - - note_map = sheet.cell_note_map - self.assertEqual(len(note_map), 1) - self.assertEqual(note_map[(0, 1)].text, 'hello') - - def test_excel_comments_multiline(self): - book = open_workbook(from_this_dir('test_comments_excel.xlsx')) - sheet = book.sheet_by_index(1) - - note_map = sheet.cell_note_map - self.assertEqual(note_map[(1, 2)].text, '1st line\n2nd line') - - def test_excel_comments_two_t_elements(self): - book = open_workbook(from_this_dir('test_comments_excel.xlsx')) - sheet = book.sheet_by_index(2) - - note_map = sheet.cell_note_map - self.assertEqual(note_map[(0, 0)].text, 'Author:\nTwo t elements') - - def test_excel_comments_no_t_elements(self): - book = open_workbook(from_this_dir('test_comments_excel.xlsx')) - sheet = book.sheet_by_index(3) - - note_map = sheet.cell_note_map - self.assertEqual(note_map[(0,0)].text, '') - - def test_gdocs_comments(self): - book = open_workbook(from_this_dir('test_comments_gdocs.xlsx')) - sheet = book.sheet_by_index(0) - - note_map = sheet.cell_note_map - self.assertEqual(len(note_map), 1) - self.assertEqual(note_map[(0, 1)].text, 'Just a test') - - def test_excel_comments_with_multi_sheets(self): - book = open_workbook(from_this_dir('test_comments_excel_sheet2.xlsx')) - sheet = book.sheet_by_index(1) - - note_map = sheet.cell_note_map - self.assertEqual(len(note_map), 1) - self.assertEqual(note_map[(1, 1)].text, 'Note lives here') - self.assertEqual(len(book.sheet_by_index(0).cell_note_map), 0) diff --git a/tests/test_xlsx_parse.py b/tests/test_xlsx_parse.py deleted file mode 100644 index ac06536..0000000 --- a/tests/test_xlsx_parse.py +++ /dev/null @@ -1,72 +0,0 @@ -############################################################################### -# -# Test the parsing of problematic xlsx files from bug reports. -# - -import unittest - -import xlrd - -from .base import from_this_dir - - -class TestXlsxParse(unittest.TestCase): - # Test parsing of problematic xlsx files. These are usually submitted - # as part of bug reports as noted below. - - def test_for_github_issue_75(self): - # Test inlineStr attribute without child. - # https://github.com/python-excel/xlrd/issues/75 - workbook = xlrd.open_workbook(from_this_dir('apachepoi_52348.xlsx')) - worksheet = workbook.sheet_by_index(0) - - # Test an empty inlineStr cell. - cell = worksheet.cell(0, 0) - self.assertEqual(cell.value, '') - self.assertEqual(cell.ctype, xlrd.book.XL_CELL_EMPTY) - - # Test a non-empty inlineStr cell. - cell = worksheet.cell(1, 2) - self.assertEqual(cell.value, 'Category') - self.assertEqual(cell.ctype, xlrd.book.XL_CELL_TEXT) - - def test_for_github_issue_96(self): - # Test for non-Excel file with forward slash file separator and - # lowercase names. https://github.com/python-excel/xlrd/issues/96 - workbook = xlrd.open_workbook(from_this_dir('apachepoi_49609.xlsx')) - worksheet = workbook.sheet_by_index(0) - - # Test reading sample data from the worksheet. - cell = worksheet.cell(0, 1) - self.assertEqual(cell.value, 'Cycle') - self.assertEqual(cell.ctype, xlrd.book.XL_CELL_TEXT) - - cell = worksheet.cell(1, 1) - self.assertEqual(cell.value, 1) - self.assertEqual(cell.ctype, xlrd.book.XL_CELL_NUMBER) - - def test_for_github_issue_101(self): - # Test for non-Excel file with forward slash file separator - # https://github.com/python-excel/xlrd/issues/101 - workbook = xlrd.open_workbook(from_this_dir('self_evaluation_report_2014-05-19.xlsx')) - worksheet = workbook.sheet_by_index(0) - - # Test reading sample data from the worksheet. - cell = worksheet.cell(0, 0) - self.assertEqual(cell.value, 'one') - self.assertEqual(cell.ctype, xlrd.book.XL_CELL_TEXT) - - def test_for_github_issue_150(self): - # Test for non-Excel file with a non-lowercase worksheet filename. - # https://github.com/python-excel/xlrd/issues/150 - workbook = xlrd.open_workbook(from_this_dir('issue150.xlsx')) - worksheet = workbook.sheet_by_index(0) - - # Test reading sample data from the worksheet. - cell = worksheet.cell(0, 1) - self.assertEqual(cell.value, 'Cycle') - self.assertEqual(cell.ctype, xlrd.book.XL_CELL_TEXT) - - cell = worksheet.cell(1, 1) - self.assertEqual(cell.value, 1) - self.assertEqual(cell.ctype, xlrd.book.XL_CELL_NUMBER) diff --git a/tests/text_bar.xlsx b/tests/text_bar.xlsx deleted file mode 100644 index 9e30e63..0000000 Binary files a/tests/text_bar.xlsx and /dev/null differ diff --git a/tox.ini b/tox.ini deleted file mode 100644 index c93a6c4..0000000 --- a/tox.ini +++ /dev/null @@ -1,20 +0,0 @@ -[tox] -envlist = - lint - py{37,36,35,34,27} - -[testenv] -deps = coverage -commands = - coverage run -m unittest discover - coverage report - -[testenv:lint] -basepython = python3 -commands = - flake8 - isort --check-only --diff -deps = - flake8 - isort -skip_install = True diff --git a/xlrd/__init__.py b/xlrd/__init__.py index 078c211..84d5f26 100644 --- a/xlrd/__init__.py +++ b/xlrd/__init__.py @@ -12,33 +12,86 @@ XL_CELL_NUMBER, XL_CELL_TEXT, XLRDError, biff_text_from_num, error_text_from_code, ) -from .book import Book, colname +from .book import Book, colname, open_workbook_xls +from .compdoc import SIGNATURE as XLS_SIGNATURE from .formula import * # is constrained by __all__ from .info import __VERSION__, __version__ from .sheet import empty_cell from .xldate import XLDateError, xldate_as_datetime, xldate_as_tuple -from .xlsx import X12Book -if sys.version.startswith("IronPython"): - # print >> sys.stderr, "...importing encodings" - import encodings -try: - import mmap - MMAP_AVAILABLE = 1 -except ImportError: - MMAP_AVAILABLE = 0 -USE_MMAP = MMAP_AVAILABLE +#: descriptions of the file types :mod:`xlrd` can :func:`inspect `. +FILE_FORMAT_DESCRIPTIONS = { + 'xls': 'Excel xls', + 'xlsb': 'Excel 2007 xlsb file', + 'xlsx': 'Excel xlsx file', + 'ods': 'Openoffice.org ODS file', + 'zip': 'Unknown ZIP file', + None: 'Unknown file type', +} + +ZIP_SIGNATURE = b"PK\x03\x04" + +PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE)) + + +def inspect_format(path=None, content=None): + """ + Inspect the content at the supplied path or the :class:`bytes` content provided + and return the file's type as a :class:`str`, or ``None`` if it cannot + be determined. + + :param path: + A :class:`string ` path containing the content to inspect. + ``~`` will be expanded. + + :param content: + The :class:`bytes` content to inspect. + + :returns: + A :class:`str`, or ``None`` if the format cannot be determined. + The return value can always be looked up in :data:`FILE_FORMAT_DESCRIPTIONS` + to return a human-readable description of the format found. + """ + if content: + peek = content[:PEEK_SIZE] + else: + path = os.path.expanduser(path) + with open(path, "rb") as f: + peek = f.read(PEEK_SIZE) + + if peek.startswith(XLS_SIGNATURE): + return 'xls' + + if peek.startswith(ZIP_SIGNATURE): + zf = zipfile.ZipFile(timemachine.BYTES_IO(content) if content else path) + + # Workaround for some third party files that use forward slashes and + # lower case names. We map the expected name in lowercase to the + # actual filename in the zip container. + component_names = {name.replace('\\', '/').lower(): name + for name in zf.namelist()} + + if 'xl/workbook.xml' in component_names: + return 'xlsx' + if 'xl/workbook.bin' in component_names: + return 'xlsb' + if 'content.xml' in component_names: + return 'ods' + return 'zip' + def open_workbook(filename=None, logfile=sys.stdout, verbosity=0, - use_mmap=USE_MMAP, + use_mmap=True, file_contents=None, encoding_override=None, formatting_info=False, on_demand=False, - ragged_rows=False): + ragged_rows=False, + ignore_workbook_corruption=False + ): """ Open a spreadsheet file for data extraction. @@ -100,52 +153,23 @@ def open_workbook(filename=None, This can result in substantial memory savings if rows are of widely varying sizes. See also the :meth:`~xlrd.sheet.Sheet.row_len` method. - :returns: An instance of the :class:`~xlrd.book.Book` class. - """ - peeksz = 4 - if file_contents: - peek = file_contents[:peeksz] - else: - filename = os.path.expanduser(filename) - with open(filename, "rb") as f: - peek = f.read(peeksz) - if peek == b"PK\x03\x04": # a ZIP file - if file_contents: - zf = zipfile.ZipFile(timemachine.BYTES_IO(file_contents)) - else: - zf = zipfile.ZipFile(filename) + :param ignore_workbook_corruption: - # Workaround for some third party files that use forward slashes and - # lower case names. We map the expected name in lowercase to the - # actual filename in the zip container. - component_names = dict([(X12Book.convert_filename(name), name) - for name in zf.namelist()]) + This option allows to read corrupted workbooks. + When ``False`` you may face CompDocError: Workbook corruption. + When ``True`` that exception will be ignored. - if verbosity: - logfile.write('ZIP component_names:\n') - pprint.pprint(component_names, logfile) - if 'xl/workbook.xml' in component_names: - from . import xlsx - bk = xlsx.open_workbook_2007_xml( - zf, - component_names, - logfile=logfile, - verbosity=verbosity, - use_mmap=use_mmap, - formatting_info=formatting_info, - on_demand=on_demand, - ragged_rows=ragged_rows, - ) - return bk - if 'xl/workbook.bin' in component_names: - raise XLRDError('Excel 2007 xlsb file; not supported') - if 'content.xml' in component_names: - raise XLRDError('Openoffice.org ODS file; not supported') - raise XLRDError('ZIP file contents not a known type of workbook') + :returns: An instance of the :class:`~xlrd.book.Book` class. + """ - from . import book - bk = book.open_workbook_xls( + file_format = inspect_format(filename, file_contents) + # We have to let unknown file formats pass through here, as some ancient + # files that xlrd can parse don't start with the expected signature. + if file_format and file_format != 'xls': + raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+'; not supported') + + bk = open_workbook_xls( filename=filename, logfile=logfile, verbosity=verbosity, @@ -155,7 +179,9 @@ def open_workbook(filename=None, formatting_info=formatting_info, on_demand=on_demand, ragged_rows=ragged_rows, + ignore_workbook_corruption=ignore_workbook_corruption, ) + return bk diff --git a/xlrd/book.py b/xlrd/book.py index 0ba8f12..6876a3e 100644 --- a/xlrd/book.py +++ b/xlrd/book.py @@ -4,8 +4,7 @@ from __future__ import print_function -import gc -import sys +import struct from . import compdoc, formatting, sheet from .biffh import * @@ -18,23 +17,13 @@ # Python 2.7 from time import clock as perf_counter -import struct; unpack = struct.unpack +from struct import unpack empty_cell = sheet.empty_cell # for exposure to the world ... DEBUG = 0 -USE_FANCY_CD = 1 - -TOGGLE_GC = 0 -# gc.set_debug(gc.DEBUG_STATS) - -try: - import mmap - MMAP_AVAILABLE = 1 -except ImportError: - MMAP_AVAILABLE = 0 -USE_MMAP = MMAP_AVAILABLE +import mmap MY_EOF = 0xF00BAAA # not a 16-bit number @@ -68,15 +57,12 @@ del _bin, _bic, _code_from_builtin_name def open_workbook_xls(filename=None, - logfile=sys.stdout, verbosity=0, use_mmap=USE_MMAP, + logfile=sys.stdout, verbosity=0, use_mmap=True, file_contents=None, encoding_override=None, - formatting_info=False, on_demand=False, ragged_rows=False): + formatting_info=False, on_demand=False, ragged_rows=False, + ignore_workbook_corruption=False): t0 = perf_counter() - if TOGGLE_GC: - orig_gc_enabled = gc.isenabled() - if orig_gc_enabled: - gc.disable() bk = Book() try: bk.biff2_8_load( @@ -86,6 +72,7 @@ def open_workbook_xls(filename=None, formatting_info=formatting_info, on_demand=on_demand, ragged_rows=ragged_rows, + ignore_workbook_corruption=ignore_workbook_corruption ) t1 = perf_counter() bk.load_time_stage_1 = t1 - t0 @@ -126,9 +113,6 @@ def open_workbook_xls(filename=None, "*** Book-level data will be that of the last worksheet.\n", bk.nsheets ) - if TOGGLE_GC: - if orig_gc_enabled: - gc.enable() t2 = perf_counter() bk.load_time_stage_2 = t2 - t1 except: @@ -465,6 +449,14 @@ def sheet_by_index(self, sheetx): """ return self._sheet_list[sheetx] or self.get_sheet(sheetx) + def __iter__(self): + """ + Makes iteration through sheets of a book a little more straightforward. + Don't free resources after use since it can be called like `list(book)` + """ + for i in range(self.nsheets): + yield self.sheet_by_index(i) + def sheet_by_name(self, sheet_name): """ :param sheet_name: Name of the sheet required. @@ -476,6 +468,17 @@ def sheet_by_name(self, sheet_name): raise XLRDError('No sheet named <%r>' % sheet_name) return self.sheet_by_index(sheetx) + def __getitem__(self, item): + """ + Allow indexing with sheet name or index. + :param item: Name or index of sheet enquired upon + :return: :class:`~xlrd.sheet.Sheet`. + """ + if isinstance(item, int): + return self.sheet_by_index(item) + else: + return self.sheet_by_name(item) + def sheet_names(self): """ :returns: @@ -593,15 +596,17 @@ def __init__(self): self.filestr = b'' def biff2_8_load(self, filename=None, file_contents=None, - logfile=sys.stdout, verbosity=0, use_mmap=USE_MMAP, + logfile=sys.stdout, verbosity=0, use_mmap=True, encoding_override=None, formatting_info=False, on_demand=False, - ragged_rows=False): + ragged_rows=False, + ignore_workbook_corruption=False + ): # DEBUG = 0 self.logfile = logfile self.verbosity = verbosity - self.use_mmap = use_mmap and MMAP_AVAILABLE + self.use_mmap = use_mmap self.encoding_override = encoding_override self.formatting_info = formatting_info self.on_demand = on_demand @@ -629,21 +634,15 @@ def biff2_8_load(self, filename=None, file_contents=None, # got this one at the antique store self.mem = self.filestr else: - cd = compdoc.CompDoc(self.filestr, logfile=self.logfile) - if USE_FANCY_CD: - for qname in ['Workbook', 'Book']: - self.mem, self.base, self.stream_len = \ - cd.locate_named_stream(UNICODE_LITERAL(qname)) - if self.mem: break - else: - raise XLRDError("Can't find workbook in OLE2 compound document") + cd = compdoc.CompDoc(self.filestr, logfile=self.logfile, + ignore_workbook_corruption=ignore_workbook_corruption) + for qname in ['Workbook', 'Book']: + self.mem, self.base, self.stream_len = \ + cd.locate_named_stream(UNICODE_LITERAL(qname)) + if self.mem: + break else: - for qname in ['Workbook', 'Book']: - self.mem = cd.get_named_stream(UNICODE_LITERAL(qname)) - if self.mem: break - else: - raise XLRDError("Can't find workbook in OLE2 compound document") - self.stream_len = len(self.mem) + raise XLRDError("Can't find workbook in OLE2 compound document") del cd if self.mem is not self.filestr: if hasattr(self.filestr, "close"): @@ -796,8 +795,8 @@ def derive_encoding(self): elif self.codepage is None: if self.biff_version < 80: fprintf(self.logfile, - "*** No CODEPAGE record, no encoding_override: will use 'ascii'\n") - self.encoding = 'ascii' + "*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'\n") + self.encoding = 'iso-8859-1' else: self.codepage = 1200 # utf16le if self.verbosity >= 2: @@ -808,6 +807,9 @@ def derive_encoding(self): encoding = encoding_from_codepage[codepage] elif 300 <= codepage <= 1999: encoding = 'cp' + str(codepage) + elif self.biff_version >= 80: + self.codepage = 1200 + encoding = 'utf_16_le' else: encoding = 'unknown_codepage_' + str(codepage) if DEBUG or (self.verbosity and encoding != self.encoding) : @@ -1189,7 +1191,11 @@ def handle_writeaccess(self, data): return strg = unpack_string(data, 0, self.encoding, lenlen=1) else: - strg = unpack_unicode(data, 0, lenlen=2) + try: + strg = unpack_unicode(data, 0, lenlen=2) + except UnicodeDecodeError: + # may have invalid trailing characters + strg = unpack_unicode(data.strip(), 0, lenlen=2) if DEBUG: fprintf(self.logfile, "WRITEACCESS: %d bytes; raw=%s %r\n", len(data), self.raw_user_name, strg) strg = strg.rstrip() self.user_name = strg diff --git a/xlrd/compdoc.py b/xlrd/compdoc.py index b4632dc..412a89e 100644 --- a/xlrd/compdoc.py +++ b/xlrd/compdoc.py @@ -81,8 +81,9 @@ class CompDoc(object): """ - def __init__(self, mem, logfile=sys.stdout, DEBUG=0): + def __init__(self, mem, logfile=sys.stdout, DEBUG=0, ignore_workbook_corruption=False): self.logfile = logfile + self.ignore_workbook_corruption = ignore_workbook_corruption self.DEBUG = DEBUG if mem[0:8] != SIGNATURE: raise CompDocError('Not an OLE2 compound document') @@ -423,8 +424,9 @@ def _locate_stream(self, mem, base, sat, sec_size, start_sid, expected_stream_si found_limit = (expected_stream_size + sec_size - 1) // sec_size while s >= 0: if self.seen[s]: - print("_locate_stream(%s): seen" % qname, file=self.logfile); dump_list(self.seen, 20, self.logfile) - raise CompDocError("%s corruption: seen[%d] == %d" % (qname, s, self.seen[s])) + if not self.ignore_workbook_corruption: + print("_locate_stream(%s): seen" % qname, file=self.logfile); dump_list(self.seen, 20, self.logfile) + raise CompDocError("%s corruption: seen[%d] == %d" % (qname, s, self.seen[s])) self.seen[s] = seen_id tot_found += 1 if tot_found > found_limit: diff --git a/xlrd/formatting.py b/xlrd/formatting.py index 9e4db6a..ca637b8 100644 --- a/xlrd/formatting.py +++ b/xlrd/formatting.py @@ -175,7 +175,8 @@ class Font(BaseObject, EqNeAttrs): #: 1 = Characters are bold. Redundant; see "weight" attribute. bold = 0 - #: Values:: + #: Values: + #: :: #: #: 0 = ANSI Latin #: 1 = System default @@ -204,7 +205,8 @@ class Font(BaseObject, EqNeAttrs): #: 1 = Superscript, 2 = Subscript. escapement = 0 - #: Values:: + #: Values: + #: :: #: #: 0 = None (unknown or don't care) #: 1 = Roman (variable width, serifed) @@ -230,7 +232,8 @@ class Font(BaseObject, EqNeAttrs): #: 1 = Characters are struck out. struck_out = 0 - #: Values:: + #: Values: + #: :: #: #: 0 = None #: 1 = Single; 0x21 (33) = Single accounting diff --git a/xlrd/info.py b/xlrd/info.py index f63f017..f26b6bb 100644 --- a/xlrd/info.py +++ b/xlrd/info.py @@ -1 +1 @@ -__version__ = __VERSION__ = "1.2.0" +__version__ = __VERSION__ = "2.0.1" diff --git a/xlrd/sheet.py b/xlrd/sheet.py index 79c200d..a831398 100644 --- a/xlrd/sheet.py +++ b/xlrd/sheet.py @@ -237,7 +237,8 @@ class Sheet(BaseObject): #: From the *optional* ``DEFAULTROWHEIGHT`` record. default_additional_space_below = None - #: Visibility of the sheet:: + #: Visibility of the sheet: + #: :: #: #: 0 = visible #: 1 = hidden (can be unhidden by user -- Format -> Sheet -> Unhide) @@ -476,10 +477,27 @@ def row(self, rowx): for colx in xrange(len(self._cell_values[rowx])) ] + def __getitem__(self, item): + """ + Takes either rowindex or (rowindex, colindex) as an index, + and returns either row or cell respectively. + """ + try: + rowix, colix = item + except TypeError: + # it's not a tuple (or of right size), let's try indexing as is + # if this is a problem, let this error propagate back + return self.row(item) + else: + return self.cell(rowix, colix) + def get_rows(self): "Returns a generator for iterating through each row." return (self.row(index) for index in range(self.nrows)) + # makes `for row in sheet` natural and intuitive + __iter__ = get_rows + def row_types(self, rowx, start_colx=0, end_colx=None): """ Returns a slice of the types of the cells in the given row. @@ -721,7 +739,7 @@ def put_cell_unragged(self, rowx, colx, ctype, value, xf_index): if nr < self.nrows: # cell data is not in non-descending row order *AND* # self.ncols has been bumped up. - # This very rare case ruins this optmisation. + # This very rare case ruins this optimisation. self._first_full_rowx = -2 elif rowx > self._first_full_rowx > -2: self._first_full_rowx = rowx @@ -2091,6 +2109,9 @@ def handle_feat11(self, data): rupBuild, unusedShort,listFlags, lPosStmCache, cbStmCache, cchStmCache, lem, rgbHashParam, cchName), file=self.logfile) + def __repr__(self): + return "Sheet {:>2}:<{}>".format(self.number, self.name) + class MSODrawing(BaseObject): pass diff --git a/xlrd/xlsx.py b/xlrd/xlsx.py deleted file mode 100644 index fa1547b..0000000 --- a/xlrd/xlsx.py +++ /dev/null @@ -1,860 +0,0 @@ -## -# Portions copyright (c) 2008-2012 Stephen John Machin, Lingfo Pty Ltd -# This module is part of the xlrd package, which is released under a BSD-style licence. -## - -from __future__ import print_function, unicode_literals - -import re -import sys -from os.path import join, normpath - -from .biffh import ( - XL_CELL_BLANK, XL_CELL_BOOLEAN, XL_CELL_ERROR, XL_CELL_TEXT, XLRDError, - error_text_from_code, -) -from .book import Book, Name -from .formatting import XF, Format, is_date_format_string -from .sheet import Sheet -from .timemachine import * - -DEBUG = 0 - - -DLF = sys.stdout # Default Log File - -ET = None -ET_has_iterparse = False -Element_has_iter = False - -def ensure_elementtree_imported(verbosity, logfile): - global ET, ET_has_iterparse, Element_has_iter - if ET is not None: - return - if "IronPython" in sys.version: - import xml.etree.ElementTree as ET - #### 2.7.2.1: fails later with - #### NotImplementedError: iterparse is not supported on IronPython. (CP #31923) - else: - try: import defusedxml.cElementTree as ET - except ImportError: - try: import xml.etree.cElementTree as ET - except ImportError: - try: import cElementTree as ET - except ImportError: - try: import lxml.etree as ET - except ImportError: - try: import xml.etree.ElementTree as ET - except ImportError: - try: import elementtree.ElementTree as ET - except ImportError: - raise Exception("Failed to import an ElementTree implementation") - if hasattr(ET, 'iterparse'): - _dummy_stream = BYTES_IO(b'') - try: - ET.iterparse(_dummy_stream) - ET_has_iterparse = True - except NotImplementedError: - pass - Element_has_iter = hasattr(ET, 'ElementTree') and hasattr(ET.ElementTree, 'iter') - if verbosity: - etree_version = repr([ - (item, getattr(ET, item)) - for item in ET.__dict__.keys() - if item.lower().replace('_', '') == 'version' - ]) - print(ET.__file__, ET.__name__, etree_version, ET_has_iterparse, file=logfile) - -def split_tag(tag): - pos = tag.rfind('}') + 1 - if pos >= 2: - return tag[:pos], tag[pos:] - return '', tag - -def augment_keys(adict, uri): - # uri must already be enclosed in {} - for x in list(adict.keys()): - adict[uri + x] = adict[x] - -_UPPERCASE_1_REL_INDEX = {} # Used in fast conversion of column names (e.g. "XFD") to indices (16383) -for _x in xrange(26): - _UPPERCASE_1_REL_INDEX["ABCDEFGHIJKLMNOPQRSTUVWXYZ"[_x]] = _x + 1 -for _x in "123456789": - _UPPERCASE_1_REL_INDEX[_x] = 0 -del _x - -def cell_name_to_rowx_colx(cell_name, letter_value=_UPPERCASE_1_REL_INDEX, - allow_no_col=False): - # Extract column index from cell name - # A => 0, Z =>25, AA => 26, XFD => 16383 - colx = 0 - charx = -1 - try: - for c in cell_name: - charx += 1 - lv = letter_value[c] - if lv: - colx = colx * 26 + lv - else: # start of row number; can't be '0' - if charx == 0: - # there was no col marker - if allow_no_col: - colx = None - break - else: - raise Exception( - 'Missing col in cell name %r', cell_name) - else: - colx = colx - 1 - assert 0 <= colx < X12_MAX_COLS - break - except KeyError: - raise Exception('Unexpected character %r in cell name %r' % (c, cell_name)) - rowx = int(cell_name[charx:]) - 1 - return rowx, colx - -error_code_from_text = {} -for _code, _text in error_text_from_code.items(): - error_code_from_text[_text] = _code - -# === X12 === Excel 2007 .xlsx =============================================== - -U_SSML12 = "{http://schemas.openxmlformats.org/spreadsheetml/2006/main}" -U_ODREL = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}" -U_PKGREL = "{http://schemas.openxmlformats.org/package/2006/relationships}" -U_CP = "{http://schemas.openxmlformats.org/package/2006/metadata/core-properties}" -U_DC = "{http://purl.org/dc/elements/1.1/}" -U_DCTERMS = "{http://purl.org/dc/terms/}" -XML_SPACE_ATTR = "{http://www.w3.org/XML/1998/namespace}space" -XML_WHITESPACE = "\t\n \r" -X12_MAX_ROWS = 2 ** 20 -X12_MAX_COLS = 2 ** 14 -V_TAG = U_SSML12 + 'v' # cell child: value -F_TAG = U_SSML12 + 'f' # cell child: formula -IS_TAG = U_SSML12 + 'is' # cell child: inline string - -def unescape(s, - subber=re.compile(r'_x[0-9A-Fa-f]{4,4}_', re.UNICODE).sub, - repl=lambda mobj: unichr(int(mobj.group(0)[2:6], 16))): - if "_" in s: - return subber(repl, s) - return s - -def cooked_text(self, elem): - t = elem.text - if t is None: - return '' - if elem.get(XML_SPACE_ATTR) != 'preserve': - t = t.strip(XML_WHITESPACE) - return ensure_unicode(unescape(t)) - -def get_text_from_si_or_is(self, elem, r_tag=U_SSML12+'r', t_tag=U_SSML12 +'t'): - "Returns unescaped unicode" - accum = [] - for child in elem: - # self.dump_elem(child) - tag = child.tag - if tag == t_tag: - t = cooked_text(self, child) - if t: # note: .text attribute can be None - accum.append(t) - elif tag == r_tag: - for tnode in child: - if tnode.tag == t_tag: - t = cooked_text(self, tnode) - if t: - accum.append(t) - return ''.join(accum) - -def map_attributes(amap, elem, obj): - for xml_attr, obj_attr, cnv_func_or_const in amap: - if not xml_attr: - setattr(obj, obj_attr, cnv_func_or_const) - continue - if not obj_attr: continue #### FIX ME #### - raw_value = elem.get(xml_attr) - cooked_value = cnv_func_or_const(raw_value) - setattr(obj, obj_attr, cooked_value) - -def cnv_ST_Xstring(s): - if s is None: return "" - return ensure_unicode(s) - -def cnv_xsd_unsignedInt(s): - if not s: - return None - value = int(s) - assert value >= 0 - return value - -def cnv_xsd_boolean(s): - if not s: - return 0 - if s in ("1", "true", "on"): - return 1 - if s in ("0", "false", "off"): - return 0 - raise ValueError("unexpected xsd:boolean value: %r" % s) - - -_defined_name_attribute_map = ( - ("name", "name", cnv_ST_Xstring, ), - ("comment", "", cnv_ST_Xstring, ), - ("customMenu", "", cnv_ST_Xstring, ), - ("description", "", cnv_ST_Xstring, ), - ("help", "", cnv_ST_Xstring, ), - ("statusBar", "", cnv_ST_Xstring, ), - ("localSheetId", "scope", cnv_xsd_unsignedInt, ), - ("hidden", "hidden", cnv_xsd_boolean, ), - ("function", "func", cnv_xsd_boolean, ), - ("vbProcedure", "vbasic", cnv_xsd_boolean, ), - ("xlm", "macro", cnv_xsd_boolean, ), - ("functionGroupId", "funcgroup", cnv_xsd_unsignedInt, ), - ("shortcutKey", "", cnv_ST_Xstring, ), - ("publishToServer", "", cnv_xsd_boolean, ), - ("workbookParameter", "", cnv_xsd_boolean, ), - ("", "any_err", 0, ), - ("", "any_external", 0, ), - ("", "any_rel", 0, ), - ("", "basic_formula_len", 0, ), - ("", "binary", 0, ), - ("", "builtin", 0, ), - ("", "complex", 0, ), - ("", "evaluated", 0, ), - ("", "excel_sheet_index", 0, ), - ("", "excel_sheet_num", 0, ), - ("", "option_flags", 0, ), - ("", "result", None, ), - ("", "stack", None, ), -) - -def make_name_access_maps(bk): - name_and_scope_map = {} # (name.lower(), scope): Name_object - name_map = {} # name.lower() : list of Name_objects (sorted in scope order) - num_names = len(bk.name_obj_list) - for namex in xrange(num_names): - nobj = bk.name_obj_list[namex] - name_lcase = nobj.name.lower() - key = (name_lcase, nobj.scope) - if key in name_and_scope_map: - msg = 'Duplicate entry %r in name_and_scope_map' % (key, ) - if 0: - raise XLRDError(msg) - else: - if bk.verbosity: - print(msg, file=bk.logfile) - name_and_scope_map[key] = nobj - sort_data = (nobj.scope, namex, nobj) - if name_lcase in name_map: - name_map[name_lcase].append(sort_data) - else: - name_map[name_lcase] = [sort_data] - for key in name_map.keys(): - alist = name_map[key] - alist.sort() - name_map[key] = [x[2] for x in alist] - bk.name_and_scope_map = name_and_scope_map - bk.name_map = name_map - -class X12General(object): - - def process_stream(self, stream, heading=None): - if self.verbosity >= 2 and heading is not None: - fprintf(self.logfile, "\n=== %s ===\n", heading) - self.tree = ET.parse(stream) - getmethod = self.tag2meth.get - for elem in self.tree.iter() if Element_has_iter else self.tree.getiterator(): - if self.verbosity >= 3: - self.dump_elem(elem) - meth = getmethod(elem.tag) - if meth: - meth(self, elem) - self.finish_off() - - def finish_off(self): - pass - - def dump_elem(self, elem): - fprintf(self.logfile, - "===\ntag=%r len=%d attrib=%r text=%r tail=%r\n", - split_tag(elem.tag)[1], len(elem), elem.attrib, elem.text, elem.tail) - - def dumpout(self, fmt, *vargs): - text = (12 * ' ' + fmt + '\n') % vargs - self.logfile.write(text) - -class X12Book(X12General): - - def __init__(self, bk, logfile=DLF, verbosity=False): - self.bk = bk - self.logfile = logfile - self.verbosity = verbosity - self.bk.nsheets = 0 - self.bk.props = {} - self.relid2path = {} - self.relid2reltype = {} - self.sheet_targets = [] # indexed by sheetx - self.sheetIds = [] # indexed by sheetx - - core_props_menu = { - U_CP+"lastModifiedBy": ("last_modified_by", cnv_ST_Xstring), - U_DC+"creator": ("creator", cnv_ST_Xstring), - U_DCTERMS+"modified": ("modified", cnv_ST_Xstring), - U_DCTERMS+"created": ("created", cnv_ST_Xstring), - } - - def process_coreprops(self, stream): - if self.verbosity >= 2: - fprintf(self.logfile, "\n=== coreProps ===\n") - self.tree = ET.parse(stream) - getmenu = self.core_props_menu.get - props = {} - for elem in self.tree.iter() if Element_has_iter else self.tree.getiterator(): - if self.verbosity >= 3: - self.dump_elem(elem) - menu = getmenu(elem.tag) - if menu: - attr, func = menu - value = func(elem.text) - props[attr] = value - self.bk.user_name = props.get('last_modified_by') or props.get('creator') - self.bk.props = props - if self.verbosity >= 2: - fprintf(self.logfile, "props: %r\n", props) - self.finish_off() - - @staticmethod - def convert_filename(name): - return name.replace('\\', '/').lower() - - def process_rels(self, stream): - if self.verbosity >= 2: - fprintf(self.logfile, "\n=== Relationships ===\n") - tree = ET.parse(stream) - r_tag = U_PKGREL + 'Relationship' - for elem in tree.findall(r_tag): - rid = elem.get('Id') - target = X12Book.convert_filename(elem.get('Target')) - reltype = elem.get('Type').split('/')[-1] - if self.verbosity >= 2: - self.dumpout('Id=%r Type=%r Target=%r', rid, reltype, target) - self.relid2reltype[rid] = reltype - # self.relid2path[rid] = 'xl/' + target - if target.startswith('/'): - self.relid2path[rid] = target[1:] # drop the / - else: - self.relid2path[rid] = 'xl/' + target - - def do_defined_name(self, elem): - #### UNDER CONSTRUCTION #### - if 0 and self.verbosity >= 3: - self.dump_elem(elem) - nobj = Name() - bk = self.bk - nobj.bk = bk - nobj.name_index = len(bk.name_obj_list) - bk.name_obj_list.append(nobj) - nobj.name = elem.get('name') - nobj.raw_formula = None # compiled bytecode formula -- not in XLSX - nobj.formula_text = cooked_text(self, elem) - map_attributes(_defined_name_attribute_map, elem, nobj) - if nobj.scope is None: - nobj.scope = -1 # global - if nobj.name.startswith("_xlnm."): - nobj.builtin = 1 - if self.verbosity >= 2: - nobj.dump(header='=== Name object ===') - - def do_defined_names(self, elem): - for child in elem: - self.do_defined_name(child) - make_name_access_maps(self.bk) - - def do_sheet(self, elem): - bk = self.bk - sheetx = bk.nsheets - # print elem.attrib - rid = elem.get(U_ODREL + 'id') - sheetId = int(elem.get('sheetId')) - name = unescape(ensure_unicode(elem.get('name'))) - reltype = self.relid2reltype[rid] - target = self.relid2path[rid] - if self.verbosity >= 2: - self.dumpout( - 'sheetx=%d sheetId=%r rid=%r type=%r name=%r', - sheetx, sheetId, rid, reltype, name) - if reltype != 'worksheet': - if self.verbosity >= 2: - self.dumpout('Ignoring sheet of type %r (name=%r)', reltype, name) - return - state = elem.get('state') - visibility_map = { - None: 0, - 'visible': 0, - 'hidden': 1, - 'veryHidden': 2, - } - bk._sheet_visibility.append(visibility_map[state]) - sheet = Sheet(bk, position=None, name=name, number=sheetx) - sheet.utter_max_rows = X12_MAX_ROWS - sheet.utter_max_cols = X12_MAX_COLS - bk._sheet_list.append(sheet) - bk._sheet_names.append(name) - bk.nsheets += 1 - self.sheet_targets.append(target) - self.sheetIds.append(sheetId) - - - def do_workbookpr(self, elem): - datemode = cnv_xsd_boolean(elem.get('date1904')) - if self.verbosity >= 2: - self.dumpout('datemode=%r', datemode) - self.bk.datemode = datemode - - tag2meth = { - 'definedNames': do_defined_names, - 'workbookPr': do_workbookpr, - 'sheet': do_sheet, - } - augment_keys(tag2meth, U_SSML12) - -class X12SST(X12General): - - def __init__(self, bk, logfile=DLF, verbosity=0): - self.bk = bk - self.logfile = logfile - self.verbosity = verbosity - if ET_has_iterparse: - self.process_stream = self.process_stream_iterparse - else: - self.process_stream = self.process_stream_findall - - def process_stream_iterparse(self, stream, heading=None): - if self.verbosity >= 2 and heading is not None: - fprintf(self.logfile, "\n=== %s ===\n", heading) - si_tag = U_SSML12 + 'si' - elemno = -1 - sst = self.bk._sharedstrings - for event, elem in ET.iterparse(stream): - if elem.tag != si_tag: continue - elemno = elemno + 1 - if self.verbosity >= 3: - fprintf(self.logfile, "element #%d\n", elemno) - self.dump_elem(elem) - result = get_text_from_si_or_is(self, elem) - sst.append(result) - elem.clear() # destroy all child elements - if self.verbosity >= 2: - self.dumpout('Entries in SST: %d', len(sst)) - if self.verbosity >= 3: - for x, s in enumerate(sst): - fprintf(self.logfile, "SST x=%d s=%r\n", x, s) - - def process_stream_findall(self, stream, heading=None): - if self.verbosity >= 2 and heading is not None: - fprintf(self.logfile, "\n=== %s ===\n", heading) - self.tree = ET.parse(stream) - si_tag = U_SSML12 + 'si' - elemno = -1 - sst = self.bk._sharedstrings - for elem in self.tree.findall(si_tag): - elemno = elemno + 1 - if self.verbosity >= 3: - fprintf(self.logfile, "element #%d\n", elemno) - self.dump_elem(elem) - result = get_text_from_si_or_is(self, elem) - sst.append(result) - if self.verbosity >= 2: - self.dumpout('Entries in SST: %d', len(sst)) - -class X12Styles(X12General): - - def __init__(self, bk, logfile=DLF, verbosity=0): - self.bk = bk - self.logfile = logfile - self.verbosity = verbosity - self.xf_counts = [0, 0] - self.xf_type = None - self.fmt_is_date = {} - for x in list(range(14, 23)) + list(range(45, 48)): #### hard-coding FIX ME #### - self.fmt_is_date[x] = 1 - # dummy entry for XF 0 in case no Styles section - self.bk._xf_index_to_xl_type_map[0] = 2 - # fill_in_standard_formats(bk) #### pre-integration kludge - - def do_cellstylexfs(self, elem): - self.xf_type = 0 - - def do_cellxfs(self, elem): - self.xf_type = 1 - - def do_numfmt(self, elem): - formatCode = ensure_unicode(elem.get('formatCode')) - numFmtId = int(elem.get('numFmtId')) - is_date = is_date_format_string(self.bk, formatCode) - self.fmt_is_date[numFmtId] = is_date - fmt_obj = Format(numFmtId, is_date + 2, formatCode) - self.bk.format_map[numFmtId] = fmt_obj - if self.verbosity >= 3: - self.dumpout('numFmtId=%d formatCode=%r is_date=%d', numFmtId, formatCode, is_date) - - def do_xf(self, elem): - if self.xf_type != 1: - #### ignoring style XFs for the moment - return - xfx = self.xf_counts[self.xf_type] - self.xf_counts[self.xf_type] = xfx + 1 - xf = XF() - self.bk.xf_list.append(xf) - self.bk.xfcount += 1 - numFmtId = int(elem.get('numFmtId', '0')) - xf.format_key = numFmtId - is_date = self.fmt_is_date.get(numFmtId, 0) - self.bk._xf_index_to_xl_type_map[xfx] = is_date + 2 - if self.verbosity >= 3: - self.dumpout('xfx=%d numFmtId=%d', xfx, numFmtId) - self.dumpout(repr(self.bk._xf_index_to_xl_type_map)) - - tag2meth = { - 'cellStyleXfs': do_cellstylexfs, - 'cellXfs': do_cellxfs, - 'numFmt': do_numfmt, - 'xf': do_xf, - } - augment_keys(tag2meth, U_SSML12) - -class X12Sheet(X12General): - - def __init__(self, sheet, logfile=DLF, verbosity=0): - self.sheet = sheet - self.logfile = logfile - self.verbosity = verbosity - self.rowx = -1 # We may need to count them. - self.bk = sheet.book - self.sst = self.bk._sharedstrings - self.relid2path = {} - self.relid2reltype = {} - self.merged_cells = sheet.merged_cells - self.warned_no_cell_name = 0 - self.warned_no_row_num = 0 - if ET_has_iterparse: - self.process_stream = self.own_process_stream - - def own_process_stream(self, stream, heading=None): - if self.verbosity >= 2 and heading is not None: - fprintf(self.logfile, "\n=== %s ===\n", heading) - row_tag = U_SSML12 + "row" - self_do_row = self.do_row - for event, elem in ET.iterparse(stream): - if elem.tag == row_tag: - self_do_row(elem) - elem.clear() # destroy all child elements (cells) - elif elem.tag == U_SSML12 + "dimension": - self.do_dimension(elem) - elif elem.tag == U_SSML12 + "mergeCell": - self.do_merge_cell(elem) - self.finish_off() - - def process_rels(self, stream): - if self.verbosity >= 2: - fprintf(self.logfile, "\n=== Sheet Relationships ===\n") - tree = ET.parse(stream) - r_tag = U_PKGREL + 'Relationship' - for elem in tree.findall(r_tag): - rid = elem.get('Id') - target = elem.get('Target') - reltype = elem.get('Type').split('/')[-1] - if self.verbosity >= 2: - self.dumpout('Id=%r Type=%r Target=%r', rid, reltype, target) - self.relid2reltype[rid] = reltype - self.relid2path[rid] = normpath(join('xl/worksheets', target)) - - def process_comments_stream(self, stream): - root = ET.parse(stream).getroot() - author_list = root[0] - assert author_list.tag == U_SSML12 + 'authors' - authors = [elem.text for elem in author_list] - comment_list = root[1] - assert comment_list.tag == U_SSML12 + 'commentList' - cell_note_map = self.sheet.cell_note_map - from .sheet import Note - text_tag = U_SSML12 + 'text' - r_tag = U_SSML12 + 'r' - t_tag = U_SSML12 + 't' - for elem in comment_list.findall(U_SSML12 + 'comment'): - ts = elem.findall('./' + text_tag + '/' + t_tag) - ts += elem.findall('./' + text_tag + '/' + r_tag + '/' + t_tag) - ref = elem.get('ref') - note = Note() - note.author = authors[int(elem.get('authorId'))] - note.rowx, note.colx = coords = cell_name_to_rowx_colx(ref) - note.text = '' - for t in ts: - note.text += cooked_text(self, t) - cell_note_map[coords] = note - - def do_dimension(self, elem): - ref = elem.get('ref') # example: "A1:Z99" or just "A1" - if ref: - # print >> self.logfile, "dimension: ref=%r" % ref - last_cell_ref = ref.split(':')[-1] # example: "Z99" - rowx, colx = cell_name_to_rowx_colx( - last_cell_ref, allow_no_col=True) - self.sheet._dimnrows = rowx + 1 - if colx is not None: - self.sheet._dimncols = colx + 1 - - def do_merge_cell(self, elem): - # The ref attribute should be a cell range like "B1:D5". - ref = elem.get('ref') - if ref: - try: - first_cell_ref, last_cell_ref = ref.split(':') - except ValueError: - # encountered a single cell merge, e.g. "B3" - first_cell_ref = ref - last_cell_ref = ref - first_rowx, first_colx = cell_name_to_rowx_colx(first_cell_ref) - last_rowx, last_colx = cell_name_to_rowx_colx(last_cell_ref) - self.merged_cells.append((first_rowx, last_rowx + 1, - first_colx, last_colx + 1)) - - def do_row(self, row_elem): - - def bad_child_tag(child_tag): - raise Exception('cell type %s has unexpected child <%s> at rowx=%r colx=%r' % (cell_type, child_tag, rowx, colx)) - - row_number = row_elem.get('r') - if row_number is None: # Yes, it's optional. - self.rowx += 1 - explicit_row_number = 0 - if self.verbosity and not self.warned_no_row_num: - self.dumpout("no row number; assuming rowx=%d", self.rowx) - self.warned_no_row_num = 1 - else: - self.rowx = int(row_number) - 1 - explicit_row_number = 1 - assert 0 <= self.rowx < X12_MAX_ROWS - rowx = self.rowx - colx = -1 - if self.verbosity >= 3: - self.dumpout(" row_number=%r rowx=%d explicit=%d", - row_number, self.rowx, explicit_row_number) - letter_value = _UPPERCASE_1_REL_INDEX - for cell_elem in row_elem: - cell_name = cell_elem.get('r') - if cell_name is None: # Yes, it's optional. - colx += 1 - if self.verbosity and not self.warned_no_cell_name: - self.dumpout("no cellname; assuming rowx=%d colx=%d", rowx, colx) - self.warned_no_cell_name = 1 - else: - # Extract column index from cell name - # A => 0, Z =>25, AA => 26, XFD => 16383 - colx = 0 - charx = -1 - try: - for c in cell_name: - charx += 1 - if c == '$': - continue - lv = letter_value[c] - if lv: - colx = colx * 26 + lv - else: # start of row number; can't be '0' - colx = colx - 1 - assert 0 <= colx < X12_MAX_COLS - break - except KeyError: - raise Exception('Unexpected character %r in cell name %r' % (c, cell_name)) - if explicit_row_number and cell_name[charx:] != row_number: - raise Exception('cell name %r but row number is %r' % (cell_name, row_number)) - xf_index = int(cell_elem.get('s', '0')) - cell_type = cell_elem.get('t', 'n') - tvalue = None - if cell_type == 'n': - # n = number. Most frequent type. - # child contains plain text which can go straight into float() - # OR there's no text in which case it's a BLANK cell - for child in cell_elem: - child_tag = child.tag - if child_tag == V_TAG: - tvalue = child.text - elif child_tag == F_TAG: - # formula - pass - else: - raise Exception('unexpected tag %r' % child_tag) - if not tvalue: - if self.bk.formatting_info: - self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index) - else: - self.sheet.put_cell(rowx, colx, None, float(tvalue), xf_index) - elif cell_type == "s": - # s = index into shared string table. 2nd most frequent type - # child contains plain text which can go straight into int() - for child in cell_elem: - child_tag = child.tag - if child_tag == V_TAG: - tvalue = child.text - elif child_tag == F_TAG: - # formula not expected here, but gnumeric does it. - pass - else: - bad_child_tag(child_tag) - if not tvalue: - # - if self.bk.formatting_info: - self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index) - else: - value = self.sst[int(tvalue)] - self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, value, xf_index) - elif cell_type == "str": - # str = string result from formula. - # Should have (formula) child; however in one file, all text cells are str with no formula. - # child can contain escapes - for child in cell_elem: - child_tag = child.tag - if child_tag == V_TAG: - tvalue = cooked_text(self, child) - elif child_tag == F_TAG: - # formula - pass - else: - bad_child_tag(child_tag) - # assert tvalue is not None and formula is not None - # Yuk. Fails with file created by gnumeric -- no tvalue! - self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, tvalue, xf_index) - elif cell_type == "b": - # b = boolean - # child contains "0" or "1" - for child in cell_elem: - child_tag = child.tag - if child_tag == V_TAG: - tvalue = child.text - elif child_tag == F_TAG: - # formula - pass - else: - bad_child_tag(child_tag) - self.sheet.put_cell(rowx, colx, XL_CELL_BOOLEAN, cnv_xsd_boolean(tvalue), xf_index) - elif cell_type == "e": - # e = error - # child contains e.g. "#REF!" - tvalue = '#N/A' - for child in cell_elem: - child_tag = child.tag - if child_tag == V_TAG: - tvalue = child.text - elif child_tag == F_TAG: - # formula - pass - else: - bad_child_tag(child_tag) - value = error_code_from_text[tvalue] - self.sheet.put_cell(rowx, colx, XL_CELL_ERROR, value, xf_index) - elif cell_type == "inlineStr": - # Not expected in files produced by Excel. - # It's a way of allowing 3rd party s/w to write text (including rich text) cells - # without having to build a shared string table - for child in cell_elem: - child_tag = child.tag - if child_tag == IS_TAG: - tvalue = get_text_from_si_or_is(self, child) - elif child_tag == V_TAG: - tvalue = child.text - elif child_tag == F_TAG: - # formula - pass - else: - bad_child_tag(child_tag) - if not tvalue: - if self.bk.formatting_info: - self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index) - else: - self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, tvalue, xf_index) - else: - raise Exception("Unknown cell type %r in rowx=%d colx=%d" % (cell_type, rowx, colx)) - - tag2meth = { - 'row': do_row, - } - augment_keys(tag2meth, U_SSML12) - -def open_workbook_2007_xml(zf, - component_names, - logfile=sys.stdout, - verbosity=0, - use_mmap=0, - formatting_info=0, - on_demand=0, - ragged_rows=0): - ensure_elementtree_imported(verbosity, logfile) - bk = Book() - bk.logfile = logfile - bk.verbosity = verbosity - bk.formatting_info = formatting_info - if formatting_info: - raise NotImplementedError("formatting_info=True not yet implemented") - bk.use_mmap = False #### Not supported initially - bk.on_demand = on_demand - if on_demand: - if verbosity: - print("WARNING *** on_demand=True not yet implemented; falling back to False", file=bk.logfile) - bk.on_demand = False - bk.ragged_rows = ragged_rows - - x12book = X12Book(bk, logfile, verbosity) - zflo = zf.open(component_names['xl/_rels/workbook.xml.rels']) - x12book.process_rels(zflo) - del zflo - zflo = zf.open(component_names['xl/workbook.xml']) - x12book.process_stream(zflo, 'Workbook') - del zflo - props_name = 'docprops/core.xml' - if props_name in component_names: - zflo = zf.open(component_names[props_name]) - x12book.process_coreprops(zflo) - - x12sty = X12Styles(bk, logfile, verbosity) - if 'xl/styles.xml' in component_names: - zflo = zf.open(component_names['xl/styles.xml']) - x12sty.process_stream(zflo, 'styles') - del zflo - else: - # seen in MS sample file MergedCells.xlsx - pass - - sst_fname = 'xl/sharedstrings.xml' - x12sst = X12SST(bk, logfile, verbosity) - if sst_fname in component_names: - zflo = zf.open(component_names[sst_fname]) - x12sst.process_stream(zflo, 'SST') - del zflo - - for sheetx in range(bk.nsheets): - fname = x12book.sheet_targets[sheetx] - zflo = zf.open(component_names[fname]) - sheet = bk._sheet_list[sheetx] - x12sheet = X12Sheet(sheet, logfile, verbosity) - heading = "Sheet %r (sheetx=%d) from %r" % (sheet.name, sheetx, fname) - x12sheet.process_stream(zflo, heading) - del zflo - - rels_fname = 'xl/worksheets/_rels/%s.rels' % fname.rsplit('/', 1)[-1] - if rels_fname in component_names: - zfrels = zf.open(rels_fname) - x12sheet.process_rels(zfrels) - del zfrels - - for relid, reltype in x12sheet.relid2reltype.items(): - if reltype == 'comments': - comments_fname = x12sheet.relid2path.get(relid) - if comments_fname and comments_fname in component_names: - comments_stream = zf.open(comments_fname) - x12sheet.process_comments_stream(comments_stream) - del comments_stream - - sheet.tidy_dimensions() - - return bk