diff --git a/README.rst b/README.rst index 82d1f0bd7..4f62a435c 100644 --- a/README.rst +++ b/README.rst @@ -8,3 +8,13 @@ More information is available in the `python-docx documentation`_. .. _`python-docx documentation`: https://python-docx.readthedocs.org/en/latest/ + +This fork of the repository includes a merge from `renejsum's fork `_ with recent master from `the origin `_ to support read/write access to custom metadata properties of the document. For example:: + + >>> import docx + >>> d = docx.Document('test1.docx') + >>> p = d.custom_properties + >>> print(p['prov_wasDerivedFrom']) + fid://slap.G24X2UWc + >>> p['prov_wasAssociatedWith'] = 'some other value' + >>> d.save('test1.docx') diff --git a/docx/__init__.py b/docx/__init__.py index cfa48729d..6d2212fc8 100644 --- a/docx/__init__.py +++ b/docx/__init__.py @@ -10,6 +10,7 @@ from docx.opc.constants import CONTENT_TYPE as CT, RELATIONSHIP_TYPE as RT from docx.opc.part import PartFactory from docx.opc.parts.coreprops import CorePropertiesPart +from docx.opc.parts.customprops import CustomPropertiesPart from docx.parts.document import DocumentPart from docx.parts.image import ImagePart @@ -26,6 +27,7 @@ def part_class_selector(content_type, reltype): PartFactory.part_class_selector = part_class_selector PartFactory.part_type_for[CT.OPC_CORE_PROPERTIES] = CorePropertiesPart +PartFactory.part_type_for[CT.OPC_CUSTOM_PROPERTIES] = CustomPropertiesPart PartFactory.part_type_for[CT.WML_DOCUMENT_MAIN] = DocumentPart PartFactory.part_type_for[CT.WML_NUMBERING] = NumberingPart PartFactory.part_type_for[CT.WML_SETTINGS] = SettingsPart diff --git a/docx/document.py b/docx/document.py index ba94a7990..21546a7a4 100644 --- a/docx/document.py +++ b/docx/document.py @@ -108,6 +108,14 @@ def core_properties(self): """ return self._part.core_properties + @property + def custom_properties(self): + """ + A |CustomProperties| object providing read/write access to the custom + properties of this document. + """ + return self._part.custom_properties + @property def inline_shapes(self): """ diff --git a/docx/opc/constants.py b/docx/opc/constants.py index b90aa394a..1bcf16f61 100644 --- a/docx/opc/constants.py +++ b/docx/opc/constants.py @@ -77,6 +77,9 @@ class CONTENT_TYPE(object): OPC_CORE_PROPERTIES = ( 'application/vnd.openxmlformats-package.core-properties+xml' ) + OPC_CUSTOM_PROPERTIES = ( + 'application/vnd.openxmlformats-officedocument.custom-properties+xml' + ) OPC_DIGITAL_SIGNATURE_CERTIFICATE = ( 'application/vnd.openxmlformats-package.digital-signature-certificat' 'e' diff --git a/docx/opc/customprops.py b/docx/opc/customprops.py new file mode 100644 index 000000000..cd0cddc96 --- /dev/null +++ b/docx/opc/customprops.py @@ -0,0 +1,48 @@ +# encoding: utf-8 + +""" +The :mod:`pptx.packaging` module coheres around the concerns of reading and +writing presentations to and from a .pptx file. +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +from lxml import etree + +NS_VT = "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes" + +class CustomProperties(object): + """ + Corresponds to part named ``/docProps/custom.xml``, containing the custom + document properties for this document package. + """ + def __init__(self, element): + self._element = element + + def __getitem__( self, item ): + # print(etree.tostring(self._element, pretty_print=True)) + prop = self.lookup(item) + if prop is not None : + return prop[0].text + + def __setitem__( self, key, value ): + prop = self.lookup(key) + if prop is None : + prop = etree.SubElement( self._element, "property" ) + elm = etree.SubElement(prop, '{%s}lpwstr' % NS_VT, nsmap = {'vt':NS_VT} ) + prop.set("name", key) + prop.set("fmtid", "{D5CDD505-2E9C-101B-9397-08002B2CF9AE}") + prop.set("pid", "%s" % str(len(self._element) + 1)) + else: + elm = prop[0] + elm.text = value + # etree.tostring(prop, pretty_print=True) + + def lookup(self, item): + for child in self._element : + if child.get("name") == item : + return child + return None + diff --git a/docx/opc/package.py b/docx/opc/package.py index b0ea37ea5..21b9e2293 100644 --- a/docx/opc/package.py +++ b/docx/opc/package.py @@ -11,6 +11,7 @@ from .packuri import PACKAGE_URI from .part import PartFactory from .parts.coreprops import CorePropertiesPart +from .parts.customprops import CustomPropertiesPart from .pkgreader import PackageReader from .pkgwriter import PackageWriter from .rel import Relationships @@ -43,6 +44,14 @@ def core_properties(self): """ return self._core_properties_part.core_properties + @property + def custom_properties(self): + """ + |CustomProperties| object providing read/write access to the Dublin + Core properties for this document. + """ + return self._custom_properties_part.custom_properties + def iter_rels(self): """ Generate exactly one reference to each relationship in the package by @@ -172,6 +181,18 @@ def _core_properties_part(self): self.relate_to(core_properties_part, RT.CORE_PROPERTIES) return core_properties_part + @property + def _custom_properties_part(self): + """ + |CustomPropertiesPart| object related to this package. Creates + a default custom properties part if one is not present (not common). + """ + try: + return self.part_related_by(RT.CUSTOM_PROPERTIES) + except KeyError: + custom_properties_part = CustomPropertiesPart.default(self) + self.relate_to(custom_properties_part, RT.CUSTOM_PROPERTIES) + return custom_properties_part class Unmarshaller(object): """ diff --git a/docx/opc/parts/customprops.py b/docx/opc/parts/customprops.py new file mode 100644 index 000000000..fead1db56 --- /dev/null +++ b/docx/opc/parts/customprops.py @@ -0,0 +1,57 @@ +# encoding: utf-8 + +""" +Custom properties part, corresponds to ``/docProps/custom.xml`` part in package. +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +from lxml import etree + +from datetime import datetime + +from ..constants import CONTENT_TYPE as CT +from ..customprops import CustomProperties +from ...oxml.customprops import CT_CustomProperties, ct_parse_xml +from ..packuri import PackURI +from ..part import XmlPart + + +class CustomPropertiesPart(XmlPart): + """ + Corresponds to part named ``/docProps/custom.xml``, containing the custom + document properties for this document package. + """ + @classmethod + def default(cls, package): + """ + Return a new |CustomPropertiesPart| object initialized with default + values for its base properties. + """ + custom_properties_part = cls._new(package) + custom_properties = custom_properties_part.custom_properties + return custom_properties_part + + @property + def custom_properties(self): + """ + A |CustomProperties| object providing read/write access to the custom + properties contained in this custom properties part. + """ + return CustomProperties(self.element) + + @classmethod + def load(cls, partname, content_type, blob, package): + element = ct_parse_xml(blob) + return cls(partname, content_type, element, package) + + @classmethod + def _new(cls, package): + partname = PackURI('/docProps/custom.xml') + content_type = CT.OPC_CUSTOM_PROPERTIES + customProperties = CT_CustomProperties.new() + return CustomPropertiesPart( + partname, content_type, customProperties, package + ) \ No newline at end of file diff --git a/docx/oxml/__init__.py b/docx/oxml/__init__.py index 528b1eac7..71ead4f82 100644 --- a/docx/oxml/__init__.py +++ b/docx/oxml/__init__.py @@ -29,6 +29,22 @@ def parse_xml(xml): return root_element +# configure XML parser +parser_lookup = etree.ElementDefaultClassLookup()#element=CT_CustomProperties) +ct_parser = etree.XMLParser(remove_blank_text=True) +ct_parser.set_element_class_lookup(parser_lookup) + +def ct_parse_xml(xml): + """ + Return root lxml element obtained by parsing XML character string in + *xml*, which can be either a Python 2.x string or unicode. The custom + parser is used, so custom element classes are produced for elements in + *xml* that have them. + """ + root_element = etree.fromstring(xml, ct_parser) + return root_element + + def register_element_cls(tag, cls): """ Register *cls* to be constructed when the oxml parser encounters an @@ -70,6 +86,9 @@ def OxmlElement(nsptag_str, attrs=None, nsdecls=None): from .coreprops import CT_CoreProperties register_element_cls('cp:coreProperties', CT_CoreProperties) +from .customprops import CT_CustomProperties +#register_element_cls('Properties', CT_CustomProperties) + from .document import CT_Body, CT_Document register_element_cls('w:body', CT_Body) register_element_cls('w:document', CT_Document) diff --git a/docx/oxml/customprops.py b/docx/oxml/customprops.py new file mode 100644 index 000000000..14aea25a5 --- /dev/null +++ b/docx/oxml/customprops.py @@ -0,0 +1,155 @@ +# encoding: utf-8 + +""" +lxml custom element classes for core properties-related XML elements. +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +import re + +from datetime import datetime, timedelta +from lxml import etree +from .ns import nsdecls, qn +from .xmlchemy import BaseOxmlElement, ZeroOrOne +from . import ct_parse_xml + +class CT_CustomProperties(BaseOxmlElement): + """ + ```` element, the root element of the Custom Properties + part stored as ``/docProps/custom.xml``. String elements are + limited in length to 255 unicode characters. + """ + + _customProperties_tmpl = ( + '\n' % nsdecls('vt') + ) + + @classmethod + def new(cls): + """ + Return a new ```` element + """ + xml = cls._customProperties_tmpl + customProperties = ct_parse_xml(xml) + return customProperties + + def _datetime_of_element(self, property_name): + element = getattr(self, property_name) + if element is None: + return None + datetime_str = element.text + try: + return self._parse_W3CDTF_to_datetime(datetime_str) + except ValueError: + # invalid datetime strings are ignored + return None + + def _get_or_add(self, prop_name): + """ + Return element returned by 'get_or_add_' method for *prop_name*. + """ + get_or_add_method_name = 'get_or_add_%s' % prop_name + get_or_add_method = getattr(self, get_or_add_method_name) + element = get_or_add_method() + return element + + @classmethod + def _offset_dt(cls, dt, offset_str): + """ + Return a |datetime| instance that is offset from datetime *dt* by + the timezone offset specified in *offset_str*, a string like + ``'-07:00'``. + """ + match = cls._offset_pattern.match(offset_str) + if match is None: + raise ValueError( + "'%s' is not a valid offset string" % offset_str + ) + sign, hours_str, minutes_str = match.groups() + sign_factor = -1 if sign == '+' else 1 + hours = int(hours_str) * sign_factor + minutes = int(minutes_str) * sign_factor + td = timedelta(hours=hours, minutes=minutes) + return dt + td + + _offset_pattern = re.compile('([+-])(\d\d):(\d\d)') + + @classmethod + def _parse_W3CDTF_to_datetime(cls, w3cdtf_str): + # valid W3CDTF date cases: + # yyyy e.g. '2003' + # yyyy-mm e.g. '2003-12' + # yyyy-mm-dd e.g. '2003-12-31' + # UTC timezone e.g. '2003-12-31T10:14:55Z' + # numeric timezone e.g. '2003-12-31T10:14:55-08:00' + templates = ( + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%d', + '%Y-%m', + '%Y', + ) + # strptime isn't smart enough to parse literal timezone offsets like + # '-07:30', so we have to do it ourselves + parseable_part = w3cdtf_str[:19] + offset_str = w3cdtf_str[19:] + dt = None + for tmpl in templates: + try: + dt = datetime.strptime(parseable_part, tmpl) + except ValueError: + continue + if dt is None: + tmpl = "could not parse W3CDTF datetime string '%s'" + raise ValueError(tmpl % w3cdtf_str) + if len(offset_str) == 6: + return cls._offset_dt(dt, offset_str) + return dt + + def _set_element_datetime(self, prop_name, value): + """ + Set date/time value of child element having *prop_name* to *value*. + """ + if not isinstance(value, datetime): + tmpl = ( + "property requires object, got %s" + ) + raise ValueError(tmpl % type(value)) + element = self._get_or_add(prop_name) + dt_str = value.strftime('%Y-%m-%dT%H:%M:%SZ') + element.text = dt_str + if prop_name in ('created', 'modified'): + # These two require an explicit 'xsi:type="dcterms:W3CDTF"' + # attribute. The first and last line are a hack required to add + # the xsi namespace to the root element rather than each child + # element in which it is referenced + self.set(qn('xsi:foo'), 'bar') + element.set(qn('xsi:type'), 'dcterms:W3CDTF') + del self.attrib[qn('xsi:foo')] + + def _set_element_text(self, prop_name, value): + """ + Set string value of *name* property to *value*. + """ + value = str(value) + if len(value) > 255: + tmpl = ( + "exceeded 255 char limit for property, got:\n\n'%s'" + ) + raise ValueError(tmpl % value) + element = self._get_or_add(prop_name) + element.text = value + + def _text_of_element(self, property_name): + """ + Return the text in the element matching *property_name*, or an empty + string if the element is not present or contains no text. + """ + element = getattr(self, property_name) + if element is None: + return '' + if element.text is None: + return '' + return element.text diff --git a/docx/oxml/ns.py b/docx/oxml/ns.py index e6f6a4acc..27f3bf1a1 100644 --- a/docx/oxml/ns.py +++ b/docx/oxml/ns.py @@ -12,6 +12,7 @@ 'c': ('http://schemas.openxmlformats.org/drawingml/2006/chart'), 'cp': ('http://schemas.openxmlformats.org/package/2006/metadata/core-pr' 'operties'), + 'vt' : ("http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"), 'dc': ('http://purl.org/dc/elements/1.1/'), 'dcmitype': ('http://purl.org/dc/dcmitype/'), 'dcterms': ('http://purl.org/dc/terms/'), diff --git a/docx/parts/document.py b/docx/parts/document.py index 7a23e9a5e..9b5df6479 100644 --- a/docx/parts/document.py +++ b/docx/parts/document.py @@ -36,6 +36,14 @@ def core_properties(self): """ return self.package.core_properties + @property + def custom_properties(self): + """ + A |CustomProperties| object providing read/write access to the custom + properties of this document. + """ + return self.package.custom_properties + @property def document(self): """ diff --git a/tests/opc/test_customprops.py b/tests/opc/test_customprops.py new file mode 100644 index 000000000..9f92d1f98 --- /dev/null +++ b/tests/opc/test_customprops.py @@ -0,0 +1,181 @@ +# encoding: utf-8 + +""" +Unit test suite for the docx.opc.customprops module +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +import pytest + +from datetime import datetime + +from docx.opc.coreprops import CoreProperties +from docx.oxml import parse_xml + + +class DescribeCustomProperties(object): + + def it_knows_the_string_property_values(self, text_prop_get_fixture): + core_properties, prop_name, expected_value = text_prop_get_fixture + actual_value = getattr(core_properties, prop_name) + assert actual_value == expected_value + + def it_can_change_the_string_property_values(self, text_prop_set_fixture): + core_properties, prop_name, value, expected_xml = text_prop_set_fixture + setattr(core_properties, prop_name, value) + assert core_properties._element.xml == expected_xml + + def it_knows_the_date_property_values(self, date_prop_get_fixture): + core_properties, prop_name, expected_datetime = date_prop_get_fixture + actual_datetime = getattr(core_properties, prop_name) + assert actual_datetime == expected_datetime + + def it_can_change_the_date_property_values(self, date_prop_set_fixture): + core_properties, prop_name, value, expected_xml = ( + date_prop_set_fixture + ) + setattr(core_properties, prop_name, value) + assert core_properties._element.xml == expected_xml + + def it_knows_the_revision_number(self, revision_get_fixture): + core_properties, expected_revision = revision_get_fixture + assert core_properties.revision == expected_revision + + def it_can_change_the_revision_number(self, revision_set_fixture): + core_properties, revision, expected_xml = revision_set_fixture + core_properties.revision = revision + assert core_properties._element.xml == expected_xml + + # fixtures ------------------------------------------------------- + + @pytest.fixture(params=[ + ('created', datetime(2012, 11, 17, 16, 37, 40)), + ('last_printed', datetime(2014, 6, 4, 4, 28)), + ('modified', None), + ]) + def date_prop_get_fixture(self, request, core_properties): + prop_name, expected_datetime = request.param + return core_properties, prop_name, expected_datetime + + @pytest.fixture(params=[ + ('created', 'dcterms:created', datetime(2001, 2, 3, 4, 5), + '2001-02-03T04:05:00Z', ' xsi:type="dcterms:W3CDTF"'), + ('last_printed', 'cp:lastPrinted', datetime(2014, 6, 4, 4), + '2014-06-04T04:00:00Z', ''), + ('modified', 'dcterms:modified', datetime(2005, 4, 3, 2, 1), + '2005-04-03T02:01:00Z', ' xsi:type="dcterms:W3CDTF"'), + ]) + def date_prop_set_fixture(self, request): + prop_name, tagname, value, str_val, attrs = request.param + coreProperties = self.coreProperties(None, None) + core_properties = CoreProperties(parse_xml(coreProperties)) + expected_xml = self.coreProperties(tagname, str_val, attrs) + return core_properties, prop_name, value, expected_xml + + @pytest.fixture(params=[ + ('42', 42), (None, 0), ('foobar', 0), ('-17', 0), ('32.7', 0) + ]) + def revision_get_fixture(self, request): + str_val, expected_revision = request.param + tagname = '' if str_val is None else 'cp:revision' + coreProperties = self.coreProperties(tagname, str_val) + core_properties = CoreProperties(parse_xml(coreProperties)) + return core_properties, expected_revision + + @pytest.fixture(params=[ + (42, '42'), + ]) + def revision_set_fixture(self, request): + value, str_val = request.param + coreProperties = self.coreProperties(None, None) + core_properties = CoreProperties(parse_xml(coreProperties)) + expected_xml = self.coreProperties('cp:revision', str_val) + return core_properties, value, expected_xml + + @pytest.fixture(params=[ + ('author', 'python-docx'), + ('category', ''), + ('comments', ''), + ('content_status', 'DRAFT'), + ('identifier', 'GXS 10.2.1ab'), + ('keywords', 'foo bar baz'), + ('language', 'US-EN'), + ('last_modified_by', 'Steve Canny'), + ('subject', 'Spam'), + ('title', 'Word Document'), + ('version', '1.2.88'), + ]) + def text_prop_get_fixture(self, request, core_properties): + prop_name, expected_value = request.param + return core_properties, prop_name, expected_value + + @pytest.fixture(params=[ + ('author', 'dc:creator', 'scanny'), + ('category', 'cp:category', 'silly stories'), + ('comments', 'dc:description', 'Bar foo to you'), + ('content_status', 'cp:contentStatus', 'FINAL'), + ('identifier', 'dc:identifier', 'GT 5.2.xab'), + ('keywords', 'cp:keywords', 'dog cat moo'), + ('language', 'dc:language', 'GB-EN'), + ('last_modified_by', 'cp:lastModifiedBy', 'Billy Bob'), + ('subject', 'dc:subject', 'Eggs'), + ('title', 'dc:title', 'Dissertation'), + ('version', 'cp:version', '81.2.8'), + ]) + def text_prop_set_fixture(self, request): + prop_name, tagname, value = request.param + coreProperties = self.coreProperties(None, None) + core_properties = CoreProperties(parse_xml(coreProperties)) + expected_xml = self.coreProperties(tagname, value) + return core_properties, prop_name, value, expected_xml + + # fixture components --------------------------------------------- + + def coreProperties(self, tagname, str_val, attrs=''): + tmpl = ( + '%s\n' + ) + if not tagname: + child_element = '' + elif not str_val: + child_element = '\n <%s%s/>\n' % (tagname, attrs) + else: + child_element = ( + '\n <%s%s>%s\n' % (tagname, attrs, str_val, tagname) + ) + return tmpl % child_element + + @pytest.fixture + def core_properties(self): + element = parse_xml( + b'' + b'\n\n' + b' DRAFT\n' + b' python-docx\n' + b' 2012-11-17T11:07:' + b'40-05:30\n' + b' \n' + b' GXS 10.2.1ab\n' + b' US-EN\n' + b' 2014-06-04T04:28:00Z\n' + b' foo bar baz\n' + b' Steve Canny\n' + b' 4\n' + b' Spam\n' + b' Word Document\n' + b' 1.2.88\n' + b'\n' + ) + return CoreProperties(element) + \ No newline at end of file