diff --git a/doc/users/next_whats_new/type1_subset.rst b/doc/users/next_whats_new/type1_subset.rst new file mode 100644 index 000000000000..b0ab0a4337e6 --- /dev/null +++ b/doc/users/next_whats_new/type1_subset.rst @@ -0,0 +1,9 @@ +PDF files created with usetex now embed subsets of Type 1 fonts +--------------------------------------------------------------- + +When using the PDF backend with the usetex feature, +Matplotlib calls TeX to render the text and formulas in the figure. +The fonts that get used are usually "Type 1" fonts. +They used to be embedded in full +but are now limited to the glyphs that are actually used in the figure. +This reduces the size of the resulting PDF files. diff --git a/lib/matplotlib/_type1font.py b/lib/matplotlib/_type1font.py index b3e08f52c035..8d90f5ab3472 100644 --- a/lib/matplotlib/_type1font.py +++ b/lib/matplotlib/_type1font.py @@ -3,7 +3,7 @@ This version reads pfa and pfb files and splits them for embedding in pdf files. It also supports SlantFont and ExtendFont transformations, -similarly to pdfTeX and friends. There is no support yet for subsetting. +similarly to pdfTeX and friends. Usage:: @@ -11,6 +11,7 @@ clear_part, encrypted_part, finale = font.parts slanted_font = font.transform({'slant': 0.167}) extended_font = font.transform({'extend': 1.2}) + subset_font = font.subset([ord(c) for c in 'Hello World']) Sources: @@ -25,6 +26,7 @@ import binascii import functools +import itertools import logging import re import string @@ -627,8 +629,7 @@ def _parse_subrs(self, tokens, _data): return array, next(tokens).endpos() - @staticmethod - def _parse_charstrings(tokens, _data): + def _parse_charstrings(self, tokens, _data): count_token = next(tokens) if not count_token.is_number(): raise RuntimeError( @@ -650,7 +651,12 @@ def _parse_charstrings(tokens, _data): f"Token following /{glyphname} in CharStrings definition " f"must be a number, was {nbytes_token}" ) - next(tokens) # usually RD or |- + token = next(tokens) + if not token.is_keyword(self._abbr['RD']): + raise RuntimeError( + f"Token preceding charstring must be {self._abbr['RD']}, " + f"was {token}" + ) binary_token = tokens.send(1+nbytes_token.value()) charstrings[glyphname] = binary_token.value() @@ -681,8 +687,7 @@ def _parse_encoding(tokens, _data): continue encoding[index_token.value()] = name_token.value() - @staticmethod - def _parse_othersubrs(tokens, data): + def _parse_othersubrs(self, tokens, data): init_pos = None while True: token = next(tokens) @@ -690,7 +695,7 @@ def _parse_othersubrs(tokens, data): init_pos = token.pos if token.is_delim(): _expression(token, tokens, data) - elif token.is_keyword('def', 'ND', '|-'): + elif token.is_keyword('def', self._abbr['ND']): return data[init_pos:token.endpos()], token.endpos() def transform(self, effects): @@ -745,7 +750,7 @@ def transform(self, effects): fontmatrix = ( f"[{' '.join(_format_approx(x, 6) for x in array)}]" ) - replacements = ( + newparts = self._replace( [(x, f'/FontName/{fontname} def') for x in self._pos['FontName']] + [(x, f'/ItalicAngle {italicangle} def') @@ -755,11 +760,63 @@ def transform(self, effects): + [(x, '') for x in self._pos.get('UniqueID', [])] ) + return Type1Font(( + newparts[0], + self._encrypt(newparts[1], 'eexec'), + self.parts[2] + )) + + def with_encoding(self, encoding): + """ + Change the encoding of the font. + + Parameters + ---------- + encoding : dict + A dictionary mapping character codes to glyph names. + + Returns + ------- + `Type1Font` + """ + newparts = self._replace( + [(x, '') for x in self._pos.get('UniqueID', [])] + + [(self._pos['Encoding'][0], self._postscript_encoding(encoding))] + ) + return Type1Font(( + newparts[0], + self._encrypt(newparts[1], 'eexec'), + self.parts[2] + )) + + def _replace(self, replacements): + """ + Change the font according to `replacements` + + Parameters + ---------- + replacements : list of ((int, int), str) + Each element is ((pos0, pos1), replacement) where pos0 and + pos1 are indices to the original font data (parts[0] and the + decrypted part concatenated). The data in the interval + pos0:pos1 will be replaced by the replacement text. To + accommodate binary data, the replacement is taken to be in + Latin-1 encoding. + + The case where pos0 is inside parts[0] and pos1 inside + the decrypted part is not supported. + + Returns + ------- + (bytes, bytes) + The new parts[0] and decrypted part (which needs to be + encrypted in the transformed font). + """ data = bytearray(self.parts[0]) data.extend(self.decrypted) len0 = len(self.parts[0]) for (pos0, pos1), value in sorted(replacements, reverse=True): - data[pos0:pos1] = value.encode('ascii', 'replace') + data[pos0:pos1] = value.encode('latin-1') if pos0 < len(self.parts[0]): if pos1 >= len(self.parts[0]): raise RuntimeError( @@ -768,13 +825,275 @@ def transform(self, effects): ) len0 += len(value) - pos1 + pos0 - data = bytes(data) - return Type1Font(( - data[:len0], - self._encrypt(data[len0:], 'eexec'), + return bytes(data[:len0]), bytes(data[len0:]) + + def subset(self, characters, name_prefix): + """ + Return a new font that only defines the given characters. + + Parameters + ---------- + characters : sequence of bytes + The subset of characters to include. These are indices into the + font's encoding array. The encoding array of a Type-1 font can + only include 256 characters, but other glyphs may be accessed + via the seac operator. + name_prefix : str + Prefix to prepend to the font name. + + Returns + ------- + `Type1Font` + """ + characters = frozenset(characters) + if _log.isEnabledFor(logging.DEBUG): + _log.debug( + "Subsetting font %s to characters %s = %s", + self.prop['FontName'], + sorted(characters), + [self.prop['Encoding'].get(code) for code in sorted(characters)], + ) + encoding = {code: glyph + for code, glyph in self.prop['Encoding'].items() + if code in characters} + encoding[0] = '.notdef' + # todo and done include strings (glyph names) + todo = set(encoding.values()) + done = set() + seen_subrs = {0, 1, 2, 3} + while todo: + glyph = todo.pop() + called_glyphs, called_subrs = _CharstringSimulator(self).run(glyph) + todo.update(called_glyphs - done) + seen_subrs.update(called_subrs) + done.add(glyph) + + charstrings = self._subset_charstrings(done) + subrs = self._subset_subrs(seen_subrs) + newparts = self._replace( + [(x, f'/FontName /{name_prefix}{self.prop["FontName"]} def') + for x in self._pos['FontName']] + + [(self._pos['CharStrings'][0], charstrings), + (self._pos['Subrs'][0], subrs), + (self._pos['Encoding'][0], self._postscript_encoding(encoding)) + ] + [(x, '') for x in self._pos.get('UniqueID', [])] + ) + return type(self)(( + newparts[0], + self._encrypt(newparts[1], 'eexec'), self.parts[2] )) + @staticmethod + def _charstring_tokens(data): + """Parse a Type-1 charstring + + Yield opcode names and integer parameters. + """ + data = iter(data) + for byte in data: + if 32 <= byte <= 246: + yield byte - 139 + elif 247 <= byte <= 250: + byte2 = next(data) + yield (byte-247) * 256 + byte2 + 108 + elif 251 <= byte <= 254: + byte2 = next(data) + yield -(byte-251)*256 - byte2 - 108 + elif byte == 255: + bs = bytes(itertools.islice(data, 4)) + yield struct.unpack('>i', bs)[0] + elif byte == 12: + byte1 = next(data) + yield { + 0: 'dotsection', + 1: 'vstem3', + 2: 'hstem3', + 6: 'seac', + 7: 'sbw', + 12: 'div', + 16: 'callothersubr', + 17: 'pop', + 33: 'setcurrentpoint' + }[byte1] + else: + yield { + 1: 'hstem', + 3: 'vstem', + 4: 'vmoveto', + 5: 'rlineto', + 6: 'hlineto', + 7: 'vlineto', + 8: 'rrcurveto', + 9: 'closepath', + 10: 'callsubr', + 11: 'return', + 13: 'hsbw', + 14: 'endchar', + 21: 'rmoveto', + 22: 'hmoveto', + 30: 'vhcurveto', + 31: 'hvcurveto' + }[byte] + + def _postscript_encoding(self, encoding): + """Return a PostScript encoding array for the encoding.""" + return '\n'.join([ + '/Encoding 256 array\n0 1 255 { 1 index exch /.notdef put} for', + *( + f'dup {i} /{glyph} put' + for i, glyph in sorted(encoding.items()) + if glyph != '.notdef' + ), + 'readonly def\n', + ]) + + def _subset_charstrings(self, glyphs): + """Return a PostScript CharStrings array for the glyphs.""" + charstrings = self.prop['CharStrings'] + lenIV = self.prop.get('lenIV', 4) + ordered = sorted(glyphs) + encrypted = [ + self._encrypt(charstrings[glyph], 'charstring', lenIV).decode('latin-1') + for glyph in ordered + ] + RD, ND = self._abbr['RD'], self._abbr['ND'] + return '\n'.join([ + f'/CharStrings {len(ordered)} dict dup begin', + *( + f'/{glyph} {len(enc)} {RD} {enc} {ND}' + for glyph, enc in zip(ordered, encrypted) + ), + 'end\n', + ]) + + def _subset_subrs(self, indices): + """Return a PostScript Subrs array for the subroutines.""" + # we can't remove subroutines, we just replace unused ones with a stub + subrs = self.prop['Subrs'] + n_subrs = len(subrs) + lenIV = self.prop.get('lenIV', 4) + stub = self._encrypt(b'\x0b', 'charstring', lenIV).decode('latin-1') + encrypted = [ + self._encrypt(subrs[i], 'charstring', lenIV).decode('latin-1') + if i in indices + else stub + for i in range(n_subrs) + ] + RD, ND, NP = self._abbr['RD'], self._abbr['ND'], self._abbr['NP'] + return '\n'.join([ + f'/Subrs {n_subrs} array', + *( + f'dup {i} {len(enc)} {RD} {enc} {NP}' + for i, enc in enumerate(encrypted) + ), + ]) + + +class _CharstringSimulator: + __slots__ = ('font', 'buildchar_stack', 'postscript_stack', 'glyphs', 'subrs') + + def __init__(self, font): + self.font = font + self.buildchar_stack = [] + self.postscript_stack = [] + self.glyphs = set() + self.subrs = set() + + def run(self, glyph_or_subr): + """Run the charstring interpreter on a glyph or subroutine. + + This does not actually execute the code but simulates it to find out + which subroutines get called when executing the glyph or subroutine. + + Parameters + ---------- + glyph_or_subr : str or int + The name of the glyph or the index of the subroutine to simulate. + + Returns + ------- + glyphs : set[str] + The set of glyph names called by the glyph or subroutine. + subrs : set[int] + The set of subroutines called by the glyph or subroutine. + """ + if isinstance(glyph_or_subr, str): + program = self.font.prop['CharStrings'][glyph_or_subr] + self.glyphs.add(glyph_or_subr) + else: + program = self.font.prop['Subrs'][glyph_or_subr] + self.subrs.add(glyph_or_subr) + for opcode in self.font._charstring_tokens(program): + if opcode in ('return', 'endchar'): + return self.glyphs, self.subrs + self._step(opcode) + else: + font_name = self.font.prop.get('FontName', '(unknown)') + _log.info( + f"Glyph or subr {glyph_or_subr} in font {font_name} does not end " + "with return or endchar" + ) + return self.glyphs, self.subrs + + def _step(self, opcode): + """Run one step in the charstring interpreter.""" + match opcode: + case _ if isinstance(opcode, int): + self.buildchar_stack.append(opcode) + case ( + 'hsbw' | 'sbw' | 'closepath' | 'hlineto' | 'hmoveto' | 'hcurveto' | + 'hvcurveto' | 'rlineto' | 'rmoveto' | 'rrcurveto' | 'vhcurveto' | + 'vlineto' | 'vmoveto' | 'dotsection' | 'hstem' | 'hstem3' | + 'vstem' | 'vstem3' | 'setcurrentpoint' + ): + self.buildchar_stack.clear() + case 'seac': # Standard Encoding Accented Character + codes = self.buildchar_stack[3:5] + self.glyphs.update(_StandardEncoding[int(x)] for x in codes) + self.buildchar_stack.clear() + case 'div': + num1, num2 = self.buildchar_stack[-2:] + if num2 == 0: + _log.warning( + f"Division by zero in font {self.font.prop['FontName']}" + ) + self.buildchar_stack[-2:] = [0] + else: + self.buildchar_stack[-2:] = [num1/num2] + case 'callothersubr': + n, othersubr = self.buildchar_stack[-2:] + if not isinstance(n, int): + _log.warning( + f"callothersubr {othersubr} with non-integer argument " + f"count in font {self.font.prop['FontName']}" + ) + n = int(n) + args = self.buildchar_stack[-2-n:-2] + if othersubr == 3: + self.postscript_stack.append(args[0]) + else: + self.postscript_stack.extend(args[::-1]) + self.buildchar_stack[-2-n:] = [] + case 'callsubr': + subr = self.buildchar_stack.pop() + if not isinstance(subr, int): + _log.warning( + f"callsubr with non-integer argument {subr} in font " + f"{self.font.prop['FontName']}" + ) + subr = int(subr) + self.run(subr) + case 'pop': + if not self.postscript_stack: + _log.warning( + f"pop with empty stack in font {self.font.prop['FontName']}" + ) + self.postscript_stack.append(0) + self.buildchar_stack.append(self.postscript_stack.pop()) + case _: + raise RuntimeError(f'opcode {opcode}') + _StandardEncoding = { **{ord(letter): letter for letter in string.ascii_letters}, diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py index eb9d217c932c..1ecd390f132b 100644 --- a/lib/matplotlib/backends/backend_pdf.py +++ b/lib/matplotlib/backends/backend_pdf.py @@ -808,7 +808,14 @@ def newTextnote(self, text, positionRect=[-100, -100, 0, 0]): } self.pageAnnotations.append(theNote) - def _get_subsetted_psname(self, ps_name, charmap): + @staticmethod + def _get_subset_prefix(charset): + """ + Get a prefix for a subsetted font name. + + The prefix is six uppercase letters followed by a plus sign; + see PDF reference section 5.5.3 Font Subsets. + """ def toStr(n, base): if n < base: return string.ascii_uppercase[n] @@ -818,11 +825,15 @@ def toStr(n, base): ) # encode to string using base 26 - hashed = hash(frozenset(charmap.keys())) % ((sys.maxsize + 1) * 2) + hashed = hash(charset) % ((sys.maxsize + 1) * 2) prefix = toStr(hashed, 26) # get first 6 characters from prefix - return prefix[:6] + "+" + ps_name + return prefix[:6] + "+" + + @staticmethod + def _get_subsetted_psname(ps_name, charmap): + return PdfFile._get_subset_prefix(frozenset(charmap.keys())) + ps_name def finalize(self): """Write out the various deferred objects and the pdf end matter.""" @@ -994,39 +1005,29 @@ def _embedTeXFont(self, fontinfo): _log.debug('Embedding TeX font %s - fontinfo=%s', fontinfo.dvifont.texname, fontinfo.__dict__) - # Widths - widthsObject = self.reserveObject('font widths') - tfm = fontinfo.dvifont._tfm - # convert from TeX's 12.20 representation to 1/1000 text space units. - widths = [(1000 * metrics.tex_width) >> 20 - if (metrics := tfm.get_metrics(char)) else 0 - for char in range(max(tfm._glyph_metrics, default=-1) + 1)] - self.writeObject(widthsObject, widths) - # Font dictionary fontdictObject = self.reserveObject('font dictionary') fontdict = { 'Type': Name('Font'), 'Subtype': Name('Type1'), - 'FirstChar': 0, - 'LastChar': len(widths) - 1, - 'Widths': widthsObject, - } - - # Encoding (if needed) - if fontinfo.encodingfile is not None: - fontdict['Encoding'] = { - 'Type': Name('Encoding'), - 'Differences': [ - 0, *map(Name, dviread._parse_enc(fontinfo.encodingfile))], - } + } # We have a font file to embed - read it in and apply any effects t1font = _type1font.Type1Font(fontinfo.fontfile) + if fontinfo.encodingfile is not None: + t1font = t1font.with_encoding( + {i: c for i, c in enumerate(dviread._parse_enc(fontinfo.encodingfile))} + ) + if fontinfo.effects: t1font = t1font.transform(fontinfo.effects) + chars = frozenset(self._character_tracker.used[fontinfo.dvifont.fname]) + t1font = t1font.subset(chars, self._get_subset_prefix(chars)) fontdict['BaseFont'] = Name(t1font.prop['FontName']) - + encoding = t1font.prop['Encoding'] + fc = fontdict['FirstChar'] = min(encoding.keys(), default=0) + lc = fontdict['LastChar'] = max(encoding.keys(), default=255) + fontdict['Encoding'] = self._generate_encoding(encoding) # Font descriptors may be shared between differently encoded # Type-1 fonts, so only create a new descriptor if there is no # existing descriptor for this font. @@ -1038,9 +1039,32 @@ def _embedTeXFont(self, fontinfo): self._type1Descriptors[(fontinfo.fontfile, effects)] = fontdesc fontdict['FontDescriptor'] = fontdesc + # Use TeX Font Metrics file to get glyph widths (TeX uses its 12.20 fixed point + # representation and we want 1/1000 text space units) + tfm = fontinfo.dvifont._tfm + widths = [(1000 * metrics.tex_width) >> 20 + if (metrics := tfm.get_metrics(char)) else 0 + for char in range(fc, lc + 1)] + fontdict['Widths'] = widthsObject = self.reserveObject('glyph widths') self.writeObject(fontdictObject, fontdict) + self.writeObject(widthsObject, widths) return fontdictObject + + def _generate_encoding(self, encoding): + prev = -2 + result = [] + for code, name in sorted(encoding.items()): + if code != prev + 1: + result.append(code) + prev = code + result.append(Name(name)) + return { + 'Type': Name('Encoding'), + 'Differences': result + } + + def createType1Descriptor(self, t1font, fontfile): # Create and write the font descriptor and the font file # of a Type-1 font @@ -1078,6 +1102,14 @@ def createType1Descriptor(self, t1font, fontfile): ft2font = get_font(fontfile) + encoding = t1font.prop['Encoding'] + charset = ''.join( + sorted( + f'/{c}' for c in encoding.values() + if c != '.notdef' + ) + ) + descriptor = { 'Type': Name('FontDescriptor'), 'FontName': Name(t1font.prop['FontName']), @@ -1091,6 +1123,7 @@ def createType1Descriptor(self, t1font, fontfile): 'FontFile': fontfileObject, 'FontFamily': t1font.prop['FamilyName'], 'StemV': 50, # TODO + 'CharSet': charset, # (see also revision 3874; but not all TeX distros have AFM files!) # 'FontWeight': a number where 400 = Regular, 700 = Bold } @@ -2268,6 +2301,7 @@ def draw_tex(self, gc, x, y, s, prop, angle, *, mtext=None): seq += [['font', pdfname, dvifont.size]] oldfont = dvifont seq += [['text', x1, y1, [bytes([glyph])], x1+width]] + self.file._character_tracker.track(dvifont, chr(glyph)) # Find consecutive text strings with constant y coordinate and # combine into a sequence of strings and kerns, or just one diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py index a588979f5fad..9e8b6a5facf5 100644 --- a/lib/matplotlib/dviread.py +++ b/lib/matplotlib/dviread.py @@ -17,17 +17,17 @@ ... """ -from collections import namedtuple import dataclasses import enum -from functools import cache, lru_cache, partial, wraps import logging import os -from pathlib import Path import re import struct import subprocess import sys +from collections import namedtuple +from functools import cache, lru_cache, partial, wraps +from pathlib import Path import numpy as np @@ -583,6 +583,9 @@ class DviFont: Attributes ---------- texname : bytes + fname : str + Compatibility shim so that DviFont can be used with + ``_backend_pdf_ps.CharacterTracker``; not a real filename. size : float Size of the font in Adobe points, converted from the slightly smaller TeX points. @@ -602,6 +605,18 @@ def __init__(self, scale, tfm, texname, vf): (1000 * self._tfm.width.get(char, 0)) >> 20 for char in range(max(self._tfm.width, default=-1) + 1)])) + @property + def fname(self): + """A fake filename""" + return self.texname.decode('latin-1') + + def _get_fontmap(self, string): + """Get the mapping from characters to the font that includes them. + + Each value maps to self; there is no fallback mechanism for DviFont. + """ + return {char: self for char in string} + def __eq__(self, other): return (type(self) is type(other) and self.texname == other.texname and self.size == other.size) @@ -1161,8 +1176,8 @@ def _fontfile(cls, suffix, texname): if __name__ == '__main__': - from argparse import ArgumentParser import itertools + from argparse import ArgumentParser import fontTools.agl diff --git a/lib/matplotlib/dviread.pyi b/lib/matplotlib/dviread.pyi index 41799c083218..12a9215b5308 100644 --- a/lib/matplotlib/dviread.pyi +++ b/lib/matplotlib/dviread.pyi @@ -66,6 +66,8 @@ class DviFont: def __ne__(self, other: object) -> bool: ... @property def widths(self) -> list[int]: ... + @property + def fname(self) -> str: ... class Vf(Dvi): def __init__(self, filename: str | os.PathLike) -> None: ... diff --git a/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-bitstream-charter.pdf b/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-bitstream-charter.pdf new file mode 100644 index 000000000000..c8f9411fb3d9 Binary files /dev/null and b/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-bitstream-charter.pdf differ diff --git a/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-dejavusans.pdf b/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-dejavusans.pdf new file mode 100644 index 000000000000..fd907dee6687 Binary files /dev/null and b/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-dejavusans.pdf differ diff --git a/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-heuristica.pdf b/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-heuristica.pdf new file mode 100644 index 000000000000..ca9b38d09b89 Binary files /dev/null and b/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-heuristica.pdf differ diff --git a/lib/matplotlib/tests/test_backend_pdf.py b/lib/matplotlib/tests/test_backend_pdf.py index 60169a38c972..079ef16128bb 100644 --- a/lib/matplotlib/tests/test_backend_pdf.py +++ b/lib/matplotlib/tests/test_backend_pdf.py @@ -16,7 +16,7 @@ from matplotlib.backends._backend_pdf_ps import get_glyphs_subset, font_as_file from matplotlib.backends.backend_pdf import PdfPages from matplotlib.patches import Rectangle -from matplotlib.testing import _gen_multi_font_text +from matplotlib.testing import _gen_multi_font_text, _has_tex_package from matplotlib.testing.decorators import check_figures_equal, image_comparison from matplotlib.testing._markers import needs_usetex @@ -428,3 +428,53 @@ def test_truetype_conversion(recwarn): font=Path(__file__).with_name("mpltest.ttf"), fontsize=80) ax.set_xticks([]) ax.set_yticks([]) + + +@pytest.mark.skipif(not _has_tex_package("heuristica"), + reason="LaTeX lacks heuristica package") +@image_comparison(["font-heuristica.pdf"]) +def test_font_heuristica(): + # Heuristica uses the callothersubr operator for some glyphs + mpl.rcParams['text.latex.preamble'] = '\n'.join(( + r'\usepackage{heuristica}', + r'\usepackage[T1]{fontenc}', + r'\usepackage[utf8]{inputenc}' + )) + fig, ax = plt.subplots() + ax.text(0.1, 0.1, r"BHTem fi ffl 1234", usetex=True, fontsize=50) + ax.set_xticks([]) + ax.set_yticks([]) + + +@pytest.mark.skipif(not _has_tex_package("DejaVuSans"), + reason="LaTeX lacks DejaVuSans package") +@image_comparison(["font-dejavusans.pdf"]) +def test_font_dejavusans(): + # DejaVuSans uses the seac operator to compose characters with diacritics + mpl.rcParams['text.latex.preamble'] = '\n'.join(( + r'\usepackage{DejaVuSans}', + r'\usepackage[T1]{fontenc}', + r'\usepackage[utf8]{inputenc}' + )) + + fig, ax = plt.subplots() + ax.text(0.1, 0.1, r"\textsf{ñäö ABCDabcd}", usetex=True, fontsize=50) + ax.text(0.1, 0.3, r"\textsf{fi ffl 1234}", usetex=True, fontsize=50) + ax.set_xticks([]) + ax.set_yticks([]) + + +@pytest.mark.skipif(not _has_tex_package("charter"), + reason="LaTeX lacks charter package") +@image_comparison(["font-bitstream-charter.pdf"]) +def test_font_bitstream_charter(): + mpl.rcParams['text.latex.preamble'] = '\n'.join(( + r'\usepackage{charter}', + r'\usepackage[T1]{fontenc}', + r'\usepackage[utf8]{inputenc}' + )) + fig, ax = plt.subplots() + ax.text(0.1, 0.1, r"åüš ABCDabcd", usetex=True, fontsize=50) + ax.text(0.1, 0.3, r"fi ffl 1234", usetex=True, fontsize=50) + ax.set_xticks([]) + ax.set_yticks([]) diff --git a/lib/matplotlib/tests/test_usetex.py b/lib/matplotlib/tests/test_usetex.py index c7658c4f42ac..62588b00e7f5 100644 --- a/lib/matplotlib/tests/test_usetex.py +++ b/lib/matplotlib/tests/test_usetex.py @@ -1,3 +1,4 @@ +import re from tempfile import TemporaryFile import numpy as np @@ -156,6 +157,69 @@ def test_missing_psfont(fmt, monkeypatch): fig.savefig(tmpfile, format=fmt) +def test_pdf_type1_font_subsetting(): + """Test that fonts in PDF output are properly subset.""" + pikepdf = pytest.importorskip("pikepdf") + + mpl.rcParams["text.usetex"] = True + mpl.rcParams["text.latex.preamble"] = r"\usepackage{amssymb}" + fig, ax = plt.subplots() + ax.text(0.2, 0.7, r"$\int_{-\infty}^{\aleph}\sqrt{\alpha\beta\gamma}\mathrm{d}x$") + ax.text(0.2, 0.5, r"$\mathfrak{x}\circledcirc\mathfrak{y}\in\mathbb{R}$") + + with TemporaryFile() as tmpfile: + fig.savefig(tmpfile, format="pdf") + tmpfile.seek(0) + pdf = pikepdf.Pdf.open(tmpfile) + + length = {} + page = pdf.pages[0] + for font_name, font in page.Resources.Font.items(): + assert font.Subtype == "/Type1", ( + f"Font {font_name}={font} is not a Type 1 font" + ) + + # Subsetted font names have a 6-character tag followed by a '+' + base_font = str(font["/BaseFont"]).removeprefix("/") + assert re.match(r"^[A-Z]{6}\+", base_font), ( + f"Font {font_name}={base_font} lacks a subset indicator tag" + ) + assert "/FontFile" in font.FontDescriptor, ( + f"Type 1 font {font_name}={base_font} is not embedded" + ) + _, original_name = base_font.split("+", 1) + length[original_name] = len(bytes(font["/FontDescriptor"]["/FontFile"])) + + print("Embedded font stream lengths:", length) + # We should have several fonts, each much smaller than the original. + # I get under 10kB on my system for each font, but allow 15kB in case + # of differences in the font files. + assert { + 'CMEX10', + 'CMMI12', + 'CMR12', + 'CMSY10', + 'CMSY8', + 'EUFM10', + 'MSAM10', + 'MSBM10', + }.issubset(length), "Missing expected fonts in the PDF" + for font_name, length in length.items(): + assert length < 15_000, ( + f"Font {font_name}={length} is larger than expected" + ) + + # For comparison, lengths without subsetting on my system: + # 'CMEX10': 29686 + # 'CMMI12': 36176 + # 'CMR12': 32157 + # 'CMSY10': 32004 + # 'CMSY8': 32061 + # 'EUFM10': 20546 + # 'MSAM10': 31199 + # 'MSBM10': 34129 + + try: _old_gs_version = mpl._get_executable_info('gs').version < parse_version('9.55') except mpl.ExecutableNotFoundError: