Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit f6861ad

Browse filesBrowse files
committed
Type-1 subsetting
With this I can produce smaller pdf files with usetex in some small tests, but this obviously needs more extensive testing, thus marking as draft. Give dviread.DviFont a fake filename attribute for character tracking. On top of #20715. Closes #127.
1 parent e98bb83 commit f6861ad
Copy full SHA for f6861ad

File tree

3 files changed

+283
-10
lines changed
Filter options

3 files changed

+283
-10
lines changed

‎lib/matplotlib/backends/backend_pdf.py

Copy file name to clipboardExpand all lines: lib/matplotlib/backends/backend_pdf.py
+3Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -981,6 +981,8 @@ def _embedTeXFont(self, fontinfo):
981981
t1font = type1font.Type1Font(fontinfo.fontfile)
982982
if fontinfo.effects:
983983
t1font = t1font.transform(fontinfo.effects)
984+
chars = self._character_tracker.used[fontinfo.dvifont.fname]
985+
t1font = t1font.subset(chars)
984986
fontdict['BaseFont'] = Name(t1font.prop['FontName'])
985987

986988
# Font descriptors may be shared between differently encoded
@@ -2255,6 +2257,7 @@ def draw_tex(self, gc, x, y, s, prop, angle, *, mtext=None):
22552257
seq += [['font', pdfname, dvifont.size]]
22562258
oldfont = dvifont
22572259
seq += [['text', x1, y1, [bytes([glyph])], x1+width]]
2260+
self.file._character_tracker.track(dvifont, chr(glyph))
22582261

22592262
# Find consecutive text strings with constant y coordinate and
22602263
# combine into a sequence of strings and kerns, or just one

‎lib/matplotlib/dviread.py

Copy file name to clipboardExpand all lines: lib/matplotlib/dviread.py
+8Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,9 @@ class DviFont:
546546
Attributes
547547
----------
548548
texname : bytes
549+
fname : str
550+
Compatibility shim so that DviFont can be used with
551+
``_backend_pdf_ps.CharacterTracker``; not a real filename.
549552
size : float
550553
Size of the font in Adobe points, converted from the slightly
551554
smaller TeX points.
@@ -570,6 +573,11 @@ def __init__(self, scale, tfm, texname, vf):
570573
self.widths = [(1000*tfm.width.get(char, 0)) >> 20
571574
for char in range(nchars)]
572575

576+
@property
577+
def fname(self):
578+
"""A fake filename"""
579+
return self.texname.decode('latin-1')
580+
573581
def __eq__(self, other):
574582
return (type(self) == type(other)
575583
and self.texname == other.texname and self.size == other.size)

‎lib/matplotlib/type1font.py

Copy file name to clipboardExpand all lines: lib/matplotlib/type1font.py
+272-10Lines changed: 272 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@
2121
v1.1, 1993. ISBN 0-201-57044-0.
2222
"""
2323

24+
import base64
2425
import binascii
2526
import functools
27+
import itertools
2628
import logging
2729
import re
2830
import string
@@ -36,6 +38,35 @@
3638
_log = logging.getLogger(__name__)
3739

3840

41+
def _make_tag(set):
42+
"""
43+
Hash set into a six-character tag made of uppercase letters
44+
45+
Useful for adding a tag into subsetted fonts while keeping the code
46+
reproducible. The function always returns the same value for the
47+
same set on the same exact Python version but is not guaranteed to
48+
not have collisions.
49+
50+
Parameters
51+
----------
52+
set : iterable
53+
The set of glyphs present in a font subset
54+
55+
Returns
56+
-------
57+
str
58+
Six uppercase ASCII letters and a plus sign
59+
"""
60+
61+
# freeze the set to make it hashable, interpret the hash as bytes
62+
array = struct.pack("@q", hash(frozenset(set)))
63+
# turn the bytes into characters with b32encode, which uses uppercase
64+
# letters and numbers from 2 to 7 - remap those arbitrarily
65+
trans = str.maketrans('234567', 'MTPLIB', '=')
66+
return (base64.b32encode(array).decode('ascii')
67+
.translate(trans)[:6] + '+')
68+
69+
3970
class _Token:
4071
"""
4172
A token in a PostScript stream
@@ -627,8 +658,7 @@ def _parse_subrs(self, tokens, _data):
627658

628659
return array, next(tokens).endpos()
629660

630-
@staticmethod
631-
def _parse_charstrings(tokens, _data):
661+
def _parse_charstrings(self, tokens, _data):
632662
count_token = next(tokens)
633663
if not count_token.is_number():
634664
raise RuntimeError(
@@ -650,7 +680,12 @@ def _parse_charstrings(tokens, _data):
650680
f"Token following /{glyphname} in CharStrings definition "
651681
f"must be a number, was {nbytes_token}"
652682
)
653-
next(tokens) # usually RD or |-
683+
token = next(tokens)
684+
if not token.is_keyword(self._abbr['RD']):
685+
raise RuntimeError(
686+
"Token preceding charstring must be {self._abbr['RD']}, "
687+
f"was {token}"
688+
)
654689
binary_token = tokens.send(1+nbytes_token.value())
655690
charstrings[glyphname] = binary_token.value()
656691

@@ -681,16 +716,15 @@ def _parse_encoding(tokens, _data):
681716
continue
682717
encoding[index_token.value()] = name_token.value()
683718

684-
@staticmethod
685-
def _parse_othersubrs(tokens, data):
719+
def _parse_othersubrs(self, tokens, data):
686720
init_pos = None
687721
while True:
688722
token = next(tokens)
689723
if init_pos is None:
690724
init_pos = token.pos
691725
if token.is_delim():
692726
_expression(token, tokens, data)
693-
elif token.is_keyword('def', 'ND', '|-'):
727+
elif token.is_keyword('def', self._abbr['ND']):
694728
return data[init_pos:token.endpos()], token.endpos()
695729

696730
def transform(self, effects):
@@ -745,7 +779,7 @@ def transform(self, effects):
745779
fontmatrix = (
746780
'[%s]' % ' '.join(_format_approx(x, 6) for x in array)
747781
)
748-
replacements = (
782+
newparts = self._replace(
749783
[(x, '/FontName/%s def' % fontname)
750784
for x in self._pos['FontName']]
751785
+ [(x, '/ItalicAngle %a def' % italicangle)
@@ -755,11 +789,40 @@ def transform(self, effects):
755789
+ [(x, '') for x in self._pos.get('UniqueID', [])]
756790
)
757791

792+
return Type1Font((
793+
newparts[0],
794+
self._encrypt(newparts[1], 'eexec'),
795+
self.parts[2]
796+
))
797+
798+
def _replace(self, replacements):
799+
"""
800+
Change the font according to `replacements`
801+
802+
Parameters
803+
----------
804+
replacements : list of ((int, int), str)
805+
Each element is ((pos0, pos1), replacement) where pos0 and
806+
pos1 are indices to the original font data (parts[0] and the
807+
decrypted part concatenated). The data in the interval
808+
pos0:pos1 will be replaced by the replacement text. To
809+
accommodate binary data, the replacement is taken to be in
810+
Latin-1 encoding.
811+
812+
The case where pos0 is inside parts[0] and pos1 inside
813+
the decrypted part is not supported.
814+
815+
Returns
816+
-------
817+
(bytes, bytes)
818+
The new parts[0] and decrypted part (which needs to be
819+
encrypted in the transformed font).
820+
"""
758821
data = bytearray(self.parts[0])
759822
data.extend(self.decrypted)
760823
len0 = len(self.parts[0])
761824
for (pos0, pos1), value in sorted(replacements, reverse=True):
762-
data[pos0:pos1] = value.encode('ascii', 'replace')
825+
data[pos0:pos1] = value.encode('latin-1')
763826
if pos0 < len(self.parts[0]):
764827
if pos1 >= len(self.parts[0]):
765828
raise RuntimeError(
@@ -769,12 +832,211 @@ def transform(self, effects):
769832
len0 += len(value) - pos1 + pos0
770833

771834
data = bytes(data)
835+
return data[:len0], data[len0:]
836+
837+
def subset(self, characters):
838+
"""
839+
Return a new font that only defines the given characters.
840+
841+
Parameters
842+
----------
843+
characters : sequence of bytes
844+
The subset of characters to include
845+
846+
Returns
847+
-------
848+
`Type1Font`
849+
"""
850+
851+
characters = set(characters)
852+
encoding = {code: glyph
853+
for code, glyph in self.prop['Encoding'].items()
854+
if code in characters}
855+
encoding[0] = '.notdef'
856+
# todo and done include strings (glyph names)
857+
todo = set(encoding.values())
858+
done = set()
859+
seen_subrs = {0, 1, 2, 3}
860+
while todo - done:
861+
glyph = next(iter(todo - done))
862+
called_glyphs, called_subrs, _, _ = self._simulate(glyph, [], [])
863+
todo.update(called_glyphs)
864+
seen_subrs.update(called_subrs)
865+
done.add(glyph)
866+
867+
fontname = _make_tag(todo) + self.prop['FontName']
868+
charstrings = self._subset_charstrings(todo)
869+
subrs = self._subset_subrs(seen_subrs)
870+
newparts = self._replace(
871+
[(x, '/FontName/%s def' % fontname)
872+
for x in self._pos['FontName']]
873+
+ [(self._pos['CharStrings'][0], charstrings),
874+
(self._pos['Subrs'][0], subrs),
875+
(self._pos['Encoding'][0], self._subset_encoding(encoding))
876+
] + [(x, '') for x in self._pos.get('UniqueID', [])]
877+
)
772878
return Type1Font((
773-
data[:len0],
774-
self._encrypt(data[len0:], 'eexec'),
879+
newparts[0],
880+
self._encrypt(newparts[1], 'eexec'),
775881
self.parts[2]
776882
))
777883

884+
@staticmethod
885+
def _charstring_tokens(data):
886+
data = iter(data)
887+
for byte in data:
888+
if 32 <= byte <= 246:
889+
yield byte - 139
890+
elif 247 <= byte <= 250:
891+
byte2 = next(data)
892+
yield (byte-247) * 256 + byte2 + 108
893+
elif 251 <= byte <= 254:
894+
byte2 = next(data)
895+
yield -(byte-251)*256 - byte2 - 108
896+
elif byte == 255:
897+
bs = itertools.islice(data, 4)
898+
yield struct.unpack('>i', bs)[0]
899+
elif byte == 12:
900+
byte1 = next(data)
901+
yield {
902+
0: 'dotsection',
903+
1: 'vstem3',
904+
2: 'hstem3',
905+
6: 'seac',
906+
7: 'sbw',
907+
12: 'div',
908+
16: 'callothersubr',
909+
17: 'pop',
910+
33: 'setcurrentpoint'
911+
}[byte1]
912+
else:
913+
yield {
914+
1: 'hstem',
915+
3: 'vstem',
916+
4: 'vmoveto',
917+
5: 'rlineto',
918+
6: 'hlineto',
919+
7: 'vlineto',
920+
8: 'rrcurveto',
921+
9: 'closepath',
922+
10: 'callsubr',
923+
11: 'return',
924+
13: 'hsbw',
925+
14: 'endchar',
926+
21: 'rmoveto',
927+
22: 'hmoveto',
928+
30: 'vhcurveto',
929+
31: 'hvcurveto'
930+
}[byte]
931+
932+
def _step(self, buildchar_stack, postscript_stack, opcode):
933+
if isinstance(opcode, int):
934+
return set(), set(), buildchar_stack + [opcode], postscript_stack
935+
elif opcode in {
936+
'hsbw', 'sbw', 'closepath', 'hlineto', 'hmoveto', 'hcurveto',
937+
'hvcurveto', 'rlineto', 'rmoveto', 'rrcurveto', 'vhcurveto',
938+
'vlineto', 'vmoveto', 'dotsection', 'hstem', 'hstem3', 'vstem',
939+
'vstem3', 'setcurrentpoint'
940+
}:
941+
return set(), set(), [], postscript_stack
942+
elif opcode == 'seac':
943+
codes = buildchar_stack[3:5]
944+
glyphs = [self.prop['Encoding'][x] for x in codes]
945+
return set(glyphs), set(), [], postscript_stack
946+
elif opcode == 'div':
947+
num1, num2 = buildchar_stack[-2:]
948+
return (
949+
set(),
950+
set(),
951+
buildchar_stack[-2:] + [num1/num2], postscript_stack
952+
)
953+
elif opcode == 'callothersubr':
954+
othersubr = buildchar_stack[-1]
955+
n = buildchar_stack[-2]
956+
args = buildchar_stack[-2-n:-2]
957+
if othersubr == 3: # Section 8.1 in Type-1 spec
958+
postscript_stack.append(args[0])
959+
else:
960+
postscript_stack.extend(args[::-1])
961+
return set(), set(), buildchar_stack[:-n-2], postscript_stack
962+
elif opcode == 'callsubr':
963+
subr = buildchar_stack[-1]
964+
glyphs, subrs, new_bc_stack, new_ps_stack = \
965+
self._simulate(subr, buildchar_stack[:-1], postscript_stack)
966+
return set(), subrs | {subr}, new_bc_stack, new_ps_stack
967+
elif opcode == 'pop':
968+
return (
969+
set(),
970+
set(),
971+
buildchar_stack + [postscript_stack[-1]], postscript_stack[:-1]
972+
)
973+
else:
974+
raise RuntimeError(f'opcode {opcode}')
975+
976+
def _simulate(self, glyph_or_subr, buildchar_stack, postscript_stack):
977+
if isinstance(glyph_or_subr, str):
978+
program = self.prop['CharStrings'][glyph_or_subr]
979+
glyphs = {glyph_or_subr}
980+
subrs = set()
981+
else:
982+
program = self.prop['Subrs'][glyph_or_subr]
983+
glyphs = set()
984+
subrs = {glyph_or_subr}
985+
for opcode in self._charstring_tokens(program):
986+
if opcode in ('return', 'endchar'):
987+
return glyphs, subrs, buildchar_stack, postscript_stack
988+
newglyphs, newsubrs, buildchar_stack, postscript_stack = \
989+
self._step(buildchar_stack, postscript_stack, opcode)
990+
glyphs.update(newglyphs)
991+
subrs.update(newsubrs)
992+
993+
def _subset_encoding(self, encoding):
994+
result = [
995+
'/Encoding 256 array\n0 1 255 { 1 index exch /.notdef put } for'
996+
]
997+
result.extend(
998+
f'dup {i} /{glyph} put'
999+
for i, glyph in sorted(encoding.items())
1000+
if glyph != '.notdef'
1001+
)
1002+
result.extend('readonly def\n')
1003+
return '\n'.join(result)
1004+
1005+
def _subset_charstrings(self, glyphs):
1006+
result = [f'/CharStrings {len(glyphs)} dict dup begin']
1007+
encrypted = [self._encrypt(self.prop['CharStrings'][glyph],
1008+
'charstring',
1009+
self.prop.get('lenIV', 4)
1010+
).decode('latin-1')
1011+
for glyph in glyphs]
1012+
RD, ND = self._abbr['RD'], self._abbr['ND']
1013+
result.extend(
1014+
f'/{glyph} {len(enc)} {RD} {enc} {ND}'
1015+
for glyph, enc in zip(glyphs, encrypted)
1016+
)
1017+
result.append('end\n')
1018+
return '\n'.join(result)
1019+
1020+
def _subset_subrs(self, indices):
1021+
# we can't remove subroutines, we just replace unused ones with a stub
1022+
n_subrs = len(self.prop['Subrs'])
1023+
result = [f'/Subrs {n_subrs} array']
1024+
lenIV = self.prop.get('lenIV', 4)
1025+
stub = self._encrypt(b'\x0b', 'charstring', lenIV).decode('latin-1')
1026+
encrypted = [
1027+
self._encrypt(self.prop['Subrs'][i], 'charstring', lenIV
1028+
).decode('latin-1')
1029+
if i in indices else stub
1030+
for i in range(n_subrs)
1031+
]
1032+
RD, ND, NP = self._abbr['RD'], self._abbr['ND'], self._abbr['NP']
1033+
result.extend(
1034+
f'dup {i} {len(enc)} {RD} {enc} {NP}'
1035+
for i, enc in enumerate(encrypted)
1036+
)
1037+
result.extend((ND, ''))
1038+
return '\n'.join(result)
1039+
7781040

7791041
_StandardEncoding = {
7801042
**{ord(letter): letter for letter in string.ascii_letters},

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.