Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

improve pseudo-html entity conversion #2932

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 4, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 0 additions & 41 deletions 41 src/constants/string_mappings.js

This file was deleted.

24 changes: 1 addition & 23 deletions 24 src/lib/html2unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
'use strict';

var toSuperScript = require('superscript-text');
var stringMappings = require('../constants/string_mappings');
var fixEntities = require('./svg_text_utils').convertEntities;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very nice here.

Now string_mapping is only required in svg_text_utils.js. Would you mind moving the entityToUnicode lookup table to svg_text_utils.js and 🔪 string_mapping.js?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good call -> 🔪 string_mapping.js in a943db2


function fixSuperScript(x) {
var idx = 0;
Expand All @@ -33,28 +33,6 @@ function stripTags(x) {
return x.replace(/\<.*\>/g, '');
}

function fixEntities(x) {
var entityToUnicode = stringMappings.entityToUnicode;
var idx = 0;

while((idx = x.indexOf('&', idx)) >= 0) {
var nidx = x.indexOf(';', idx);
if(nidx < idx) {
idx += 1;
continue;
}

var entity = entityToUnicode[x.slice(idx + 1, nidx)];
if(entity) {
x = x.slice(0, idx) + entity + x.slice(nidx + 1);
} else {
x = x.slice(0, idx) + x.slice(nidx + 1);
}
}

return x;
}

function convertHTMLToUnicode(html) {
return '' +
fixEntities(
Expand Down
109 changes: 80 additions & 29 deletions 109 src/lib/svg_text_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ var d3 = require('d3');

var Lib = require('../lib');
var xmlnsNamespaces = require('../constants/xmlns_namespaces');
var stringMappings = require('../constants/string_mappings');
var LINE_SPACING = require('../constants/alignment').LINE_SPACING;

// text converter
Expand Down Expand Up @@ -223,13 +222,6 @@ var PROTOCOLS = ['http:', 'https:', 'mailto:', '', undefined, ':'];

var STRIP_TAGS = new RegExp('</?(' + Object.keys(TAG_STYLES).join('|') + ')( [^>]*)?/?>', 'g');

var ENTITY_TO_UNICODE = Object.keys(stringMappings.entityToUnicode).map(function(k) {
return {
regExp: new RegExp('&' + k + ';', 'g'),
sub: stringMappings.entityToUnicode[k]
};
});

var NEWLINES = /(\r\n?|\n)/g;

var SPLIT_TAGS = /(<[^<>]*>)/;
Expand All @@ -254,6 +246,14 @@ var BR_TAG = /<br(\s+.*)?>/i;
*
* Because we hack in other attributes with style (sub & sup), drop any trailing
* semicolon in user-supplied styles so we can consistently append the tag-dependent style
*
* These are for tag attributes; Chrome anyway will convert entities in
* attribute values, but not in attribute names
* you can test this by for example:
* > p = document.createElement('p')
* > p.innerHTML = '<span styl&#x65;="font-color:r&#x65;d;">Hi</span>'
* > p.innerHTML
* <- '<span styl&#x65;="font-color:red;">Hi</span>'
*/
var STYLEMATCH = /(^|[\s"'])style\s*=\s*("([^"]*);?"|'([^']*);?')/i;
var HREFMATCH = /(^|[\s"'])href\s*=\s*("([^"]*)"|'([^']*)')/i;
Expand All @@ -265,7 +265,8 @@ var POPUPMATCH = /(^|[\s"'])popup\s*=\s*("([\w=,]*)"|'([\w=,]*)')/i;
function getQuotedMatch(_str, re) {
if(!_str) return null;
var match = _str.match(re);
return match && (match[3] || match[4]);
var result = match && (match[3] || match[4]);
return result && convertEntities(result);
}

var COLORMATCH = /(^|;)\s*color:/;
Expand All @@ -276,19 +277,70 @@ exports.plainText = function(_str) {
return (_str || '').replace(STRIP_TAGS, ' ');
};

function replaceFromMapObject(_str, list) {
if(!_str) return '';
/*
* N.B. HTML entities are listed without the leading '&' and trailing ';'
* https://www.freeformatter.com/html-entities.html
*
* FWIW if we wanted to support the full set, it has 2261 entries:
* https://www.w3.org/TR/html5/entities.json
* though I notice that some of these are duplicates and/or are missing ";"
* eg: "&amp;", "&amp", "&AMP;", and "&AMP" all map to "&"
* We no longer need to include numeric entities here, these are now handled
* by String.fromCodePoint/fromCharCode
*
* Anyway the only ones that are really important to allow are the HTML special
* chars <, >, and &, because these ones can trigger special processing if not
* replaced by the corresponding entity.
*/
var entityToUnicode = {
mu: 'μ',
amp: '&',
lt: '<',
gt: '>',
nbsp: ' ',
times: '×',
plusmn: '±',
deg: '°'
};

for(var i = 0; i < list.length; i++) {
var item = list[i];
_str = _str.replace(item.regExp, item.sub);
}
// NOTE: in general entities can contain uppercase too (so [a-zA-Z]) but all the
// ones we support use only lowercase. If we ever change that, update the regex.
var ENTITY_MATCH = /&(#\d+|#x[\da-fA-F]+|[a-z]+);/g;
function convertEntities(_str) {
return _str.replace(ENTITY_MATCH, function(fullMatch, innerMatch) {
var outChar;
if(innerMatch.charAt(0) === '#') {
// cannot use String.fromCodePoint in IE
outChar = fromCodePoint(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Support all numbered entities, decimal and hex - including the correct ampersand code!

Brilliant ✨

innerMatch.charAt(1) === 'x' ?
parseInt(innerMatch.substr(2), 16) :
parseInt(innerMatch.substr(1), 10)
);
}
else outChar = entityToUnicode[innerMatch];

return _str;
// as in regular HTML, if we didn't decode the entity just
// leave the raw text in place.
return outChar || fullMatch;
});
}

function convertEntities(_str) {
return replaceFromMapObject(_str, ENTITY_TO_UNICODE);
exports.convertEntities = convertEntities;

function fromCodePoint(code) {
// Don't allow overflow. In Chrome this turns into � but I feel like it's
// more useful to just not convert it at all.
if(code > 0x10FFFF) return;
var stringFromCodePoint = String.fromCodePoint;
if(stringFromCodePoint) return stringFromCodePoint(code);

// IE doesn't have String.fromCodePoint
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be fair, not just IE doesn't support fromCodePoint, 2015 Chrome and 2014 FF didn't have it either.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough - though we don't support such old versions of Chrome and FF, do we?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we?

We used to have a blob on https://plot.ly/javascript/ (that I can't find right now) saying plotly.js is supported in all SVG-enabled browsers. But yeah, the fraction of users still using Chrome41 and FF29 must be tiny.

// see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/fromCodePoint
var stringFromCharCode = String.fromCharCode;
if(code <= 0xFFFF) return stringFromCharCode(code);
return stringFromCharCode(
(code >> 10) + 0xD7C0,
(code % 0x400) + 0xDC00
);
}

/*
Expand All @@ -302,15 +354,14 @@ function convertEntities(_str) {
* somewhat differently if it does, so just keep track of this when it happens.
*/
function buildSVGText(containerNode, str) {
str = convertEntities(str)
/*
* Normalize behavior between IE and others wrt newlines and whitespace:pre
* this combination makes IE barf https://github.com/plotly/plotly.js/issues/746
* Chrome and FF display \n, \r, or \r\n as a space in this mode.
* I feel like at some point we turned these into <br> but currently we don't so
* I'm just going to cement what we do now in Chrome and FF
*/
.replace(NEWLINES, ' ');
/*
* Normalize behavior between IE and others wrt newlines and whitespace:pre
* this combination makes IE barf https://github.com/plotly/plotly.js/issues/746
* Chrome and FF display \n, \r, or \r\n as a space in this mode.
* I feel like at some point we turned these into <br> but currently we don't so
* I'm just going to cement what we do now in Chrome and FF
*/
str = str.replace(NEWLINES, ' ');

var hasLink = false;

Expand Down Expand Up @@ -435,7 +486,7 @@ function buildSVGText(containerNode, str) {
newLine();
}
else if(tagStyle === undefined) {
addTextNode(currentNode, parti);
addTextNode(currentNode, convertEntities(parti));
}
else {
// tag - open or close
Expand Down
78 changes: 75 additions & 3 deletions 78 test/jasmine/tests/svg_text_utils_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@ describe('svg+text utils', function() {

describe('convertToTspans', function() {

var stringFromCodePoint;

beforeAll(function() {
stringFromCodePoint = String.fromCodePoint;
});

afterEach(function() {
String.fromCodePoint = stringFromCodePoint;
});

function mockTextSVGElement(txt) {
return d3.select('body')
.append('svg')
Expand Down Expand Up @@ -300,16 +310,78 @@ describe('svg+text utils', function() {
'100 &times; 20 &plusmn; 0.5 &deg;'
);

expect(node.text()).toEqual('100μ & < 10 > 0  100 × 20 ± 0.5 °');
expect(node.text()).toBe('100μ & < 10 > 0  100 × 20 ± 0.5 °');
});

it('decodes some HTML entities in text (number case)', function() {
var node = mockTextSVGElement(
'100&#956; &#28; &#60; 10 &#62; 0 &#160;' +
'100&#956; &#38; &#60; 10 &#62; 0 &#160;' +
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the ampersand is not &#28;...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops. My bad.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh haha I assumed I did that 🙈

'100 &#215; 20 &#177; 0.5 &#176;'
);

expect(node.text()).toEqual('100μ & < 10 > 0  100 × 20 ± 0.5 °');
expect(node.text()).toBe('100μ & < 10 > 0  100 × 20 ± 0.5 °');
});

it('decodes arbitrary decimal and hex number entities', function() {
var i = 0;
for(var n = 33; n < 0x10FFFF; n = Math.round(n * 1.03)) {
var node = mockTextSVGElement(
'&#x' + n.toString(16) +
'; = &#' + n.toString() +
'; = &#x' + n.toString(16).toUpperCase() + ';'
);
var char = String.fromCodePoint(n);
expect(node.text()).toBe(char + ' = ' + char + ' = ' + char, n);
i++;
}
// not really necessary to assert this, but we tested 355 characters,
// weighted toward the low end but continuing all the way to the
// end of the unicode definition
expect(i).toBe(355);
});

it('decodes arbitrary decimal and hex number entities (IE case)', function() {
// IE does not have String.fromCodePoint
String.fromCodePoint = undefined;
expect(String.fromCodePoint).toBeUndefined();

var i = 0;
for(var n = 33; n < 0x10FFFF; n = Math.round(n * 1.03)) {
var node = mockTextSVGElement(
'&#x' + n.toString(16) +
'; = &#' + n.toString() +
'; = &#x' + n.toString(16).toUpperCase() + ';'
);
var char = stringFromCodePoint(n);
expect(node.text()).toBe(char + ' = ' + char + ' = ' + char, n);
i++;
}
// not really necessary to assert this, but we tested 355 characters,
// weighted toward the low end but continuing all the way to the
// end of the unicode definition
expect(i).toBe(355);
});

it('does not decode entities prematurely', function() {
var testCases = [
'&lt;b>not bold</b&gt;',
'<b&gt;not bold</b&gt;',
'&lt;b>not bold&lt;/b>',
'<b&gt;not bold&lt;/b>',
'&lt;b&gt;not bold&lt;/b&gt;'
];
testCases.forEach(function(testCase) {
var node = mockTextSVGElement(testCase);

expect(node.html()).toBe(
'&lt;b&gt;not bold&lt;/b&gt;', testCase
);
});

var controlNode = mockTextSVGElement('<b>bold</b>');
expect(controlNode.html()).toBe(
'<tspan style="font-weight:bold">bold</tspan>'
);
});

it('supports superscript by itself', function() {
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.