-
-
Notifications
You must be signed in to change notification settings - Fork 1.9k
improve pseudo-html entity conversion #2932
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,6 @@ var d3 = require('d3'); | |
|
||
var Lib = require('../lib'); | ||
var xmlnsNamespaces = require('../constants/xmlns_namespaces'); | ||
var stringMappings = require('../constants/string_mappings'); | ||
var LINE_SPACING = require('../constants/alignment').LINE_SPACING; | ||
|
||
// text converter | ||
|
@@ -223,13 +222,6 @@ var PROTOCOLS = ['http:', 'https:', 'mailto:', '', undefined, ':']; | |
|
||
var STRIP_TAGS = new RegExp('</?(' + Object.keys(TAG_STYLES).join('|') + ')( [^>]*)?/?>', 'g'); | ||
|
||
var ENTITY_TO_UNICODE = Object.keys(stringMappings.entityToUnicode).map(function(k) { | ||
return { | ||
regExp: new RegExp('&' + k + ';', 'g'), | ||
sub: stringMappings.entityToUnicode[k] | ||
}; | ||
}); | ||
|
||
var NEWLINES = /(\r\n?|\n)/g; | ||
|
||
var SPLIT_TAGS = /(<[^<>]*>)/; | ||
|
@@ -254,6 +246,14 @@ var BR_TAG = /<br(\s+.*)?>/i; | |
* | ||
* Because we hack in other attributes with style (sub & sup), drop any trailing | ||
* semicolon in user-supplied styles so we can consistently append the tag-dependent style | ||
* | ||
* These are for tag attributes; Chrome anyway will convert entities in | ||
* attribute values, but not in attribute names | ||
* you can test this by for example: | ||
* > p = document.createElement('p') | ||
* > p.innerHTML = '<span style="font-color:red;">Hi</span>' | ||
* > p.innerHTML | ||
* <- '<span style="font-color:red;">Hi</span>' | ||
*/ | ||
var STYLEMATCH = /(^|[\s"'])style\s*=\s*("([^"]*);?"|'([^']*);?')/i; | ||
var HREFMATCH = /(^|[\s"'])href\s*=\s*("([^"]*)"|'([^']*)')/i; | ||
|
@@ -265,7 +265,8 @@ var POPUPMATCH = /(^|[\s"'])popup\s*=\s*("([\w=,]*)"|'([\w=,]*)')/i; | |
function getQuotedMatch(_str, re) { | ||
if(!_str) return null; | ||
var match = _str.match(re); | ||
return match && (match[3] || match[4]); | ||
var result = match && (match[3] || match[4]); | ||
return result && convertEntities(result); | ||
} | ||
|
||
var COLORMATCH = /(^|;)\s*color:/; | ||
|
@@ -276,19 +277,70 @@ exports.plainText = function(_str) { | |
return (_str || '').replace(STRIP_TAGS, ' '); | ||
}; | ||
|
||
function replaceFromMapObject(_str, list) { | ||
if(!_str) return ''; | ||
/* | ||
* N.B. HTML entities are listed without the leading '&' and trailing ';' | ||
* https://www.freeformatter.com/html-entities.html | ||
* | ||
* FWIW if we wanted to support the full set, it has 2261 entries: | ||
* https://www.w3.org/TR/html5/entities.json | ||
* though I notice that some of these are duplicates and/or are missing ";" | ||
* eg: "&", "&", "&", and "&" all map to "&" | ||
* We no longer need to include numeric entities here, these are now handled | ||
* by String.fromCodePoint/fromCharCode | ||
* | ||
* Anyway the only ones that are really important to allow are the HTML special | ||
* chars <, >, and &, because these ones can trigger special processing if not | ||
* replaced by the corresponding entity. | ||
*/ | ||
var entityToUnicode = { | ||
mu: 'μ', | ||
amp: '&', | ||
lt: '<', | ||
gt: '>', | ||
nbsp: ' ', | ||
times: '×', | ||
plusmn: '±', | ||
deg: '°' | ||
}; | ||
|
||
for(var i = 0; i < list.length; i++) { | ||
var item = list[i]; | ||
_str = _str.replace(item.regExp, item.sub); | ||
} | ||
// NOTE: in general entities can contain uppercase too (so [a-zA-Z]) but all the | ||
// ones we support use only lowercase. If we ever change that, update the regex. | ||
var ENTITY_MATCH = /&(#\d+|#x[\da-fA-F]+|[a-z]+);/g; | ||
function convertEntities(_str) { | ||
return _str.replace(ENTITY_MATCH, function(fullMatch, innerMatch) { | ||
var outChar; | ||
if(innerMatch.charAt(0) === '#') { | ||
// cannot use String.fromCodePoint in IE | ||
outChar = fromCodePoint( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Brilliant ✨ |
||
innerMatch.charAt(1) === 'x' ? | ||
parseInt(innerMatch.substr(2), 16) : | ||
parseInt(innerMatch.substr(1), 10) | ||
); | ||
} | ||
else outChar = entityToUnicode[innerMatch]; | ||
|
||
return _str; | ||
// as in regular HTML, if we didn't decode the entity just | ||
// leave the raw text in place. | ||
return outChar || fullMatch; | ||
}); | ||
} | ||
|
||
function convertEntities(_str) { | ||
return replaceFromMapObject(_str, ENTITY_TO_UNICODE); | ||
exports.convertEntities = convertEntities; | ||
|
||
function fromCodePoint(code) { | ||
// Don't allow overflow. In Chrome this turns into � but I feel like it's | ||
// more useful to just not convert it at all. | ||
if(code > 0x10FFFF) return; | ||
var stringFromCodePoint = String.fromCodePoint; | ||
if(stringFromCodePoint) return stringFromCodePoint(code); | ||
|
||
// IE doesn't have String.fromCodePoint | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be fair, not just IE doesn't support There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair enough - though we don't support such old versions of Chrome and FF, do we? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We used to have a blob on https://plot.ly/javascript/ (that I can't find right now) saying plotly.js is supported in all SVG-enabled browsers. But yeah, the fraction of users still using Chrome41 and FF29 must be tiny. |
||
// see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/fromCodePoint | ||
var stringFromCharCode = String.fromCharCode; | ||
if(code <= 0xFFFF) return stringFromCharCode(code); | ||
return stringFromCharCode( | ||
(code >> 10) + 0xD7C0, | ||
(code % 0x400) + 0xDC00 | ||
); | ||
} | ||
|
||
/* | ||
|
@@ -302,15 +354,14 @@ function convertEntities(_str) { | |
* somewhat differently if it does, so just keep track of this when it happens. | ||
*/ | ||
function buildSVGText(containerNode, str) { | ||
str = convertEntities(str) | ||
/* | ||
* Normalize behavior between IE and others wrt newlines and whitespace:pre | ||
* this combination makes IE barf https://github.com/plotly/plotly.js/issues/746 | ||
* Chrome and FF display \n, \r, or \r\n as a space in this mode. | ||
* I feel like at some point we turned these into <br> but currently we don't so | ||
* I'm just going to cement what we do now in Chrome and FF | ||
*/ | ||
.replace(NEWLINES, ' '); | ||
/* | ||
* Normalize behavior between IE and others wrt newlines and whitespace:pre | ||
* this combination makes IE barf https://github.com/plotly/plotly.js/issues/746 | ||
* Chrome and FF display \n, \r, or \r\n as a space in this mode. | ||
* I feel like at some point we turned these into <br> but currently we don't so | ||
* I'm just going to cement what we do now in Chrome and FF | ||
*/ | ||
str = str.replace(NEWLINES, ' '); | ||
|
||
var hasLink = false; | ||
|
||
|
@@ -435,7 +486,7 @@ function buildSVGText(containerNode, str) { | |
newLine(); | ||
} | ||
else if(tagStyle === undefined) { | ||
addTextNode(currentNode, parti); | ||
addTextNode(currentNode, convertEntities(parti)); | ||
} | ||
else { | ||
// tag - open or close | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,16 @@ describe('svg+text utils', function() { | |
|
||
describe('convertToTspans', function() { | ||
|
||
var stringFromCodePoint; | ||
|
||
beforeAll(function() { | ||
stringFromCodePoint = String.fromCodePoint; | ||
}); | ||
|
||
afterEach(function() { | ||
String.fromCodePoint = stringFromCodePoint; | ||
}); | ||
|
||
function mockTextSVGElement(txt) { | ||
return d3.select('body') | ||
.append('svg') | ||
|
@@ -300,16 +310,78 @@ describe('svg+text utils', function() { | |
'100 × 20 ± 0.5 °' | ||
); | ||
|
||
expect(node.text()).toEqual('100μ & < 10 > 0 100 × 20 ± 0.5 °'); | ||
expect(node.text()).toBe('100μ & < 10 > 0 100 × 20 ± 0.5 °'); | ||
}); | ||
|
||
it('decodes some HTML entities in text (number case)', function() { | ||
var node = mockTextSVGElement( | ||
'100μ  < 10 > 0  ' + | ||
'100μ & < 10 > 0  ' + | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, the ampersand is not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oops. My bad. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh haha I assumed I did that 🙈 |
||
'100 × 20 ± 0.5 °' | ||
); | ||
|
||
expect(node.text()).toEqual('100μ & < 10 > 0 100 × 20 ± 0.5 °'); | ||
expect(node.text()).toBe('100μ & < 10 > 0 100 × 20 ± 0.5 °'); | ||
}); | ||
|
||
it('decodes arbitrary decimal and hex number entities', function() { | ||
var i = 0; | ||
for(var n = 33; n < 0x10FFFF; n = Math.round(n * 1.03)) { | ||
var node = mockTextSVGElement( | ||
'&#x' + n.toString(16) + | ||
'; = &#' + n.toString() + | ||
'; = &#x' + n.toString(16).toUpperCase() + ';' | ||
); | ||
var char = String.fromCodePoint(n); | ||
expect(node.text()).toBe(char + ' = ' + char + ' = ' + char, n); | ||
i++; | ||
} | ||
// not really necessary to assert this, but we tested 355 characters, | ||
// weighted toward the low end but continuing all the way to the | ||
// end of the unicode definition | ||
expect(i).toBe(355); | ||
}); | ||
|
||
it('decodes arbitrary decimal and hex number entities (IE case)', function() { | ||
// IE does not have String.fromCodePoint | ||
String.fromCodePoint = undefined; | ||
expect(String.fromCodePoint).toBeUndefined(); | ||
|
||
var i = 0; | ||
for(var n = 33; n < 0x10FFFF; n = Math.round(n * 1.03)) { | ||
var node = mockTextSVGElement( | ||
'&#x' + n.toString(16) + | ||
'; = &#' + n.toString() + | ||
'; = &#x' + n.toString(16).toUpperCase() + ';' | ||
); | ||
var char = stringFromCodePoint(n); | ||
expect(node.text()).toBe(char + ' = ' + char + ' = ' + char, n); | ||
i++; | ||
} | ||
// not really necessary to assert this, but we tested 355 characters, | ||
// weighted toward the low end but continuing all the way to the | ||
// end of the unicode definition | ||
expect(i).toBe(355); | ||
}); | ||
|
||
it('does not decode entities prematurely', function() { | ||
var testCases = [ | ||
'<b>not bold</b>', | ||
'<b>not bold</b>', | ||
'<b>not bold</b>', | ||
'<b>not bold</b>', | ||
'<b>not bold</b>' | ||
]; | ||
testCases.forEach(function(testCase) { | ||
var node = mockTextSVGElement(testCase); | ||
|
||
expect(node.html()).toBe( | ||
'<b>not bold</b>', testCase | ||
); | ||
}); | ||
|
||
var controlNode = mockTextSVGElement('<b>bold</b>'); | ||
expect(controlNode.html()).toBe( | ||
'<tspan style="font-weight:bold">bold</tspan>' | ||
); | ||
}); | ||
|
||
it('supports superscript by itself', function() { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very nice here.
Now
string_mapping
is only required insvg_text_utils.js
. Would you mind moving theentityToUnicode
lookup table tosvg_text_utils.js
and 🔪string_mapping.js
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good call -> 🔪
string_mapping.js
in a943db2