From bdb2c6820fb82c7ec9a5c60c18c09cce9ed53ffa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20R=C3=A9mond?= Date: Mon, 30 Jul 2007 17:35:00 +0000 Subject: [PATCH] * src/xml.erl: Better escaping management with CDATA. We only add CDATA enclosure when needed. CDATA end token is properly escaped. SVN Revision: 844 --- ChangeLog | 3 +++ src/xml.erl | 76 +++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 71 insertions(+), 8 deletions(-) diff --git a/ChangeLog b/ChangeLog index 31f888b52..56e19a51b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ 2007-07-30 Mickael Remond + * src/xml.erl: Better escaping management with CDATA. We only add + CDATA enclosure when needed. CDATA end token is properly escaped. + * src/xml.erl: Only wrap xmldata nodes in xml cdata "tag" if bigger than 50 bytes. Shorter xmlcdata nodes will be escaped. diff --git a/src/xml.erl b/src/xml.erl index 2108d6920..1e869be6a 100644 --- a/src/xml.erl +++ b/src/xml.erl @@ -20,9 +20,6 @@ get_path_s/2, replace_tag_attr/3]). -%% XML CDATA bigger than this will be enclosed in CDATA XML "tag" --define(CDATA_BINARY_THRESHOLD, 50). - element_to_string(El) -> case El of {xmlelement, Name, Attrs, Els} -> @@ -35,11 +32,9 @@ element_to_string(El) -> [$<, Name, attrs_to_list(Attrs), $/, $>] end; %% We do not crypt CDATA binary, but we enclose it in XML CDATA - %% if they are long enough to be worth it. - {xmlcdata, CData} when binary(CData), size(CData) > ?CDATA_BINARY_THRESHOLD -> - CDATA1 = <<">, - CDATA2 = <<"]]>">>, - concat_binary([CDATA1, CData, CDATA2]); + {xmlcdata, CData} + when binary(CData) -> + make_text_node(CData); %% We crypt list and short binaries (implies a conversion to %% list). {xmlcdata, CData} -> @@ -63,6 +58,71 @@ crypt(S) when is_list(S) -> end || C <- S]; crypt(S) when is_binary(S) -> crypt(binary_to_list(S)). + +%% Make a cdata_binary depending on what characters it contains +make_text_node(CData) -> + case cdata_need_escape(CData) of + cdata -> + CDATA1 = <<">, + CDATA2 = <<"]]>">>, + concat_binary([CDATA1, CData, CDATA2]); + none -> + CData; + {cdata, EndTokens} -> + EscapedCData = escape_cdata(CData, EndTokens), + concat_binary(EscapedCData) + end. + +%% Returns escape type needed for the text node +%% none, cdata, {cdata, [Positions]} +%% Positions is a list a integer containing positions of CDATA end +%% tokens, so that they can be escaped +cdata_need_escape(CData) -> + cdata_need_escape(CData, 0, false, []). +cdata_need_escape(<<>>, _, false, _) -> + none; +cdata_need_escape(<<>>, _, true, []) -> + cdata; +cdata_need_escape(<<>>, _, true, CDataEndTokens) -> + {cdata, lists:reverse(CDataEndTokens)}; +cdata_need_escape(<<$],$],$>,Rest/binary>>, CurrentPosition, + _XMLEscape, CDataEndTokens) -> + NewPosition = CurrentPosition + 3, + cdata_need_escape(Rest, NewPosition, true, + [CurrentPosition+1|CDataEndTokens]); +%% Only <, & need to be escaped in XML text node +%% See reference: http://www.w3.org/TR/xml11/#syntax +cdata_need_escape(<<$<,Rest/binary>>, CurrentPosition, + _XMLEscape, CDataEndTokens) -> + cdata_need_escape(Rest, CurrentPosition+1, true, CDataEndTokens); +cdata_need_escape(<<$&,Rest/binary>>, CurrentPosition, + _XMLEscape, CDataEndTokens) -> + cdata_need_escape(Rest, CurrentPosition+1, true, CDataEndTokens); +cdata_need_escape(<<_:8,Rest/binary>>, CurrentPosition, + XMLEscape, CDataEndTokens) -> + cdata_need_escape(Rest, CurrentPosition+1, XMLEscape, + CDataEndTokens). + +%% escape cdata that contain CDATA end tokens +%% EndTokens is a list of position of end tokens (integer) +%% This is supposed to be a very rare case: You need to generate several +%% fields, splitting it in the middle of the end token. +%% See example: http://en.wikipedia.org/wiki/CDATA#Uses_of_CDATA_sections +escape_cdata(CData, EndTokens) -> + escape_cdata(CData, 0, EndTokens, []). +escape_cdata(<<>>, _CurrentPosition, [], Acc) -> + lists:reverse(Acc); +escape_cdata(Rest, CurrentPosition, [], Acc) -> + CDATA1 = <<">, + CDATA2 = <<"]]>">>, + escape_cdata(<<>>, CurrentPosition, [], [CDATA2, Rest, CDATA1|Acc]); +escape_cdata(CData, Index, [Pos|Positions], Acc) -> + CDATA1 = <<">, + CDATA2 = <<"]]>">>, + Split = Pos-Index, + {Part, Rest} = split_binary(CData, Split+1), + %% Note: We build the list in reverse to optimize construction + escape_cdata(Rest, Pos+1, Positions, [CDATA2, Part, CDATA1|Acc]). remove_cdata_p({xmlelement, _Name, _Attrs, _Els}) -> true; remove_cdata_p(_) -> false.