* src/xml.erl: Better escaping management with CDATA. We only add CDATA enclosure when needed. CDATA end token is properly escaped.

SVN Revision: 844
This commit is contained in:
Mickaël Rémond 2007-07-30 17:35:00 +00:00
parent f6095694b1
commit bdb2c6820f
2 changed files with 71 additions and 8 deletions

View File

@ -1,5 +1,8 @@
2007-07-30 Mickael Remond <mickael.remond@process-one.net> 2007-07-30 Mickael Remond <mickael.remond@process-one.net>
* src/xml.erl: Better escaping management with CDATA. We only add
CDATA enclosure when needed. CDATA end token is properly escaped.
* src/xml.erl: Only wrap xmldata nodes in xml cdata "tag" if * src/xml.erl: Only wrap xmldata nodes in xml cdata "tag" if
bigger than 50 bytes. Shorter xmlcdata nodes will be escaped. bigger than 50 bytes. Shorter xmlcdata nodes will be escaped.

View File

@ -20,9 +20,6 @@
get_path_s/2, get_path_s/2,
replace_tag_attr/3]). replace_tag_attr/3]).
%% XML CDATA bigger than this will be enclosed in CDATA XML "tag"
-define(CDATA_BINARY_THRESHOLD, 50).
element_to_string(El) -> element_to_string(El) ->
case El of case El of
{xmlelement, Name, Attrs, Els} -> {xmlelement, Name, Attrs, Els} ->
@ -35,11 +32,9 @@ element_to_string(El) ->
[$<, Name, attrs_to_list(Attrs), $/, $>] [$<, Name, attrs_to_list(Attrs), $/, $>]
end; end;
%% We do not crypt CDATA binary, but we enclose it in XML CDATA %% We do not crypt CDATA binary, but we enclose it in XML CDATA
%% if they are long enough to be worth it. {xmlcdata, CData}
{xmlcdata, CData} when binary(CData), size(CData) > ?CDATA_BINARY_THRESHOLD -> when binary(CData) ->
CDATA1 = <<"<![CDATA[">>, make_text_node(CData);
CDATA2 = <<"]]>">>,
concat_binary([CDATA1, CData, CDATA2]);
%% We crypt list and short binaries (implies a conversion to %% We crypt list and short binaries (implies a conversion to
%% list). %% list).
{xmlcdata, CData} -> {xmlcdata, CData} ->
@ -63,6 +58,71 @@ crypt(S) when is_list(S) ->
end || C <- S]; end || C <- S];
crypt(S) when is_binary(S) -> crypt(S) when is_binary(S) ->
crypt(binary_to_list(S)). crypt(binary_to_list(S)).
%% Make a cdata_binary depending on what characters it contains
make_text_node(CData) ->
case cdata_need_escape(CData) of
cdata ->
CDATA1 = <<"<![CDATA[">>,
CDATA2 = <<"]]>">>,
concat_binary([CDATA1, CData, CDATA2]);
none ->
CData;
{cdata, EndTokens} ->
EscapedCData = escape_cdata(CData, EndTokens),
concat_binary(EscapedCData)
end.
%% Returns escape type needed for the text node
%% none, cdata, {cdata, [Positions]}
%% Positions is a list a integer containing positions of CDATA end
%% tokens, so that they can be escaped
cdata_need_escape(CData) ->
cdata_need_escape(CData, 0, false, []).
cdata_need_escape(<<>>, _, false, _) ->
none;
cdata_need_escape(<<>>, _, true, []) ->
cdata;
cdata_need_escape(<<>>, _, true, CDataEndTokens) ->
{cdata, lists:reverse(CDataEndTokens)};
cdata_need_escape(<<$],$],$>,Rest/binary>>, CurrentPosition,
_XMLEscape, CDataEndTokens) ->
NewPosition = CurrentPosition + 3,
cdata_need_escape(Rest, NewPosition, true,
[CurrentPosition+1|CDataEndTokens]);
%% Only <, & need to be escaped in XML text node
%% See reference: http://www.w3.org/TR/xml11/#syntax
cdata_need_escape(<<$<,Rest/binary>>, CurrentPosition,
_XMLEscape, CDataEndTokens) ->
cdata_need_escape(Rest, CurrentPosition+1, true, CDataEndTokens);
cdata_need_escape(<<$&,Rest/binary>>, CurrentPosition,
_XMLEscape, CDataEndTokens) ->
cdata_need_escape(Rest, CurrentPosition+1, true, CDataEndTokens);
cdata_need_escape(<<_:8,Rest/binary>>, CurrentPosition,
XMLEscape, CDataEndTokens) ->
cdata_need_escape(Rest, CurrentPosition+1, XMLEscape,
CDataEndTokens).
%% escape cdata that contain CDATA end tokens
%% EndTokens is a list of position of end tokens (integer)
%% This is supposed to be a very rare case: You need to generate several
%% fields, splitting it in the middle of the end token.
%% See example: http://en.wikipedia.org/wiki/CDATA#Uses_of_CDATA_sections
escape_cdata(CData, EndTokens) ->
escape_cdata(CData, 0, EndTokens, []).
escape_cdata(<<>>, _CurrentPosition, [], Acc) ->
lists:reverse(Acc);
escape_cdata(Rest, CurrentPosition, [], Acc) ->
CDATA1 = <<"<![CDATA[">>,
CDATA2 = <<"]]>">>,
escape_cdata(<<>>, CurrentPosition, [], [CDATA2, Rest, CDATA1|Acc]);
escape_cdata(CData, Index, [Pos|Positions], Acc) ->
CDATA1 = <<"<![CDATA[">>,
CDATA2 = <<"]]>">>,
Split = Pos-Index,
{Part, Rest} = split_binary(CData, Split+1),
%% Note: We build the list in reverse to optimize construction
escape_cdata(Rest, Pos+1, Positions, [CDATA2, Part, CDATA1|Acc]).
remove_cdata_p({xmlelement, _Name, _Attrs, _Els}) -> true; remove_cdata_p({xmlelement, _Name, _Attrs, _Els}) -> true;
remove_cdata_p(_) -> false. remove_cdata_p(_) -> false.