Is there any way to do HTML decode in SQL Server?
Previous version does not works with Japan, Korean, ... Here fixed version:
GO
IF OBJECT_ID('dbo.fn_HTMLDecode') IS NOT NULL BEGIN DROP FUNCTION dbo.fn_HTMLDecode END
GO
CREATE FUNCTION dbo.fn_HTMLDecode(
@vcWhat NVARCHAR(MAX)
,@toDecodeMainISOSymbols bit = 1
,@toDecodeISOChars bit = 1
)
RETURNS NVARCHAR(MAX)
AS
BEGIN
DECLARE @vcResult NVARCHAR(MAX);
DECLARE @siPos INT ,@vcEncoded NVARCHAR(9) ,@siChar INT;
SET @vcResult = RTRIM(LTRIM(CAST(REPLACE(@vcWhat COLLATE Latin1_General_BIN, CHAR(0), '') AS NVARCHAR(MAX))));
SELECT @vcResult = REPLACE(REPLACE(@vcResult, ' ', ' '), ' ', ' ');
IF @vcResult = '' RETURN @vcResult;
declare @s varchar(35);
declare @n int; set @n = 6;
declare @i int;
while @n > 2
begin
set @s = '';
set @i=1;
while @i<=@n
begin
set @s = @s + '[0-9]';
set @i = @i + 1;
end
set @s = '%&#' + @s + '%';
SELECT @siPos = PATINDEX(@s, @vcResult);
WHILE @siPos > 0
BEGIN
SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, @n+3)
,@siChar = CAST(SUBSTRING(@vcEncoded, 3, @n) AS INT)
,@vcResult = REPLACE(@vcResult, @vcEncoded, NCHAR(@siChar))
,@siPos = PATINDEX(@s, @vcResult);
END
set @n = @n - 1;
end
if @toDecodeMainISOSymbols=1
begin
select @vcResult = REPLACE(REPLACE(@vcResult, NCHAR(160), ' '), CHAR(160), ' ');
select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult, '&', '&'), '"', '"'), '<', '<'), '>', '>'), '&amp;', '&'),'”','”'),'„','„'),'–','–'),'—','—');
select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult,'‘','‘'),'’','’'),'•','•'),'…','…'),'‰','‰') COLLATE Latin1_General_BIN,'′','′') COLLATE Latin1_General_BIN,'″','″'),'ˆ','ˆ'),'˜','˜'),' ',' ');
end
if @toDecodeISOChars=1
begin
select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'Š','Š') COLLATE Latin1_General_BIN,'š','š') COLLATE Latin1_General_BIN,'Ç','Ç') COLLATE Latin1_General_BIN,'ç','ç');
select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult,'À','À') COLLATE Latin1_General_BIN,'à','à') COLLATE Latin1_General_BIN,'Á','Á') COLLATE Latin1_General_BIN,'á','á') COLLATE Latin1_General_BIN,'Â','Â') COLLATE Latin1_General_BIN,'â','â') COLLATE Latin1_General_BIN,'Ã','Ã') COLLATE Latin1_General_BIN,'ã','ã') COLLATE Latin1_General_BIN,'Ä','Ä') COLLATE Latin1_General_BIN,'ä','ä') COLLATE Latin1_General_BIN,'Å','Å') COLLATE Latin1_General_BIN,'å','å') COLLATE Latin1_General_BIN,'Æ','Æ') COLLATE Latin1_General_BIN,'æ','æ');
select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'È','È') COLLATE Latin1_General_BIN,'è','è') COLLATE Latin1_General_BIN,'É','É') COLLATE Latin1_General_BIN,'é','é') COLLATE Latin1_General_BIN,'Ê','Ê') COLLATE Latin1_General_BIN,'ê','ê') COLLATE Latin1_General_BIN,'Ë','Ë') COLLATE Latin1_General_BIN,'ë','ë');
select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'Ì','Ì') COLLATE Latin1_General_BIN,'ì','ì') COLLATE Latin1_General_BIN,'Í','Í') COLLATE Latin1_General_BIN,'í','í') COLLATE Latin1_General_BIN,'Î','Î') COLLATE Latin1_General_BIN,'î','î') COLLATE Latin1_General_BIN,'Ï','Ï') COLLATE Latin1_General_BIN,'ï','ï');
select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'Ò','Ò') COLLATE Latin1_General_BIN,'ò','ò') COLLATE Latin1_General_BIN,'Ó','Ó') COLLATE Latin1_General_BIN,'ó','ó') COLLATE Latin1_General_BIN,'Ô','Ô') COLLATE Latin1_General_BIN,'ô','ô') COLLATE Latin1_General_BIN,'Õ','Õ') COLLATE Latin1_General_BIN,'õ','õ') COLLATE Latin1_General_BIN,'Ö','Ö') COLLATE Latin1_General_BIN,'ö','ö') COLLATE Latin1_General_BIN,'Ø','Ø') COLLATE Latin1_General_BIN,'ø','ø');
select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'Ù','Ù') COLLATE Latin1_General_BIN,'ù','ù') COLLATE Latin1_General_BIN,'Ú','Ú') COLLATE Latin1_General_BIN,'ú','ú') COLLATE Latin1_General_BIN,'Û','Û') COLLATE Latin1_General_BIN,'û','û') COLLATE Latin1_General_BIN,'Ü','Ü') COLLATE Latin1_General_BIN,'ü','ü');
select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'Ð','Ð') COLLATE Latin1_General_BIN,'ð','ð') COLLATE Latin1_General_BIN,'Ñ','Ñ') COLLATE Latin1_General_BIN,'ñ','ñ') COLLATE Latin1_General_BIN,'Ý','Ý') COLLATE Latin1_General_BIN,'ý','ý') COLLATE Latin1_General_BIN,'Þ','Þ') COLLATE Latin1_General_BIN,'þ','þ') COLLATE Latin1_General_BIN,'ß','ß');
end
RETURN @vcResult;
END
-- test:
-- select dbo.fn_HTMLDecode(N'A fine example of man and nature co-existing is Slovenia’s ecological tourist farms.',1,1)
-- select dbo.fn_HTMLDecode(N'm0 와인분야에서 m1 가장 m2 영향력 m3있는m10',1,1)
The following SQL function would work in your case or it would be a good starting point for you to extend it. However, please note the String manipulations in the Database [SQL Server] would be slower compared to the string manipulations in application layer.
GO
IF OBJECT_ID('dbo.MyHTMLDecode') IS NOT NULL BEGIN DROP FUNCTION dbo.MyHTMLDecode END
GO
CREATE FUNCTION dbo.MyHTMLDecode (@vcWhat VARCHAR(MAX))
RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE @vcResult VARCHAR(MAX)
DECLARE @siPos INT
,@vcEncoded VARCHAR(7)
,@siChar INT
SET @vcResult = RTRIM(LTRIM(CAST(REPLACE(@vcWhat COLLATE Latin1_General_BIN, CHAR(0), '') AS VARCHAR(MAX))))
SELECT @vcResult = REPLACE(REPLACE(@vcResult, ' ', ' '), ' ', ' ')
IF @vcResult = ''
RETURN @vcResult
SELECT @siPos = PATINDEX('%&#[0-9][0-9][0-9];%', @vcResult)
WHILE @siPos > 0
BEGIN
SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, 6)
,@siChar = CAST(SUBSTRING(@vcEncoded, 3, 3) AS INT)
,@vcResult = REPLACE(@vcResult, @vcEncoded, NCHAR(@siChar))
,@siPos = PATINDEX('%&#[0-9][0-9][0-9];%', @vcResult)
END
SELECT @siPos = PATINDEX('%&#[0-9][0-9][0-9][0-9];%', @vcResult)
WHILE @siPos > 0
BEGIN
SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, 7)
,@siChar = CAST(SUBSTRING(@vcEncoded, 3, 4) AS INT)
,@vcResult = REPLACE(@vcResult, @vcEncoded, NCHAR(@siChar))
,@siPos = PATINDEX('%&#[0-9][0-9][0-9][0-9];%', @vcResult)
END
SELECT @siPos = PATINDEX('%#[0-9][0-9][0-9][0-9]%', @vcResult)
WHILE @siPos > 0
BEGIN
SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, 5)
,@vcResult = REPLACE(@vcResult, @vcEncoded, '')
,@siPos = PATINDEX('%#[0-9][0-9][0-9][0-9]%', @vcResult)
END
SELECT @vcResult = REPLACE(REPLACE(@vcResult, NCHAR(160), ' '), CHAR(160), ' ')
SELECT @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult, '&', '&'), '"', '"'), '<', '<'), '>', '>'), '&amp;', '&')
RETURN @vcResult
END
GO
Illustration:
DECLARE @S VARCHAR(MAX)='CD&amp;amp;M Communications
auburndale oil &amp;amp; propane inc
C F La Fountaine #7561
Laramie County Fire District # 2
AmeriGas Propane LP #2250'
SELECT dbo.MyHTMLDecode (@s)
OUTPUT:
CD&M Communications
auburndale oil & propane inc
C F La Fountaine
Laramie County Fire District # 2
AmeriGas Propane LP
There is a much easier solution...
SQL Server supports the XML datatype, and it supports decoding XML/HTML encoded entities. If you just cast the string to the XML datatype, you can use the built in decode function.
That would look like this:
select cast('Q & A' as XML).value('.[1]','nvarchar(max)' );
To turn it into a function for easy use:
create function dbo.xmlDecode (@string nvarchar(max))
returns varchar(max)
begin
return cast(@string as XML).value('.[1]','nvarchar(max)' )
end;
Keep in mind that in OP's example, the string seems to have been encoded 3 times in a row. &
got turned into &
then into &amp;
and then into &amp;amp;
. As a result, to get the "original" string back, you have to use the decode function 3 times.