How to strip HTML tags from a string in SQL Server?
There is a UDF that will do that described here:
User Defined Function to Strip HTML
CREATE FUNCTION [dbo].[udf_StripHTML] (@HTMLText VARCHAR(MAX))
RETURNS VARCHAR(MAX) AS
BEGIN
DECLARE @Start INT
DECLARE @End INT
DECLARE @Length INT
SET @Start = CHARINDEX('<',@HTMLText)
SET @End = CHARINDEX('>',@HTMLText,CHARINDEX('<',@HTMLText))
SET @Length = (@End - @Start) + 1
WHILE @Start > 0 AND @End > 0 AND @Length > 0
BEGIN
SET @HTMLText = STUFF(@HTMLText,@Start,@Length,'')
SET @Start = CHARINDEX('<',@HTMLText)
SET @End = CHARINDEX('>',@HTMLText,CHARINDEX('<',@HTMLText))
SET @Length = (@End - @Start) + 1
END
RETURN LTRIM(RTRIM(@HTMLText))
END
GO
Edit: note this is for SQL Server 2005, but if you change the keyword MAX to something like 4000, it will work in SQL Server 2000 as well.
Derived from @Goner Doug answer, with a few things updated:
- using REPLACE where possible
- conversion of predefined entities like é
(I chose the ones I needed :-)
- some conversion of list tags <ul> and <li>
ALTER FUNCTION [dbo].[udf_StripHTML]
--by Patrick Honorez --- www.idevlop.com
--inspired by http://stackoverflow.com/questions/457701/best-way-to-strip-html-tags-from-a-string-in-sql-server/39253602#39253602
(
@HTMLText varchar(MAX)
)
RETURNS varchar(MAX)
AS
BEGIN
DECLARE @Start int
DECLARE @End int
DECLARE @Length int
set @HTMLText = replace(@htmlText, '<br>',CHAR(13) + CHAR(10))
set @HTMLText = replace(@htmlText, '<br/>',CHAR(13) + CHAR(10))
set @HTMLText = replace(@htmlText, '<br />',CHAR(13) + CHAR(10))
set @HTMLText = replace(@htmlText, '<li>','- ')
set @HTMLText = replace(@htmlText, '</li>',CHAR(13) + CHAR(10))
set @HTMLText = replace(@htmlText, '’' collate Latin1_General_CS_AS, '''' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '"' collate Latin1_General_CS_AS, '"' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '&' collate Latin1_General_CS_AS, '&' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '€' collate Latin1_General_CS_AS, '€' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '<' collate Latin1_General_CS_AS, '<' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '>' collate Latin1_General_CS_AS, '>' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'œ' collate Latin1_General_CS_AS, 'oe' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ' ' collate Latin1_General_CS_AS, ' ' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '©' collate Latin1_General_CS_AS, '©' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '«' collate Latin1_General_CS_AS, '«' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '®' collate Latin1_General_CS_AS, '®' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '±' collate Latin1_General_CS_AS, '±' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '²' collate Latin1_General_CS_AS, '²' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '³' collate Latin1_General_CS_AS, '³' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'µ' collate Latin1_General_CS_AS, 'µ' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '·' collate Latin1_General_CS_AS, '·' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'º' collate Latin1_General_CS_AS, 'º' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '»' collate Latin1_General_CS_AS, '»' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '¼' collate Latin1_General_CS_AS, '¼' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '½' collate Latin1_General_CS_AS, '½' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '¾' collate Latin1_General_CS_AS, '¾' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '&Aelig' collate Latin1_General_CS_AS, 'Æ' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'Ç' collate Latin1_General_CS_AS, 'Ç' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'È' collate Latin1_General_CS_AS, 'È' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'É' collate Latin1_General_CS_AS, 'É' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'Ê' collate Latin1_General_CS_AS, 'Ê' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'Ö' collate Latin1_General_CS_AS, 'Ö' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'à' collate Latin1_General_CS_AS, 'à' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'â' collate Latin1_General_CS_AS, 'â' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ä' collate Latin1_General_CS_AS, 'ä' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'æ' collate Latin1_General_CS_AS, 'æ' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ç' collate Latin1_General_CS_AS, 'ç' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'è' collate Latin1_General_CS_AS, 'è' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'é' collate Latin1_General_CS_AS, 'é' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ê' collate Latin1_General_CS_AS, 'ê' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ë' collate Latin1_General_CS_AS, 'ë' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'î' collate Latin1_General_CS_AS, 'î' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ô' collate Latin1_General_CS_AS, 'ô' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ö' collate Latin1_General_CS_AS, 'ö' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '÷' collate Latin1_General_CS_AS, '÷' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ø' collate Latin1_General_CS_AS, 'ø' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ù' collate Latin1_General_CS_AS, 'ù' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ú' collate Latin1_General_CS_AS, 'ú' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'û' collate Latin1_General_CS_AS, 'û' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ü' collate Latin1_General_CS_AS, 'ü' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '"' collate Latin1_General_CS_AS, '"' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '&' collate Latin1_General_CS_AS, '&' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '‹' collate Latin1_General_CS_AS, '<' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '›' collate Latin1_General_CS_AS, '>' collate Latin1_General_CS_AS)
-- Remove anything between <STYLE> tags
SET @Start = CHARINDEX('<STYLE', @HTMLText)
SET @End = CHARINDEX('</STYLE>', @HTMLText, CHARINDEX('<', @HTMLText)) + 7
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '')
SET @Start = CHARINDEX('<STYLE', @HTMLText)
SET @End = CHARINDEX('</STYLE>', @HTMLText, CHARINDEX('</STYLE>', @HTMLText)) + 7
SET @Length = (@End - @Start) + 1
END
-- Remove anything between <whatever> tags
SET @Start = CHARINDEX('<', @HTMLText)
SET @End = CHARINDEX('>', @HTMLText, CHARINDEX('<', @HTMLText))
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '')
SET @Start = CHARINDEX('<', @HTMLText)
SET @End = CHARINDEX('>', @HTMLText, CHARINDEX('<', @HTMLText))
SET @Length = (@End - @Start) + 1
END
RETURN LTRIM(RTRIM(@HTMLText))
END