Stripping out HTML tags from a string
Since HTML is not a regular language (HTML is a context-free language), you cannot use Regular Expressions. See: Using regular expressions to parse HTML: why not?
I would consider using NSAttributedString instead.
let htmlString = "LCD Soundsystem was the musical project of producer <a href='http://www.last.fm/music/James+Murphy' class='bbcode_artist'>James Murphy</a>, co-founder of <a href='http://www.last.fm/tag/dance-punk' class='bbcode_tag' rel='tag'>dance-punk</a> label <a href='http://www.last.fm/label/DFA' class='bbcode_label'>DFA</a> Records. Formed in 2001 in New York City, New York, United States, the music of LCD Soundsystem can also be described as a mix of <a href='http://www.last.fm/tag/alternative%20dance' class='bbcode_tag' rel='tag'>alternative dance</a> and <a href='http://www.last.fm/tag/post%20punk' class='bbcode_tag' rel='tag'>post punk</a>, along with elements of <a href='http://www.last.fm/tag/disco' class='bbcode_tag' rel='tag'>disco</a> and other styles. <br />"
let htmlStringData = htmlString.dataUsingEncoding(NSUTF8StringEncoding)!
let options: [String: AnyObject] = [NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType, NSCharacterEncodingDocumentAttribute: NSUTF8StringEncoding]
let attributedHTMLString = try! NSAttributedString(data: htmlStringData, options: options, documentAttributes: nil)
let string = attributedHTMLString.string
Or, as Irshad Mohamed in the comments would do it:
let attributed = try NSAttributedString(data: htmlString.data(using: .unicode)!, options: [NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType], documentAttributes: nil)
print(attributed.string)
I'm using the following extension to remove specific HTML elements:
extension String {
func deleteHTMLTag(tag:String) -> String {
return self.stringByReplacingOccurrencesOfString("(?i)</?\(tag)\\b[^<]*>", withString: "", options: .RegularExpressionSearch, range: nil)
}
func deleteHTMLTags(tags:[String]) -> String {
var mutableString = self
for tag in tags {
mutableString = mutableString.deleteHTMLTag(tag)
}
return mutableString
}
}
This makes it possible to only remove <a>
tags from a string, e.g.:
let string = "my html <a href="">link text</a>"
let withoutHTMLString = string.deleteHTMLTag("a") // Will be "my html link text"
Mohamed solution but as a String extension in Swift 4.
extension String {
func stripOutHtml() -> String? {
do {
guard let data = self.data(using: .unicode) else {
return nil
}
let attributed = try NSAttributedString(data: data, options: [.documentType: NSAttributedString.DocumentType.html, .characterEncoding: String.Encoding.utf8.rawValue], documentAttributes: nil)
return attributed.string
} catch {
return nil
}
}
}
Hmm, I tried your function and it worked on a small example:
var string = "<!DOCTYPE html> <html> <body> <h1>My First Heading</h1> <p>My first paragraph.</p> </body> </html>"
let str = string.stringByReplacingOccurrencesOfString("<[^>]+>", withString: "", options: .RegularExpressionSearch, range: nil)
print(str)
//output " My First Heading My first paragraph. "
Can you give an example of a problem?
Swift 4 and 5 version:
var string = "<!DOCTYPE html> <html> <body> <h1>My First Heading</h1> <p>My first paragraph.</p> </body> </html>"
let str = string.replacingOccurrences(of: "<[^>]+>", with: "", options: .regularExpression, range: nil)