Regex to parse image data URI
I faced also with the need to parse the data URI scheme. As a result, I improved the regular expression given on this page specifically for C# and which fits any data URI scheme (to check the scheme, you can take it from here or here.
Here is my solution for C#:
private class DataUriModel {
public string MediaType { get; set; }
public string Type { get; set; }
public string[] Tree { get; set; }
public string Subtype { get; set; }
public string Suffix { get; set; }
public string[] Params { get; set; }
public string Encoding { get; set; }
public string Data { get; set; }
}
static void Main(string[] args) {
string s = "data:image/prs.jpeg+gzip;charset=UTF-8;page=21;page=22;base64,/9j/4AAQSkZJRgABAQAAAQABAAD";
var parsedUri = GetDataURI(s);
Console.WriteLine(decodedUri.Type);
Console.WriteLine(decodedUri.Subtype);
Console.WriteLine(decodedUri.Encoding);
}
private static DataUriModel GetDataURI(string data) {
var result = new DataUriModel();
Regex regex = new Regex(@"^\s*data:(?<media_type>(?<type>[a-z\-]+){1}\/(?<tree>([a-z\-]+\.)+)?(?<subtype>[a-z\-]+){1}(?<suffix>\+[a-z]+)?(?<params>(;[a-z\-]+\=[a-z0-9\-\+]+)*)?)?(?<encoding>;base64)?(?<data>,+[a-z0-9\\\!\$\&\'\,\(\)\*\+\,\;\=\-\.\~\:\@\/\?\%\s]*\s*)?$", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Multiline);
var match = regex.Match(data);
if (!match.Success)
return result;
var names = regex.GetGroupNames();
foreach (var name in names) {
var group = match.Groups[name];
switch (name) {
case "media_type": result.MediaType = group.Value; break;
case "type": result.Type = group.Value; break;
case "tree": result.Tree = !string.IsNullOrWhiteSpace(group.Value) && group.Value.Length > 1 ? group.Value[0..^1].Split(".") : null; break;
case "subtype": result.Subtype = group.Value; break;
case "suffix": result.Suffix = !string.IsNullOrWhiteSpace(group.Value) && group.Value.Length > 1 ? group.Value[1..] : null; break;
case "params": result.Params = !string.IsNullOrWhiteSpace(group.Value) && group.Value.Length > 1 ? group.Value[1..].Split(";") : null; break;
case "encoding": result.Encoding = !string.IsNullOrWhiteSpace(group.Value) && group.Value.Length > 1 ? group.Value[1..] : null; break;
case "data": result.Data = !string.IsNullOrWhiteSpace(group.Value) && group.Value.Length > 1 ? group.Value[1..] : null; break;
}
}
return result;
}
Actually, you don't need a regex for that. According to Wikipedia, the data URI format is
data:[<MIME-type>][;charset=<encoding>][;base64],<data>
so just do the following:
byte[] imagedata = Convert.FromBase64String(imageSrc.Substring(imageSrc.IndexOf(",") + 1));
EDIT: expanded to show usage
var regex = new Regex(@"data:(?<mime>[\w/\-\.]+);(?<encoding>\w+),(?<data>.*)", RegexOptions.Compiled);
var match = regex.Match(input);
var mime = match.Groups["mime"].Value;
var encoding = match.Groups["encoding"].Value;
var data = match.Groups["data"].Value;
NOTE: The regex applies to the input shown in question. If there was a charset
specified too, it would not work and would have to be rewritten.