Regex to parse image data URI

I faced also with the need to parse the data URI scheme. As a result, I improved the regular expression given on this page specifically for C# and which fits any data URI scheme (to check the scheme, you can take it from here or here.

Here is my solution for C#:

private class DataUriModel {
  public string MediaType { get; set; }
  public string Type { get; set; }
  public string[] Tree { get; set; }
  public string Subtype { get; set; }
  public string Suffix { get; set; }
  public string[] Params { get; set; }
  public string Encoding { get; set; }
  public string Data { get; set; }
}

static void Main(string[] args) {
  string s = "data:image/prs.jpeg+gzip;charset=UTF-8;page=21;page=22;base64,/9j/4AAQSkZJRgABAQAAAQABAAD";
  var parsedUri = GetDataURI(s);
  Console.WriteLine(decodedUri.Type);
  Console.WriteLine(decodedUri.Subtype);
  Console.WriteLine(decodedUri.Encoding);
}

private static DataUriModel GetDataURI(string data) {
  var result = new DataUriModel();
  Regex regex = new Regex(@"^\s*data:(?<media_type>(?<type>[a-z\-]+){1}\/(?<tree>([a-z\-]+\.)+)?(?<subtype>[a-z\-]+){1}(?<suffix>\+[a-z]+)?(?<params>(;[a-z\-]+\=[a-z0-9\-\+]+)*)?)?(?<encoding>;base64)?(?<data>,+[a-z0-9\\\!\$\&\'\,\(\)\*\+\,\;\=\-\.\~\:\@\/\?\%\s]*\s*)?$", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Multiline);
  var match = regex.Match(data);

  if (!match.Success)
    return result;

  var names = regex.GetGroupNames();
  foreach (var name in names) {
    var group = match.Groups[name];
    switch (name) {
      case "media_type": result.MediaType = group.Value; break;
      case "type": result.Type = group.Value; break;
      case "tree": result.Tree = !string.IsNullOrWhiteSpace(group.Value) && group.Value.Length > 1 ? group.Value[0..^1].Split(".") : null; break;
      case "subtype": result.Subtype = group.Value; break;
      case "suffix": result.Suffix = !string.IsNullOrWhiteSpace(group.Value) && group.Value.Length > 1 ? group.Value[1..] : null; break;
      case "params": result.Params = !string.IsNullOrWhiteSpace(group.Value) && group.Value.Length > 1 ? group.Value[1..].Split(";") : null; break;
      case "encoding": result.Encoding = !string.IsNullOrWhiteSpace(group.Value) && group.Value.Length > 1 ? group.Value[1..] : null; break;
      case "data": result.Data = !string.IsNullOrWhiteSpace(group.Value) && group.Value.Length > 1 ? group.Value[1..] : null; break;
    }
  }

  return result;
}

Actually, you don't need a regex for that. According to Wikipedia, the data URI format is

data:[<MIME-type>][;charset=<encoding>][;base64],<data>

so just do the following:

byte[] imagedata = Convert.FromBase64String(imageSrc.Substring(imageSrc.IndexOf(",") + 1));

EDIT: expanded to show usage

var regex = new Regex(@"data:(?<mime>[\w/\-\.]+);(?<encoding>\w+),(?<data>.*)", RegexOptions.Compiled);

var match = regex.Match(input);

var mime = match.Groups["mime"].Value;
var encoding = match.Groups["encoding"].Value;
var data = match.Groups["data"].Value;

NOTE: The regex applies to the input shown in question. If there was a charset specified too, it would not work and would have to be rewritten.

Tags:

C#

Image

Uri

Base64