Recursively extract contents of (nested) parentheses in string

I usually do something like the following.

An outer function receives the string as argument and declares outer variables used to iterate it.

The work is done in a recursive inner function which iterates until ).

This way all new strings I create are included in the returned array. No useless work is done.

function shitshow(str) {
  var i = 0;
  function main() {
    var arr = [];
    var startIndex = i;
    function addWord() {
      if (i-1 > startIndex) {
        arr.push(str.slice(startIndex, i-1));
      }
    }
    while (i < str.length) {
      switch(str[i++]) {
        case " ":
          addWord();
          startIndex = i;
          continue;
        case "(":
          arr.push(main());
          startIndex = i;
          continue;
        case ")":
          addWord();
          return arr;
      }
    }
    addWord();
    return arr;
  }
  return main();
}
console.log(shitshow("((developer or engineer ) or (nurse or doctor)) and manager"));
div.as-console-wrapper { max-height: 100%; }


One way may be to reformat it into JSON and then let JSON.parse() do all the work. Something like:

function blah(string) {
	string = string.replace(/\(/g, "[");
	string = string.replace(/\)\s/g, "], ");
	string = string.replace(/\)/g, "]");
	string = string.replace(/\s+/, ", ");
	string = "[" + string + "]";
	string = string.replace(/[^\[\]\,\s]+/g, "\"$&\"");
	string = string.replace(/" /g, "\", ");

	return JSON.parse(string);
}

console.log(blah("((a b) ugh (one two)) pi"));

EDIT: Fixed some problems with regular expressions & posted as running code. Works with the test string.

[[["a","b"],"ugh",["one","two"]],"pi"]

As it stands, it will not cope if your string does not have a space between ")" and a word, nor if there is a space between two "))", but if your input format is known and rigid as above then it would be fine. Otherwise, further regular expression tweaking or substituting could deal with that.

Depending on where the source of your text is from, if it is possible to get it in JSON from the source, that would make it a trivial one-liner solution. I only mention this because your strings are similar to JSON already, but they may be from a source where you don't have that much control over the format.


Just been looking for a more complete solution due to the "manage" vs "manager" bug that Ciantic found in Oriol's answere.

function shitshow(str) {
  var i = 0;
  var trailingWhiteSpace = str[str.length - 1] === " ";
  function main() {
    var arr = [];
    var startIndex = i;
    function addWord() {
      if (i-1 > startIndex) {
        arr.push(str.slice(startIndex, i-1));
      }
    }
    while (i < str.length) {
      switch(str[i++]) {
        case " ":
          addWord();
          startIndex = i;
          continue;
        case "(":
          arr.push(main());
          startIndex = i;
          continue;
        case ")":
          addWord();
          return arr;
      }
    }
    if(!trailingWhiteSpace){
      i = i + 1;
      addWord();
    }
    return arr;
  }
  return main();
}
console.log(shitshow("((developer or engineer ) or (nurse or doctor)) and manager"));
div.as-console-wrapper { max-height: 100%; }