Regex in reverse - decompose regular expressions
Python v2.7 1069 1036 950 925 897 884 871 833 822
This answer seems rather long for a code golf, but there are a lot operators that need to be handled and I know what purpose each byte in this answer does. Since there is no existing answer I submit this as a target for other users to beat. See if you can make a shorter answer :).
The two main functions are f
which parses the regex starting at the i
th character, and d
which generates the matching strings, using r
the sub-regexes we can recursed into, 'a' the array representing the part of the current sub-regex not yet processed, and a string suffix s
which represents the part of the string generated so far.
Also check out the sample output and a test harness.
import sys;V=sys.argv;n=int(V[2]);r=V[1];S=len;R=range;C=R(32,127)
Z=[];z=-1;D='d(r,p,';F='for j in '
def f(i,a):
if i>=S(r):return a,i
c=r[i];x=0;I="|)]".find(c)
if c in"([|":x,i=f(i+1,Z)
if I+1:return([c,a,x],[a],[c,a])[I],i
if'\\'==c:i+=1;x=c+r[i]
return f(i+1,a+[x or c])
def d(r,a,s):
if S(s)>n:return
while a==Z:
if r==Z:print s;return
a=r[z];r=r[:z]
e=a[z];p=a[0:z]
if'|'==a[0]:d(r,a[1],s);d(r,a[2],s)
elif']'==a[0]:
g=a[1];N=g[0]=='^';g=(g,g[1:])[N];B=[0]*127;O=[ord(c[z])for c in g]
for i in R(0,S(g)):
if'-'==g[i]:exec F+'R(O[i-1],O[i+1]):B[j]=1'
else:B[O[i]]=1
for c in C:N^B[c]<1or d(r,Z,chr(c)+s)
elif' '>e:d(r+[p],e,s)
else:c=p[:z];exec{'.':F+'C:'+D+'chr(j)+s)','?':D+'s);d(r,p[:z],s)','*':F+'R(0,n+1):d(r,c,s);c+=[p[z]]','+':"d(r,p+['*',p[z]],s)"}.get(e,D+'e[z]+s)')
d(Z,f(0,Z)[0],"")
Note that tabs in the original solution have been expand
ed. To count the number of characters in the original use unexpand < regex.py | wc
.
Haskell 757 705 700 692 679 667
import Data.List
data R=L Char|A R R|T R R|E
h=[' '..'~']
k(']':s)a=(a,s)
k('^':s)_=l$k[]s
k('-':c:s)(a:b)=k([a..c]++b)s
k('\\':c:s)a=k s$c:a
k(c:s)a=k s$c:a
l(a,b)=(h\\a,b)
c#E=L c
c#r=A(L c)r
o(a,b)=(foldr(#)E a,b)
t%0=E
t%n=A(t%(n-1))$T t$t%(n-1)
d s n=m(fst$r s)[[]] where{m E a=a;m(L c)a=[b++[c]|b<-a,length b<n];m(A r s)x=nub$(m r x)++m s x;m(T r s)a=m s$m r a;r s=w$e s E;w(u,'|':v)=(\(a,b)->(A u a,b))$r v;w x=x;e(')':xs)t=(t,xs);e s@('|':_)t=(t,s);e s@(c:_)t=g t$f$b s;e[]t=(t,[]);g t(u,v)=e v$T t u;f(t,'*':s)=(t%n,s);f(t,'+':s)=(T t$t%n,s);f(t,'?':s)=(A t E,s);f(t,s)=(t,s);b('(':s)=r s;b('\\':s:t)=(L s,t);b('.':s)=o(h,s);b('[':s)=o$k s[];b(s:t)=(L s,t)}
output:
ghci> d ".*" 1
[""," ","!","\"","#","$","%","&","'","(",")","*","+",",","-",".","/","0","1","2","3","4","5","6","7","8","9",":",";","<","=",">","?","@","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","[","\\","]","^","_","`","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","{","|","}","~"]
ghci> d "w\\w+" 3
["ww","www"]
ghci> d "[abx-z][^ -}][\\\\]" 3
["x~\\","y~\\","z~\\","b~\\","a~\\"]
ghci> d "ab*a|c[de]*" 3
["aa","aba","c","ce","cd","cee","cde","ced","cdd"]
ghci> d "(foo)+(bar)?!?" 6
["foo!","foobar","foo","foofoo"]
ghci> d "(a+|b*c)d" 4
["ad","aad","aaad","cd","bcd","bbcd"]
ghci> d "p+cg" 4
["pcg","ppcg"]
ghci> d "a{3}" 4
["a{3}"]
Explanation: this one's a textbook regex implementation. R is the regex type, with constructors A (alternate), L (literal), T (then) and E (empty/epsilon). The usual 'Star' doesn't appear because I inline it as alternates during the parse (see '%'). 'm' runs the simulation. The parser (start at 'r s=....') is just recursive descent; 'k' parses ranges. The function '#' expands ranges into alternations.
Prolog (SWI), 586 bytes
The built in backtracking ability of Prolog makes it an awesome choice for this challenge. By taking advantage of backtracking, generating strings for a regex becomes exactly the same task as testing if a string is matched by a regex. Unfortunately, much of my golfing effort went into writing a shorter regex parsers. The actual task of decomposing a regular expression we easily golfed.
R-L-S:-R*A,-(B,A,[]),setof(Z,(0/L/M,length(C,M),C+B+[],Z*C),S).
-R-->e+S,S-R.
R-T-->{R=T};"|",e+S,u+R+S-T.
Z+Y-->(".",{setof(C,32/126/C,R)};"[^",!,\E,"]",{setof(C,(32/126/C,\+C^E),R)};"[",\R,"]";"(",-R,")";{R=[C]},([C],{\+C^`\\.[|+?*(`};[92,C])),("*",{S=k*R};"+",{S=c+R+k*R};"?",{S=u+e+R};{S=R}),({Y=c+Z+S};c+Z+S+Y).
\C-->{C=[H|T]},+H,\T;{C=[]};+A,"-",+B,\T,{setof(C,A/B/C,H),append(H,T,C)}.
+C-->[92,C];[C],{\+C^`\\]-`}.
S+e+S.
[C|S]+D+S:-C^D.
S+(B+L+R)+T:-B=c,!,S+L+U,U+R+T;S+L+T;S+R+T.
S+k*K+U:-S=U;S+K+T,S\=T,T+k*K+U.
A/B/C:-between(A,B,C).
A^B:-member(A,B).
A*B:-string_codes(A,B).
Try it online!
Ungolfed Code
generate_string(R, L, S) :-
% parse regex
string_codes(R, RC),
regex_union(RE, RC, []),
% bound string length
between(0, L, M),
length(SC, M),
% find string
match(SC, RE, []),
string_codes(S, SC).
% Parsers
%%%%%%%%%
regex_union(R) -->regex_concat(S), regex_union1(S, R).
regex_union1(R,T) --> [124], regex_concat(S), regex_union1(regex_union(R,S), T).
regex_union1(R, R) --> [].
regex_concat(R) --> regex_op(S), regex_concat1(S, R).
regex_concat1(R, T) --> regex_op(S), regex_concat1(regex_concat(R,S), T).
regex_concat1(R, R) --> [].
regex_op(regex_kleene(R)) --> regex_lit(R), [42].
regex_op(regex_concat(R,regex_kleene(R))) --> regex_lit(R), [43].
regex_op(regex_union(regex_empty,R)) --> regex_lit(R), [63].
regex_op(R) --> regex_lit(R).
regex_lit(regex_char([C])) --> [C], {\+ regex_ctrl(C)}.
regex_lit(regex_char([C])) --> [92], [C].
regex_lit(regex_char(CS)) --> [46], {findall(C, between(32,126, C), CS)}.
regex_lit(regex_char(DS)) -->
[91], [94], !, class_body(CS), [93],
{findall(C, (between(32, 126, C), \+ member(C, CS)), DS)}.
regex_lit(regex_char(CS)) --> [91], class_body(CS), [93].
regex_lit(R) --> [40], regex_union(R), [41].
class_body([C|T]) --> class_lit(C),class_body(T).
class_body(CS) -->
class_lit(C0), [45], class_lit(C1), class_body(T),
{findall(C, between(C0, C1, C), H), append(H,T,CS)}.
class_body([]) --> [].
class_lit(C) --> [C], {\+ class_ctrl(C)}.
class_lit(C) --> [92], [C].
class_ctrl(C) :- string_codes("\\[]-", CS), member(C, CS).
regex_ctrl(C) :- string_codes("\\.[]|+?*()", CS), member(C, CS).
% Regex Engine
%%%%%%%%%%%%%%
% The regex empty matches any string without consuming any characters.
match(S, regex_empty, S).
% A regex consisting of a single character matches any string starting with
% that character. The chanter is consumed.
match([C|S], regex_char(CS), S) :- member(C, CS).
% A union of two regex only needs to satisify one of the branches.
match(S, regex_union(L,R), T) :- match(S, L, T); match(S, R, T).
% A concat of two regex must satisfy the left and then the right.
match(S, regex_concat(L, R), U) :- match(S, L, T), match(T, R, U).
% The kleene closure of a regex can match the regex 0 times or it can the regex
% once before matching the kleene closure again.
match(S, regex_kleene(_), S).
match(S, regex_kleene(K), U) :- match(S, K, T), S \= T, match(T, regex_kleene(K), U).