A Basic Pyth-like Syntax Checker
CJam, 65 bytes
q'"/La+2%{0s*}:Z~_,:L')*+{L{4{)_Z+s/Z}/L{'(\Z')++/Z}/}*')-}2*0s-!
Gosh, I wish CJam had Regex, this could have been completed in less than 50 bytes then
The main idea is to keep reducing stuff to 0
i.e. 10
to 0
, 200
to 0
and so on. Once that is done, we reduce all matched brackets to 0
, i.e. ()
to 0
, (0)
to 0
, (00)
to 0
and so on. The we repeat the cycle L
times, where L
is the input length.
The input string initially goes through extra processing where we adjust for unmatched "
and add lots of )
to compensate for unmatched (
This ensures that after all the iterations, we should only have 0
(and no-op )
) left in the string.
Update - fixed a bug where top level )
no-op are considered harmful
Code expansion
q'"/La+2%{0s*}:Z~_,:L')* "Part 1 - Preprocessing";
q'"/ "Read the input and split it on quote";
La+ "Add an extra empty array to the split array to compensate";
"for unmatched ending quote";
2% "Take every other splitted part, ignoring the contents";
"of the strings in the code";
{0s*}:Z~ "Join the parts by string 0. Also create a function Z";
"that does that for future use. We are now done with";
"reducing "XYZ" to 0 ";
_,:L "Store the length of the remaining code in L";
')* "Append L ) to the string to compensate for unmatched (";
{L{4{)_Z+s/Z}/L{'(\Z')++/Z}/}*')-}2* "Part 2 - the reducing loop";
L{ }* "Run the reduction logic L times";
4{)_Z+s/Z}/ "Part 2a - reducing arities";
4{ }/ "Run the iteration for 0, 1, 2 and 3";
)_ "Increment the iteration number and make a copy";
Z+s "Get that many 0 and append it to the number";
/Z "Split the code onto this number and convert it";
"to 0. This successfully reduces 10, 200, ";
"3000 and 4000 to 0";
L{'(\Z')++/Z}/ "Part 2b - reducing groups";
L{ }/ "Run the iteration for 0 through L - 1";
'(\Z')++ "Get the string '(<iteration number of 0>)'";
/Z "split on it and join by 0. This successfully";
"reduces (), (0), (00) and so on .. to 0";
')- "After running the part 2 loop L times, all";
"reducible arities and brackets should be reduced";
"except for cases like '30)00' where the no-op )";
"breaks the flow. So we remove ) from the code";
"and run Part 2 L times again";
0s-! "Now, if the code was valid, we should only have 0 in the";
"remaining code. If not, the code was invalid";
Try it online here or run the whole suite
Regex, PCRE flavor, 83 bytes
^(?3)*\)*$(()((?(2)|\)*)(\((?1)*(\)|$)|0|"[^"]*.?|(1|(2|(3|4(?3))(?3))(?3))(?3))))?
Try it here.
Regex, PCRE flavor, 85 bytes
^((?(3)|\)*)(?>\((?2)*(()(?1))?(\)|$)|0|"[^"]*.?|(1|(2|(3|4(?1))(?1))(?1))(?1)))*\)*$
Try it here.
Used some ideas in this dan1111's answer.
Some explanations about (?2)*(()(?1))?
.
80386 Assembler, 97 bytes
Hex dump:
0000000: 8b54 2404 5589 e531 c96a ff8a 022c 303c .T$.U..1.j...,0<
0000010: f275 04f6 d530 c084 ed75 263c f875 0141 .u...0...u&<.u.A
0000020: 3cf9 750f 84c9 7419 4958 3c00 7c03 50eb <.u...t.IX<.|.P.
0000030: 2430 c084 c075 0958 483c ff7f 0140 ebf3 $0...u.XH<...@..
0000040: 5042 8a02 84c0 75c5 4a84 edb0 2275 be84 PB....u.J..."u..
0000050: c9b0 2975 b85a 31c0 39e5 7501 4089 ec5d ..)u.Z1.9.u.@..]
0000060: c3 .
This runs through the input once, pushing numbers greater than zero onto the stack and decrementing them when a zero is processed. Unboundeds are processed as a -1.
Function prototype (in C) (function returns 0 if invalid and 1 if valid):
int __cdecl test(char *in);
Equivalent assembly (NASM):
bits 32
; get input pointer into edx
mov edx, [esp+4] ; 8B 54 24 04
; save ebp; set ebp = esp
push ebp ; 55
mov ebp, esp ; 89 E5
; clear ecx
xor ecx, ecx ; 31 C9
; push base -1
push byte(-1) ; 6A FF
; get top char
mov al, [edx] ; 8A 02
sub al, 0x30 ; 2C 30
; if al == quote
cmp al, 0xF2 ; 3C F2
jne $+6 ; 75 04
; set ch (in quote?) to opposite
not ch ; F6 D5
; set value to 0
xor al, al ; 30 C0
; if in quote, continue
test ch, ch ; 84 ED
jnz $+40 ; 75 26
cmp al, 0xF8 ; 3C F8
jne $+3 ; 75 01
; increment cl=depth
inc ecx ; 41
cmp al, 0xF9 ; 3C F9
jne $+17 ; 75 0F
; check depth = 0
test cl, cl ; 84 C9
jz $+27 ; 74 19
; decrement cl=depth
dec ecx ; 49
; pop and check -1
pop eax ; 58
cmp al, 0 ; 3C 00
jl $+5 ; 7C 03
push eax ; 50
jmp $+38 ; EB 24
xor al, al ; 30 C0
test al, al ; 84 C0
jnz $+11 ; 75 09
pop eax ; 58
dec eax ; 48
cmp al, -1 ; 3C FF
jg $+3 ; 7F 01
inc eax ; 40
jmp $-11 ; EB F3
push eax ; 50
inc edx ; 42
mov al, [edx] ; 8A 02
test al, al ; 84 C0
jnz $-57 ; 75 C5
dec edx ; 4A
; in quote?
test ch, ch ; 84 ED
mov al, 0x22 ; B0 22
jnz $-64 ; 75 BE
; depth not zero?
test cl, cl ; 84 C9
mov al, 0x29 ; B0 29
jnz $-70 ; 75 B8
; pop base -1
pop edx ; 5A
; set return value based on ebp/esp comparison
xor eax, eax ; 31 C0
cmp ebp, esp ; 39 E5
jne $+3 ; 75 01
inc eax ; 40
; restore esp
mov esp, ebp ; 89 EC
; restore ebp
pop ebp ; 5D
; return
ret ; C3
The following code in C can be used with GCC on a POSIX system to test:
#include <sys/mman.h>
#include <stdio.h>
#include <string.h>
int main(){
char code[] = {
0x8b, 0x54, 0x24, 0x04, 0x55, 0x89, 0xe5, 0x31, 0xc9, 0x6a, 0xff,
0x8a, 0x02, 0x2c, 0x30, 0x3c, 0xf2, 0x75, 0x04, 0xf6, 0xd5, 0x30,
0xc0, 0x84, 0xed, 0x75, 0x26, 0x3c, 0xf8, 0x75, 0x01, 0x41, 0x3c,
0xf9, 0x75, 0x0f, 0x84, 0xc9, 0x74, 0x19, 0x49, 0x58, 0x3c, 0x00,
0x7c, 0x03, 0x50, 0xeb, 0x24, 0x30, 0xc0, 0x84, 0xc0, 0x75, 0x09,
0x58, 0x48, 0x3c, 0xff, 0x7f, 0x01, 0x40, 0xeb, 0xf3, 0x50, 0x42,
0x8a, 0x02, 0x84, 0xc0, 0x75, 0xc5, 0x4a, 0x84, 0xed, 0xb0, 0x22,
0x75, 0xbe, 0x84, 0xc9, 0xb0, 0x29, 0x75, 0xb8, 0x5a, 0x31, 0xc0,
0x39, 0xe5, 0x75, 0x01, 0x40, 0x89, 0xec, 0x5d, 0xc3,
};
void *mem = mmap(0, sizeof(code), PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0);
memcpy(mem, code, sizeof(code));
int __cdecl (*test)(char *) = (int __cdecl (*)(char *)) mem;
#define TRY(s) printf(s ": %d\n", test(s))
printf("Truthy tests:\n");
TRY("0");
TRY(")");
TRY("(");
TRY("\"");
TRY("()");
TRY("\"\"");
TRY("10");
TRY("400010");
TRY("210010");
TRY("(\"\")00");
TRY("3\"\"\"\"\"");
TRY("(0)))0)1)0");
TRY("2(2(2(0)0)0)0");
TRY("2\"010\"\"44)()4\"");
TRY(")31(0)0(201000100");
TRY("())2)1))0\"3())\"))");
TRY("3(\"4321(\"301(0)21100\"4\")\"123\"00)40\"121\"31000\"\"01010");
printf("\nFalsy tests:\n");
TRY("1");
TRY("1(310");
TRY("(1)0)");
TRY("12(010");
TRY("4\"00010\"");
TRY("3120102100");
TRY("20(2((0)(0)))");
TRY("2(2(2(0)0)0)01)");
TRY("4(0102)00)00000");
TRY("2\"00\"(\"00\"2(\"\"))");
munmap(mem, sizeof(code));
return 0;
}